commit cd5554d22f1d7ad2212e6299f85d564c1ce1b35b Author: CI Date: Fri Aug 22 08:50:18 2025 +0000 Build branch main with version main (173327cc) Build pipeline: vsh-ci-build-template-k4qzr Source commit: https://github.com/openpipelines-bio/openpipeline/commit/173327cc5670aa8bd5cf473827de80b602c90092 Source message: Cellranger multi conversion: fix combined AB + CB probe experiments (#1062) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..0f532372 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,3 @@ +# Apply ruff as linter for python code +9e87b864f699c371b444b592a19e610a3c9d3286 + \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..8c270bd8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +src/mapping/bd_rhapsody*/*.cwl linguist-generated +src/query/cellxgene_census linguist-generated \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..16fe6e22 --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +# Jupyter notebooks +.ipynb_checkpoints + +# pycache +*__pycache__* +.nfs* + +# R related +.Rhistory +*.Rproj +.Rproj.user + +# Python virtual environments +.venv + +# temporary files related +temp + +# NextFlow +work/ +.nextflow.log +flowchart.* +.nextflow* +out/ + +# Macos +.DS_Store + +# viash related +.viash_log* +log.txt +check_results/ +out/ +output* +output_log/ +resources_test +/viash_tools/ + +# vscode +.vscode/launch.json +.vscode/settings.json \ No newline at end of file diff --git a/.lintr b/.lintr new file mode 100644 index 00000000..827a151b --- /dev/null +++ b/.lintr @@ -0,0 +1,3 @@ +exclusions: list( + "README.qmd" + ) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..90b19c61 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,24 @@ + +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.12.1 + hooks: + - id: ruff-check + args: [ --fix ] + - id: ruff-format +- repo: local + hooks: + - id: run_styler + name: run_styler + language: r + description: style files with {styler} + entry: "Rscript -e 'styler::style_file(commandArgs(TRUE))'" + files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$' + additional_dependencies: + - styler + - knitr +- repo: https://github.com/lorenzwalthert/precommit + rev: v0.4.3.9012 + hooks: + - id: lintr diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..912140e2 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,626 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=10 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\' represents the directory delimiter on Windows systems, it +# can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.10 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=BaseException, + Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + line-too-long, + missing-module-docstring, + redefined-outer-name + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: en_AG (hunspell), en_AU +# (hunspell), en_BS (hunspell), en_BW (hunspell), en_BZ (hunspell), en_CA +# (hunspell), en_DK (hunspell), en_GB (hunspell), en_GH (hunspell), en_HK +# (hunspell), en_IE (hunspell), en_IN (hunspell), en_JM (hunspell), en_MW +# (hunspell), en_NA (hunspell), en_NG (hunspell), en_NZ (hunspell), en_PH +# (hunspell), en_SG (hunspell), en_TT (hunspell), en_US (hunspell), en_ZA +# (hunspell), en_ZM (hunspell), en_ZW (hunspell). +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..ee0457d1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,1662 @@ +# openpipelines 3.0.0 + +## BREAKING CHANGES + +* `transfer/publish`: remove component after deprecating it in 2.1.0 (PR #1019). + +* Removed `split_h5mu_train_test` component (PR #1020). + +* `tar_extract` has been deprecated and will be removed in openpipeline 4.0 (PR #1014). Use [vsh://toolbox/bgzip](https://www.viash-hub.com/packages/toolbox/latest/components/bgzip) instead. + +* `compress_h5mu`: rename `compression` argument to `output_compression` (PR #1017, PR #1018). + +* `delimit_fraction`: remove unused `layer` argument (PR #1018). + +* `download_file` has been deprecated and will be removed in openpipeline 3.0 (PR #1015). + +* `scarches`: Loading of legacy models no longer asumes the model to based on SCANVI. An argument (`reference_class`) was added which need to be set in this case (PR #1035). + +* `convert/from_h5mu_to_seurat` has been deprecated and will be removed in openpipeline 4.0. Use `convert/from_h5mu_or_h5ad_to_seurat` instead (PR #1046). + +## NEW FUNCTIONALITY + +* `liana`: enabled jobs to be run in parallel and added two new arguments: `consensus_opts`, `de_method` (PR #1039) + +* `from_h5mu_or_h5ad_to_seurat`: converts an h5ad file or a single modality from an h5mu file to a seurat object (PR #1046). + +## EXPERIMENTAL + +Warning: These experimental features are subject to change in future releases. + +* Added `from_h5mu_or_h5ad_to_tiledb` component (PR #1034). + +* Added `differential_expression/create_pseudobulk`: Generation of pseudobulk samples from single-cell transcriptomics data, + to create bulk-like expression profiles suitable for differential expression analysis with methods designed for bulk differential expression analysis (PR #1042). + +* Added `annotate/singler`: Cell type annotation using SingleR (PR #1051). + +## MAJOR CHANGES + +* `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006). + +* `leiden`: bump base container to 3.13 (PR #1030). + +* `scanvi`, `scarches`, `scvi` and `totalvi`: bump scvi-tools to `1.3.1` and base image to `nvcr.io/nvidia/pytorch:25.05-py3` (PR #1035). + +* `lianapy`: update liana to `1.5.0` (PR #1039) + +## MINOR CHANGES + +* `velocyto`: pin base container to `python:3.10-slim-bookworm` (PR #1063). + +* `mapping/cellranger_multi`: The output from Cell Ranger is now displayed as Cell Ranger is running (PR #1045). + +* Remove `workflows` directory (PR #993). The workflows which were at one point in this directory were all deprecated and moved to `src/workflows`. + +* Move output file compression argument for AnnData and MuData files to a base config file (`src/base/h5_compression_argument.yaml`) (PR #1017). + +* Add missing descriptions to components and arguments (PR #1018). + +* Add `scope` to component and workflow configurations (see https://viash.io/reference/config/scope.html) (PR #1013 and #1032). + +* `workflows/multiomics/process_samples`: Add optional `--skip_scrublet_doublet_detection` flag to bypass Scrublet doublet detection. Scrublet doublet detection runs by default and can now be optionally disabled (PR #1049). + +* Nextflow runner: use `resourceLimits` directive in the labels config to set a global limit on the memory (PR #1060). + +## BUG FIXES + +* `cellranger_multi`: Fix error when running Cell Ranger without any computational resources specified (PR #1056) + +* Bump viash to 0.9.4. This adds support for nextflow versions starting major version 25.01 and fixes an issue where an integer being passed to a argument with `type: double` resulted in an error (PR #1016). + +* Fix running `neigbors_leiden_umap` workflow with `-stub` enabled (PR #1026). + +* Add missing CUDA enabled `jaxlib` to components that use `scvi-tools` (`scanvi`, `scarches`, `scvi` and `totalvi`) (PR #1028) + +* `leiden`: fix issue where the logging system was shut down prematurely after the calculations were done (PR #1030) +* Added missing `gpu` label to `scarches` component (PR #1027). + +* `conversion/from_cellranger_multi_to_h5mu`: fix conversion to MuData for experiments that combine probe barcodes with other feature barcodes (e.g. Antibody Capture and CIRSPR Guide Capture) (PR #1062). + +# openpipelines 2.1.2 + +## DOCUMENTATION + +* Update README (PR #1024, backported from #1012). + +# openpipelines 2.1.1 + +## BUG FIXES + +* Add support for nextflow versions starting major version 25.01 (PR #1009). + +* Fix an issue where an interger being passed to a argument with `type: double` resulted in an error (PR #1009). + +# openpipelines 2.1.0 + +## BREAKING CHANGES + +* Deprecation of `metadata/duplicate_obs` and `metadata/duplicate_var` components (PR #952). + +* Deprecation of `workflows/annotation/scgpt_integration_knn` component (PR #952). + +* `annotate/scanvi`: Remove scarches functionality from this component, as it is already covered in `integrate/scarches` (PR #986). + +## NEW FUNCTIONALITY + +* `dataflow/concatenate_h5mu`: add `modality` parameter (PR #977). + +* `filter_with_scrublet`: add `expected_doublet_rate`, `stdev_doublet_rate`, `n_neighbors` and `sim_doublet_ratio` arguments (PR #974). + +* `feature_annotation/aling_query_reference`: Added a component to align a query and reference dataset (PR #948, #958, #972). + +* `workflows/qc/qc` workflow: Added ribosomal gene detection (PR #961). + +* `workflows/rna/rna_singlesample`, `workflows/multiomics/process_samples` workflows: Added ribosomal gene detection (PR #968). + +* `scanvi`: enable CUDA acceleration (PR #969). + +* `workflows/annotation/scvi_knn` workflow: Cell-type annotation based on scVI integration followed by KNN label transfer (PR #954). + +* `convert/from_h5ad_to_seurat`: Add component to convert from h5ad to Seurat (PR #980). + +* `workflows/annotation/scanvi_scarches` workflow: Cell-type annotation based on scANVI integration and annotation with scArches for reference mapping (PR #898). + +* `integrate/scarches`: Implemented functionality to align the query dataset with the model registry and extend functionality to predict labels for scANVI models (PR #898). + +* `workflows/annotation/harmony_knn` workflow: Cell-type annotation based on harmony integration with KNN label transfer (PR #836). + +* `from_cellranger_multi_to_h5mu`: add support for `custom` modality (PR #982). + +* `integrate/scvi`: Enable passing any .var field for gene name information instead of .var index, using the `--var_gene_names` parameter (PR #986). + +## MAJOR CHANGES + +* Several components: when a component processes a single modality, only that modality is read into memory (PR #944) + +* The `transfer/publish` component is deprecated and will be removed in a future major release (PR #941). + + +# MINOR CHANGES + +* Bump viash to `0.9.3` (PR #995). + +* Several workflows: refactor neighbors, leiden and UMAP in a separate subworkflow (PR #942 and PR #949). + +* `grep_annotation_column` and `subset_obsp`: Fix compatibility for SciPy (PR #945). + +* `popv`: Pin numpy<2 after new release of scvi-tools (PR #946). + +* Various components (`scgpt` and `annotate`): Add resource labels (PR #947, PR #950). + +* `feature_annotation/highly_variable_features_scanpy`: Enable calculation of HVG on a subset of genes (PR #957, PR #959). + +* `integrate/scvi`, `integrate/totalvi` and `integrate/scarches`: update base image to nvcr.io/nvidia/pytorch:24.12-py3, pin scvi-tools version to 1.1.5, unpin jax and jaxlib version (PR #970). + +* `annotate/celltypist`: Enable passing any layer with log normalized counts, enforce checking whether counts are log normalized (PR #971). + +* `process_10xh5/filter_10xh5`: update container base to ubuntu 24.04 (PR #983). + +# BUG FIXES + +* Fix `-stub` runs (PR #1000). + +* `cluster/leiden`: Fix an issue where insufficient shared memory (size of `/dev/shm`) causes the processing to hang. + +* `utils/subset_vars`: Convert .var column used for subsetting of dtype "boolean" to dtype "bool" when it doesn't contain NaN values (PR #959). + +* `resources_test_scripts/annotation_test_data.sh`: Add a layer to the annotation reference dataset with log normalized counts (PR #960). + +* `annotate/celltypist`: Fix missing values in annotation column caused by index misalignment (PR #976). + +* `workflows/annotation/scgpt_annotation` and `workflows/integrate/scgpt_leiden`: Parameterization of HVG flavor with default method `cell_ranger` instead of `seurat_v3` (PR #979). + +* `dataflow/merge`: Resolved an issue where merging two MuData objects with overlapping `var` or `obs` columns sometimes resulted in an unsupported nullable dtype (e.g. merging `pd.IntegerDtype` and `pd.FloatDtype`). These columns are now correctly cast to their native numpy dtypes before writing(PR #990). + +* `workflows/annotation/harmony_knn`: Only process RNA modality in the workflow (PR #988). + +# openpipelines 2.0.0 + +## BREAKING CHANGES + +* `velocity/scvelo`: update `scvelo` to `0.3.3`, which also removes support for using `loom` input files. The component now uses a `MuData` object as input. Several arguments were added to support selecting different inputs from the MuData file: `counts_layer`, `modality`, `layer_spliced`, `layer_unspliced`, `layer_ambiguous`. An `output_h5mu` argument was has been added (PR #932). + +* `src/annotate/onclass` and `src/annotate/celltypist`: Input parameter for gene name layers of input datasets has been updated to `--input_var_gene_names` and `reference_var_gene_names` (PR #919). + +* Several components under `src/scgpt` (`cross_check_genes`, `tokenize_pad`, `binning`) now processes the input (query) datasets differently. Instead of subsetting datasets based on genes in the model vocabulary and/or highly variable genes, these components require an input .var column with a boolean mask specifying this information. The results are written back to the original input data, preserving the dataset structure (PR #832). + +* `query/cellxgene_census`: The default output layer has been changed from `.layers["counts"]` to `.X` to be more aligned with the standard OpenPipelines format (PR #933). + Use argument `--output_layer_counts counts` to revert the behaviour to the previous default. + +## NEW FUNCTIONALITY + +* `velocyto_to_h5mu`: now writes counts to `.X` (PR #932) + +* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868). + +* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832). + +* `workflows/annotation/scgpt_integration_knn` workflow: Cell-type annotation based on scGPT integration with KNN label transfer (PR #875). + +* CI: Use `params.resources_test` in test workflows in order to point to an alternative location (e.g. a cache) (PR #889). + +## MINOR CHANGES + +* Pin `scikit-learn` for `labels_transfer/xgboost` to `<1.6` (PR #931). + +* `filter/filter_with_scrublet`: provide cleaner error message when running scrublet on an empty modality (PR #929). + +* Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920). + +* `scgpt/cell_type_annotation` component update: Added support for multi-processing (PR #832). + +* Several annotation (`src/annotate/`) components (`onclass`, `celltypist`, `random_forest_annotation`, `scanvi`, `svm_annotation`): Updated input parameteres to ensure uniformity across components, implemented functionality to cross-check the overlap of genes between query and reference (model) datasets and implemented logic to allow for subsetting of genes (PR #919). + +* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832). + +* `scgpt/cross_check_genes` component update: Highly variable genes are now cross-checked based on the boolean mask in `var_input`. The filtering information is stored in the `--output_var_filter` .var field instead of subsetting the dataset (PR #832). + +* `scgpt/binning` component update: This component now requires the `--var_input` parameter to provide gene filtering information. Binned data is written to the `--output_obsm_binned_counts` .obsm field in the original input data (PR #832). + +* `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832). + +* `resources_test_scripts/scgpt.sh`: Update scGPT test resources to avoid subsetting of datasets (PR #926). + +* `workflows/integration/scgpt_leiden` workflow update: Update workflow such that input dataset is not subsetted for HVG but uses boolean masks in .var field instead (PR #875). + +## BUG FIXES + +* `scvi_leiden` workflow: fix the input layer argument of the workflow not being passed to the scVI component (PR #936 and PR #938). + +* `scgpt/embedding`: remove unused argument `dbsn` (PR #875). + +* `scgpt/binning`: update handling of empty rows in sparse matrices (PR #875). + +* `dataflow/split_h5mu`: Update memory label from `lowmem` to `highmem` and cpu label from `singlecpu` to `lowcpu` (PR #930). + +# openpipelines 2.0.0-rc.2 + +## BUG FIXES + +* `annotate/popv`: fix popv raising `ValueError` when an accelerator (e.g. GPU) is unavailable (PR #915). + +## MINOR CHANGES + +* `dataflow/split_h5mu`: Optimize resource usage of the component (PR #913). + +# openpipelines 2.0.0-rc.1 + +## BREAKING CHANGES + +* Added cell multiplexing support to the `from_cellranger_multi_to_h5mu` component and the `cellranger_multi` workflow. For the `from_cellranger_multi_to_h5mu` component, the `output` argument now requires a value containing a wildcard character `*`, which will be replaced by the sample ID to form the final output file names. Additionally, a `sample_csv` argument is added to the `from_cellragner_multi_to_h5mu` component which describes the sample name per output file. No change is required for the `output_h5mu` argument from the `cellranger_multi` workflow, the workflow will just emit multiple events in case of a multiplexed run, one for each sample. The id of the events (and default output file names) are set by `--sample_ids` (in case of cell multiplexing), or (as before) by the user provided `id` for the input (PR #803 and PR #902). + +* `demux/bcl_convert`: update BCL convert from 3.10 to 4.2 (PR #774). + +* `demux/cellranger_mkfastq`, `mapping/cellranger_count`, `mapping/cellranger_multi` and `reference/build_cellranger_reference`: update cellranger to `8.0.1` (PR #774 and PR #811). + +* Removed `--disable_library_compatibility_check` in favour of `--check_library_compatibility` to the `mapping/cellranger_multi` component and the `ingestion/cellranger_multi` workflow (PR #818). + +* `lianapy`: bumped version to `1.3.0` (PR #827 and PR #862). Additionally, `groupby` is now a required argument. + +* `concat`: this component was deprecated and has now been removed, use `concatenate_h5mu` instead (PR #796). + +* The `workflows` folder in the root of the project no longer contains symbolic links to the build workflows in `target`. + Using any workflows that was previously linked in this directory will now result in an error which will indicate + the location of the workflow to be used instead (PR #796). + +* `XGBoost`: bump version to `2.0.3` (PR #646). + +* Several components: update anndata to `0.11.1` and mudata to `0.3.1` (PR #645 and PR #901), and scanpy to `1.10.4` (PR #901). + +* `filter/filter_with_hvg`: this component was deprecated and has now been removed. Use `feature_annotation/highly_variable_features_scanpy` instead (PR #843). + +* `dataflow/concat`: this component was deprecated and has now been removed. Use `dataflow/concatenate_h5mu` instead (PR #857). + +* `convert/from_h5mu_to_seurat`: bump seurat to latest version (PR #850). + +* `workflows/ingestion/bd_rhapsody`: Upgrade BD Rhapsody 1.x to 2.x, thereby changing the interface of the workflow (PR #846). + +* `mapping/bd_rhapsody`: Upgrade BD Rhapsody 1.x to 2.x, thereby changing the interface of the workflow (PR #846). + +* `reference/make_bdrhap_reference`: Upgrade BD Rhapsody 1.x to 2.x, thereby changing the interface of the workflow (PR #846). + +* `reference/build_star_reference`: Rename `mapping/star_build_reference` to `reference/build_star_reference` (PR #846). + +* `reference/cellranger_mkgtf`: Rename `reference/mkgtf` to `reference/cellranger_mkgtf` (PR #846). + +* `labels_transfer/xgboost`: Align interface with new annotation workflow + - Store label probabilities instead of uncertainties + - Take `.h5mu` format as an input instead of `.h5ad` + +* `reference/build_cellranger_arc_reference`: a default value of "output" is now specified for the argument `--genome`, inline with `reference/build_cellranger_reference` component. Additionally, providing a value for `--organism` is no longer required and its default value of `Homo Sapiens` has been removed (PR #864). + +## MAJOR CHANGES + +* Bump popv to `0.4.2` (PR #901) + +## NEW FUNCTIONALITY + +* Added `demux/cellranger_atac_mkfastq` component: demultiplex raw sequencing data for ATAC experiments (PR #726). + +* `process_samples`, `process_batches` and `rna_multisample` workflows: added functionality to scale the log-normalized + gene expression data to unit variance and zero mean. The scaled data will be output to a different layer and the + representation with reduced dimensions will be created and stored in addition to the non-scaled data (PR #733). + +* `transform/scaling`: add `--input_layer` and `--output_layer` arguments (PR #733). + +* CI: added checking of mudata contents for multiple workflows (PR #783). + +* Added multiple arguments to the `cellranger_multi` workflow in order to maintain feature parity with the `mapping/cellranger_multi` component (PR #803). + +* `convert/from_cellranger_to_h5mu`: add support for antigen analysis. + +* Added `demux/cellranger_atac_mkfastq` component: demultiplex raw sequencing data for ATAC experiments (PR #726). + +* Added `reference/build_cellranger_reference` component: build reference file compatible with ATAC and ATAC+GEX experiments (PR #726). + +* `demux/bcl_convert`: add support for no lane splitting (PR #804). + +* `reference/cellranger_mkgtf` component: Added cellranger mkgtf as a standalone component (PR #771). + +* `scgpt/cross_check_genes` component: Added a gene-model cross check component for scGPT (PR #758). + +* `scgpt/embedding`: component: Added scGPT embedding component (PR #761) + +* `scgpt/tokenize_pad`: component: Added scGPT padding and tokenization component (PR #754). + +* `scgpt/binning` component: Added a scGPT pre-processing binning component (PR #765). + +* `workflows/integration/scgpt_leiden` workflow with scGPT integration followed by Leiden clustering (PR #794). + +* `scgpt/cell_type_annotation` component: Added scGPT cell type annotation component (PR #798). + +* `resources_test_scripts/scGPT.sh`: Added script to include scGPT test resources (PR #800). + +* `transform/clr` component: Added the option to set the `axis` along which to apply CLR. Possible to override + on workflow level as well (PR #767). + +* `annotate/celltypist` component: Added a CellTypist annotation component (PR #825). + +* `dataflow/split_h5mu` component: Added a component to split a single h5mu file into multiple h5mu files based on the values of an .obs column (PR #824). + +* `workflows/test_workflows/ingestion` components & `workflows/ingestion`: Added standalone components for integration testing of ingestion workflows (PR #801). + +* `workflows/ingestion/make_reference`: Add additional arguments passed through to the STAR and BD Rhapsody reference components (PR #846). + +* `annotate/random_forest_annotation` component: Added a random forest cell type annotation component (PR #848). + +* `dataflow/concatenate_h5mu`: data from `.uns`, both originating from the global and per-modality slots, is now retained in the final concatenated output object. Additionally, added the `uns_merge_mode` argument in order to tune the behavior when conflicting keys are detected across samples (PR #859). + +* `dimred/densmap` component: Added a densMAP dimensionality reduction component (PR #748). + +* `annotate/scanvi` component: Added a component to annotate cells using scANVI (PR #833). + +* `transform/bpcells_regress_out` component: Added a component to regress out effects of confounding variables in the count matrix using BPCells (PR #863). + +* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863). + +* `workflows/ingestion/make_reference`: add possibility to build CellRanger ARC references. Added `--motifs_file`, `--non_nuclear_contigs` and `--output_cellranger_arc` arguments (PR #864). + +* Test resources (reference_gencodev41_chr1): switch reference genome for CellRanger to ARC variant (PR #864). + +* `transform/bpcells_regress_out` component: Added a component to regress out effects of confounding variables in the count matrix using BPCells (PR #863). + +* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863). + +* Added `transform/tfidf` component: normalize ATAC data with TF-IDF (PR #870). + +* Added `dimred/lsi` component (PR #552). + +* `metadata/duplicate_obs` component: Added a component to make a copy from one .obs field or index to another .obs field within the same MuData object (PR #874, PR #899). + +* `annotate/onclass`: component: Added a component to annotate cell types using OnClass (PR #844). + +* `annotate/svm` component: Added a component to annotate cell types using support vector machine (SVM) (PR #845). + +* `metadata/duplicate_var` component: Added a component to make a copy from one .var field or index to another .var field within the same MuData object (PR #877, PR #899). + +* `filter/subset_obsp` component: Added a component to subset an .obsp matrix by column based on the value of an .obs field. The resulting subset is moved to an .obsm field (PR #888). + +* `labels_transfer/knn` component: Enable using additional distance functions for KNN classification (PR #830) and allow to perform KNN classification based on a pre-calculated neighborhood graph (PR #890). + +## MINOR CHANGES + +* Several components: bump python version (PR #901). + +* `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726). + +* `cellbender_remove_background_v0_2`: update base image to `nvcr.io/nvidia/pytorch:23.12-py3` (PR #646). + +* Bump scvelo to `0.3.2` (PR #828). + +* Pin numpy<2 for several components (PR #815). + +* Added `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: download tiny bcl file with an ATAC experiment, download a motifs file, demultiplex bcl files to reads in fastq format (PR #726). + +* `mapping/cellranger_multi` component now outputs logs on failure of the `cellranger multi` process (PR #766). + +* Bump `viash-actions` to `v6` (PR #821). + +* `reference/make_reference`: Do not try to extract genome fasta and transcriptome gtf if they are not gzipped (PR #856). + +* Changes related to syncing the test resources (PR #867): + + - Add `.info.test_resources` to `_viash.yaml` to specify where test resources need to be synced from. + - `download/sync_test_resources`: Use `.info.test_resources` in `_viash.yaml` to detect where test resources need to be synced from. + - Update CI to use `project/sync-and-cache` instead of `project/sync-and-cache-s3`. + +## BUG FIXES + +* Fix failing tests for `ingestion/cellranger_postprocessing`, `ingestion/conversion` and `multiomics/process_batches` (PR #869). + +* `convert/from_10xh5_to_h5mu`: add .uns slot to mdata root when metrics file is provided (PR #887). + +* Fix ingestion components not working when optional arguments are unset (PR #894). + +* `transform/normalize_total` component: pass the `target_sum` argument to `sc.pp.normalize_total()` (PR #823). + +* `from_cellranger_multi_to_h5mu`: fix missing `pytest` dependency (PR #897). + +## DOCUMENTATION + +* Update authorship of components (PR #835). + +# openpipelines 1.0.4 + +## BUG FIXES + +* `scvi_leiden` workflow: fix the input layer argument of the workflow not being passed to the scVI component (PR #939, backported from PR #936 and PR #938). + +# openpipelines 1.0.3 + +## BUG FIXES + +* `qc/calculate_qc_metrics`: increase total counts accuracy with low precision floating dtypes as input layer (PR # , backported from PR #852). + +# openpipelines 1.0.2 + +## BUG FIXES + +* `dataflow/concatenate_h5mu`: fix writing out multidimensional annotation dataframes (e.g. `.varm`) that had their + data dtype (dtype) changed as a result of adding more observations after concatenation, causing `TypeError`. + One notable example of this happening is when one of the samples does not have a multimodal annotation dataframe + which is present in another sample; causing the values being filled with `NA` (PR #842, backported from PR #837). + +# openpipelines 1.0.1 + +## BUG FIXES + +* Bump viash to `0.8.6` (PR #816, backported from #815). This changes the at-runtime generated nextflow process from an in-memory to an on-disk temporary file, which should cause less issues with Nextflow Fusion. + +# openpipelines 1.0.0-rc6 + +## BUG FIXES + +* `dataflow/concatenate_h5mu`: fix regression bug where observations are no longer linked to the correct metadata +after concatenation (PR #807) + +# openpipelines 1.0.0-rc5 + +## BUG FIXES + +* `cluster/leiden`: prevent leiden component from hanging when a child process is killed (e.g. when there is not enough memory available) (PR #805). + +# openpipelines 1.0.0-rc4 + +## BREAKING CHANGES + +* `query/cellxgene_census`: Refactored the interface, documentation and internal workings of this component (PR #621). + - Renamed arguments to align with standard OpenPipelines notations and cellxgene census API: + - `--input_database` became `--input_uri` + - `--cellxgene_release` became `--census_version` + - `--cell_query` became `--obs_value_filter` + - `--cells_filter_columns` became `--cell_filter_grouping` + - `--min_cells_filter_columns` became `--cell_filter_minimum_count` + - `--modality` became `--output_modality` + - Removed `--dataset_id` since it was no longer being used. + - Added `--add_dataset_meta` to add metadata to the output MuData object. + - Documentation of the component and its arguments was improved. + +## BUG FIXES + +* `mapping/cellranger_multi`: Fix the regex for the fastq input files to allow dropping the lane from the input file names (e.g. `_L001`) (PR #778). + +* `workflows/rna/rna_singlesample`: Fix argument passing `top_n_vars` and `obs_name_mitochondrial_fraction` to the `qc` subworkflow (PR #779). + +# openpipelines 1.0.0-rc3 + +## BREAKING CHANGES + +* Docker image names now use `/` instead of `_` between the name of the component and the namespace (PR #712). + +## BUG FIXES + +* `rna_singlesample`: fixed a bug where selecting the column for the filtering with mitochondrial fractions + using `obs_name_mitochondrial_fraction` was done with the wrong column name, causing `ValueError` (PR #743). + +* Fix publishing in `process_samples` and `process_batches` (PR #759). + +## NEW FUNCTIONALITY + +* `dimred/tsne` component: Added a tSNE dimensionality reduction component (PR #742). + +# openpipelines 1.0.0-rc2 + +## BUG FIXES + +* Cellranger multi: Fix using a relative input path for `--vdj_inner_enrichment_primers` (PR #717) + +* `dataflow/split_modalities`: remove unused `compression` argument. Use `output_compression` instead (PR #714). + +* `metadata/grep_annotation_column`: fix calculating fraction when an input observation has no counts, which caused +the result to be out of bounds. + +* Fix `--output` argument not working for several workflows (PR #740). + +## MINOR CHANGES + +* `metadata/grep_annotation_column`: Added more logging output (PR #697). + +* `metadata/add_id` and `metadata/grep_annotation_column`: Bump python to 3.11 (PR #697). + +* Bump viash to 0.8.5 (PR #697) + +* `dataflow/split_modalities`: add more logging output and bump python to 3.12 (PR #714). + +* `correction/cellbender`: Update nextflow resource labels from `singlecpu` and `lowmem` to `midcpu` and `midmem` (PR #736) + +# openpipelines 1.0.0rc1 + +## BREAKING CHANGES + +* Change separator for arguments with multiple inputs from `:` to `;` (PR #700 and #707). Now, _all_ arguments with `multiple: true` will use `;` as the separator. + This change was made to be able to deal with file paths that contain `:`, e.g. `s3://my-bucket/my:file.txt`. Furthermore, the `;` separator will become + the default separator for all arguments with `multiple: true` in Viash >= 0.9.0. + +* This project now uses viash version 0.8.4 to build components and workflows. Changes related to this version update should + be _mostly_ backwards compatible with respect to the results and execution of the pipelines. From a development perspective, + drastic updates have been made to the developemt workflow. + + Development related changes: + * Bump viash version to 0.8.4 (PR #598, PR#638 and #706) in the project configuration. + * All pipelines no longer use the anonymous workflow. Instead, these workflows were given + a name which was added to the viash config as the entrypoint to the pipeline (PR #598). + * Removed the `workflows` folder and moved its contents to new locations: + 1. The `resources_test_scripts` folder now resides in the root of the project (PR #605). + 2. All workflows have been moved to the `src/workflows` folder (PR #605). + This implies that workflows must now be build using `viash (ns) build`, just like with components. + 3. Adjust GitHub Actions to account for new workflow paths (PR #605). + 4. In order to be backwards compatible, the `workflows` folder now contains symbolic + links to the build workflows in `target`. This is not a problem when using the repository for pipeline + execution. However, if a developer wishes to contribute to the project, symlink support should be enabled + in git using `git config core.symlinks=true`. Alternatively, use + `git clone -c core.symlinks=true git@github.com:openpipelines-bio/openpipeline.git` when cloning the + repository. This avoids the symlinks being resolved (PR #628). + 4bis. With PR #668, the workflows have been renamed. This does not hamper the backwards compatibility + of the symlinks that have been described in 4, because they still use the original location + which includes the original name. + * `multiomics/rna_singlesample` has been renamed to `rna/process_single_sample`, + * `multiomics/rna_multisample` has been renamed to `rna/rna_multisample`, + * `multiomics/prot_multisample` became `prot/prot_multisample`, + * `multiomics/prot_singlesample` became `prot/prot_singlesample`, + * `multiomics/full_pipeline` was moved to `multiomics/process_samples`, + * `multiomics/multisample` has been renamed to `multiomics/process_batches`, + * `multiomics/integration/initialize_integration` changed to `multiomics/dimensionality_reduction`, + * finally, all workflows at `multiomics/integration/*` were moved to `integration/*` + + 5. Removed the `workflows/utils` folder. Functionality that was provided by the `DataflowHelper` + and `WorkflowHelper` is now being provided by viash when the workflow is being build (PR #605). + + End-user facing changes: + * The `concat` component had been deprecated and will be removed in a future release. + It's functionality has been copied to the `concatenate_h5mu` component because the name is in + conflict with the `concat` operator from nextflow (PR #598). + * `prot_singlesample`, `rna_singlesample`, `prot_multisample` and `rna_multisample`: QC statistics + are now only calculated once where needed. This means that the mitochondrial gene detection is + performed in the `rna_singlesample` pipeline and the other count based statistics are calculated + during the `prot_multisample` and `rna_multisample` pipelines. In both cases, the `qc` pipeline + is being used, but only parts of that workflow are activated by parametrization. Previously + the count based statistics were calculated in both the `singlesample` and `multisample` pipelines, + with the results from the multisample pipelines overwriting the previous results. What is breaking here + is that the qc statistics are not being added to the results of the singlesample worklows. + This is _not_ an issue when using the `full_pipeline` because in this case the singlesample and + multisample workflows are executed in-tandem. If you wish to execute the singlesample workflows + in a seperate manner and still include count based statistics, please run the `qc` pipeline + on the result of the singlesample workflow (PR #604). + * `filter/filter_with_hvg` has been renamed to `feature_annotation/highly_variable_features_scanpy`, along with the following changes (PR #667). + - `--do_filter` was removed + - `--n_top_genes` has been renamed to `--n_top_features` + * `full_pipeline`, `multisample` and `rna_multisample`: Renamed arguments (PR #667). + - `--filter_with_hvg_var_output` became `--highly_variable_features_obs_batch_key` + - `--filter_with_hvg_obs_batch_key` became `--highly_variable_features_var_output` + * `rna_multisample`: Renamed arguments (PR #667). + - `--filter_with_hvg_n_top_genes` became `--highly_variable_features_n_top_features` + - `--filter_with_hvg_flavor` became `--highly_variable_features_flavor` + +* Renamed `obsm_metrics` to `uns_metrics` for the `cellranger_mapping` workflow because the cellranger metrics are stored in `.uns` and not `.obsm` (PR #610). + +## MAJOR CHANGES + +* `mapping/cellranger_mkfastq`: update from cellranger `6.0.2` to `7.0.1` (PR #675) + +## NEW FUNCTIONALITY + +* `multisample` pipeline: This workflow now works when provided multimple unimodal files or multiple multimodal files, in addition to the previously supported single multimodal file (PR #606). The modalities are processed independently from each other: + - As before, a single multimodal file is split into several unimodal MuData objects, each modality being stored in a file. + - (New) When multiple unimodal files are provided, they can be used used as is. + - (New) Mosaic input (i.e. multiple uni- or multimodal files) are split into unimodal files. + Providing the same modality twice is not supported however, meaning the modalities should be unique. + For example, using `input: ["data1.h5mu", "data2.h5mu"]` with `data1.h5mu` providing data for `rna` and `atac` + and `data2.h5mu` for `rna` and `prot` will not work, because the `rna` modality is present in both input files. + +* `multisample` workflow: throw an error when argument values for the merge component or the `initialize_integration` workflow differ between the inputs (PR #606). + +* Added a `split_modalities` workflow in order to split a multimodal mudata files into several unimodal mudata files. Its behavior is identical to the `split_modalities` component, but it also provides functionality to make sure everything works when nextflow's `-stub` option is enabled (PR #606). + +* All workflow now use `dependencies` to handle includes from other workflows (PR #606). + +* `qc/calculate_qc_metrics`: allow setting the output column names and disabling the calculation of several metrics (PR #644). + +* `rna_multisample`, `prot_multisample` and `qc` workflows: allow setting the output column names and disabling the calculation of several metrics (PR #606). + +* `cluster/leiden`: Allow calculating multiple resolutions in parallel (PR #645). + +* `qc/calculate_qc_metrics`: allow setting the output column names and disabling the calculation of several metrics (PR #644). + +* `rna_multisample` workflow: added `--modality` argument (PR #607). + +* `multisample` workflow: in addition to using multimodal files as input, this workflow now also accepts a list of files. The list of files must be the unimodal equivalents of a split multimodal file. The modalities in the list must be unique and after processing the modalities will be merged into multimodal files (PR #606). + +* Added `filter/intersect_obs` component which removes observations that are not shared between modalities (PR #589). + +* Re-enable `convert/from_h5mu_to_seurat` component (PR #616). + +* Added the `gdo_singlesample` pipeline with basic count filtering (PR #672). + +* `process_samples` pipeline: the `--rna_layer`, `--prot_layer` and `gdo_layer` argument can not be used to specify an alternative layer to .X where the raw data are stored. To enable this feature, the following changes were required: + - Added `transform/move_layer` component. + - `filter/filter_with_scrublet`: added `--layer` argument. + - `transform/clr`: added `--input_layer` argument. + - `metadata/grep_annotation_column`: added `--input_layer` argument. + - `rna/rna_singlesample`, `rna/rna_multisample`, `prot/prot_singlesample` and `prot/prot_multisample`: add `--layer` argument. + - `process_batches`: Added `rna_layer` and `prot_layer` arguments. + +* Enable dataset functionality for nf-tower (PR #701) + +* Added `annotate/score_genes` and `annotate/score_genes_cell_cycle` to calculate scanpy gene scores (PR #703). + +## MINOR CHANGES + +* Refactored `rna_multisample` (PR #607), `cellranger_multi` (PR #609), `cellranger_mapping` (PR #610) and other (PR #606) pipelines to use `fromState` and `toState` functionality. + +* `metadata/add_id`: add more runtime logging (PR #663). + +* `cluster/leiden`: Bump python to 3.11 and leidenalg to 0.10.0 (PR #645). + +* `mapping/htseq_count_to_h5mu` and `multi_star`: update polars and gtfparse (PR #642). + +* Pin `from_h5mu_to_seurat` to use Seurat to version 4 (PR #630). + +* `velocity/scvelo`: bump scvelo to 0.3.1 and python to 3.10 (PR #640). + +* Updated the Viash YAML schemas to the latest version of Viash (PR #620). + +* `build_cellranger_reference` and `build_bdrhap_reference`: Bump go version to `1.21.4` when building seqkit for testing the component (PR #624 and PR #637). + +* `correction/cellbender_remove_background`: Remove `muon` as a test dependency (PR #636). + +* (Automatic testing) Update viashpy to 0.6.0 (PR #665). + +* `integrate/scarches`, `integrate/scvi`, `velocity/scvelo` and `integrate/totalvi`: pin jax, jaxlib to `<0.4.23` (PR #699). + +* `integrate/scvi`: Unpin `numba` and pin scvi-tools to `1.0.3` (PR #699). + +* `integrate/totalvi`: Enable GPU-accelerated computing, unpin `torchmetrics` and pin jax, jaxlib to `<0.4.23` (PR #699). + +## BUG FIXES + +* `transform/log1p`: fix `--input_layer` argument not functioning (PR #678). + +* `dataflow/concat` and `dataflow/concatenate_h5mu`: Fix an issue where using `--mode move` on samples with non-overlapping features would cause `var_names` to become unaligned to the data (PR #653). + +* `filter/filter_with_scrublet`: (Testing) Fix duplicate test function names (PR #641). + +* `dataflow/concatenate_h5mu` and `dataflow/concat`: Fix `TypeError` when using mode 'move' and a column with conflicting metadata does not exist across all samples (PR #631). + +* `dataflow/concatenate_h5mu` and `dataflow/concat`: Fix an issue where joining columns with different datatypes caused `TypeError` (PR #619). + +* `qc/calculate_qc_metrics`: Resolved an issue where statistics based on the input columns selected with `--var_qc_metrics` were incorrect when these input columns were encoded in `pd.BooleanDtype()` (PR #685). + +* `move_obsm_to_obs`: fix setting output columns when they already exist (PR #690). + +* `workflows/dimensionality_reduction` workflow: nearest neighbour calculations no longer recalcalates the PCA when `obm_input` `--obsm_pca` is not set to `X_pca`. + +* `feature_annotation/highly_variable_scanpy`: fix .X being used to remove observations with 0 counts when `--layer` has been specified. + +* `filter/filter_with_counts`: fix `--layer` argument not being used. + +* `transform/normalize_total`: fix incorrect layer being written to the output when the input layer if not `.X`. + +* `src/workflows/qc`: fix input layer not being taken into account when calculating the fraction of mitochondrial genes (always used .X). + +* `convert/from_cellranger_multi_to_h5mu`: fix metric values not repesented as percentages being devided by 100. (#704). + +# openpipelines 0.12.1 + +## BUG FIXES + +* `rna_singlesample`: Fix filtering parameters values `min_counts`, `max_counts`, `min_genes_per_cell`, `max_genes_per_cell` and `min_cells_per_gene` not being passed to the `filter_with_counts` component (PR #614). + +* `prot_singlesample`: Fix filtering parameters values `min_counts`, `max_counts`, `min_proteins_per_cell`, `max_proteins_per_cell` and `min_cells_per_protein` not being passed to the `filter_with_counts` component (PR #614). + +# openpipelines 0.12.0 + +## BREAKING CHANGES + +The detection of mitochondrial genes has been revisited in order to remove the interdependency with the count filtering and the QC metric calculation. +Implementing this changes involved breaking some existing functionality: + +* `filter/filter_with_counts`: removed `--var_gene_names`, `--mitochondrial_gene_regex`, `--var_name_mitochondrial_genes`, `--min_fraction_mito` and `--max_fraction_mito` (PR #585). + +* `workflows/prot_singlesample`: removed `--min_fraction_mito` and `--max_fraction_mito` because regex-based detection detection of mitochondrial genes is not possible (PR #585). + +* The fraction of counts that originated from mitochondrial genes used to be written to an .obs column with a name that was derived from `pct_` suffixed by the name of the mitochondrial gene column. The `--obs_name_mitochondrial_fraction` argument is introduced to change the destination column and the default prefix has changed from `pct_` to `fraction_` (PR #585). + +## NEW FUNCTIONALITY + +* `workflows/qc`: A pipeline to add basic qc statistics to a MuData object (PR #585). + +* `workflows/rna_singlesample`: added `--obs_name_mitochondrial_fraction` and make sure that the values from `--max_fraction_mito` and `--min_fraction_mito` are bound between 0 and 1 (PR #585). + +* Added `filter/delimit_fraction`: Turns an annotation column containing values between 0 and 1 into a boolean column based on thresholds (PR #585). + +* Added `metadata/grep_annotation_column`: Perform a regex lookup on a column from the annotation matrices .obs or .var (PR #585). + +* `workflows/full_pipelines`: added `--obs_name_mitochondrial_fraction` argument (PR #585). + +* `workflows/prot_multisample`: added `--var_qc_metrics` and `--top_n_vars` arguments (PR #585). + +* Added genetic demultiplexing methods `cellsnp`, `demuxlet`, `freebayes`, `freemuxlet`, `scsplit`, `sourorcell` and `vireo` (PR #343). + +## MINOR CHANGES + +* Several components: bump scanpy to 1.9.5 (PR #594). + +* Refactored `prot_multisample` and `prot_singlesample` pipelines to use `fromState` and `toState` functionality (PR #585). + +# openpipelines 0.11.0 + +## BREAKING CHANGES + +* Nextflow VDSL3: set `simplifyOutput` to `False` by default. This implies that components and workflows will output a hashmap with a sole "output" entry when there is only one output (PR #563). + +* `integrate/scvi`: rename `model_output` argument to `output_model` in order to align with the `scvi_leiden` workflow. This also fixes a bug with the workflow where the argument did not function (PR #562). + +## MINOR CHANGES + +* `dataflow/concat`: reduce memory consumption when using `--other_axis_mode move` by processing only one annotation matrix (`.var`, `.obs`) at a time (PR #569). + +* Update viashpy and pin it to `0.5.0` (PR #572 and PR #577). + +* `convert/from_h5ad_to_h5mu`, `convert/from_h5mu_to_h5ad`, `dimred/pca`, `dimred/umap/`, +`filter/filter_with_counts`, `filter/filter_with_hvg`, `filter/remove_modality`, `filter/subset_h5mu`, +`integrate/scanorama`, `transform/delete_layer` and `transform/log1p`: update python to `3.9` (PR #572). + +* `integrate/scarches`: update base image, `scvi-tools` and `pandas` to `nvcr.io/nvidia/pytorch:23.09-py3`, `~=1.0.3` and `~=2.1.0` respectively (PR #572). + +* `integrate/totalvi`: update python to 3.9 and scvi-tools to `~=1.0.3` (PR #572). + +* `correction/cellbender_remove_background`: change base image to `nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04` and downwgrade MuData to 0.2.1 because it is the oldest version that uses python 3.7 (PR #575). + +* Several integration workflows: prevent leiden from being executed when no resolutions are provided (PR #583). + +* `dataflow/concat`: bump pandas to ~=2.1.1 and reduce memory consumption by only reading one modality into memory at a time (PR #568). + +* `annotate/popv`: bump `jax` and `jaxlib` to `0.4.10`, scanpy to `1.9.4`, scvi to `1.0.3` and pin `ml-dtypes` to < 0.3.0 (PR #565). + +* `velocity/scvelo`: pin matplotlib to < 3.8.0 (PR #566). + +* `mapping/multi_star`: pin multiqc to 1.15.0 (PR #566). + +* `mapping/bd_rhapsody`: pin pandas version to <2 (PR #563). + +* `query/cellxgene_census`: replaced label `singlecpu` with label `midcpu`. + +* `query/cellxgene_census`: avoid creating MuData object in memory by writing the modality directly to disk (PR #558). + +* `integrate/scvi`: use `midcpu` label instead of `singlecpu` (PR #561). + +## BUG FIXES + +* `transform/clr`: raise an error when CLR fails to return the requested output (PR #579). + +* `correction/cellbender_remove_background`: fix missing helper functionality when using Fusion (PR #575). + +* `convert/from_bdrhap_to_h5mu`: Avoid `TypeError: Can't implicitly convert non-string objects to strings` by using categorical dtypes when a string column contains NA values (PR #563). + +* `qc/calculate_qc_metrics`: fix calculating mitochondrial gene related QC metrics when only or no mitochondrial genes were found (PR #564). + +# openpipelines 0.10.1 + +## MINOR CHANGES + +* `integration/scvi_leiden`: Expose hvg selection argument `--var_input` (#543, PR #547). + +## BUG FIXES + +* `integration/bbknn_leiden`: Set leiden clustering parameter to multiple (#542, PR #545). + +* `integration/scvi_leiden`: Fix component name in Viash config (PR #547). + +* `integration/harmony_leiden`: Pass `--uns_neighbors` argument `umap` (PR #548). + +* Add workaround for bug where resources aren't available when using Nextflow fusion by including `setup_logger`, `subset_vars` and `compress_h5mu` in the script itself (PR #549). + +# openpipelines 0.10.0 + + +## BREAKING CHANGES + +* `workflows/full_pipeline`: removed `--prot_min_fraction_mito` and `--prot_max_fraction_mito` (PR #451) + +* `workflows/rna_multisample` and `workflows/prot_multisample`: Removed concatenation from these pipelines. The input for these pipelines is now a single mudata file that contains data for multiple samples. If you wish to use this pipeline on multiple single-sample mudata files, you can use the `dataflow/concat` components on them first. This also implies that the ability to add ids to multiple single-sample mudata files prior to concatenation is no longer required, hence the removal of `--add_id_to_obs`, `--sample_id`, `--add_id_obs_output`, and `--add_id_make_observation_keys_unique` (PR #475). + +* The `scvi` pipeline was renamed to `scvi_leiden` because `leiden` clustering was added to the pipeline (PR #499). + +* Upgrade `correction/cellbender_remove_background` from CellBender v0.2 to CellBender v0.3.0 (PR #523). + Between these versions, several arguments related to the slots of the output file have been changed. + +## MAJOR CHANGES + +* Several components: update anndata to 0.9.3 and mudata to 0.2.3 (PR #423). + +* Base resources assigned for a process without any labels is now 1 CPU and 2GB (PR #518). + +* Updated to Viash 0.7.5 (PR #513). + +* Removed deprecated `variant: vdsl3` tags (PR #513). + +* Removed unused `version: dev` (PR #513). + +* `multiomics/integration/harmony_leiden`: Refactored data flow (PR #513). + +* `ingestion/bd_rhapsody`: Refactored data flow (PR #513). + +* `query/cellxgene_census`: increased returned metadata content, revised query option, added filtering strategy and refactored functionality (PR #520). + +* Refactor loggers using `setup_logger()` helper function (PR #534). + +* Refactor unittest tests to pytest tests (PR #534). + +## MINOR CHANGES + +* Add resource labels to several components (PR #518). + +* `full_pipeline`: default value for `--var_qc_metrics` is now the combined values specified for `--mitochondrial_gene_regex` and `--filter_with_hvg_var_output`. + +* `dataflow/concat`: reduce memory consumption by only reading one modality at the same time (PR #474). + +* Components that use CellRanger, BCL Convert or bcl2fastq: updated from Ubuntu 20.04 to Ubuntu 22.04 (PR #494). + +* Components that use CellRanger: updated Picard to 2.27.5 (PR #494). + +* `interprete/liana`: Update lianapy to 0.1.9 (PR #497). + +* `qc/multiqc`: add unittests (PR #502). + +* `reference/build_cellranger_reference`: add unit tests (PR #506). + +* `reference/build_bd_rhapsody_reference`: add unittests (PR #504). + +## NEW FUNCTIONALITY + +* Added `compression/compress_h5mu` component (PR #530). + +* Resource management: when a process exits with a status code between 137 and 140, retry the process with increased memory requirements. Memory scales by multiplying the base memory assigned to the process with the attempt number (PR #518 and PR #527). + +* `integrate/scvi`: Add `--n_hidden_nodes`, `--n_dimensions_latent_space`, `--n_hidden_layers`, `--dropout_rate`, `--dispersion`, `--gene_likelihood`, `--use_layer_normalization`, `--use_batch_normalization`, `--encode_covariates`, `--deeply_inject_covariates` and `--use_observed_lib_size` parameters. + +* `filter/filter_with_counts`: add `--var_name_mitochondrial_genes` argument to store a boolean array corresponding the detected mitochondrial genes. + +* `full_pipeline` and `rna_singlesample` pipelines: add `--var_name_mitochondrial_genes`, `--var_gene_names` and `--mitochondrial_gene_regex` arguments to specify mitochondrial gene detection behaviour. + +* `integrate/scvi`: Add `--obs_labels`, `--obs_size_factor`, `--obs_categorical_covariate` and `--obs_continuous_covariate` arguments (PR #496). + +* Added `var_qc_metrics_fill_na_value` argument to `calculate_qc_metrics` (PR #477). + +* Added `multiomics/multisample` pipeline to run multisample processing followed by the integration setup. It is considered an entrypoint into the full pipeline which skips the single-sample processing. The idea is to allow a a re-run of these steps after a sample has already been processed by the `full_pipeline`. Keep in mind that samples that are provided as input to this pipeline are processed separately and are not concatenated. Hence, the input should be a concatenated sample (PR #475). + +* Added `multiomics/integration/bbknn_leiden` workflow. (PR #456). + +* `workflows/prot_multisample` and `workflows/full_pipelines`: add basic QC statistics to prot modality (PR #485). + +* `mapping/cellranger_multi`: Add tests for the mapping of Crispr Guide Capture data (PR #494). + +* `convert/from_cellranger_multi_to_h5mu`: add `perturbation_efficiencies_by_feature` and `perturbation_efficiencies_by_feature` information to .uns slot of `gdo` modality (PR #494). + +* `convert/from_cellranger_multi_to_h5mu`: add `feature_reference` information to the MuData object. Information is split between the modalities. For example `CRISPR Guide Capture` information if added to the `.uns` slot of the `gdo` modality, while `Antibody Capture` information is added to the .uns slot of `prot` (PR #494). + +* Added `multiomics/integration/totalvi_leiden` pipeline (PR #500). + +* Added totalVI component (PR #386). + +* `workflows/full_pipeline`: Add `pca_overwrite` argument (PR #511). + +* Add `main_build_viash_hub` action to build, tag, and push components and docker images for viash-hub.com (PR #480). + +* `integration/bbknn_leiden`: Update state management to `fromState` / `toState` (PR #538). + +* `mapping/cellranger_multi`: Add optional helper input: allow for passing modality specific inputs, from which library type and library id are inferred (PR #693). + +## DOCUMENTATION + +* `images`: Added images for various concepts, such as a sample, a cell, RNA, ADT, ATAC, VDJ (PR #515). + +* `multiomics/rna_singlesample`: Add image for workflow (PR #515). + +* `multiomics/rna_multisample`: Add image for workflow (PR #515). + +* `multiomics/prot_singlesample`: Add image for workflow (PR #515). + +* `multiomics/prot_multisample`: Add image for workflow (PR #515). + +## BUG FIXES + +* Fix an issue with `workflows/multiomics/scanorama_leiden` where the `--output` argument doesn't work as expected (PR #509). + +* Fix an issue with `workflows/full_pipeline` not correctly caching previous runs (PR #460). + +* Fix incorrect namespaces of the integration pipelines (PR #464). + +* Fix an issue in several workflows where the `--output` argument would not work (PR #476). + +* `integration/harmony_leiden` and `integration/scanorama_leiden`: Fix an issue where the prefix of the columns that store the leiden clusters was hardcoded to `leiden`, instead of adapting to the value for `--obs_cluster` (PR #482). + +* `velocity/velocyto`: Resolve symbolic link before checking whether the transcriptome is a gzip (PR #484). + +* `workflows/integration/scanorama_leiden`: fix an issue where `--obsm_input`, --obs_batch`, `--batch_size`, `--sigma`, `--approx`, `--alpha` and `-knn` were not working beacuse they were not passed through to the scanorama component (PR #487). + +* `workflows/integration/scanorama_leiden`: fix leiden being calculated on the wrong embedding because the `--obsm_input` argument was not correctly set to the output embedding of scanorama (PR #487). + +* `mapping/cellranger_multi`: Fix and issue where modalities did not have the proper name (PR #494). + +* `metadata/add_uns_to_obs`: Fix `KeyError: 'ouput_compression'` error (PR #501). + +* `neighbors/bbknn`: Fix `--input` not being a required argument (PR #518). + +* Create `correction/cellbender_remove_background_v0.2` for legacy CellBender v0.2 format (PR #523). + +* `integrate/scvi`: Ensure output has the same dimensionality as the input (PR #524). + +* `mapping/bd_rhapsody`: Fix `--dryrun` argument not working (PR #534). + +* `qc/multiqc`: Fix component not working for multiple inputs (PR #537). Also converted Bash script to Python scripts. + +* `neighbors/bbknn`: Fix `--uns_output`, `--obsp_distances` and `--obsp_connectivities` not being processed correctly (PR #538). + +# openpipelines 0.9.0 + +## BREAKING CHANGES + +Running the integration in the `full_pipeline` deemed to be impractical because a plethora of integration methods exist, which in turn interact with other functionality (like clustering). This generates a large number of possible usecases which one pipeline cannot cover in an easy manner. Instead, each integration methods will be split into its separate pipeline, and the `full_pipeline` will prepare for integration by performing steps that are required by many integration methods. Therefore, the following changes were performed: + + * `workflows/full_pipeline`: `harmony` integration and `leiden` clustering are removed from the pipeline. + + * Added `initialize_integration` to run calculations that output information commonly required by the integration methods. This pipeline runs PCA, nearest neighbours and UMAP. This pipeline is run as a subpipeline at the end of `full_pipeline`. + + * Added `leiden_harmony` integration pipeline: run harmony integration followed by neighbour calculations and leiden clustering. Also runs umap on the result. + + * Removed the `integration` pipeline. + +The old behavior of the `full_pipeline` can be obtained by running `full_pipeline` followed by the `leiden_harmony` pipeline. + +* The `crispr` and `hashing` modalities have been renamed to `gdo` and `hto` respectively (PR #392). + +* Updated Viash to 0.7.4 (PR #390). + +* `cluster/leiden`: Output is now stored into `.obsm` instead of `.obs` (PR #431). + +## NEW FUNCTIONALITY + +* `cluster/leiden` and `integration/harmony_leiden`: allow running leiden multiple times with multiple resolutions (PR #431). + +* `workflows/full_pipeline`: PCA, nearest neighbours and UMAP are now calculated for the `prot` modality (PR #396). + +* `transform/clr`: added `output_layer` argument (PR #396). + +* `workflows/integration/scvi`: Run scvi integration followed by neighbour calculations and run umap on the result (PR #396). + +* `mapping/cellranger_multi` and `workflows/ingestion/cellranger_multi`: Added `--vdj_inner_enrichment_primers` argument (PR #417). + +* `metadata/move_obsm_to_obs`: Move a matrix from an `.obsm` slot into `.obs` (PR #431). + +* `integrate/scvi` validity checks for non-normalized input, obs and vars in order to proceed to training (PR #429). + +* `schemas`: Added schema files for authors (PR #436). + +* `schemas`: Added schema file for Viash configs (PR #436). + +* `schemas`: Refactor author import paths (PR #436). + +* `schemas`: Added schema file for file format specification files (PR #437). + +* `query/cellxgene_census`: Query Cellxgene census component and save the results to a MuData file. (PR #433). + +## MAJOR CHANGES + +* `report/mermaid`: Now used `mermaid-cli` to generate images instead of creating a request to `mermaid.ink`. New `--output_format`, `--width`, `--height` and `--background_color` arguments were added (PR #419). + +* All components that used `python` as base container: use `slim` version to reduce container image size (PR #427). + +## MINOR CHANGES + +* `integrate/scvi`: update scvi to 1.0.0 (PR #448) + +* `mapping/multi_star`: Added `--min_success_rate` which causes component to fail when the success rate of processed samples were successful (PR #408). + +* `correction/cellbender_remove_background` and `transform/clr`: update muon to 0.1.5 (PR #428) + +* `ingestion/cellranger_postprocessing`: split integration tests into several workflows (PR #425). + +* `schemas`: Add schema file for author yamls (PR #436). + +* `mapping/multi_star`, `mapping/star_build_reference` and `mapping/star_align`: update STAR from 2.7.10a to 2.7.10b (PR #441). + +## BUG FIXES + +* `annotate/popv`: Fix concat issue when the input data has multiple layers (#395, PR #397). + +* `annotate/popv`: Fix indexing issue when MuData object contain non overlapping modalities (PR #405). + +* `mapping/multi_star`: Fix issue where temp dir could not be created when group_id contains slashes (PR #406). + +* `mapping/multi_star_to_h5mu`: Use glob to look for count files recursively (PR #408). + +* `annotate/popv`: Pin `PopV`, `jax` and `jaxlib` versions (PR #415). + +* `integrate/scvi`: the max_epochs is no longer required since it has a default value (PR #396). + +* `workflows/full_pipeline`: fix `make_observation_keys_unique` parameter not being correctly passed to the `add_id` component, causing `ValueError: Observations are not unique across samples` during execution of the `concat` component (PR #422). + +* `annotate/popv`: now sets `aprox` to `False` to avoid using `annoy` in scanorama because it fails on processors that are missing the AVX-512 instruction sets, causing `Illegal instruction (core dumped)`. + +* `workflows/full_pipeline`: Avoid adding sample names to observation ids twice (PR #457). + +# openpipelines 0.8.0 + +## BREAKING CHANGES + +* `workflows/full_pipeline`: Renamed inconsistencies in argument naming (#372): + - `rna_min_vars_per_cell` was renamed to `rna_min_genes_per_cell` + - `rna_max_vars_per_cell` was renamed to `rna_max_genes_per_cell` + - `prot_min_vars_per_cell` was renamed to `prot_min_proteins_per_cell` + - `prot_max_vars_per_cell` was renamed to `prot_max_proteins_per_cell` + +* `velocity/scvelo`: bump anndata from <0.8 to 0.9. + +## NEW FUNCTIONALITY + +* Added an extra label `veryhighmem` mostly for `cellranger_multi` with a large number of samples. + +* Added `multiomics/prot_multisample` pipeline. + +* Added `clr` functionality to `prot_multisample` pipeline. + +* Added `interpret/lianapy`: Enables the use of any combination of ligand-receptor methods and resources, and their consensus. + +* `filter/filter_with_scrublet`: Add `--allow_automatic_threshold_detection_fail`: when scrublet fails to detect doublets, the component will now put `NA` in the output columns. + +* `workflows/full_pipeline`: Allow not setting the sample ID to the .obs column of the MuData object. + +* `workflows/rna_multisample`: Add the ID of the sample to the .obs column of the MuData object. + +* `correction/cellbender_remove_background`: add `obsm_latent_gene_encoding` parameter to store the latent gene representation. + +## BUG FIXES + +* `transform/clr`: fix anndata object instead of matrix being stored as a layer in output `MuData`, resulting in `NoneTypeError` object after reading the `.layers` back in. + +* `dataflow/concat` and `dataflow/merge`: fixed a bug where boolean values were cast to their string representation. + +* `workflows/full_pipeline`: fix running pipeline with `-stub`. + +* Fixed an issue where passing a remote file URI (for example `http://` or `s3://`) as `param_list` caused `No such file` errors. + +* `workflows/full_pipeline`: Fix incorrectly named filtering arguments (#372). + +* `integrate/scvi`: Fix bug when subsetting using the `var_input` argument (PR #385). +* +* `correction/cellbender_remove_background`: add `obsm_latent_gene_encoding` parameter to store the latent gene representation. + +## MINOR CHANGES + +* `integrate/scarches`, `integrate/scvi` and `correction/cellbender_remove_background`: Update base container to `nvcr.io/nvidia/pytorch:22.12-py3` + +* `integrate/scvi`: add `gpu` label for nextflow platform. + +* `integrate/scvi`: use cuda enabled `jax` install. + +* `convert/from_cellranger_multi_to_h5mu`, `dataflow/concat` and `dataflow/merge`: update pandas to 2.0.0 + +* `dataflow/concat` and `dataflow/merge`: Boolean and integer columns are now represented by the `BooleanArray` and `IntegerArray` dtypes in order to allow storing `NA` values. + +* `interpret/lianapy`: use the latest development release (commit 11156ddd0139a49dfebdd08ac230f0ebf008b7f8) of lianapy in order to fix compatibility with numpy 1.24.x. + +* `filter/filter_with_hvg`: Add error when specified input layer cannot be found in input data. + +* `workflows/multiomics/full_pipeline`: publish the output from sample merging to allow running different integrations. + +* CI: Remove various unused software libraries from runner image in order to avoid `no space left on device` (PR #425, PR #447). + +# openpipelines 0.7.1 + +## NEW FUNCTIONALITY + +* `integrate/scvi`: use `nvcr.io/nvidia/pytorch:22.09-py3` as base container to enable GPU acceleration. + +* `integrate/scvi`: add `--model_output` to save model. + +* `workflows/ingestion/cellranger_mapping`: Added `output_type` to output the filtered Cell Ranger data as h5mu, not the converted raw 10xh5 output. + +* Several components: added `--output_compression` component to set the compression of output .h5mu files. + +* `workflows/full_pipeline` and `workflows/integration`: Added `leiden_resolution` argument to control the coarseness of the clustering. + +* Added `--rna_theta` and `--rna_harmony_theta` to full and integration pipeline respectively in order to tune the diversity clustering penalty parameter for harmony integration. + +* `dimred/pca`: fix `variance` slot containing a second copy of the variance ratio matrix and not the variances. + +## BUG FIXES + +* `mapping/cellranger_multi`: Fix an issue where using a directory as value for `--input` would cause `AttributeError`. + +* `workflows/integration`: `init_pos` is no longer set to the integration layer (e.g. `X_pca_integrated`). + +## MINOR CHANGES + +* `integration` and `full` workflows: do not run harmony integration when `obs_covariates` is not provided. + +* Add `highmem` label to `dimred/pca` component. + +* Remove disabled `convert/from_csv_to_h5mu` component. + +* Update to Viash 0.7.1. + +* Several components: update to scanpy 1.9.2 + +* `process_10xh5/filter_10xh5`: speed up build by using `eddelbuettel/r2u:22.04` base container. + +## MAJOR CHANGES + +* `dataflow/concat`: Renamed `--compression` to `--output_compression`. + +# openpipelines 0.7.0 + +## MAJOR CHANGES + +* Removed `bin` folder. As of viash 0.6.4, a `_viash.yaml` file can be included in the root of a repository to set common viash options for the project. +These options were previously covered in the `bin/init` script, but this new feature of viash makes its use unnecessary. The `viash` and `nextlow` should now be installed in a directory that is included in your `$PATH`. + +## MINOR CHANGES + +* `filter/do_filter`: raise an error instead of printing a warning when providing a column for `var_filer` or `obs_filter` that doesn't exist. + +## BUG FIXES + +* `workflows/full_pipeline`: Fix setting .var output column for filter_with_hvg. + +* Fix running `mapping/cellranger_multi` without passing all references. + +* `filter/filter_with_scrublet`: now sets `use_approx_neighbors` to `False` to avoid using `annoy` because it fails on processors that are missing the AVX-512 instruction sets. + +* `workflows`: Updated `WorkflowHelper` to newer version that allows applying defaults when calling a subworkflow from another workflow. + +* Several components: pin matplotlib to <3.7 to fix scanpy compatibility (see https://github.com/scverse/scanpy/issues/2411). + +* `workflows`: fix a bug when running a subworkflow from a workflow would cause the parent config to be read instead of the subworklow config. + +* `correction/cellbender_remove_background`: Fix description of input for cellbender_remove_background. + +* `filter/do_filter`: resolved an issue where the .obs column instead of the .var column was being logged when filtering using the .var column. + +* `workflows/rna_singlesample` and `workflows/prot_singlesample`: Correctly set var and obs columns while filtering with counts. + +* `filter/do_filter`: removed the default input value for `var_filter` argument. + +* `workflows/full_pipeline` and `workflows/integration`: fix PCA not using highly variable genes filter. + +# openpipelines 0.6.2 + +## NEW FUNCTIONALITY + +* `workflows/full_pipeline`: added `filter_with_hvg_obs_batch_key` argument for batched detection of highly variable genes. + +* `workflows/rna_multisample`: added `filter_with_hvg_obs_batch_key`, `filter_with_hvg_flavor` and `filter_with_hvg_n_top_genes` arguments. + +* `qc/calculate_qc_metrics`: Add basic statistics: `pct_dropout`, `num_zero_obs`, `obs_mean` and `total_counts` are added to .var. `num_nonzero_vars`, `pct_{var_qc_metrics}`, `total_counts_{var_qc_metrics}`, `pct_of_counts_in_top_{top_n_vars}_vars` and `total_counts` are included in .obs + +* `workflows/multiomics/rna_multisample` and `workflows/multiomics/full_pipeline`: add `qc/calculate_qc_metrics` component to workflow. + +* `workflows/multiomics/prot_singlesample`: Processing unimodal single-sample CITE-seq data. + +* `workflows/multiomics/rna_singlesample` and `workflows/multiomics/full_pipeline`: Add filtering arguments to pipeline. + +## MINOR CHANGES + +* `convert/from_bdrhap_to_h5mu`: bump R version to 4.2. + +* `process_10xh5/filter_10xh5`: bump R version to 4.2. + +* `dataflow/concat`: include path of file in error message when reading a mudata file fails. + +* `mapping/cellranger_multi`: write cellranger console output to a `cellranger_multi.log` file. + +## BUG FIXES + +* `mapping/htseq_count_to_h5mu`: Fix a bug where reading in the gtf file caused `AttributeError`. + +* `dataflow/concat`: the `--input_id` is no longer required when `--mode` is not `move`. + +* `filter/filter_with_hvg`: does no longer try to use `--varm_name` to set non-existant metadata when running with `--flavor seurat_v3`, which was causing `KeyError`. + +* `filter/filter_with_hvg`: Enforce that `n_top_genes` is set when `flavor` is set to 'seurat_v3'. + +* `filter/filter_with_hvg`: Improve error message when trying to use 'cell_ranger' as `flavor` and passing unfiltered data. + +* `mapping/cellranger_multi` now applies `gex_chemistry`, `gex_secondary_analysis`, `gex_generate_bam`, `gex_include_introns` and `gex_expect_cells`. + +# openpipeline 0.6.1 + +## NEW FUNCTIONALITY + +* `mapping/multi_star`: A parallellized version of running STAR (and HTSeq). + +* `mapping/multi_star_to_h5mu`: Convert the output of `multi_star` to a h5mu file. + +## BUG FIXES + +* `filter/filter_with_counts`: Fix an issue where mitochrondrial genes were being detected in .var_names, which contain ENSAMBL IDs instead of gene symbols in the pipelines. Solution was to create a `--var_gene_names` argument which allows selecting a .var column to check using a regex (`--mitochondrial_gene_regex`). + +* `dataflow/concat`, `report/mermaid`, `transform/clr`: Don't forget to exit with code returned by pytest. +# openpipeline 0.6.0 + +## NEW FUNCTIONALITY + +* `workflows/full_pipeline`: add `filter_with_hvg_var_output` argument. + +* `dimred/pca`: Add `--overwrite` and `--var_input` arguments. + +* `tranform/clr`: Perform CLR normalization on CITE-seq data. + +* `workflows/ingestion/cellranger_multi`: Run Cell Ranger multi and convert the output to .h5mu. + +* `filter/remove_modality`: Remove a single modality from a MuData file. + +* `mapping/star_align`: Align `.fastq` files using STAR. + +* `mapping/star_align_v273a`: Align `.fastq` files using STAR v2.7.3a. + +* `mapping/star_build_reference`: Create a STAR reference index. + +* `mapping/cellranger_multi`: Align fastq files using Cell Ranger multi. + +* `mapping/samtools_sort`: Sort and (optionally) index alignments. + +* `mapping/htseq_count`: Quantify gene expression for subsequent testing for differential expression. + +* `mapping/htseq_count_to_h5mu`: Convert one or more HTSeq outputs to a MuData file. + +* Added from `convert/from_cellranger_multi_to_h5mu` component. + +## MAJOR CHANGES + +* `convert/from_velocyto_to_h5mu`: Moved to `velocity/velocyto_to_h5mu`. + It also now accepts an optional `--input_h5mu` argument, to allow directly reading + the RNA velocity data into a `.h5mu` file containing the other modalities. + +* `resources_test/cellranger_tiny_fastq`: Include RNA velocity computations as part of + the script. + +* `mapping/cellranger_mkfastq`: remove --memory and --cpu arguments as (resource management is automatically provided by viash). + +## MINOR CHANGES + +* Several components: use `gzip` compression for writing .h5mu files. + +* Default value for `obs_covariates` argument of full pipeline is now `sample_id`. + +* Set the `tag` directive of all Nextflow components to '$id'. + +## BUG FIXES + +* Keep data for modalities that are not specifically enabled when running full pipeline. + +* Fix many components thanks to Viash 0.6.4, which causes errors to be + thrown when input and output files are defined but not found. + + +# openpipeline 0.5.1 + +## BREAKING CHANGES + +* `reference/make_reference`: Input files changed from `type: string` to `type: file` to allow Nextflow to cache the input files fetched from URL. + +* several components (except `from_h5ad_to_h5mu`): the `--modality` arguments no longer accept multiple values. + +* Remove outdated `resources_test_scripts`. + +* `convert/from_h5mu_to_seurat`: Disabled because MuDataSeurat is currently broken, see [https://github.com/PMBio/MuDataSeurat/issues/9](PMBio/MuDataSeurat#9). + +* `integrate/harmony`: Disabled because it is currently not functioning and the alternative, harmonypy, is used in the workflows. + +* `dataflow/concat`: Renamed --sample_names to --input_id and moved the ability to add sample id and to join the sample ids with the observation names to `metadata/add_id` + +* Moved `dataflow/concat`, `dataflow/merge` and `dataflow/split_modalities` to a new namespace: `dataflow`. + +* Moved `workflows/conversion/conversion` to `workflows/ingestion/conversion` + +## NEW FUNCTIONALITY + +* `metadata/add_id`: Add an id to a column in .obs. Also allows joining the id to the .obs_names. + +* `workflows/ingestion/make_reference`: A generic component to build a transcriptomics reference into one of many formats. + +* `integrate/scvi`: Performs scvi integration. + +* `integrate/add_metadata`: Add a csv containing metadata to the .obs or .var field of a mudata file. + +* `DataflowHelper.nf`: Added `passthroughMap`. Usage: + + ```groovy + include { passthroughMap as pmap } from "./DataflowHelper.nf" + + workflow { + Channel.fromList([["id", [input: "foo"], "passthrough"]]) + | pmap{ id, data -> + [id, data + [arg: 10]] + } + } + ``` + Note that in the example above, using a regular `map` would result in an exception being thrown, + that is, "Invalid method invocation `call` with arguments". + + A synonymous of doing this with a regular `map()` would be: + ```groovy + workflow { + Channel.fromList([["id", [input: "foo"], "passthrough"]]) + | map{ tup -> + def (id, data) = tup + [id, data + [arg: 10]] + tup.drop(2) + } + } + ``` + +* `correction/cellbender_remove_background`: Eliminating technical artifacts from high-throughput single-cell RNA sequencing data. + +* `workflows/ingestion/cellranger_postprocessing`: Add post-processing of h5mu files created from Cell Ranger data. + +* `annotate/popv`: Performs popular major vote cell typing on single cell sequence data. + +## MAJOR CHANGES + +* `workflows/utils/DataflowHelper.nf`: Added helper functions `setWorkflowArguments()` and `getWorkflowArguments()` to split the data field of a channel event into a hashmap. Example usage: + ```groovy + | setWorkflowArguments( + pca: [ "input": "input", "obsm_output": "obsm_pca" ] + integration: [ "obs_covariates": "obs_covariates", "obsm_input": "obsm_pca" ] + ) + | getWorkflowArguments("pca") + | pca + | getWorkflowArguments("integration") + | integration + ``` + +* `mapping/cellranger_count`: Allow passing both directories as well as individual fastq.gz files as inputs. + +* `convert/from_10xh5_to_h5mu`: Allow reading in QC metrics, use gene ids as `.obs_names` instead of gene symbols. + +* `workflows/conversion`: Update pipeline to use the latest practices and to get it to a working state. + +## MINOR CHANGES + +* `dimred/umap`: Streamline UMAP parameters by adding `--obsm_output` parameter to allow choosing the output `.obsm` slot. + +* `workflows/multiomics/integration`: Added arguments for tuning the various output slots of the integration pipeline, namely `--obsm_pca`, `--obsm_integrated`, `--uns_neighbors`, `--obsp_neighbor_distances`, `--obsp_neighbor_connectivities`, `--obs_cluster`, `--obsm_umap`. + +* Switch to Viash 0.6.1. + +* `filter/subset_h5mu`: Add `--modality` argument, export to VDSL3, add unit test. + +* `dataflow/split_modalities`: Also output modality types in a separate csv. + +## BUG FIXES + +* `convert/from_bd_to_10x_molecular_barcode_tags`: Replaced UTF8 characters with ASCII. OpenJDK 17 or lower might throw the following exception when trying to read a UTF8 file: `java.nio.charset.MalformedInputException: Input length = 1`. + +* `dataflow/concat`: Overriding sample name in .obs no longer raises `AttributeError`. + +* `dataflow/concat`: Fix false positives when checking for conflicts in .obs and .var when using `--mode move`. + +# openpipeline 0.5.0 + +Major redesign of the integration and multiomic workflows. Current list of workflows: + +* `ingestion/bd_rhapsody`: A generic pipeline for running BD Rhapsody WTA or Targeted mapping, with support for AbSeq, VDJ and/or SMK. + +* `ingestion/cellranger_mapping`: A pipeline for running Cell Ranger mapping. + +* `ingestion/demux`: A generic pipeline for running bcl2fastq, bcl-convert or Cell Ranger mkfastq. + +* `multiomics/rna_singlesample`: Processing unimodal single-sample RNA transcriptomics data. + +* `multiomics/rna_multisample`: Processing unimodal multi-sample RNA transcriptomics data. + +* `multiomics/integration`: A pipeline for demultiplexing multimodal multi-sample RNA transcriptomics data. + +* `multiomics/full_pipeline`: A pipeline to analyse multiple multiomics samples. + +## BREAKING CHANGES + +* Many components: Renamed `.var["gene_ids"]` and `.var["feature_types"]` to `.var["gene_id"]` and `.var["feature_type"]`. + +## DEPRECATED + +* `convert/from_10xh5_to_h5ad` and `convert/from_bdrhap_to_h5ad`: Removed h5ad based components. + +* `mapping/bd_rhapsody_wta` and `workflows/ingestion/bd_rhapsody_wta`: Deprecated in favour for more generic `mapping/bd_rhapsody` and `workflows/ingestion/bd_rhapsody` pipelines. + +* `convert/from_csv_to_h5mu`: Disable until it is needed again. + +* `dataflow/concat`: Deprecated `"concat"` option for `--other_axis_mode`. + +## NEW COMPONENTS + +* `graph/bbknn`: Batch balanced KNN. + +* `transform/scaling`: Scale data to unit variance and zero mean. + +* `mapping/bd_rhapsody`: Added generic component for running the BD Rhapsody WTA or Targeted analysis, with support for AbSeq, VDJ and/or SMK. + +* `integrate/harmony` and `integrate/harmonypy`: Run a Harmony integration analysis (R-based and Python-based, respectively). + +* `integrate/scanorama`: Use Scanorama to integrate different experiments. + +* `reference/make_reference`: Download a transcriptomics reference and preprocess it (adding ERCC spikeins and filtering with a regex). + +* `reference/build_bdrhap_reference`: Compile a reference into a STAR index in the format expected by BD Rhapsody. + +## NEW WORKFLOWS + +* `workflows/ingestion/bd_rhapsody`: Added generic workflow for running the BD Rhapsody WTA or Targeted analysis, with support for AbSeq, VDJ and/or SMK. + +* `workflows/multiomics/full_pipeline`: Implement pipeline for processing multiple multiomics samples. + +## NEW FUNCTIONALITY + +* `convert/from_bdrhap_to_h5mu`: Added support for being able to deal with WTA, Targeted, SMK, AbSeq and VDJ data. + +* `dataflow/concat`: Added `"move"` option to `--other_axis_mode`, which allows merging `.obs` and `.var` by only keeping elements of the matrices which are the same in each of the samples, moving the conflicting values to `.varm` or `.obsm`. + +## MAJOR CHANGES + +* Multiple components: Update to anndata 0.8 with mudata 0.2.0. This means that the format of the `.h5mu` files have changed. + +* `multiomics/rna_singlesample`: Move transformation counts into layers instead of overwriting `.X`. + +* Updated to Viash 0.6.0. + +## MINOR CHANGES + +* `velocity/velocyto`: Allow configuring memory and parallellisation. + +* `cluster/leiden`: Add `--obsp_connectivities` parameter to allow choosing the output slot. + +* `workflows/multiomics/rna_singlesample`, `workflows/multiomics/rna_multisample` and `workflows/multiomics/integration`: Allow choosing the output paths. + +* `neighbors/bbknn` and `neighbors/find_neighbors`: Add parameters for choosing the input/output slots. + +* `dimred/pca` and `dimred/umap`: Add parameters for choosing the input/output slots. + +* `dataflow/concat`: Optimize concat performance by adding multiprocessing and refactoring functions. + +* `workflows/multimodal_integration`: Add `obs_covariates` argument to pipeline. + +## BUG FIXES + +* Several components: Revert using slim versions of containers because they do not provide the tools to run nextflow with trace capabilities. + +* `dataflow/concat`: Fix an issue where joining boolean values caused `TypeError`. + +* `workflows/multiomics/rna_multisample`, `workflows/multiomics/rna_singlesample` and `workflows/multiomics/integration`: Use nextflow trace reporting when running integration tests. + + +# openpipeline 0.4.1 + +## BUG FIXES + +* `workflows/ingestion/bd_rhapsody_wta`: use ':' as a seperator for multiple input files and fix integration tests. + +## MINOR CHANGES + +* Several components: pin mudata and scanpy dependencies so that anndata version <0.8.0 is used. + +# openpipeline 0.4.0 + +## NEW FUNCTIONALITY + +* `convert/from_bdrhap_to_h5mu`: Merge one or more BD rhapsody outputs into an h5mu file. + +* `dataflow/split_modalities`: Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. + +* `dataflow/concat`: Combine data from multiple samples together. + +## MINOR CHANGES + +* `mapping/bd_rhapsody_wta`: Update to BD Rhapsody 1.10.1. + +* `mapping/bd_rhapsody_wta`: Add parameters for overriding the minimum RAM & cores. Add `--dryrun` parameter. + +* Switch to Viash 0.5.14. + +* `convert/from_bdrhap_to_h5mu`: Update to BD Rhapsody 1.10.1. + +* `resources_test/bdrhap_5kjrt`: Add subsampled BD rhapsody datasets to test pipeline with. + +* `resources_test/bdrhap_ref_gencodev40_chr1`: Add subsampled reference to test BD rhapsody pipeline with. + +* `dataflow/merge`: Merge several unimodal .h5mu files into one multimodal .h5mu file. + +* Updated several python docker images to slim version. + +* `mapping/cellranger_count_split`: update container from ubuntu focal to ubuntu jammy + +* `download/sync_test_resources`: update AWS cli tools from 2.7.11 to 2.7.12 by updating docker image + +* `download/download_file`: now uses bash container instead of python. + +* `mapping/bd_rhapsody_wta`: Use squashed docker image in which log4j issues are resolved. + +## BUG FIXES + +* `workflows/utils/WorkflowHelper.nf`: Renamed `utils.nf` to `WorkflowHelper.nf`. + +* `workflows/utils/WorkflowHelper.nf`: Fix error message when required parameter is not specified. + +* `workflows/utils/WorkflowHelper.nf`: Added helper functions: + - `readConfig`: Read a Viash config from a yaml file. + - `viashChannel`: Create a channel from the Viash config and the params object. + - `helpMessage`: Print a help message and exit. + +* `mapping/bd_rhapsody_wta`: Update picard to 2.27.3. + +## DEPRECATED + +* `convert/from_bdrhap_to_h5ad`: Deprecated in favour for `convert/from_bdrhap_to_h5mu`. + +* `convert/from_10xh5_to_h5ad`: Deprecated in favour for `convert/from_10xh5_to_h5mu`. + +# openpipeline 0.3.1 + +## NEW FUNCTIONALITY + +* `bin/port_from_czbiohub_utilities.sh`: Added helper script to import components and pipelines from `czbiohub/utilities` + +Imported components from `czbiohub/utilities`: + +* `demux/cellranger_mkfastq`: Demultiplex raw sequencing data. + +* `mapping/cellranger_count`: Align fastq files using Cell Ranger count. + +* `mapping/cellranger_count_split`: Split 10x Cell Ranger output directory into separate output fields. + +Imported workflows from `czbiohub/utilities`: + +* `workflows/1_ingestion/cellranger`: Use Cell Ranger to preprocess 10x data. + +* `workflows/1_ingestion/cellranger_demux`: Use cellranger demux to demultiplex sequencing BCL output to FASTQ. + +* `workflows/1_ingestion/cellranger_mapping`: Use cellranger count to align 10x fastq files to a reference. + + +## MINOR CHANGES + +* Fix `interactive/run_cirrocumulus` script raising `NotImplementedError` caused by using `MutData.var_names_make_unique()` +on each modality instead of on the whole `MuData` object. + +* Fix `transform/normalize_total` and `interactive/run_cirrocumulus` component build missing a hdf5 dependency. + +* `interactive/run_cellxgene`: Updated container to ubuntu:focal because it contains python3.6 but cellxgene dropped python3.6 support. + +* `mapping/bd_rhapsody_wta`: Set `--parallel` to true by default. + +* `mapping/bd_rhapsody_wta`: Translate Bash script into Python. + +* `download/sync_test_resources`: Add `--dryrun`, `--quiet`, and `--delete` arguments. + +* `convert/from_h5mu_to_seurat`: Use `eddelbuettel/r2u:22.04` docker container in order to speed up builds by downloading precompiled R packages. + +* `mapping/cellranger_count`: Use 5Gb for testing (to adhere to github CI runner memory constraints). + +* `convert/from_bdrhap_to_h5ad`: change test data to output from `mapping/bd_rhapsody_wta` after reducing the BD Rhapsody test data size. + +* Various `config.vsh.yaml`s: Renamed `values:` to `choices:`. + +* `download/download_file` and `transfer/publish`: Switch base container from `bash:5.1` to `python:3.10`. + +* `mapping/bd_rhapsody_wta`: Make sure procps is installed. + +## BUG FIXES + +* `mapping/bd_rhapsody_wta`: Use a smaller test dataset to reduce test time and make sure that the Github Action runners do not run out of disk space. + +* `download/sync_test_resources`: Disable the use of the Amazon EC2 instance metadata service to make script work on Github Actions runners. + +* `convert/from_h5mu_to_seurat`: Fix unit test requiring Seurat by using native R functions to test the Seurat object instead. + +* `mapping/cellranger_count` and `bcl_demus/cellranger_mkfastq`: cellranger uses the `--parameter=value` formatting instead of `--parameter value` to set command line arguments. + +* `mapping/cellranger_count`: `--nosecondary` is no longer always applied. + +* `mapping/bd_rhapsody_wta`: Added workaround for bug in Viash 0.5.12 where triple single quotes are incorrectly escaped (viash-io/viash#139). + +## DEPRECATED + +* `bcl_demux/cellranger_mkfastq`: Duplicate of `demux/cellranger_mkfastq`. + +# openpipeline 0.3.0 + +* Add `tx_processing` pipeline with following components: + - `filter_with_counts` + - `filter_with_scrublet` + - `filter_with_hvg` + - `do_filter` + - `normalize_total` + - `regress_out` + - `log1p` + - `pca` + - `find_neighbors` + - `leiden` + - `umap` + +# openpipeline 0.2.0 + +## NEW FUNCTIONALITY + +* Added `from_10x_to_h5ad` and `download_10x_dataset` components. + +## MINOR CHANGES +* Workflow `bd_rhapsody_wta`: Minor change to workflow to allow for easy processing of multiple samples with a tsv. + +* Component `bd_rhapsody_wta`: Added more parameters, `--parallel` and `--timestamps`. + +* Added `pbmc_1k_protein_v3` as a test resource. + +* Translate `bd_rhapsody_extracth5ad` from R into Python script. + +* `bd_rhapsody_wta`: Remove temporary directory after execution. + +* `files/make_params`: Implement unit tests (PR #505). + +# openpipeline 0.1.0 + +* Initial release containing only a `bd_rhapsody_wta` pipeline and corresponding components. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..35b51670 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,132 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +[INSERT CONTACT METHOD]. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f5b784fa --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 OpenPipelines + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..2635e493 --- /dev/null +++ b/README.md @@ -0,0 +1,258 @@ + + +# OpenPipeline + +Extensible single cell analysis pipelines for reproducible and +large-scale single cell processing using Viash and Nextflow. + +[![ViashHub](https://img.shields.io/badge/ViashHub-openpipeline-7a4baa.svg)](https://www.viash-hub.com/packages/openpipeline) +[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2Fopenpipeline-blue.svg)](https://github.com/openpipelines-bio/openpipeline) +[![GitHub +License](https://img.shields.io/github/license/openpipelines-bio/openpipeline.svg)](https://github.com/openpipelines-bio/openpipeline/blob/main/LICENSE) +[![GitHub +Issues](https://img.shields.io/github/issues/openpipelines-bio/openpipeline.svg)](https://github.com/openpipelines-bio/openpipeline/issues) +[![Viash +version](https://img.shields.io/badge/Viash-v0.9.3-blue.svg)](https://viash.io) + +## Documentation + +Please find more in-depth documentation on [the +website](https://openpipelines.bio/). + +## Functionality Overview + +Openpipelines execute a list of predefined tasks. These descrete steps +are also provided as standalone components that can be executed +individually, with a standardized interface. This is especially useful +when a particular step wraps a tool that you do not necessarily always +need to execute in a workflow context. + +In terms of workflows, the following functionality is provided: + +- Demultiplexing: conversion of raw sequencing data to FASTQ objects. +- [Ingestion](https://openpipelines.bio/fundamentals/architecture.html#sec-ingestion): + Read mapping and generating a count matrix. +- [Single sample + processing](https://openpipelines.bio/fundamentals/architecture.html#sec-single-sample): + cell filtering and doublet detection. +- [Multisample + processing](https://openpipelines.bio/fundamentals/architecture.html#sec-multisample-processing): + Count transformation, normalization, QC metric calulations. +- [Integration](https://openpipelines.bio/fundamentals/architecture.html#sec-intergration): + Clustering, integration and batch correction using single and + multimodal methods. +- Downstream analysis workflows + +``` mermaid lang="mermaid" +flowchart LR + demultiplexing["Step 1: Demultiplexing"] + ingestion["Step 2: Ingestion"] + process_samples["Step 3: Process Samples"] + integration["Step 4: Integration"] + downstream["Step 5: Downstream"] + demultiplexing-->ingestion-->process_samples-->integration-->downstream +``` + +## Guided execution using Viash Hub (CLI and Seqera cloud) + +Openpipelines is now available on [Viash +Hub](https://www.viash-hub.com/packages/openpipeline/latest). Viash Hub +provides a list of components and workflows, together with a graphical +interface that guides you through the steps of running a workflow or +standalone component. Intstructions are provided for using a local viash +or nextflow executable (requires using a linux based OS), but connecting +to a Seqera cloud instance is also supported. + +## Execution using the nextflow executable + +Executing a workflow is a bit more involved and requires familiarity +with the command line interface (CLI). + +### Setup + +In order to use the workflows in this package on your local computer, +you’ll need to do the following: + +- Install [nextflow](https://www.nextflow.io/docs/latest/install.html) +- Install a nextflow compatible executor. This workflow provides a + profile for [docker](https://docs.docker.com/get-started/). + +### Location of the workflow scripts + +Nextflow workflow scripts, schema’s and configuration files can be found +in the `target/nextflow` folder. On the `main` branch however, only the +source code that needs to be build into the functionning workflows and +components can be found. Instead, please refer to the `main_build` +branch or any of the tags to find the `target` folders. Components and +workflows are organized into namespaces, which can be nested. Workflows +are located at `target/nextflow/workflows`, while components that +execute individual workflow steps are + +A reference of workflows and modules is also provided in the +[documentation](https://openpipelines.bio/components/). + +### Retrieving a list of a workflow parameters + +A list of workflows arguments can be consulted in multiple ways: + +- On [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest) +- In the [reference + documentation](https://openpipelines.bio/components/) +- The config YAML file lists the argument for each workflow and + component +- In the `target/nextflow` folder, a nextflow schema JSON file + (`nextflow_schema.json`) is provided next to each workflow `.nf` file. +- Using nextflow on the CLI: + +``` bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/ingestion/demux/main.nf \ + --help +``` + +### Resource usage tuning + +Nextflow’s labels can be used to specify the amount of resources a +process can use. This workflow uses the following labels for CPU, memory +and disk: + +- `lowmem`, `lowmem`, `midmem`, `highmem`, `veryhighmem` +- `lowcpu`, `lowcpu`, `midcpu`, `highcpu`, `veryhighcpu` +- `lowdisk`, `middisk`, `highdisk`, `veryhighdisk` + +The defaults for these labels can be found at +`src/workflows/utils/labels.config`. Nextflow checks that the specified +resources for a process do not exceed what is available on the machine +and will not start if it does. Create your own config file to tune the +labels to your needs, for example: + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 16 } + + withLabel: verylowmem { memory = 4.GB } + withLabel: lowmem { memory = 8.GB } + withLabel: midmem { memory = 16.GB } + withLabel: highmem { memory = 32.GB } + +When starting nextflow using the CLI, you can use `-c` to provide the +file to nextflow and overwrite the defaults. + +### Demultiplexing example + +Here, generating FASTQ files from raw sequencing data is demonstrated, +based on data generated using 10X genomic’s protocols. However, BD +genomics data is also supported by Openpipeline. If you wish to try it +out yourself, test data is available at +`s3://openpipelines-data/cellranger_tiny_bcl/bcl`. + +``` bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/ingestion/demux/main.nf \ + -c "" \ + -profile docker \ + --publish_dir "" \ + --id "cellranger_tiny_bcl" \ + --input "s3://openpipelines-data/cellranger_tiny_bcl/bcl" \ + --sample_sheet "s3://openpipelines-data/cellranger_tiny_bcl/bcl/sample_sheet.csv" \ + --demultiplexer "mkfastq" +``` + +### Mapping and read counting + +FASTQ files can be mapped to a reference genome and the resulting mapped +reads can be counted in order to generate a count matrix. Both +`BD Rhapsody` and `Cell Ranger` are supported. Here, we demonstrate +using Cell Ranger multi on test data available at +`s3://openpipelines-data/10x_5k_anticmv`. + +In order to facilitate passing multiple argument values, the parameters +can be specified using a YAML file. + +``` yaml +input: + - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_*.fastq.gz" + - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_*.fastq.gz" + - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_*.fastq.gz" +gex_reference: "s3://openpipelines-data/reference_gencodev41_chr1/reference_cellranger.tar.gz" +vdj_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" +feature_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/feature_reference.csv" +library_id: + - "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset" + - "5k_human_antiCMV_T_TBNK_connect_AB_subset" + - "5k_human_antiCMV_T_TBNK_connect_VDJ_subset" +library_type: + - "Gene Expression" + - "Antibody Capture" + - "VDJ" +``` + +You can pass this file to nextflow using `-params-file` + +``` bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/ingestion/cellranger_multi/main.nf \ + -c "" \ + -profile docker \ + -params-file "" \ + --publish_dir "" +``` + +### Filtering, normalization, clustering, dimensionality reduction and QC calculations (w/o integration) + +Once you have an MuData object for each of your samples, you can process +it into a multisample file that is ready for integration and other +downstream analyses. This can be done using the `process_samples` +workflow. Here is an example, but please keep in mind that the exact +parameters that need to be provided differ depending on you data. A lot +of functionality for this pipeline can be customized, including the name +of the output slots where data is being stored. + +``` yaml +param_list: + - id: "sample_1" + input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + rna_min_counts: 2 + - id: "sample_2" + input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + rna_min_counts: 1 +rna_max_counts: 1000000 +rna_min_genes_per_cell: 1 +rna_max_genes_per_cell: 1000000 +rna_min_cells_per_gene: 1 +rna_min_fraction_mito: 0.0 +rna_max_fraction_mito: 1.0 +``` + +In order to provide multiple samples to the pipeline, `param_list` is +used. Using `param_list` it is possible to specify arguments per sample. +However, it is still possible to define arguments for all samples +together by listing those outside the `param_list` block. + +``` bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \ + -c "" \ + -profile docker \ + -params-file "" + --publish_dir "" +``` + +## Executing standalone components using the Viash executable + +Another option to execute individual modules on the CLI is to use +`viash run`. All you need to do is download viash, clone the +Openpipeline repository and point viash to a config file. However, keep +in mind that using `viash run` for workflows is currently not supported. +Please see `viash run --help` for more information on how to use the +command, but here is an example: + +``` bash +viash run --engine docker src/mapping/cellranger_multi/config.vsh.yaml --help +``` diff --git a/README.qmd b/README.qmd new file mode 100644 index 00000000..28825d08 --- /dev/null +++ b/README.qmd @@ -0,0 +1,205 @@ +--- +format: gfm +--- +```{r setup, include=FALSE} +project <- yaml::read_yaml("_viash.yaml") +license <- paste0(project$links$repository, "/blob/main/LICENSE") +``` +# OpenPipeline + +Extensible single cell analysis pipelines for reproducible and large-scale single cell processing using Viash and Nextflow. + +[![ViashHub](https://img.shields.io/badge/ViashHub-`r project$name`-7a4baa.svg)](https://www.viash-hub.com/packages/`r project$name`) +[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2F`r project$name`-blue.svg)](`r project$links$repository`) +[![GitHub License](https://img.shields.io/github/license/`r project$organization`/`r project$name`.svg)](`r license`) +[![GitHub Issues](https://img.shields.io/github/issues/`r project$organization`/`r project$name`.svg)](`r project$links$issue_tracker`) +[![Viash version](https://img.shields.io/badge/Viash-v`r gsub("-", "--", project$viash_version)`-blue.svg)](https://viash.io) + +## Documentation + +Please find more in-depth documentation on [the website](https://openpipelines.bio/). + +## Functionality Overview + +Openpipelines execute a list of predefined tasks. These descrete steps are also provided as standalone components that can be executed individually, with a standardized interface. This is especially useful when a particular step wraps a tool that you do not necessarily always need to execute in a workflow context. + +In terms of workflows, the following functionality is provided: + +* Demultiplexing: conversion of raw sequencing data to FASTQ objects. +* [Ingestion](https://openpipelines.bio/fundamentals/architecture.html#sec-ingestion): Read mapping and generating a count matrix. +* [Single sample processing](https://openpipelines.bio/fundamentals/architecture.html#sec-single-sample): cell filtering and doublet detection. +* [Multisample processing](https://openpipelines.bio/fundamentals/architecture.html#sec-multisample-processing): Count transformation, normalization, QC metric calulations. +* [Integration](https://openpipelines.bio/fundamentals/architecture.html#sec-intergration): Clustering, integration and batch correction using single and multimodal methods. +* Downstream analysis workflows + + +```{mermaid lang='mermaid'} +flowchart LR + demultiplexing["Step 1: Demultiplexing"] + ingestion["Step 2: Ingestion"] + process_samples["Step 3: Process Samples"] + integration["Step 4: Integration"] + downstream["Step 5: Downstream"] + demultiplexing-->ingestion-->process_samples-->integration-->downstream +``` + +## Guided execution using Viash Hub (CLI and Seqera cloud) + +Openpipelines is now available on [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest). Viash Hub provides a list of components and workflows, together with a graphical interface that guides you through the steps of running a workflow or standalone component. Intstructions are provided for using a local viash or nextflow executable (requires using a linux based OS), but connecting to a Seqera cloud instance is also supported. + +## Execution using the nextflow executable + +Executing a workflow is a bit more involved and requires familiarity with the command line interface (CLI). + +### Setup + +In order to use the workflows in this package on your local computer, you'll need to do the following: + +* Install [nextflow](https://www.nextflow.io/docs/latest/install.html) +* Install a nextflow compatible executor. This workflow provides a profile for [docker](https://docs.docker.com/get-started/). + +### Location of the workflow scripts + +Nextflow workflow scripts, schema's and configuration files can be found in the `target/nextflow` folder. +On the `main` branch however, only the source code that needs to be build into the functionning workflows and components can be found. +Instead, please refer to the `main_build` branch or any of the tags to find the `target` folders. +Components and workflows are organized into namespaces, which can be nested. Workflows are located at `target/nextflow/workflows`, +while components that execute individual workflow steps are + +A reference of workflows and modules is also provided in the [documentation](https://openpipelines.bio/components/). + +### Retrieving a list of a workflow parameters + +A list of workflows arguments can be consulted in multiple ways: + +* On [Viash Hub](https://www.viash-hub.com/packages/openpipeline/latest) +* In the [reference documentation](https://openpipelines.bio/components/) +* The config YAML file lists the argument for each workflow and component +* In the `target/nextflow` folder, a nextflow schema JSON file (`nextflow_schema.json`) is provided next to each workflow `.nf` file. +* Using nextflow on the CLI: + +```bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/ingestion/demux/main.nf \ + --help +``` + +### Resource usage tuning + +Nextflow's labels can be used to specify the amount of resources a process can use. This workflow uses the following labels for CPU, memory and disk: + +* `lowmem`, `lowmem`, `midmem`, `highmem`, `veryhighmem` +* `lowcpu`, `lowcpu`, `midcpu`, `highcpu`, `veryhighcpu` +* `lowdisk`, `middisk`, `highdisk`, `veryhighdisk` + +The defaults for these labels can be found at `src/workflows/utils/labels.config`. Nextflow checks that the specified resources for a process do not exceed what is available on the machine and will not start if it does. Create your own config file to tune the labels to your needs, for example: + +``` +// Resource labels +withLabel: verylowcpu { cpus = 2 } +withLabel: lowcpu { cpus = 8 } +withLabel: midcpu { cpus = 16 } +withLabel: highcpu { cpus = 16 } + +withLabel: verylowmem { memory = 4.GB } +withLabel: lowmem { memory = 8.GB } +withLabel: midmem { memory = 16.GB } +withLabel: highmem { memory = 32.GB } +``` + +When starting nextflow using the CLI, you can use `-c` to provide the file to nextflow and overwrite the defaults. + +### Demultiplexing example + +Here, generating FASTQ files from raw sequencing data is demonstrated, based on data generated using 10X genomic's protocols. However, BD genomics data is also supported by Openpipeline. If you wish to try it out yourself, test data is available at `s3://openpipelines-data/cellranger_tiny_bcl/bcl`. + +```bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/ingestion/demux/main.nf \ + -c "" \ + -profile docker \ + --publish_dir "" \ + --id "cellranger_tiny_bcl" \ + --input "s3://openpipelines-data/cellranger_tiny_bcl/bcl" \ + --sample_sheet "s3://openpipelines-data/cellranger_tiny_bcl/bcl/sample_sheet.csv" \ + --demultiplexer "mkfastq" +``` + +### Mapping and read counting + +FASTQ files can be mapped to a reference genome and the resulting mapped reads can be counted in order to generate a count matrix. Both `BD Rhapsody` and `Cell Ranger` are supported. Here, we demonstrate using Cell Ranger multi on test data available at `s3://openpipelines-data/10x_5k_anticmv`. + + +In order to facilitate passing multiple argument values, the parameters can be specified using a YAML file. +```yaml +input: + - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_*.fastq.gz" + - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_*.fastq.gz" + - "s3://openpipelines-data/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_*.fastq.gz" +gex_reference: "s3://openpipelines-data/reference_gencodev41_chr1/reference_cellranger.tar.gz" +vdj_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" +feature_reference: "s3://openpipelines-data/10x_5k_anticmv/raw/feature_reference.csv" +library_id: + - "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset" + - "5k_human_antiCMV_T_TBNK_connect_AB_subset" + - "5k_human_antiCMV_T_TBNK_connect_VDJ_subset" +library_type: + - "Gene Expression" + - "Antibody Capture" + - "VDJ" +``` +You can pass this file to nextflow using `-params-file` + +```bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/ingestion/cellranger_multi/main.nf \ + -c "" \ + -profile docker \ + -params-file "" \ + --publish_dir "" +``` + +### Filtering, normalization, clustering, dimensionality reduction and QC calculations (w/o integration) + +Once you have an MuData object for each of your samples, you can process it into a multisample file that is ready for integration and other downstream analyses. This can be done using the `process_samples` workflow. Here is an example, but please keep in mind that the exact parameters that need to be provided differ depending on you data. A lot of functionality for this pipeline can be customized, including the name of the output slots where data is being stored. + +```yaml +param_list: + - id: "sample_1" + input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + rna_min_counts: 2 + - id: "sample_2" + input: "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + rna_min_counts: 1 +rna_max_counts: 1000000 +rna_min_genes_per_cell: 1 +rna_max_genes_per_cell: 1000000 +rna_min_cells_per_gene: 1 +rna_min_fraction_mito: 0.0 +rna_max_fraction_mito: 1.0 +``` + +In order to provide multiple samples to the pipeline, `param_list` is used. Using `param_list` it is possible to specify arguments per sample. However, it is still possible to define arguments for all samples together by listing those outside the `param_list` block. + +```bash +nextflow run openpipelines-bio/openpipeline \ + -r 2.1.1 \ + -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \ + -c "" \ + -profile docker \ + -params-file "" + --publish_dir "" +``` + + +## Executing standalone components using the Viash executable + +Another option to execute individual modules on the CLI is to use `viash run`. All you need to do is download viash, clone the Openpipeline repository and point viash to a config file. However, keep in mind that using `viash run` for workflows is currently not supported. Please see `viash run --help` for more information on how to use the command, but here is an example: + + +```bash +viash run --engine docker src/mapping/cellranger_multi/config.vsh.yaml --help +``` diff --git a/_viash.yaml b/_viash.yaml new file mode 100644 index 00000000..e7617710 --- /dev/null +++ b/_viash.yaml @@ -0,0 +1,45 @@ +viash_version: 0.9.4 + +source: src +target: target + +# Note: this causes the docker images to be renamed +name: openpipeline +organization: openpipelines-bio +summary: | + Best-practice workflows for single-cell multi-omics analyses. + +description: | + OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/). + + In terms of workflows, the following has been made available, but keep in mind that + individual tools and functionality can be executed as standalone components as well. + + * Demultiplexing: conversion of raw sequencing data to FASTQ objects. + * Ingestion: Read mapping and generating a count matrix. + * Single sample processing: cell filtering and doublet detection. + * Multisample processing: Count transformation, normalization, QC metric calulations. + * Integration: Clustering, integration and batch correction using single and multimodal methods. + * Downstream analysis workflows +license: MIT +keywords: [single-cell, multimodal] + +links: + repository: https://github.com/openpipelines-bio/openpipeline + docker_registry: ghcr.io + homepage: https://openpipelines.bio + documentation: https://openpipelines.bio/fundamentals + issue_tracker: https://github.com/openpipelines-bio/openpipeline/issues + +info: + test_resources: + - type: s3 + path: s3://openpipelines-data + dest: resources_test + nextflow_labels_ci: + - path: src/workflows/utils/labels_ci.config + description: Adds the correct memory and CPU labels when running on the Viash Hub CI. + +config_mods: | + .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'} + .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")' diff --git a/images/concepts/fig.svg b/images/concepts/fig.svg new file mode 100644 index 00000000..2a72f8e7 --- /dev/null +++ b/images/concepts/fig.svg @@ -0,0 +1,5340 @@ + + + +Title ofthe pipelineSome text +some moreSome text +some moreSingle-sampleRNA processing+ cell QC stats- bad cells×Multi-sampleRNA processingsample 1sample 2sample 3+ normalized layer+ feature QC statsSingle-sampleADT processing×+ cell QC stats- bad cellsMulti-sampleADT processingsample 1sample 2sample 3+ normalized layers+ feature QC +stats diff --git a/images/concepts/fig_cell.svg b/images/concepts/fig_cell.svg new file mode 100644 index 00000000..a3c178c9 --- /dev/null +++ b/images/concepts/fig_cell.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/concepts/fig_modality_adt.svg b/images/concepts/fig_modality_adt.svg new file mode 100644 index 00000000..90ee9bdf --- /dev/null +++ b/images/concepts/fig_modality_adt.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/concepts/fig_modality_atac.svg b/images/concepts/fig_modality_atac.svg new file mode 100644 index 00000000..817d95da --- /dev/null +++ b/images/concepts/fig_modality_atac.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/concepts/fig_modality_rna.svg b/images/concepts/fig_modality_rna.svg new file mode 100644 index 00000000..0ad5d5ea --- /dev/null +++ b/images/concepts/fig_modality_rna.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/concepts/fig_modality_vdj.svg b/images/concepts/fig_modality_vdj.svg new file mode 100644 index 00000000..9f59284b --- /dev/null +++ b/images/concepts/fig_modality_vdj.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/concepts/fig_workflow_multiomics_adt_multisample.svg b/images/concepts/fig_workflow_multiomics_adt_multisample.svg new file mode 100644 index 00000000..1df7dd7a --- /dev/null +++ b/images/concepts/fig_workflow_multiomics_adt_multisample.svg @@ -0,0 +1,2 @@ +sample 1sample 2sample 3+ normalized layers+ feature QC +stats \ No newline at end of file diff --git a/images/concepts/fig_workflow_multiomics_adt_singlesample.svg b/images/concepts/fig_workflow_multiomics_adt_singlesample.svg new file mode 100644 index 00000000..c52fea66 --- /dev/null +++ b/images/concepts/fig_workflow_multiomics_adt_singlesample.svg @@ -0,0 +1 @@ +×+ cell QC stats- bad cells \ No newline at end of file diff --git a/images/concepts/fig_workflow_multiomics_rna_multisample.svg b/images/concepts/fig_workflow_multiomics_rna_multisample.svg new file mode 100644 index 00000000..84f42f08 --- /dev/null +++ b/images/concepts/fig_workflow_multiomics_rna_multisample.svg @@ -0,0 +1 @@ +sample 1sample 2sample 3+ normalized layer+ feature QC stats \ No newline at end of file diff --git a/images/concepts/fig_workflow_multiomics_rna_singlesample.svg b/images/concepts/fig_workflow_multiomics_rna_singlesample.svg new file mode 100644 index 00000000..9ad3da2b --- /dev/null +++ b/images/concepts/fig_workflow_multiomics_rna_singlesample.svg @@ -0,0 +1 @@ ++ cell QC stats- bad cells× \ No newline at end of file diff --git a/images/concepts/generate_subimages.sh b/images/concepts/generate_subimages.sh new file mode 100755 index 00000000..53ffbb34 --- /dev/null +++ b/images/concepts/generate_subimages.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# so let's do it separately +rm images/concepts/fig_*.svg + +for id in cell modality_rna modality_adt modality_vdj modality_atac workflow_multiomics_rna_singlesample workflow_multiomics_rna_multisample workflow_multiomics_adt_singlesample workflow_multiomics_adt_multisample; do + inkscape --export-type="svg" --export-id="$id" --export-id-only images/concepts/fig.svg + svgo images/concepts/fig_${id}.svg +done \ No newline at end of file diff --git a/main.nf b/main.nf new file mode 100644 index 00000000..28839dad --- /dev/null +++ b/main.nf @@ -0,0 +1,5 @@ +nextflow.enable.dsl=2 + +workflow { + print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.") +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 00000000..89509ce3 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,22 @@ +// template nextflow.config for nested workflows + +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +// TODO 1: unquote and adapt `rootDir` according to relative path within project +// params { +// rootDir = "$projectDir/../.." +// } +// +// workflowDir = "${params.rootDir}/workflows" +// targetDir = "${params.rootDir}/target/nextflow" + +// TODO 2: insert custom imports here + +// TODO 3: unquote +// docker { +// runOptions = "-v \$(realpath ${params.rootDir}):\$(realpath ${params.rootDir})" +// } + + diff --git a/resources_test_scripts/10x_20k_fixed.sh b/resources_test_scripts/10x_20k_fixed.sh new file mode 100755 index 00000000..0de8a4d9 --- /dev/null +++ b/resources_test_scripts/10x_20k_fixed.sh @@ -0,0 +1,201 @@ +#!/bin/bash + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=10x_5k_fixed +OUT="resources_test/$ID" + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# check whether reference is available +reference_dir="resources_test/reference_gencodev41_chr1/" +genome_tar="$reference_dir/reference_cellranger.tar.gz" +if [[ ! -f "$genome_tar" ]]; then + echo "$genome_tar does not exist. Please create the reference genome first" + exit 1 +fi + +# create tempdir +MY_TEMP="${VIASH_TEMP:-/tmp}" +TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} + +# dataset page: +# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + + # download fastqs and untar + wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit head -n 200000 "$input" | gzip > "$output" + fi +} + +orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex" + +seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz" + +# download feature reference +feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv" +if [[ ! -f "$feature_ref" ]]; then + wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref" +fi + +# download probe set +probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" +if [[ ! -f "$probe_set" ]]; then + wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set" +fi + +sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set" + +probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv" +if [[ ! -f "$probe_set_corrected" ]]; then + reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz" + gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf" + cat "$probe_set" | while read line || [[ -n $line ]]; + do + echo "Line: $line" + old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' ) + echo "Old ID: $old_id" + if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then + echo "Just writing line" + printf "%s\n" "$line" >> "$probe_set_corrected" + else + gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;) + if [ ! -z "$gtf_lookup" ]; then + echo "Found hit" + new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//") + echo "New ID: $new_id" + new_line=${line/"$old_id"/"$new_id"} + printf "%s\n" "$new_line" >> "$probe_set_corrected" + else + echo "Did not find hit" + fi + fi + done +fi + +# # Input FASTA: +# # >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF +# # Output FASTA: +# # >chr1 1 +# input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz" +# fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa" +# if [[ ! -f "$input_fastq" ]]; then +# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq" +# fi +# zcat "$input_fastq" \ +# | sed -E 's/^>(\S+).*/>\1 \1/' \ +# | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \ +# | sed -E 's/^>MT />chrM /' \ +# > "$fasta_modified" + +# pigz --fast "$fasta_modified" +# fasta_modified="$fasta_modified.gz" +# # Input GTF: +# # ... gene_id "ENSG00000223972.5"; ... +# # Output GTF: +# # ... gene_id "ENSG00000223972"; gene_version "5"; ... +# input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz" +# gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf" +# if [[ ! -f "$input_gtf" ]]; then +# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf" +# fi + +# REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)" +# zcat "$input_gtf" \ +# | sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \ +# | sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \ +# | sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \ +# > "$gtf_modified" +# pigz --fast "$gtf_modified" +# gtf_modified="$gtf_modified.gz" + +final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz" +if [ ! -f "$final_genome" ]; then + NXF_VER=21.10.6 nextflow \ + run . \ + -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \ + -profile docker \ + -resume \ + --id "GRCh38" \ + --genome_fasta "$fasta_modified" \ + --transcriptome_gtf "$gtf_modified" \ + --target "cellranger" \ + --output_fasta "reference.fa.gz" \ + --output_gtf "reference.gtf.gz" \ + --output_cellranger "GRCh38.cellranger.genome.fa.gz" \ + --publish_dir "$HOME/.cache/openpipeline/" +fi + + +# Run mapping pipeline +cat > /tmp/params.yaml << HERE +param_list: +- id: "$ID" + input: "$raw_dir" + library_id: + - ${orig_sample_id}_subset + library_type: + - "Gene Expression" + library_lanes: + - "any" + +probe_set: "$probe_set_corrected" +gex_reference: "$genome_tar" +feature_reference: "$feature_ref" +publish_dir: "$OUT/processed" +probe_barcode_ids: + - BC001 + - BC002 + - BC003 + - BC004 +sample_ids: + - Liver_BC1 + - Ovarian_BC2 + - Colorectal_BC3 + - Pancreas_BC4 +gex_generate_bam: false +sample_force_cells: + - 5000 + - -1 + - -1 + - -1 +HERE + +nextflow \ + run . \ + -main-script target/nextflow/mapping/cellranger_multi/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels_ci.config diff --git a/resources_test_scripts/10x_4plex_dtc.sh b/resources_test_scripts/10x_4plex_dtc.sh new file mode 100755 index 00000000..3f04276b --- /dev/null +++ b/resources_test_scripts/10x_4plex_dtc.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=10x_4plex_dtc +OUT="resources_test/$ID" + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# check whether reference is available +reference_dir="resources_test/reference_gencodev41_chr1/" +genome_tar="$reference_dir/reference_cellranger.tar.gz" +if [[ ! -f "$genome_tar" ]]; then + echo "$genome_tar does not exist. Please create the reference genome first" + exit 1 +fi + +# create tempdir +MY_TEMP="${VIASH_TEMP:-/tmp}" +TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} + +# dataset page: +# https://www.10xgenomics.com/datasets/40k-mixture-of-dissociated-tumor-cells-from-3-donors-stained-with-totalseqc-antibodies + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/4plex_DTC_kidney_lung_breast_TotalSeqC_fastqs" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + + # download fastqs and untar + wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.2.0/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit head -n 100000 "$input" | gzip > "$output" + fi +} + +orig_sample_id="4plex_DTC_kidney_lung_breast_TotalSeqC" + +seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L002_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L002_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L002_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L002_R2_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L003_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L003_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L003_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L003_R2_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L004_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_gex_S1_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S1_L004_R2_001.fastq.gz" + + +seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L002_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L002_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L002_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L002_R2_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L003_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L003_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L003_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L003_R2_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L004_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_ab_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ab_subset_S2_L004_R2_001.fastq.gz" + +# download feature reference +feature_ref="$raw_dir/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" +if [[ ! -f "$feature_ref" ]]; then + wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex/4plex_DTC_kidney_lung_breast_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref" +fi + +# download probe set +probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv" +if [[ ! -f "$probe_set" ]]; then + wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv" -O "$probe_set" +fi + +sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set" + +probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A-corrected.csv" +if [[ ! -f "$probe_set_corrected" ]]; then + reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz" + gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf" + cat "$probe_set" | while read line || [[ -n $line ]]; + do + echo "Line: $line" + old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' ) + echo "Old ID: $old_id" + if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then + echo "Just writing line" + printf "%s\n" "$line" >> "$probe_set_corrected" + else + gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;) + if [ ! -z "$gtf_lookup" ]; then + echo "Found hit" + new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//") + echo "New ID: $new_id" + new_line=${line/"$old_id"/"$new_id"} + printf "%s\n" "$new_line" >> "$probe_set_corrected" + else + echo "Did not find hit" + fi + fi + done +fi + +# Run mapping pipeline +cat > /tmp/params.yaml << HERE +param_list: +- id: "$ID" + input: "$raw_dir" + library_id: + - ${orig_sample_id}_gex_subset + - ${orig_sample_id}_ab_subset + library_type: + - "Gene Expression" + - "Antibody Capture" + library_lanes: + - "any" + - "any" + +probe_set: "$probe_set_corrected" +gex_reference: "$genome_tar" +feature_reference: "$feature_ref" +publish_dir: "$OUT/processed" +probe_barcode_ids: + - BC001+AB001 + - BC002+AB002 + - BC003+AB003 + - BC004+AB004 +sample_ids: + - Kidney_Cancer1_BC1_AB1 + - Kidney_Cancer2_BC2_AB2 + - Breast_Cancer_BC3_AB3 + - Lung_Cancer_BC4_AB4 +gex_generate_bam: false +HERE + +nextflow \ + run . \ + -main-script target/nextflow/mapping/cellranger_multi/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels_ci.config diff --git a/resources_test_scripts/10x_5k_anticmv.sh b/resources_test_scripts/10x_5k_anticmv.sh new file mode 100755 index 00000000..cfd9e373 --- /dev/null +++ b/resources_test_scripts/10x_5k_anticmv.sh @@ -0,0 +1,203 @@ +#!/bin/bash + +set -eo pipefail + + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=10x_5k_anticmv +OUT=resources_test/$ID + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# dataset page: +# https://www.10xgenomics.com/resources/datasets/integrated-gex-totalseqc-and-tcr-analysis-of-connect-generated-library-from-5k-cmv-t-cells-2-standard + +# check whether reference is available +reference_dir="resources_test/reference_gencodev41_chr1/" +genome_tar="$reference_dir/reference_cellranger.tar.gz" +if [[ ! -f "$genome_tar" ]]; then + echo "$genome_tar does not exist. Please create the reference genome first" + exit 1 +fi + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/5k_human_antiCMV_T_TBNK_connect_Multiplex" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + + # download fastqs and untar + wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_fastqs.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit head -n 200000 "$input" | gzip > "$output" + fi +} + +orig_sample_id="5k_human_antiCMV_T_TBNK_connect" + +seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/gex_1/${orig_sample_id}_GEX_1_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_GEX_1_subset_S1_L001_R2_001.fastq.gz" + +seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R1_001.fastq.gz" +seqkit_head "$tar_dir/ab/${orig_sample_id}_AB_S2_L004_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_AB_subset_S2_L004_R2_001.fastq.gz" + +seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/vdj/${orig_sample_id}_VDJ_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_VDJ_subset_S1_L001_R2_001.fastq.gz" + +# download immune panel fasta if needed +feature_reference="$raw_dir/feature_reference.csv" +if [[ ! -f "$feature_reference" ]]; then + wget "https://cf.10xgenomics.com/samples/cell-vdj/6.1.2/5k_human_antiCMV_T_TBNK_connect_Multiplex/5k_human_antiCMV_T_TBNK_connect_Multiplex_count_feature_reference.csv" -O "$feature_reference" +fi + +# download vdj reference if needed +vdj_ref="$raw_dir/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" +if [[ ! -f "$vdj_ref" ]]; then + wget "https://cf.10xgenomics.com/supp/cell-vdj/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" -O "$vdj_ref" +fi + + +# Run mapping pipeline +cat > /tmp/params.yaml << HERE +param_list: +- id: "$ID" + input: "$raw_dir" + library_id: + - "${orig_sample_id}_GEX_1_subset" + - "${orig_sample_id}_AB_subset" + - "${orig_sample_id}_VDJ_subset" + library_type: + - "Gene Expression" + - "Antibody Capture" + - "VDJ" + +gex_reference: "$genome_tar" +vdj_reference: "$vdj_ref" +feature_reference: "$feature_reference" +publish_dir: "$OUT/processed" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/mapping/cellranger_multi/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config \ + -c src/workflows/utils/errorstrat_ignore.config + +# Convert to h5mu +cat > /tmp/params.yaml << HERE +id: "$orig_sample_id" +input: "$OUT/processed/10x_5k_anticmv.cellranger_multi.output" +publish_dir: "$OUT/" +output: "*.h5mu" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config + +mv "$OUT/0.h5mu" "$OUT/${orig_sample_id}.h5mu" + + +# run qc workflow +cat > /tmp/params.yaml << HERE +id: "$ID" +input: "$OUT/$orig_sample_id.h5mu" +var_name_mitochondrial_genes: mitochondrial +var_name_ribosomal_genes: ribosomal +publish_dir: "$OUT/" +output: "${orig_sample_id}_qc.h5mu" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/workflows/qc/qc/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config + + +# Run full pipeline +cat > /tmp/params.yaml << HERE +id: "$ID" +input: "$OUT/${orig_sample_id}_qc.h5mu" +publish_dir: "$OUT/" +output: "${orig_sample_id}_mms.h5mu" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config + +# create fastqc directory +fastqc_dir="$OUT/fastqc" +mkdir -p "$fastqc_dir" + +./target/executable/qc/fastqc/fastqc \ + --input "$raw_dir" \ + --mode "dir" \ + --output "$fastqc_dir" + + +# Create a test dataset for the Custom modality +# by just labeling the AB as custom +feat_ref_name=$(basename $feature_reference) +sed -e 's/Antibody Capture/Custom/g' "$feature_reference" > "/tmp/custom_${feat_ref_name}" + +cat > /tmp/params_custom.yaml << HERE +param_list: +- id: "$ID" + input: "$raw_dir" + library_id: + - "${orig_sample_id}_GEX_1_subset" + - "${orig_sample_id}_AB_subset" + - "${orig_sample_id}_VDJ_subset" + library_type: + - "Gene Expression" + - "Custom" + - "VDJ" + +gex_reference: "$genome_tar" +feature_reference: "/tmp/custom_${feat_ref_name}" +vdj_reference: "$vdj_ref" +publish_dir: "$OUT/processed_with_custom" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/mapping/cellranger_multi/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params_custom.yaml \ + -c src/workflows/utils/labels.config \ + -c src/workflows/utils/errorstrat_ignore.config \ No newline at end of file diff --git a/resources_test_scripts/10x_5k_beam.sh b/resources_test_scripts/10x_5k_beam.sh new file mode 100755 index 00000000..01943971 --- /dev/null +++ b/resources_test_scripts/10x_5k_beam.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=10x_5k_beam +OUT="resources_test/$ID" + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# check whether reference is available +reference_dir="resources_test/reference_gencodev41_chr1/" +genome_tar="$reference_dir/reference_cellranger.tar.gz" +if [[ ! -f "$genome_tar" ]]; then + echo "$genome_tar does not exist. Please create the reference genome first" + exit 1 +fi + +# dataset page: +# https://www.10xgenomics.com/datasets/5k-human-a0201-b0702-pbmcs-beam-t-2-standard + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/5k_human_A0201_B0702_PBMCs_BEAM_T" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + + # download fastqs and untar + wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_fastqs.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit head -n 200000 "$input" | gzip > "$output" + fi +} + +orig_sample_id="beamt_human_A0201_B0702_pbmc" + +seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/gex/${orig_sample_id}_gex_S3_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S3_L001_R2_001.fastq.gz" + +seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/vdj/${orig_sample_id}_vdj_S2_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_vdj_subset_S2_L001_R2_001.fastq.gz" + +seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/antigen_capture/${orig_sample_id}_ag_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_ag_subset_S1_L001_R2_001.fastq.gz" + +# download feature reference +feature_ref="$raw_dir/beamt_human_A0201_B0702_pbmc_feature_reference.csv" +if [[ ! -f "$feature_ref" ]]; then + wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_count_feature_reference.csv" -O "$feature_ref" +fi + +# download vdj reference if needed +vdj_ref="$raw_dir/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" +if [[ ! -f "$vdj_ref" ]]; then + wget "https://cf.10xgenomics.com/samples/cell-vdj/7.1.0/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" -O "$vdj_ref" +fi + +# Run mapping pipeline +# TODO: Also include conversion to h5mu +cat > /tmp/params.yaml << HERE +param_list: +- id: "$ID" + input: "$raw_dir" + library_id: + - "${orig_sample_id}_gex_subset" + - "${orig_sample_id}_vdj_subset" + - "${orig_sample_id}_ag_subset" + library_type: + - "Gene Expression" + - "VDJ-T" + - "Antigen Capture" + +gex_reference: "$genome_tar" +feature_reference: "$feature_ref" +vdj_reference: "$vdj_ref" +control_id: + - negative_control_A0201 + - negative_control_B0702 +mhc_allele: + - "HLA-A*02:01" + - "HLA-B*07:02" +publish_dir: "$OUT/processed" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/mapping/cellranger_multi/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels_ci.config + +# Create h5mu +cat > /tmp/params.yaml << HERE +id: "$ID" +input: "$OUT/processed/$ID.cellranger_multi.output" +publish_dir: "$OUT/" +output: "$orig_sample_id.h5mu" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels_ci.config diff --git a/resources_test_scripts/10x_5k_lung_crispr.sh b/resources_test_scripts/10x_5k_lung_crispr.sh new file mode 100755 index 00000000..9a57828c --- /dev/null +++ b/resources_test_scripts/10x_5k_lung_crispr.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=10x_5k_lung_crispr +OUT="resources_test/$ID" + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# check whether reference is available +reference_dir="resources_test/reference_gencodev41_chr1/" +genome_tar="$reference_dir/reference_cellranger.tar.gz" +if [[ ! -f "$genome_tar" ]]; then + echo "$genome_tar does not exist. Please create the reference genome first" + exit 1 +fi + +# dataset page: +# https://www.10xgenomics.com/resources/datasets/5-k-a-549-lung-carcinoma-cells-no-treatment-transduced-with-a-crispr-pool-3-1-standard-6-0-0 + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + + # download fastqs and untar + wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_fastqs.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit head -n 200000 "$input" | gzip > "$output" + fi +} + +orig_sample_id="SC3_v3_NextGem_DI_CRISPR_A549_5K" + +seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_gex/${orig_sample_id}_gex_S5_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_gex_subset_S5_L001_R2_001.fastq.gz" + +seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R1_001.fastq.gz" +seqkit_head "$tar_dir/${orig_sample_id}_crispr/${orig_sample_id}_crispr_S4_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_crispr_subset_S4_L001_R2_001.fastq.gz" + + +# download crispr feature reference +crispr_ref="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv" +if [[ ! -f "$crisp_ref" ]]; then + wget "https://cf.10xgenomics.com/samples/cell-exp/6.0.0/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference.csv" -O "$crispr_ref" +fi + +crispr_ref_adjusted="$raw_dir/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv" +reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz" +cat "$crispr_ref" | while read line || [[ -n $line ]]; +do + echo "Line: $line" + old_id=$( printf "%s\n" "$line" | awk -F',' '{print $7}' ) + echo "Old ID: $old_id" + if [ "$old_id" = "Non-Targeting" ] || [ "$old_id" = "target_gene_id" ] ; then + echo "Just writing line" + printf "%s\n" "$line" >> "$crispr_ref_adjusted" + else + gtf_lookup=$(zgrep "$old_id" "$reference_gtf" || test $? = 1;) + if [ ! -z "$gtf_lookup" ]; then + echo "Found hit" + new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//") + echo "New ID: $new_id" + new_line=${line/"$old_id"/"$new_id"} + printf "%s\n" "$new_line" >> "$crispr_ref_adjusted" + else + echo "Did not find hit" + fi + fi +done + + +# Run mapping pipeline +# TODO: Also include conversion to h5mu +cat > /tmp/params.yaml << HERE +param_list: +- id: "$ID" + input: "$raw_dir" + library_id: + - "${orig_sample_id}_gex_subset" + - "${orig_sample_id}_crispr_subset" + library_type: + - "Gene Expression" + - "CRISPR Guide Capture" + +gex_reference: "$genome_tar" +feature_reference: "$crispr_ref_adjusted" +publish_dir: "$OUT/processed" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/mapping/cellranger_multi/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config + +# Create h5mu +cat > /tmp/params.yaml << HERE +id: "$ID" +input: "$OUT/processed/10x_5k_lung_crispr.cellranger_multi.output" +publish_dir: "$OUT/" +output: "$orig_sample_id.h5mu" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf \ + -resume \ + -profile docker,mount_temp \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config diff --git a/resources_test_scripts/annotation_test_data.sh b/resources_test_scripts/annotation_test_data.sh new file mode 100755 index 00000000..b9db9855 --- /dev/null +++ b/resources_test_scripts/annotation_test_data.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +ID=annotation_test_data +OUT=resources_test/$ID/ + +# ideally, this would be a versioned pipeline run +[ -d "$OUT" ] || mkdir -p "$OUT" + +# Download Tabula Sapiens Blood reference h5ad from https://doi.org/10.5281/zenodo.7587774 +wget "https://zenodo.org/record/7587774/files/TS_Blood_filtered.h5ad?download=1" -O "${OUT}/tmp_TS_Blood_filtered.h5ad" + +# Download Tabula Sapiens Blood pretrained model from https://doi.org/10.5281/zenodo.7580707 +wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?download=1" -O "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz" + +# Download PopV specific CL ontology files - needed for OnClass +# OUT_ONTOLOGY="${OUT}/ontology" +# [ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY" +# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.obo \ +# -O "${OUT_ONTOLOGY}/cl.obo" +# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology \ +# -O "${OUT_ONTOLOGY}/cl.ontology" +# wget https://raw.githubusercontent.com/czbiohub/PopV/main/ontology/cl.ontology.nlp.emb \ +# -O "${OUT_ONTOLOGY}/cl.ontology.nlp.emb" + + +# Process Tabula Sapiens Blood reference h5ad +# (Select one individual and 100 cells per cell type) +# normalize and log1p transform data +# Add treatment and disease columns +python <=n].groupby('cell_ontology_class').head(n).index] + +# Normalize and log1p transform data +data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X) +sc.pp.normalize_total(data_for_scanpy, target_sum=10000) +sc.pp.log1p( + data_for_scanpy, + base=None, + layer=None, + copy=False, +) +sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X + +# Add treatment and disease columns +n_cells = sub_ref_adata_final.n_obs +treatment = np.random.choice(["ctrl", "stim"], size=n_cells, p=[0.5, 0.5]) +disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5]) +sub_ref_adata_final.obs["treatment"] = treatment +sub_ref_adata_final.obs["disease"] = disease + +# Write out data +sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip') +HEREDOC + + +echo "> Converting to h5mu" +viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \ + --input "${OUT}/TS_Blood_filtered.h5ad" \ + --output "${OUT}/TS_Blood_filtered.h5mu" \ + --modality "rna" + +rm "${OUT}/tmp_TS_Blood_filtered.h5ad" + +echo "> Downloading pretrained CellTypist model and sample test data" +wget https://celltypist.cog.sanger.ac.uk/models/Pan_Immune_CellTypist/v2/Immune_All_Low.pkl \ + -O "${OUT}/celltypist_model_Immune_All_Low.pkl" +wget https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_2000_cells.h5ad \ + -O "${OUT}/demo_2000_cells.h5ad" +viash run src/convert/from_h5ad_to_h5mu/config.vsh.yaml --engine docker -- \ + --input "${OUT}/demo_2000_cells.h5ad" \ + --output "${OUT}/demo_2000_cells.h5mu" \ + --modality "rna" + + +echo "> Fetching OnClass data and models" +OUT_ONTOLOGY="${OUT}/ontology" +[ -d "$OUT_ONTOLOGY" ] || mkdir -p "$OUT_ONTOLOGY" +wget https://figshare.com/ndownloader/files/28394466 -O "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" +tar -xzvf "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" -C "${OUT_ONTOLOGY}" --strip-components=2 +rm "${OUT_ONTOLOGY}/allen.ontology" +rm "${OUT_ONTOLOGY}/OnClass_data_public_minimal.tar.gz" + +wget https://figshare.com/ndownloader/files/28394541 -O "${OUT}/OnClass_models.tar.gz" +tar -xzvf "${OUT}/OnClass_models.tar.gz" -C "${OUT}" --strip-components=1 +rm "${OUT}/OnClass_models.tar.gz" +rm "${OUT}/tmp_pretrained_models_Blood_ts.tar.gz" + +find "${OUT}/Pretrained_model" ! -name "example_file_model*" -type f -exec rm -f {} + +mv "${OUT}/Pretrained_model" "${OUT}/onclass_model" + +echo "> Creating SCVI model" +viash run src/integrate/scvi/config.vsh.yaml --engine docker -- \ + --input "${OUT}/TS_Blood_filtered.h5mu" \ + --obs_batch "donor_id" \ + --var_gene_names "ensemblid" \ + --output "${OUT}/scvi_output.h5mu" \ + --output_model "${OUT}/scvi_model" \ + --max_epochs 5 \ + --n_obs_min_count 10 \ + --n_var_min_count 10 + +echo "> Creating SCANVI model" +viash run src/integrate/scanvi/config.vsh.yaml --engine docker -- \ + --input "${OUT}/TS_Blood_filtered.h5mu" \ + --var_gene_names "ensemblid" \ + --obs_labels "cell_ontology_class" \ + --scvi_model "${OUT}/scvi_model" \ + --output "${OUT}/scanvi_output.h5mu" \ + --output_model "${OUT}/scanvi_model" \ + --max_epochs 5 + +rm "${OUT}/scanvi_output.h5mu" +rm "${OUT}/scvi_output.h5mu" +rm -r "${OUT}/Pretrained_model/" + +echo "> Creating Pseudobulk Data for DGEA" +viash run src/differential_expression/create_pseudobulk/config.vsh.yaml --engine docker -- \ + --input "${OUT}/TS_Blood_filtered.h5mu" \ + --obs_grouping "cell_type" \ + --obs_sample_conditions "donor_id" \ + --obs_sample_conditions "treatment" \ + --obs_sample_conditions "disease" \ + --min_num_cells_per_sample 5 \ + --output "${OUT}/TS_Blood_filtered_pseudobulk.h5mu" \ No newline at end of file diff --git a/resources_test_scripts/aws_sync.sh b/resources_test_scripts/aws_sync.sh new file mode 100644 index 00000000..9a36d786 --- /dev/null +++ b/resources_test_scripts/aws_sync.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -eo pipefail + +aws s3 sync --profile di "resources_test" "s3://openpipelines-data" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun + +id=cellranger_tiny_fastq +aws s3 sync --profile di "resources_test/$id" "s3://openpipelines-data/$id" --exclude "temp_*" --exclude "tmp_*" --delete --dryrun \ No newline at end of file diff --git a/resources_test_scripts/bdrhap_5kjrt.sh b/resources_test_scripts/bdrhap_5kjrt.sh new file mode 100755 index 00000000..5bd61f7f --- /dev/null +++ b/resources_test_scripts/bdrhap_5kjrt.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=bdrhap_5kjrt +OUT=resources_test/$ID +n_threads=30 + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# check whether reference is available +reference_dir="resources_test/reference_gencodev41_chr1" +genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz" +if [[ ! -f "$genome_tar" ]]; then + echo "$genome_tar does not exist. Please create the reference genome first" + exit 1 +fi + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/12WTA-ABC-SMK-EB-5kJRT" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +genome_dir="$raw_dir/temp_reference_gencodev41_chr1" +if [[ ! -d "$genome_dir" ]]; then + echo "> Untarring genome" + mkdir -p "$genome_dir" + tar -xvf "$genome_tar" -C "$genome_dir" +fi + +# process WTA fastq files +# map to chr1, subsample chr1 reads +mapping_dir="$raw_dir/temp_mapping_chr_1" +if [[ ! -f "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" ]]; then + echo "> Processing 12WTA_S1_L432_R[12]_001.fastq.gz" + mkdir -p "$mapping_dir" + # MUST USE A STAR THAT IS COMPATIBLE WITH BD RHAPSODY + # For the cwl pipeline 1.9.1, 2.5.2b should work. + echo "star" + docker run --rm -i \ + -v "`pwd`/$OUT:`pwd`/$OUT" \ + -v "$tar_dir:$tar_dir" \ + -w `pwd` bdgenomics/rhapsody:1.10.1 \ + STAR \ + --runThreadN "$n_threads" \ + --genomeDir "$genome_dir" \ + --readFilesIn "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" \ + --runRNGseed 100 \ + --outFileNamePrefix "$mapping_dir/" \ + --readFilesCommand "gzip -d -k -c" \ + --clip3pAdapterSeq "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" \ + --outFilterMatchNmin "25" \ + --quantTranscriptomeBan "Singleend" # Prohibit mapping of one side of the read + # chown to current user before removing mapping dir + docker run --rm -i -v "`pwd`/$OUT:`pwd`/$OUT" -w `pwd` bdgenomics/rhapsody:1.10.1 \ + chown "$(id -u):$(id -g)" --silent --recursive "$mapping_dir/" + + echo "samtools" + samtools view -F 260 "$mapping_dir/Aligned.out.sam" > "$mapping_dir/primary_aligned_reads.sam" + echo "cut" + cut -f 1 "$mapping_dir/primary_aligned_reads.sam" | sort | uniq > "$mapping_dir/mapped_reads.txt" + head -500000 "$mapping_dir/mapped_reads.txt" > "$mapping_dir/mapped_reads_subset.txt" + echo "seqkit" + seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R1_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" + seqkit grep --threads "$n_threads" -f "$mapping_dir/mapped_reads_subset.txt" "$tar_dir/12WTA_S1_L432_R2_001.fastq.gz" > "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq" + + # rm -r "$mapping_dir" + # rm -r "$genome_dir" +fi + +# subsample other files +smk_r1_file="$raw_dir/12SMK_S1_L432_R1_001_subset.fastq.gz" +if [[ ! -f "$smk_r1_file" ]]; then + echo "> Processing `basename $smk_r1_file`" + seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R1_001.fastq.gz" | gzip > "$smk_r1_file" +fi +smk_r2_file="$raw_dir/12SMK_S1_L432_R2_001_subset.fastq.gz" +if [[ ! -f "$smk_r2_file" ]]; then + echo "> Processing `basename $smk_r2_file`" + seqkit head -n 500000 "$tar_dir/12SMK_S1_L432_R2_001.fastq.gz" | gzip > "$smk_r2_file" +fi +abc_r1_file="$raw_dir/12ABC_S1_L432_R1_001_subset.fastq.gz" +if [[ ! -f "$abc_r1_file" ]]; then + echo "> Processing `basename $abc_r1_file`" + seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R1_001.fastq.gz" | gzip > "$abc_r1_file" +fi +abc_r2_file="$raw_dir/12ABC_S1_L432_R2_001_subset.fastq.gz" +if [[ ! -f "$abc_r2_file" ]]; then + echo "> Processing `basename $abc_r2_file`" + seqkit head -n 500000 "$tar_dir/12ABC_S1_L432_R2_001.fastq.gz" | gzip > "$abc_r2_file" +fi +wta_r1_file="$raw_dir/12WTA_S1_L432_R1_001_subset.fastq.gz" +if [[ ! -f "$wta_r1_file" ]]; then + echo "> Processing `basename $wta_r1_file`" + gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R1_001_chr1.fastq" > "$wta_r1_file" +fi +wta_r2_file="$raw_dir/12WTA_S1_L432_R2_001_subset.fastq.gz" +if [[ ! -f "$wta_r2_file" ]]; then + echo "> Processing `basename $wta_r2_file`" + gzip -9 -k -c "$mapping_dir/12WTA_S1_L432_R2_001_chr1.fastq" > "$wta_r2_file" +fi +# copy immune panel fasta +fasta_file="$raw_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta" +if [[ ! -f "$fasta_file" ]]; then + cp "$tar_dir/BDAbSeq_ImmuneDiscoveryPanel.fasta" "$fasta_file" +fi + +genome_tar="$reference_dir/reference_bd_rhapsody.tar.gz" + +nextflow run . \ + -main-script target/nextflow/workflows/ingestion/bd_rhapsody/main.nf \ + -resume \ + -profile docker,mount_temp \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/errorstrat_ignore.config \ + --reads "$wta_r1_file;$wta_r2_file;$abc_r1_file;$abc_r2_file;$smk_r1_file;$smk_r2_file" \ + --reference_archive "$genome_tar" \ + --abseq_reference "$fasta_file" \ + --sample_tags_version "hs" \ + --tag_names "1-Jurkat;2-Ramos;3-THP1" \ + --output_raw "output_raw" \ + --output "output.h5mu" \ + --output_state state.yaml \ + --cell_calling_data "mRNA" \ + --exact_cell_count 4000 \ + --generate_bam true \ + --publish_dir "$OUT/processed" diff --git a/resources_test_scripts/bdrhap_vdj.sh b/resources_test_scripts/bdrhap_vdj.sh new file mode 100755 index 00000000..bdefde5e --- /dev/null +++ b/resources_test_scripts/bdrhap_vdj.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +set -eo pipefail + +# TODO: we should turn this into viash components + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=bdrhap_vdj +OUT=resources_test/$ID +n_threads=30 + +# create raw directory +raw_dir="$OUT/raw" +mkdir -p "$raw_dir" + +# Check whether seqkit is available +if ! command -v seqkit &> /dev/null; then + echo "This script requires seqkit. Please make sure the binary is added to your PATH." + exit 1 +fi + +# download and untar source fastq files +tar_dir="$HOME/.cache/openpipeline/VDJDemo" +if [[ ! -d "$tar_dir" ]]; then + mkdir -p "$tar_dir" + wget "http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/VDJDemo/VDJDemo.tar" -O "$tar_dir.tar" + tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1 + rm "$tar_dir.tar" +fi + +# subset fastq files +for sample_id in RhapVDJDemo-BCR_S1_L001_R1_001 RhapVDJDemo-BCR_S1_L001_R2_001 RhapVDJDemo-mRNA_S5_L001_R1_001 RhapVDJDemo-mRNA_S5_L001_R2_001 RhapVDJDemo-TCR_S3_L001_R1_001 RhapVDJDemo-TCR_S3_L001_R2_001; do + subset_file="$raw_dir/${sample_id}_subset.fastq.gz" + if [[ ! -f "$subset_file" ]]; then + echo "> Processing $sample_id" + seqkit head -n 300000 "$tar_dir/$sample_id.fastq.gz" | gzip > "$subset_file" + fi + unset subset_file +done + +# copy immune panel fasta +fasta_file="$raw_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" +if [[ ! -f "$fasta_file" ]]; then + cp "$tar_dir/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" "$fasta_file" +fi + +# create params file +cat > /tmp/params.yaml << HERE +param_list: +- id: "targeted_vdj" + input: "$raw_dir/RhapVDJDemo-*_S*_L001_R[12]_001_subset.fastq.gz" +mode: targeted +reference: "$fasta_file" +publish_dir: "$OUT/processed" +putative_cell_call: "mRNA" +vdj_version: human +HERE + +# run bd rhapsody pipeline +nextflow \ + run . \ + -main-script src/workflows/ingestion/bd_rhapsody/main.nf \ + -resume \ + -profile docker,mount_temp \ + -with-trace work/trace.txt \ + -params-file /tmp/params.yaml \ + -c src/workflows/utils/labels.config \ + -c src/workflows/utils/errorstrat_ignore.config \ No newline at end of file diff --git a/resources_test_scripts/cellranger_atac_tiny_bcl.sh b/resources_test_scripts/cellranger_atac_tiny_bcl.sh new file mode 100755 index 00000000..8432c27d --- /dev/null +++ b/resources_test_scripts/cellranger_atac_tiny_bcl.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# settings +ID=cellranger_atac_tiny_bcl +OUT="resources_test/$ID/" +DIR="$OUT" +REFERENCE_DIR=resources_test/reference_gencodev41_chr1 + +# create tempdir +MY_TEMP="${VIASH_TEMP:-/tmp}" +TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +viash ns build -q "download_file|cellranger_atac_mkfastq|build_cellranger_arc_reference|cellranger_atac_count" -p docker --setup cb + + +# download bcl data +if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then + mkdir -p "$OUT/bcl" + + # download tar gz + target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-1.0.0.tar.gz \ + --output "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" + + # untar + + tar -xf "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" \ + --strip-components=1 \ + -C "$OUT/bcl" + + # remove tar + rm "${OUT}/bcl/cellranger-atac-tiny-bcl-1.0.0.tar.gz" + + # Download the layout file. It contains info about the samples (1 in this case) and lanes + target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-simple-1.0.0.csv \ + --output "${OUT}/bcl/layout.csv" + + # download sample sheet + target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/supp/cell-atac/cellranger-atac-tiny-bcl-samplesheet-1.0.0.csv \ + --output "${OUT}/bcl/sample_sheet.csv" +fi + +if [ ! -d "${OUT}/fastqs" ]; then + mkdir -p "$OUT/fastqs" + + target/docker/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq \ + --input "${OUT}/bcl" \ + --csv "${OUT}/bcl/layout.csv" \ + --output "${OUT}/fastqs" +fi + +# Create count matrices +if [ ! -d "${OUT}/counts" ]; then + mkdir -p "$OUT/counts" + + target/docker/mapping/cellranger_atac_count/cellranger_atac_count \ + --input "${OUT}/fastqs/HJN3KBCX2/test_sample/" \ + --reference "${REFERENCE_DIR}/reference_cellranger.tar.gz" \ + --output "${OUT}/counts" +fi diff --git a/resources_test_scripts/cellranger_tiny_bcl.sh b/resources_test_scripts/cellranger_tiny_bcl.sh new file mode 100755 index 00000000..1faba2c9 --- /dev/null +++ b/resources_test_scripts/cellranger_tiny_bcl.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# settings +ID=cellranger_tiny_bcl +OUT="resources_test/$ID/" +DIR="$OUT" + +# create tempdir +MY_TEMP="${VIASH_TEMP:-/tmp}" +TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# download bcl data +if [ ! -f "${OUT}/bcl/sample_sheet.csv" ]; then + mkdir -p "$OUT/bcl" + + # download tar gz + target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz \ + --output "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" + + # untar + tar -xf "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" \ + --strip-components=1 \ + -C "$OUT/bcl" + + # remove tar + rm "${OUT}/bcl/cellranger-tiny-bcl-1.2.0.tar.gz" + + # download sample sheet + target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-simple-1.2.0.csv \ + --output "${OUT}/bcl/sample_sheet.csv" +fi + +if [ ! -d "${OUT}/fastqs" ]; then + mkdir -p "$OUT/fastqs" + + target/docker/demux/cellranger_mkfastq/cellranger_mkfastq \ + --input "${OUT}/bcl" \ + --sample_sheet "${OUT}/bcl/sample_sheet.csv" \ + --output "${OUT}/fastqs" +fi + +# bcl-convert requires a v2 sample sheet +# bcl-convert is a bit more strict concerning filter files being present or not. +# We make a copy and make the necessary adaptations. + +# We are using the tiny bcl dataset provided by Illumina: +# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq +# Unfortunately, +# 1. the sample sheet delivered with it does not work with bcl-convert (v1 of the format) +# 2. 2 filter files are missing from the run directory that bcl-convert requires to run +# +# We worked around this by +# 1. Manually editing a sample sheet file suited for bcl-convert (format v2) +# 2. Adding a filter file +# +# The filter file is a binary file, we just created an empty file use that. +# bcl-convert might complain about it, but at least something is written out. +# An alternative is to use a filter file from a different project. This also generates +# a warning, but the fastq ouput files contain reads. The drawback is that those filter files +# are generally above 100MB in size. +# +# TODO: Check if a (binary) filter file can be generated that is small but works. + +if [ ! -f "${OUT}/bcl2/sample_sheet.csv" ]; then + mkdir "${OUT}/bcl2/" + cp -r ${OUT}/bcl/* "${OUT}/bcl2/" + cat > "${OUT}/bcl2/sample_sheet.csv" << HERE +[Header],,,,,,,,, +FileFormatVersion,2,,,,,, +RunName,hiseq_test,,,,,, +InstrumentPlatform,NextSeq,,,,,, +IndexOrientation,Forward,,,,,, +,,,,,,,,, +[Reads],,,,,,,,, +Read1Cycles,26,,,,,,,,, +Read2Cycles,98,,,,,, +,,,,,,,,, +[Sequencing_Settings],,,,,,, +,,,,,,, +[BCLConvert_Settings],,,,,,, +SoftwareVersion,3.8.4,,,,,, +NoLaneSplitting,true,,,,,, +FastqCompressionFormat,gzip,,,,,, +,,,,,,,,, +[BCLConvert_Data],,,,,,, +Sample_ID,index,,,,,, +s1,GGTTTACT,,,,,, +,,,,,,, +[Cloud_Settings],,,,,,, +GeneratedVersion,1.3.0.202111171923,,,,,, +,,,,,,, +[Cloud_Data],,,,,,, +Sample_ID,ProjectName,LibraryName,LibraryPrepKitName,IndexAdapterKitName,I7_Index_ID,Sample_Name,Description,Instrument,Type +s1,p1,s1_SI-P03-C9,,,IDT01,SI-P03-C9,s1,NextSeq,HighOutput_75cycles +HERE + + touch "${OUT}/bcl2/Data/Intensities/BaseCalls/L001/s_1_1101.filter" +fi diff --git a/resources_test_scripts/cellranger_tiny_fastq.sh b/resources_test_scripts/cellranger_tiny_fastq.sh new file mode 100755 index 00000000..07b12732 --- /dev/null +++ b/resources_test_scripts/cellranger_tiny_fastq.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# settings +ID=cellranger_tiny_fastq +OUT="resources_test/$ID/" +DIR="$OUT" + +# download cellranger tar gz +cellranger_tar_gz="${OUT}/temp_cellranger-6.1.2.tar.gz" +if [ ! -f "$cellranger_tar_gz" ]; then + echo "Download Cell Ranger 6.1.2 manually first!" + exit 1 +fi + +# untar fastqs +cellranger_tiny_fastq="${OUT}/cellranger_tiny_fastq" +if [ ! -f "${cellranger_tiny_fastq}/tinygex_S1_L001_R1_001.fastq.gz" ]; then + mkdir -p "$cellranger_tiny_fastq" + + tar -xzf "$cellranger_tar_gz" \ + -C "$cellranger_tiny_fastq" \ + "cellranger-6.1.2/external/cellranger_tiny_fastq" \ + --strip-components=3 +fi + +# untar ref +cellranger_tiny_ref="${OUT}/cellranger_tiny_ref" +if [ ! -f "${cellranger_tiny_ref}/reference.json" ]; then + mkdir -p "$cellranger_tiny_ref" + + tar -xzf "$cellranger_tar_gz" \ + -C "$cellranger_tiny_ref" \ + "cellranger-6.1.2/external/cellranger_tiny_ref" \ + --strip-components=3 +fi + +# Create ref with more recent STAR version +recent_ref_dir="${OUT}/cellranger_tiny_ref_v2_7_10_a" +if [ ! -f "${recent_ref_dir}/Genome" ]; then + mkdir -p "${recent_ref_dir}" + + target/docker/mapping/star_build_reference/star_build_reference \ + --genome_fasta "$cellranger_tiny_ref/fasta/genome.fa" \ + --output "$recent_ref_dir" \ + --genomeSAindexNbases 7 \ + --transcriptome_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz" +fi + +# run cellranger count +bam_dir="${OUT}/bam" +if [ ! -f "$bam_dir/possorted_genome_bam.bam" ]; then + mkdir -p "$bam_dir" + + viash run src/mapping/cellranger_count/config.vsh.yaml -- \ + --input "$cellranger_tiny_fastq" \ + --reference "$cellranger_tiny_ref" \ + --output "$bam_dir" +fi + +# convert to h5mu +raw_h5mu="${OUT}/raw_dataset.h5mu" +if [ ! -f "$step1_h5mu" ]; then + viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \ + --input "${bam_dir}/raw_feature_bc_matrix.h5" \ + --output "$raw_h5mu" +fi + +# run velocyto +velo_gtf="$cellranger_tiny_ref/genes/genes.gtf.gz" +velo_bam="$bam_dir/possorted_genome_bam.bam" +velo_loom="${OUT}/velocyto.loom" +if [ ! -f "$velo_loom" ]; then + viash run src/velocity/velocyto/config.vsh.yaml -- \ + --input "$velo_bam" \ + --output "$velo_loom" \ + --transcriptome "$velo_gtf" +fi + +# combine raw counts with velocyto data +dataset_h5mu="${OUT}/dataset.h5mu" +if [ ! -f "$dataset_h5mu" ]; then + viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \ + --input_loom "$velo_loom" \ + --input_h5mu "$raw_h5mu" \ + --output "$dataset_h5mu" +fi + +# run htseq +htseq_counts="${OUT}/htseq_counts.tsv" +if [ ! -f "$htseq_counts" ]; then + viash run src/mapping/htseq_count/config.vsh.yaml -- \ + --input "$velo_bam" \ + --reference "$velo_gtf" \ + --output "$htseq_counts" +fi + +multi_star="${OUT}/multi_star" +if [ ! -d "$multi_star" ]; then + viash run src/mapping/multi_star/config.vsh.yaml -- \ + --input_id "tinygex" \ + --input_r1 "$cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz" \ + --input_r2 "$cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz" \ + --input_id "tinygex" \ + --input_r1 "$cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz" \ + --input_r2 "$cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz" \ + --reference_index "$recent_ref_dir" \ + --reference_gtf "$cellranger_tiny_ref/genes/genes.gtf.gz" \ + --output "$multi_star" \ + ---cpus 30 +fi \ No newline at end of file diff --git a/resources_test_scripts/concat_test_data.sh b/resources_test_scripts/concat_test_data.sh new file mode 100755 index 00000000..4d63b17a --- /dev/null +++ b/resources_test_scripts/concat_test_data.sh @@ -0,0 +1,72 @@ +#!/bin/bash + + + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# The output folder +OUT="resources_test/concat_test_data/" + +# create it if it doesn't exist already +[ -d "$OUT" ] || mkdir -p "$OUT" + +echo "> Downloading files" +target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/e18_mouse_brain_fresh_5k/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5 \ + --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" + +target/docker/download/download_file/download_file \ + --input https://cf.10xgenomics.com/samples/cell-arc/1.0.0/human_brain_3k/human_brain_3k_filtered_feature_bc_matrix.h5 \ + --output "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5" + +echo "> Converting to h5mu" +viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \ + --input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \ + --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" + +viash run src/convert/from_10xh5_to_h5mu/config.vsh.yaml -- \ + --input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5" \ + --output "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu" + +echo "> Subsetting datasets" +viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \ + --input "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5mu" \ + --output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \ + --number_of_observations 2000 + +viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \ + --input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \ + --output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \ + --number_of_observations 2000 + +echo "Making observation ids unique (required for concat component to function)" +viash run src/metadata/add_id/config.vsh.yaml -- \ +--input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \ +--output "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \ +--input_id "human" \ +--make_observation_keys_unique + +viash run src/metadata/add_id/config.vsh.yaml -- \ +--input "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \ +--output "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \ +--input_id "mouse" \ +--make_observation_keys_unique + +echo "Removing temp files" +rm "${OUT}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5mu" \ + "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix.h5" \ + "$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu" \ + "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu" \ + "${OUT}/human_brain_3k_filtered_feature_bc_matrix.h5mu" \ + "$OUT/human_brain_3k_filtered_feature_bc_matrix.h5" + + +echo "> Running concat component" +viash run src/dataflow/concat/config.vsh.yaml -- \ + --input "$OUT/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu,$OUT/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" \ + --input_id "human,mouse" \ + --output "$OUT/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu" diff --git a/resources_test_scripts/demuxafy_test_data.sh b/resources_test_scripts/demuxafy_test_data.sh new file mode 100755 index 00000000..515eeb7c --- /dev/null +++ b/resources_test_scripts/demuxafy_test_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -eo pipefail + + +# settings +ID=demuxafy_test_data +OUT=resources_test/$ID +DIR="$OUT" + +mkdir -p "$OUT" +cd "$OUT" +# download demuxafy test dataset +wget https://www.dropbox.com/s/m8u61jn4i1mcktp/TestData4PipelineSmall.tar.gz +tar -xf TestData4PipelineSmall.tar.gz +# bam and vcf file +cp TestData4PipelineSmall/test_dataset.vcf . +# extract chr from vcf file +grep -w '^#\|^#CHROM\|^[1-2]' test_dataset.vcf > test_dataset_chr1_2.vcf +grep -w '^#\|^#CHROM\|^[3-4]' test_dataset.vcf > test_dataset_chr3_4.vcf + +# barcode list +cp TestData4PipelineSmall/test_dataset/outs/filtered_gene_bc_matrices/Homo_sapiens_GRCh38p10/barcodes.tsv . + +# subsetted bam and bai for souporcell +wget https://www.dropbox.com/s/7ew5lt0msf4z5gj/chr_1_pooled.sorted.bam +wget https://www.dropbox.com/s/tpplbj9sab9b2p4/chr_1_pooled.sorted.bam.bai + +# variants from mixed sample +wget https://www.dropbox.com/s/btir7ge4kzc7tu1/mixed_variant.vcf + +# dsc_pileup output +wget https://www.dropbox.com/s/17hj9i0yavtezx1/dsc_pileup.zip +unzip dsc_pileup.zip + +# subsetted human genome reference +wget https://www.dropbox.com/s/ynlce3g7nwxthwg/genome_chr1.fa + +# remove unnecessary files +rm -rf TestData4PipelineSmall +rm TestData4PipelineSmall.tar.gz +rm dsc_pileup.zip \ No newline at end of file diff --git a/resources_test_scripts/hlca_reference_model.sh b/resources_test_scripts/hlca_reference_model.sh new file mode 100755 index 00000000..ac8955f0 --- /dev/null +++ b/resources_test_scripts/hlca_reference_model.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +ID=HLCA_reference_model +OUT=resources_test/$ID/$ID +DIR=$(dirname "$OUT") + +# ideally, this would be a versioned pipeline run +[ -d "$DIR" ] || mkdir -p "$DIR" + +# download and unarchive pre-trained scANVI model +wget https://zenodo.org/record/6337966/files/HLCA_reference_model.zip \ + -O "${OUT}.zip" + +# # Test query data +# # Source publication: Delorey, Toni M., et al. “COVID-19 tissue atlases reveal SARS-CoV-2 pathology and cellular targets.” Nature 595.7865 (2021): 107-113. +# wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5230nnn/GSM5230027/suppl/GSM5230027_04-P103142-S149-R01_raw_feature_bc_matrix.h5.gz \ +# -O "${OUT}_query_test.h5.gz" +# gzip -d "${OUT}_query_test.h5.gz" + +# # Prepare test data as in scvi-tools tutorial: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/query_hlca_knn.html +# python < "$motifs_in" +fi + +# Change motif headers so the human-readable motif name precedes the motif +# identifier. So ">MA0004.1 Arnt" -> ">Arnt_MA0004.1". +motifs_modified="${OUT}/$(basename "$motifs_in").modified" +awk '{ + if ( substr($1, 1, 1) == ">" ) { + print ">" $2 "_" substr($1,2) + } else { + print + } +}' "$motifs_in" > "$motifs_modified" + + +cat > /tmp/params.yaml << HERE +param_list: + - id: "$ID" + genome_fasta: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" + transcriptome_gtf: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" + target: ["bd_rhapsody", "cellranger_arc"] + output_fasta: "reference.fa.gz" + output_gtf: "reference.gtf.gz" + non_nuclear_contigs: null + output_cellranger_arc: "reference_cellranger.tar.gz" + output_bd_rhapsody: "reference_bd_rhapsody.tar.gz" + bdrhap_extra_star_params: "--genomeSAindexNbases 12 --genomeSAsparseD 2" + motifs_file: "$motifs_modified" + subset_regex: "chr1" +HERE + +nextflow \ + run . \ + -main-script target/nextflow/workflows/ingestion/make_reference/main.nf \ + -profile docker \ + -c ./src/workflows/utils/labels_ci.config \ + -params-file /tmp/params.yaml \ + --publish_dir $OUT \ + -resume diff --git a/resources_test_scripts/remote_param_list.sh b/resources_test_scripts/remote_param_list.sh new file mode 100755 index 00000000..dde10f5e --- /dev/null +++ b/resources_test_scripts/remote_param_list.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +mkdir -p "resources_test/remote_param_list/" +OUT=resources_test/remote_param_list/test_param_list.yaml +OUT_CSV=resources_test/remote_param_list/test_param_list.csv +OUT_JSON=resources_test/remote_param_list/test_param_list.json + +cat > $OUT << HERE +- id: "mouse" + input: s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu + publish_dir: "foo_remote/" + rna_min_counts: 2 + prot_min_counts: 3 +- id: "human" + input: s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu + publish_dir: "foo_remote/" + rna_min_counts: 2 + prot_min_counts: 3 +HERE + +cat > $OUT_CSV << EOF +"id","input","publish_dir","rna_min_counts","prot_min_counts" +"mouse","s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3" +"human","s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu","foo_remote/","2","3" +EOF + +cat > $OUT_JSON << HERE +[ + { + "id": "mouse", + "input": "s3://openpipelines-data/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + "publish_dir": "foo_remote/", + "rna_min_counts": 2, + "prot_min_counts": 3 + }, + { + "id": "human", + "input": "s3://openpipelines-data/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + "publish_dir": "foo_remote/", + "rna_min_counts": 2, + "prot_min_counts": 3 + } +] +HERE \ No newline at end of file diff --git a/resources_test_scripts/rna_velocity.sh b/resources_test_scripts/rna_velocity.sh new file mode 100755 index 00000000..9998bc4f --- /dev/null +++ b/resources_test_scripts/rna_velocity.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=rna_velocity +OUT=resources_test/$ID + + +# create raw directory +velocyto_dir="$OUT/velocyto" +mkdir -p "$velocyto_dir" + +######################################################## +# Create a compatible BAM file from BD Rhapsody Output # +######################################################## + +bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/output_raw/Combined_sample_Bioproduct.bam" + +if [[ ! -f "$bd_rhap_wta_bam" ]]; then + echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first." + exit 1 +fi + +echo "> Converting BD Rhapsody barcode tags." +viash run src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml -- \ + -i "$bd_rhap_wta_bam" \ + -o "$velocyto_dir/compatible_bd_input.bam" \ + --bam \ + -t 4 + +echo "> Creating barcodes file." +samtools view -@4 "$velocyto_dir/compatible_bd_input.bam" | \ + grep -oP "(?<=CB:Z:)\S+" | sort | uniq | head > "$velocyto_dir/barcodes.txt" + +########################################################### +# Process Tiny Fast Fastq dataset from 10X to create # +# input data for convert/from_velocyto_to_h5mu compontent # +########################################################### + +mkdir "$OUT/velocyto_processed" + +gtf="resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +bam="resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" + +echo "> Processing 10x dataset" +viash run src/velocity/velocyto/config.vsh.yaml -- \ + -i "$bam" \ + -o "$OUT/velocyto_processed/cellranger_tiny.loom" \ + --transcriptome "$gtf" + +echo "> Converting loom file to MuData object" +viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \ + --input_loom "$OUT/velocyto_processed/cellranger_tiny.loom" \ + --input_h5mu "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu" \ + --modality velocyto \ + --output_compression "gzip" \ + --output "$OUT/velocyto_processed/velocyto.h5mu" \ No newline at end of file diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh new file mode 100755 index 00000000..b8bf34f4 --- /dev/null +++ b/resources_test_scripts/scgpt.sh @@ -0,0 +1,135 @@ +set -eo pipefail + +# ensure that the command below is run from the root of the repository +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +# settings +ID=scgpt +OUT=resources_test/$ID + +# create foundational model directory +foundation_model_dir="$OUT/source" +mkdir -p "$foundation_model_dir" +export foundation_model_dir + +# create finetuned model directory +finetuned_model_dir="$OUT/finetuned_model" +mkdir -p "$finetuned_model_dir" +export finetuned_model_dir + +# install gdown if necessary +# Check whether gdown is available +if ! command -v gdown &> /dev/null; then + echo "This script requires gdown. Please make sure the binary is added to your PATH." + exit 1 +fi + +# install torch if necessary +# Check whether torch is available +if ! python -c "import torch"; then + echo "This script requires torch. Please make sure it is available in your python environment." + exit 1 +fi + +echo "> Downloading scGPT foundation model (full_human)" +# download foundational model files (full_human) +# https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y +gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json" +gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json" +gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt" + +echo "> Converting to finetuned model format" +python < Downloading test resources" +# download test data +# https://drive.google.com/file/d/1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL/view?usp=drive_link +gdown '1z_0vWYMhRuRiD1EyhuFtY9ReIR0msWaL' -O "${test_resources_dir}/Kim2020_Lung.h5ad" + +echo "> Converting to h5mu" +python < Subsetting datasets" +viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung.h5mu" \ + --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \ + --number_of_observations 4000 + +rm "${test_resources_dir}/Kim2020_Lung.h5ad" +rm "${test_resources_dir}/Kim2020_Lung.h5mu" + +echo "> Preprocessing datasets" +nextflow \ + run . \ + -main-script target/nextflow/workflows/multiomics/process_samples/main.nf \ + -profile docker \ + -c src/workflows/utils/labels_ci.config \ + --input "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \ + --output "Kim2020_Lung_subset_preprocessed.h5mu" \ + --publish_dir "${test_resources_dir}" + +echo "> Filtering highly variable features" +viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \ + --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ + --layer "log_normalized" \ + --var_name_filter "scgpt_filter_with_hvg" \ + --n_top_features 1200 \ + --flavor "cell_ranger" + +echo "> Running scGPT cross check genes" +viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ + --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \ + --vocab_file "${foundation_model_dir}/vocab.json" \ + --var_input "scgpt_filter_with_hvg" \ + --output_var_filter "scgpt_cross_checked_genes" + +echo "> Running scGPT binning" +viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \ + --input_layer "log_normalized" \ + --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \ + --output_obsm_binned_counts "binned_counts" \ + --var_input "scgpt_cross_checked_genes" + +echo "> Running scGPT tokenizing" +viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \ + --input_obsm_binned_counts "binned_counts" \ + --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \ + --model_vocab "${foundation_model_dir}/vocab.json" \ + --var_input "scgpt_cross_checked_genes" \ + + +echo "> Removing unnecessary files in test resources dir" +find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete + +echo "> scGPT test resources are ready!" diff --git a/resources_test_scripts/vireo_test_data.sh b/resources_test_scripts/vireo_test_data.sh new file mode 100755 index 00000000..ee412878 --- /dev/null +++ b/resources_test_scripts/vireo_test_data.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -eo pipefail + + +# settings +ID=vireo_test_data +OUT=resources_test/$ID +DIR="$OUT" + +mkdir -p "$OUT" +cd "$OUT" +# download vireo tutorial dataset +wget https://github.com/single-cell-genetics/vireo/raw/master/data/cells.cellSNP.vcf.gz diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 00000000..0b94ce24 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,43 @@ +# Exclude a variety of commonly ignored directories. +exclude = [ + ".git", + ".pyenv", + ".pytest_cache", + ".ruff_cache", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "build", + "dist", + "node_modules", + "site-packages", +] + +builtins = ["meta"] + + + + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +[lint.flake8-pytest-style] +fixture-parentheses = false +mark-parentheses = false + +[lint] +ignore = [ + # module level import not at top of file + "E402" +] \ No newline at end of file diff --git a/src/annotate/celltypist/config.vsh.yaml b/src/annotate/celltypist/config.vsh.yaml new file mode 100644 index 00000000..ccb8d9fc --- /dev/null +++ b/src/annotate/celltypist/config.vsh.yaml @@ -0,0 +1,161 @@ +name: celltypist +namespace: annotate +scope: "public" +description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + description: Input dataset (query) arguments + arguments: + - name: "--input" + alternatives: [-i] + type: file + description: The input (query) data to be labeled. Should be a .h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used. + - name: "--input_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." + example: reference.h5mu + direction: input + required: false + - name: "--reference_layer" + type: string + description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset. + required: false + - name: "--reference_obs_target" + type: string + description: The name of the adata obs column in the reference data containing cell type annotations. + default: "cell_ontology_class" + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--reference_var_input" + type: string + required: false + description: | + .var column containing highly variable genes. By default, do not subset genes. + + - name: Model arguments + description: Model arguments. + arguments: + - name: "--model" + type: file + description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided." + required: false + example: pretrained_model.pkl + - name: "--feature_selection" + type: boolean + description: "Whether to perform feature selection." + default: false + - name: "--majority_voting" + type: boolean + description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering." + default: false + - name: "--C" + type: double + description: "Inverse of regularization strength in logistic regression." + default: 1.0 + - name: "--max_iter" + type: integer + description: "Maximum number of iterations before reaching the minimum of the cost function." + default: 1000 + - name: "--use_SGD" + type: boolean_true + description: "Whether to use the stochastic gradient descent algorithm." + - name: "--min_prop" + type: double + description: | + "For the dominant cell type within a subcluster, the minimum proportion of cells required to + support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. + Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'." + default: 0 + + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obs_predictions" + type: string + default: celltypist_pred + required: false + description: | + In which `.obs` slots to store the predicted information. + - name: "--output_obs_probability" + type: string + default: celltypist_probability + required: false + description: | + In which `.obs` slots to store the probability of the predictions. + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/cross_check_genes.py + - path: /src/utils/subset_vars.py + - path: /src/utils/set_var_index.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - libhdf5-dev + - procps + - type: python + __merge__: [ /src/base/requirements/scanpy.yaml, .] + - type: python + packages: + - celltypist==1.6.3 + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highcpu, highmem, highdisk] diff --git a/src/annotate/celltypist/script.py b/src/annotate/celltypist/script.py new file mode 100644 index 00000000..e3efb749 --- /dev/null +++ b/src/annotate/celltypist/script.py @@ -0,0 +1,147 @@ +import sys +import celltypist +import mudata as mu +import anndata as ad +import pandas as pd +import numpy as np + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "output.h5mu", + "modality": "rna", + # "reference": None, + "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "model": None, + # "model": "resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl", + "input_layer": "log_normalized", + "reference_layer": "log_normalized", + "input_reference_gene_overlap": 100, + "reference_obs_target": "cell_ontology_class", + "reference_var_input": None, + "check_expression": False, + "feature_selection": True, + "majority_voting": True, + "output_compression": "gzip", + "input_var_gene_names": None, + "reference_var_gene_names": "ensemblid", + "C": 1.0, + "max_iter": 1000, + "use_SGD": False, + "min_prop": 0, + "output_obs_predictions": "celltypist_pred", + "output_obs_probability": "celltypist_probability", +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index +from subset_vars import subset_vars + +logger = setup_logger() + + +def check_celltypist_format(indata): + if np.abs(np.expm1(indata[0]).sum() - 10000) > 1: + return False + return True + + +def main(par): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + + # Provide correct format of query data for celltypist annotation + ## Sanitize gene names and set as index + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + ## Fetch lognormalized counts + lognorm_counts = ( + input_modality.layers[par["input_layer"]].copy() + if par["input_layer"] + else input_modality.X.copy() + ) + ## Create AnnData object + input_modality = ad.AnnData( + X=lognorm_counts, var=pd.DataFrame(index=input_modality.var.index) + ) + + if par["model"]: + logger.info("Loading CellTypist model") + model = celltypist.models.Model.load(par["model"]) + cross_check_genes( + input_modality.var.index, + model.features, + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + elif par["reference"]: + reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]] + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + # CellTypist requires query gene names to be in index + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # Ensure enough overlap between genes in query and reference + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + + labels = reference_modality.obs[par["reference_obs_target"]] + + logger.info("Training CellTypist model on reference") + model = celltypist.train( + reference_matrix, + labels=labels, + genes=reference_modality.var.index, + C=par["C"], + max_iter=par["max_iter"], + use_SGD=par["use_SGD"], + feature_selection=par["feature_selection"], + check_expression=True, + ) + + logger.info("Predicting CellTypist annotations") + predictions = celltypist.annotate( + input_modality, model, majority_voting=par["majority_voting"] + ) + + input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels[ + "predicted_labels" + ].values + input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max( + axis=1 + ).values + + # copy observations back to input data (with full set of features) + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main(par) diff --git a/src/annotate/celltypist/test.py b/src/annotate/celltypist/test.py new file mode 100644 index 00000000..60704b3c --- /dev/null +++ b/src/annotate/celltypist/test.py @@ -0,0 +1,207 @@ +import sys +import os +import pytest +import subprocess +import re +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" +reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" +model_file = ( + f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl" +) +celltypist_input_file = ( + f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu" +) + + +def test_simple_execution(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_layer", + "log_normalized", + "--reference", + reference_file, + "--reference_layer", + "log_normalized", + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert {"celltypist_pred", "celltypist_probability"}.issubset( + output_mudata.mod["rna"].obs.keys() + ), "Required keys not found in .obs" + + predictions = output_mudata.mod["rna"].obs["celltypist_pred"] + assert predictions.dtype == "category", ( + "Calculated predictions should be category dtype" + ) + assert not all(predictions.isna()), "Not all predictions should be NA" + + probabilities = output_mudata.mod["rna"].obs["celltypist_probability"] + assert probabilities.dtype == "float", ( + "Calculated probabilities should be float dtype" + ) + assert all(0 <= value <= 1 for value in probabilities), ( + ".obs at celltypist_probability has values outside the range [0, 1]" + ) + + +def test_set_params(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_layer", + "log_normalized", + "--reference", + reference_file, + "--reference_layer", + "log_normalized", + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--feature_selection", + "True", + "--majority_voting", + "True", + "--C", + "0.5", + "--max_iter", + "100", + "--use_SGD", + "--min_prop", + "0.1", + "--output", + output_file, + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert {"celltypist_pred", "celltypist_probability"}.issubset( + output_mudata.mod["rna"].obs.keys() + ), "Required keys not found in .obs" + + obs_values = output_mudata.mod["rna"].obs["celltypist_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + ".obs at celltypist_probability has values outside the range [0, 1]" + ) + + +def test_with_model(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + celltypist_input_file, + "--model", + model_file, + "--reference_obs_targets", + "cell_type", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + output_mudata = mu.read_h5mu(output_file) + + assert {"celltypist_pred", "celltypist_probability"}.issubset( + output_mudata.mod["rna"].obs.keys() + ), "Required keys not found in .obs" + + predictions = output_mudata.mod["rna"].obs["celltypist_pred"] + assert predictions.dtype == "category", ( + "Calculated predictions should be category dtype" + ) + assert not all(predictions.isna()), "Not all predictions should be NA" + + probabilities = output_mudata.mod["rna"].obs["celltypist_probability"] + assert probabilities.dtype == "float", ( + "Calculated probabilities should be float dtype" + ) + assert all(0 <= value <= 1 for value in probabilities), ( + ".obs at celltypist_probability has values outside the range [0, 1]" + ) + + +def test_fail_invalid_input_expression(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + # fails because input data are not lognormalized + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) + assert re.search( + r"Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell", + err.value.stdout.decode("utf-8"), + ) + + # fails because reference data are not lognormalized + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--layer", + "log_normalized", + "--reference", + reference_file, + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) + assert re.search( + r"Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/onclass/config.vsh.yaml b/src/annotate/onclass/config.vsh.yaml new file mode 100644 index 00000000..ff61b11c --- /dev/null +++ b/src/annotate/onclass/config.vsh.yaml @@ -0,0 +1,166 @@ +name: onclass +namespace: annotate +scope: "public" +description: | + OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity. + These similarities enable OnClass to annotate cell types that are never seen in the training data. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + +argument_groups: + - name: Inputs + description: Input dataset (query) arguments + arguments: + - name: "--input" + alternatives: [-i] + type: file + description: The input (query) data to be labeled. Should be a .h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + description: The layer in the input data to be used for cell type annotation if .X is not to be used. + required: false + - name: "--input_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + + - name: Ontology + description: Ontology input files + arguments: + - name: "--cl_nlp_emb_file" + type: file + description: The .nlp.emb file with the cell type embeddings. + required: true + - name: "--cl_ontology_file" + type: file + description: The .ontology file with the cell type ontology. + required: true + - name: "--cl_obo_file" + type: file + description: The .obo file with the cell type ontology. + required: true + + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." + example: reference.h5mu + direction: input + required: false + - name: "--reference_layer" + type: string + description: The layer in the reference data to be used for cell type annotation if .X is not to be used. + required: false + - name: "--reference_obs_target" + type: string + description: The name of the adata obs column in the reference data containing cell type annotations. + example: "cell_ontology_class" + required: true + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--reference_var_input" + type: string + required: false + description: | + .var column containing highly variable genes. By default, do not subset genes. + - name: "--unknown_celltype" + type: string + default: "Unknown" + required: false + description: | + Label for unknown cell types. + + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obs_predictions" + type: string + default: onclass_pred + required: false + description: | + In which `.obs` slots to store the predicted information. + - name: "--output_obs_probability" + type: string + default: onclass_prob + required: false + description: | + In which `.obs` slots to store the probability of the predictions. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Model arguments + description: Model arguments + arguments: + - name: "--model" + type: string + description: | + "Pretrained model path without a file extension. If not provided, the model will be trained + on the reference data and --reference should be provided. The path namespace should contain: + - a .npz or .pkl file + - a .data file + - a .meta file + - a .index file + e.g. /path/to/model/pretrained_model_target1 as saved by OnClass." + required: false + direction: input + - name: "--max_iter" + type: integer + default: 30 + required: false + description: Maximum number of iterations for training the model. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/cross_check_genes.py + - path: /src/utils/subset_vars.py + - path: /src/utils/set_var_index.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + image: python:3.11 + setup: + - type: python + packages: + - OnClass~=1.3 + - tensorflow + - obonet + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highcpu, highmem, highdisk] diff --git a/src/annotate/onclass/script.py b/src/annotate/onclass/script.py new file mode 100644 index 00000000..0306c988 --- /dev/null +++ b/src/annotate/onclass/script.py @@ -0,0 +1,214 @@ +import sys +import mudata as mu +import numpy as np +from OnClass.OnClassModel import OnClassModel +import obonet +from typing import Dict, List, Tuple + + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "output.h5mu", + "modality": "rna", + "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "model": None, + # "reference": None, + # "model": "resources_test/annotation_test_data/onclass_model/example_file_model", + "reference_obs_target": "cell_ontology_class", + "input_layer": None, + "reference_layer": None, + "max_iter": 100, + "output_obs_predictions": None, + "output_obs_probability": None, + "cl_nlp_emb_file": "resources_test/annotation_test_data/ontology/cl.ontology.nlp.emb", + "cl_ontology_file": "resources_test/annotation_test_data/ontology/cl.ontology", + "cl_obo_file": "resources_test/annotation_test_data/ontology/cl.obo", + "output_compression": "gzip", + "input_var_gene_names": "gene_symbol", + "input_reference_gene_overlap": 100, + "reference_var_input": None, + "reference_var_gene_names": None, + "unkown_celltype": "Unknown", +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index +from subset_vars import subset_vars + +logger = setup_logger() + + +def map_celltype_to_ontology_id( + cl_obo_file: str, +) -> Tuple[Dict[str, str], Dict[str, str]]: + """ + Map cell type names to ontology IDs and vice versa. + + Parameters + ---------- + cl_obo_file : str + Path to the cell ontology file. + + Returns + ------- + Tuple[Dict[str, str], Dict[str, str]] + A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names. + The second dictionary maps cell type names to cell ontology IDs. + """ + graph = obonet.read_obo(cl_obo_file) + cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)} + cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None} + name_to_cl_id = {v: k for k, v in cl_id_to_name.items()} + return cl_id_to_name, name_to_cl_id + + +def cell_type_prediction( + model: OnClassModel, + input_matrix: np.array, + input_features: List[str], + id_to_name: dict, +) -> Tuple[List[str], List[float]]: + """ + Predict cell types for input data and save results to Anndata obj. + + Parameters + ---------- + model : OnClassModel + The OnClass model. + input_matrix : np.array + The input data matrix. + input_modality : ad.AnnData + The input data Anndata object. + id_to_name : dict + Dictionary mapping cell ontology IDs to cell type names. + + Returns + ------- + predictions: List[str] + The predicted cell types. + probabilities: List[float] + Probabilities of the predicted cell types. + """ + corr_test_feature = model.ProcessTestFeature( + test_feature=input_matrix, + test_genes=input_features, + log_transform=False, + ) + onclass_pred = model.Predict( + corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0 + ) + pred_label = [model.i2co[ind] for ind in onclass_pred[2]] + pred_cell_type_label = [id_to_name[id] for id in pred_label] + prob_cell_type_label = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1) + + return pred_cell_type_label, prob_cell_type_label + + +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + # Onclass needs dense matrix format + input_matrix = input_matrix.toarray() + + id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"]) + + if par["model"]: + logger.info("Predicting cell types using pre-trained model") + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) + + model.BuildModel(use_pretrain=par["model"], ngene=None) + cross_check_genes( + model.genes, input_modality.var.index, par["input_reference_gene_overlap"] + ) + + elif par["reference"]: + logger.info("Reading reference data") + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + # Onclass needs dense matrix format + reference_matrix = reference_matrix.toarray() + + logger.info("Training a model from reference...") + + labels = reference_modality.obs[par["reference_obs_target"]].tolist() + labels_cl = [ + name_to_id[label] if label in name_to_id else par["unknown_celltype"] + for label in labels + ] + + _ = model.EmbedCellTypes(labels_cl) + corr_train_feature, _, corr_train_genes, _ = model.ProcessTrainFeature( + train_feature=reference_matrix, + train_label=labels_cl, + train_genes=reference_modality.var.index, + test_feature=input_matrix, + test_genes=input_modality.var.index, + log_transform=False, + ) + model.BuildModel(ngene=len(corr_train_genes)) + model.Train(corr_train_feature, labels_cl, max_iter=par["max_iter"]) + + logger.info("Predicting cell types") + predictions, probabilities = cell_type_prediction( + model, input_matrix, input_modality.var.index, id_to_name + ) + + logger.info("Writing output data") + input_adata.obs[par["output_obs_predictions"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() diff --git a/src/annotate/onclass/test.py b/src/annotate/onclass/test.py new file mode 100644 index 00000000..9212928d --- /dev/null +++ b/src/annotate/onclass/test.py @@ -0,0 +1,179 @@ +import sys +import os +import pytest +import subprocess +import re +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" +cl_nlp_emb_file = ( + f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb" +) +cl_ontology_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology" +cl_obo_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.obo" +model_file = ( + f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model" +) + + +def test_simple_execution(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--max_iter", + "10", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"] + + obs_values = output_mudata.mod["rna"].obs["onclass_prob"] + assert all(0 <= value <= 1 for value in obs_values), ( + ".obs at cell_ontology_class_prob has values outside the range [0, 1]" + ) + + +def test_custom_obs(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--output_obs_predictions", + "dummy_pred_1", + "--output_obs_probability", + "dummy_prob_1", + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--max_iter", + "10", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert set(output_mudata.mod["rna"].obs.keys()) == {"dummy_pred_1", "dummy_prob_1"} + + obs_values = output_mudata.mod["rna"].obs["dummy_prob_1"] + assert all(0 <= value <= 1 for value in obs_values), ( + ".obs at dummy_prob_1 has values outside the range [0, 1]" + ) + + +def test_no_model_no_reference_error(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--output", + output_file, + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--reference_obs_target", + "cell_ontology_class", + ] + ) + assert re.search( + r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", + err.value.stdout.decode("utf-8"), + ) + + +def test_pretrained_model(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--reference_obs_target", + "cell_ontology_class", + "--model", + model_file, + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"] + + obs_values = output_mudata.mod["rna"].obs["onclass_prob"] + assert all(0 <= value <= 1 for value in obs_values), ( + ".obs at cell_ontology_class_prob has values outside the range [0, 1]" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/popv/config.vsh.yaml b/src/annotate/popv/config.vsh.yaml new file mode 100644 index 00000000..d5346165 --- /dev/null +++ b/src/annotate/popv/config.vsh.yaml @@ -0,0 +1,164 @@ +name: popv +namespace: "annotate" +scope: "public" +description: "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV." +authors: + - __merge__: /src/authors/matthias_beyens.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] +argument_groups: + - name: Inputs + description: Arguments related to the input (aka query) dataset. + arguments: + - name: "--input" + alternatives: [-i] + type: file + description: Input h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`. + required: false + - name: "--input_obs_batch" + type: string + description: Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown. + required: false + - name: "--input_var_subset" + type: string + description: Subset the input object with this column. + required: false + - name: "--input_obs_label" + type: string + description: Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `"unknown_celltype_label"`. + required: false + - name: "--unknown_celltype_label" + type: string + description: If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model. + default: "unknown" + required: false + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "User-provided reference tissue. The data that will be used as reference to call cell types." + example: TS_Bladder_filtered.h5ad + direction: input + required: true + - name: "--reference_layer" + type: string + description: Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`. + required: false + - name: "--reference_obs_label" + type: string + description: Key in obs field of reference AnnData with cell-type information. + default: "cell_ontology_class" + required: false + - name: "--reference_obs_batch" + type: string + description: Key in obs field of input adata for batch information. + default: "donor_assay" + required: false + # - name: "--reference_models" + # type: file + # description: Pretrained models. Can be a directory or a tar gz. + # required: false + # example: pretrained_models_Bladder_ts.tar.gz + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + __merge__: [., /src/base/h5_compression_argument.yaml] + + # - name: "--output_models" + # type: file + # direction: output + # description: If `prediction_mode == "retrain"`, saves models to a directory and compresses the results into a tar gz. + # example: "output.tar.gz" + # required: false + - name: Arguments + description: Other arguments. + arguments: + - name: "--methods" + type: string + description: "Methods to call cell types. By default, runs to knn_on_scvi and scanvi." + example: ["knn_on_scvi", "scanvi"] + choices: [celltypist, knn_on_bbknn, knn_on_scanorama, knn_on_scvi, onclass, rf, scanvi, svm] + required: true + multiple: true + # - name: "--prediction_mode" + # type: string + # description: | + # Execution mode of cell-type annotation. + # "retrain": Train all prediction models and saves them to disk. Argument `output_models` must be defined. + # "inference": Classify all cells based on pretrained models. Argument `reference_models` must be defined. + # "fast": Fast inference using only query cells and single epoch in scArches. + # - name: "--plots" + # type: boolean + # description: "Creation of agreement and frequency plots between selected cell type algorithmn(s) and final PopV ensemble called cell type." + # default: false + # required: false +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + #image: nvcr.io/nvidia/pytorch:22.12-py3 + image: python:3.11-slim + + setup: + - type: docker + env: + # Build extensions without AVX512/AVX2 support + - CFLAGS="-mno-avx512f -mno-avx2" + - CPPFLAGS="-mno-avx512f -mno-avx2" + - type: apt + packages: + - procps + - git + - build-essential + - type: python + packages: + - popv~=0.4.2 + # Previously, scvi-tools < 1.2.2 pinned numpy to <2. We want to keep it pinned here + - numpy<2 + - setuptools + # These need to be updated AFTER popv is installed. + # See https://github.com/YosefLab/PopV/issues/30 + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] + # download ontology required by popv + - type: docker + run: | + cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + # TODO: should add new label highmem-single-gpu and lowmem-single-gpu + label: [highmem, highcpu, highdisk] \ No newline at end of file diff --git a/src/annotate/popv/script.py b/src/annotate/popv/script.py new file mode 100644 index 00000000..9f47fa8d --- /dev/null +++ b/src/annotate/popv/script.py @@ -0,0 +1,227 @@ +import sys +import re +import tempfile +import typing +import numpy as np +import mudata as mu +import anndata as ad +import popv + +# todo: is this still needed? +from torch.cuda import is_available as cuda_is_available + +try: + from torch.backends.mps import is_available as mps_is_available +except ModuleNotFoundError: + # Older pytorch versions + # MacOS GPUs + def mps_is_available(): + return False + + +# where to find the obo files +cl_obo_folder = "/opt/PopV/resources/ontology/" + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + # "input": "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset.h5mu", + "modality": "rna", + "reference": "resources_test/annotation_test_data/tmp_TS_Blood_filtered.h5ad", + "input_obs_batch": None, + "input_layer": None, + "input_obs_label": None, + "input_var_subset": None, + "unknown_celltype_label": "unknown", + "reference_layer": None, + "reference_obs_label": "cell_ontology_class", + "reference_obs_batch": "donor_assay", + "output": "output.h5mu", + "output_compression": "gzip", + "methods": [ + # "celltypist", + # "knn_on_bbknn", + # "knn_on_scanorama", + # "knn_on_scvi", + "rf", + # "scanvi", + "svm", + ], +} +meta = {} +# for debugging the obo folder can be somewhere local +cl_obo_folder = "popv_cl_ontology/" +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +use_gpu = cuda_is_available() +logger.info("GPU enabled? %s", use_gpu) + + +# Helper functions +def get_X( + adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str] +): + """Fetch the counts data from X or a layer. Subset columns by var_index if so desired.""" + if var_index: + adata = adata[:, var_index] + if layer: + return adata.layers[layer] + else: + return adata.X + + +def get_obs(adata: ad.AnnData, obs_par_names): + """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch.""" + obs_columns = [par[x] for x in obs_par_names if par[x]] + return adata.obs[obs_columns] + + +def get_var(adata: ad.AnnData, var_index: list[str]): + """Fetch the var dataframe. Subset rows by var_index if so desired.""" + return adata.var.loc[var_index] + + +def main(par, meta): + assert len(par["methods"]) >= 1, ( + "Please, specify at least one method for cell typing." + ) + logger.info("Cell typing methods: {}".format(par["methods"])) + + ### PREPROCESSING REFERENCE ### + logger.info("### PREPROCESSING REFERENCE ###") + + # take a look at reference data + logger.info("Reading reference data '%s'", par["reference"]) + reference = ad.read_h5ad(par["reference"]) + + logger.info("Setting reference var index to Ensembl IDs") + reference.var["gene_symbol"] = list(reference.var.index) + reference.var.index = [ + re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"] + ] + + logger.info("Detect number of samples per label") + min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size()) + n_samples_per_label = np.max((min_celltype_size, 100)) + + ### PREPROCESSING INPUT ### + logger.info("### PREPROCESSING INPUT ###") + logger.info("Reading '%s'", par["input"]) + input = mu.read_h5mu(par["input"]) + input_modality = input.mod[par["modality"]] + + # subset with var column + if par["input_var_subset"]: + logger.info("Subset input with .var['%s']", par["input_var_subset"]) + assert par["input_var_subset"] in input_modality.var, ( + f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var" + ) + input_modality = input_modality[:, input_modality.var[par["input_var_subset"]]] + + ### ALIGN REFERENCE AND INPUT ### + logger.info("### ALIGN REFERENCE AND INPUT ###") + + logger.info("Detecting common vars based on ensembl ids") + common_ens_ids = list( + set(reference.var.index).intersection(set(input_modality.var.index)) + ) + + logger.info(" reference n_vars: %i", reference.n_vars) + logger.info(" input n_vars: %i", input_modality.n_vars) + logger.info(" intersect n_vars: %i", len(common_ens_ids)) + assert len(common_ens_ids) >= 100, "The intersection of genes is too small." + + # subset input objects to make sure popv is using the data we expect + input_modality = ad.AnnData( + X=get_X(input_modality, par["input_layer"], common_ens_ids), + obs=get_obs(input_modality, ["input_obs_label", "input_obs_batch"]), + var=get_var(input_modality, common_ens_ids), + ) + reference = ad.AnnData( + X=get_X(reference, par["reference_layer"], common_ens_ids), + obs=get_obs(reference, ["reference_obs_label", "reference_obs_batch"]), + var=get_var(reference, common_ens_ids), + ) + + # remove layers that + + ### ALIGN REFERENCE AND INPUT ### + logger.info("### ALIGN REFERENCE AND INPUT ###") + + with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir: + logger.info("Run PopV processing") + pq = popv.preprocessing.Process_Query( + # input + query_adata=input_modality, + query_labels_key=par["input_obs_label"], + query_batch_key=par["input_obs_batch"], + query_layers_key=None, # this is taken care of by subset + # reference + ref_adata=reference, + ref_labels_key=par["reference_obs_label"], + ref_batch_key=par["reference_obs_batch"], + # options + unknown_celltype_label=par["unknown_celltype_label"], + n_samples_per_label=n_samples_per_label, + # pretrained model + # Might need to be parameterized at some point + prediction_mode="retrain", + pretrained_scvi_path=None, + # outputs + # Might need to be parameterized at some point + save_path_trained_models=temp_dir, + # hardcoded values + cl_obo_folder=cl_obo_folder, + accelerator="cuda" if use_gpu else "cpu", + ) + method_kwargs = {} + if "scanorama" in par["methods"]: + method_kwargs["scanorama"] = {"approx": False} + logger.info("Annotate data") + popv.annotation.annotate_data( + adata=pq.adata, methods=par["methods"], methods_kwargs=method_kwargs + ) + + popv_input = pq.adata[input_modality.obs_names] + + # select columns starting with "popv_" + popv_obs_cols = popv_input.obs.columns[ + popv_input.obs.columns.str.startswith("popv_") + ] + + # create new data frame with selected columns + df_popv = popv_input.obs[popv_obs_cols] + + # remove prefix from column names + df_popv.columns = df_popv.columns.str.replace("popv_", "") + + # store output in mudata .obsm + input.mod[par["modality"]].obsm["popv_output"] = df_popv + + # copy important output in mudata .obs + for col in ["popv_prediction"]: + if col in popv_input.obs.columns: + input.mod[par["modality"]].obs[col] = popv_input.obs[col] + + # code to explore how the output differs from the original + # for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]: + # old_keys = set(getattr(pq_adata_orig, attr).keys()) + # new_keys = set(getattr(pq.adata, attr).keys()) + # diff_keys = list(new_keys.difference(old_keys)) + # diff_keys.sort() + # print(f"{attr}:", flush=True) + # for key in diff_keys: + # print(f" {key}", flush=True) + + # write output + logger.info("Writing %s", par["output"]) + input.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main(par, meta) diff --git a/src/annotate/popv/test.py b/src/annotate/popv/test.py new file mode 100644 index 00000000..caca6577 --- /dev/null +++ b/src/annotate/popv/test.py @@ -0,0 +1,95 @@ +import sys +import os +import pytest +import mudata as mu + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" + + +def test_simple_execution(run_component): + output_file = "output.h5mu" + + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--output", + "output.h5mu", + "--methods", + "rf;svm", + ] + ) + + # check whether file exists + assert os.path.exists(output_file), "Output file does not exist" + + # read output mudata + output = mu.read_h5mu(output_file) + + # check output + expected_rna_obs_cols = ["popv_prediction"] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns .mod['rna'].obs['{col}']" + ) + + print(f"output: {output}", flush=True) + + +def test_popv_with_other_layer(run_component, tmp_path): + input_h5mu = mu.read(input_file) + input_h5mu.mod["rna"].layers["test"] = input_h5mu.mod["rna"].X.copy() + input_h5mu.write_h5mu(tmp_path / "input.h5mu") + run_component( + [ + "--input", + tmp_path / "input.h5mu", + "--reference", + reference_file, + "--output", + "output.h5mu", + "--methods", + "rf;svm;knn_on_scanorama;knn_on_scvi", + ] + ) + + +def test_popv_with_non_overlapping_cells(run_component, tmp_path): + input_h5mu = mu.read(input_file) + + # copy previous modalities + rna_ad = input_h5mu.mod["rna"].copy() + prot_ad = input_h5mu.mod["prot"].copy() + + # change obs_names such that the cells do not overlap + rna_ad.obs_names = [f"rna_{x}" for x in rna_ad.obs_names] + prot_ad.obs_names = [f"prot_{x}" for x in prot_ad.obs_names] + + # write new h5mu to file + new_h5mu = mu.MuData({"rna": rna_ad, "prot": prot_ad}) + new_h5mu.write_h5mu(tmp_path / "input.h5mu") + + # run component + run_component( + [ + "--input", + tmp_path / "input.h5mu", + "--reference", + reference_file, + "--output", + "output.h5mu", + "--methods", + "rf;svm;knn_on_scanorama", + ] + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/random_forest_annotation/config.vsh.yaml b/src/annotate/random_forest_annotation/config.vsh.yaml new file mode 100644 index 00000000..a8def936 --- /dev/null +++ b/src/annotate/random_forest_annotation/config.vsh.yaml @@ -0,0 +1,166 @@ +name: random_forest_annotation +namespace: annotate +scope: "public" +description: Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + +argument_groups: + - name: Inputs + description: Input dataset (query) arguments + arguments: + - name: "--input" + type: file + description: The input (query) data to be labeled. Should be a .h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + description: The layer in the input data to be used for cell type annotation if .X is not to be used. + - name: "--input_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." + example: reference.h5mu + direction: input + required: false + - name: "--reference_layer" + type: string + description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset. + required: false + - name: "--reference_obs_target" + type: string + description: Key in obs field of reference modality with cell-type information. + required: true + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--reference_var_input" + type: string + required: false + description: | + .var column containing highly variable genes. By default, do not subset genes. + + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obs_predictions" + type: string + default: random_forest_pred + required: false + description: | + In which `.obs` slots to store the predicted information. + - name: "--output_obs_probability" + type: string + default: random_forest_probability + required: false + description: | + In which `.obs` slots to store the probability of the predictions. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Model arguments + description: Model arguments. + arguments: + - name: "--model" + type: file + description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided." + required: false + example: pretrained_model.pkl + - name: "--n_estimators" + type: integer + required: false + default: 100 + description: Number of trees in the random forest. + - name: "--max_depth" + type: integer + required: false + description: | + Maximum depth of the trees in the random forest. + If not provided, the nodes are expanded until all leaves only contain a single sample. + - name: "--criterion" + type: string + required: false + choices: ["gini", "entropy", "log_loss"] + default: "gini" + description: The function to measure the quality of a split. + - name: "--class_weight" + type: string + required: false + default: "balanced_subsample" + choices: ["balanced", "balanced_subsample", "uniform"] + description: | + Weights associated with classes. + The `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. + The `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown. + The `uniform` mode gives all classes a weight of one. + - name: "--max_features" + type: string + default: "200" + description: | + The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`. + If integer: consider max_features features at each split. + If `sqrt`: max_features is the squareroot of all input features. + If `log2`: max_features is the log2 of all input features. + If `all`: max features equals all input features. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/cross_check_genes.py + - path: /src/utils/subset_vars.py + - path: /src/utils/set_var_index.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - libhdf5-dev + - procps + - type: python + packages: + - scikit-learn==1.4.2 + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highcpu, highmem, highdisk] diff --git a/src/annotate/random_forest_annotation/script.py b/src/annotate/random_forest_annotation/script.py new file mode 100644 index 00000000..df401315 --- /dev/null +++ b/src/annotate/random_forest_annotation/script.py @@ -0,0 +1,158 @@ +import sys +import mudata as mu +import numpy as np +from sklearn.ensemble import RandomForestClassifier +import pickle + + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "output.h5mu", + "input_var_gene_names": None, + "input_reference_gene_overlap": 100, + "modality": "rna", + "reference": None, + # "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + # "model": None, + "model": "resources_test/annotation_test_data/rf_model.pkl", + "reference_obs_target": "cell_ontology_class", + "reference_var_gene_names": "ensemblid", + "reference_var_input": None, + "input_layer": None, + "reference_layer": None, + "n_estimators": 100, + "criterion": "gini", + "max_depth": None, + "class_weight": None, + "max_features": 200, + "output_compression": "gzip", + "output_obs_predictions": "random_forest_pred", + "output_obs_probability": "random_forest_probability", +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from subset_vars import subset_vars +from set_var_index import set_var_index + +logger = setup_logger() + + +def main(): + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + + # Handle max_features parameter + max_features_conversion = { + "all": None, + "sqrt": "sqrt", + "log2": "log2", + } + try: + max_features = max_features_conversion.get( + par["max_features"], int(par["max_features"]) + ) + except ValueError: + raise ValueError( + f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'" + ) + + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + if par["model"]: + logger.info("Loading a pre-trained model") + model = pickle.load(open(par["model"], "rb")) + if hasattr(model, "_feature_names_in"): + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) + if not len(common_genes) == len(model._feature_names_in): + raise ValueError("Input dataset does not contain all model features.") + input_modality = input_modality[:, common_genes] + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + else: + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) + + elif par["reference"]: + logger.info("Reading reference data") + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Query and input require the exact same features + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + reference_modality = reference_modality[:, common_genes] + input_modality = input_modality[:, common_genes] + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + logger.info("Training a model...") + labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() + model = RandomForestClassifier( + n_estimators=par["n_estimators"], + criterion=par["criterion"], + max_depth=par["max_depth"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + max_features=max_features, + ) + model.fit(reference_matrix, labels) + model._feature_names_in = reference_modality.var.index + + logger.info("Running predictions...") + predictions = model.predict(input_matrix) + probabilities = np.max(model.predict_proba(input_matrix), axis=1) + + input_adata.obs[par["output_obs_predictions"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + + logger.info("Writing output data") + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() diff --git a/src/annotate/random_forest_annotation/test.py b/src/annotate/random_forest_annotation/test.py new file mode 100644 index 00000000..a5af9484 --- /dev/null +++ b/src/annotate/random_forest_annotation/test.py @@ -0,0 +1,240 @@ +import sys +import os +import pytest +import subprocess +import re +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from sklearn.ensemble import RandomForestClassifier +import pickle + +## VIASH START +meta = {"resources_dir": "resources_test"} +sys.path.append("src/utils") +## VIASH END + +input_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) +reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu" + +sys.path.append(meta["resources_dir"]) +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index + + +@pytest.fixture +def dummy_model(tmp_path): + reference_modality = mu.read_h5mu(reference_file).mod["rna"].copy() + reference_modality = set_var_index(reference_modality, "ensemblid") + + input_modality = mu.read_h5mu(input_file).mod["rna"].copy() + input_modality = set_var_index(input_modality, None) + + common_genes = cross_check_genes( + input_modality.var.index, reference_modality.var.index + ) + reference_modality = reference_modality[:, common_genes] + + labels = reference_modality.obs["cell_ontology_class"].to_numpy() + model = RandomForestClassifier() + model.fit(reference_modality.X, labels) + model._feature_names_in = reference_modality.var.index + + model_path = tmp_path / "model.pkl" + with open(model_path, "wb") as f: + pickle.dump(model, f) + + return model_path + + +def test_simple_execution(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "random_forest_pred", + "random_forest_probability", + ] + + obs_values = output_mudata.mod["rna"].obs["random_forest_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + "probabilities outside the range [0, 1]" + ) + + +def test_custom_out_obs_model_params(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_obs_predictions", + "dummy_pred", + "--output_obs_probability", + "dummy_probability", + "--n_estimators", + "10", + "--criterion", + "entropy", + "--max_depth", + "5", + "--class_weight", + "balanced", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "dummy_pred", + "dummy_probability", + ] + + obs_values = output_mudata.mod["rna"].obs["dummy_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + "probabilities outside the range [0, 1]" + ) + + +def test_with_model(run_component, random_h5mu_path, dummy_model): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--model", + dummy_model, + "--output", + output_file, + "--reference_obs_target", + "cell_ontology_class", + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "random_forest_pred", + "random_forest_probability", + ] + + obs_values = output_mudata.mod["rna"].obs["random_forest_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + "probabilities outside the range [0, 1]" + ) + + +def test_no_model_no_reference_error(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--output", + output_file, + "--reference_obs_target", + "cell_ontology_class--reference_var_gene_names", + "ensemblid", + ] + ) + assert re.search( + r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", + err.value.stdout.decode("utf-8"), + ) + + +def test_model_and_reference_error(run_component, random_h5mu_path, dummy_model): + output_file = random_h5mu_path() + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--output", + output_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--model", + dummy_model, + ] + ) + assert re.search( + r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", + err.value.stdout.decode("utf-8"), + ) + + +def test_invalid_max_features(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--output", + output_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--max_features", + "invalid_value", + ] + ) + assert re.search( + r"Invaldid value invalid_value for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/scanvi/config.vsh.yaml b/src/annotate/scanvi/config.vsh.yaml new file mode 100644 index 00000000..7c1f26a5 --- /dev/null +++ b/src/annotate/scanvi/config.vsh.yaml @@ -0,0 +1,163 @@ +name: scanvi +namespace: "annotate" +scope: "public" +description: | + scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells. + This component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction. + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer ] + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file. Note that this needs to be the exact same dataset as the --scvi_model was trained on. + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used" + - name: "--var_input" + type: string + required: false + description: ".var column containing highly variable genes that were used to train the scVi model. By default, do not subset genes." + - name: "--var_gene_names" + type: string + required: false + description: ".var column containing gene names. By default, use the index." + - name: "--obs_labels" + type: string + required: true + description: ".obs field containing the labels" + - name: "--unlabeled_category" + type: string + default: "Unknown" + description: | + Value in the --obs_labels field that indicates unlabeled observations + + - name: scVI Model + arguments: + - name: "--scvi_model" + type: file + description: "Pretrained SCVI reference model to initialize the SCANVI model with." + example: scvi_model.pt + direction: input + required: true + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--output_model" + type: file + description: Folder where the state of the trained model will be saved to. + required: false + direction: output + - name: "--obsm_output" + type: string + default: "X_scanvi_integrated" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--obs_output_predictions" + type: string + default: scanvi_pred + description: "In which .obs slot to store the predicted labels." + - name: "--obs_output_probabilities" + type: string + default: scanvi_proba + description: "In which. obs slot to store the probabilities of the predicted labels." + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: "scANVI training arguments" + arguments: + - name: "--early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, + i.e. an absolute change of less than min_delta, will count as no improvement." + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + +resources: + - type: python_script + path: script.py + - path: /src/utils/subset_vars.py + - path: /src/utils/compress_h5mu.py + - path: /src/utils/set_var_index.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/scvi_model/ + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: nvcr.io/nvidia/pytorch:25.05-py3 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + - type: python + packages: + - jax[cuda] + - scvi-tools~=1.3.1 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable + # docker_run_args: ["--gpus all"] +- type: nextflow + directives: + label: [midcpu, midmem, gpu, highdisk] diff --git a/src/annotate/scanvi/script.py b/src/annotate/scanvi/script.py new file mode 100644 index 00000000..6a89962c --- /dev/null +++ b/src/annotate/scanvi/script.py @@ -0,0 +1,110 @@ +import mudata +import scvi +import numpy as np + +### VIASH START +par = { + "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "modality": "rna", + "input_layer": None, + "var_input": None, + "output": "foo.h5mu", + "obsm_output": "X_scvi_integrated", + "early_stopping": True, + "early_stopping_monitor": "elbo_validation", + "early_stopping_patience": 45, + "early_stopping_min_delta": 0, + "reduce_lr_on_plateau": True, + "lr_factor": 0.6, + "lr_patience": 30, + "max_epochs": 500, + "n_obs_min_count": 10, + "n_var_min_count": 10, + "output_model": "test/", + "output_compression": "gzip", +} + +meta = {"resources_dir": "src/utils"} +### VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info("Reading input data...") + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + if par["var_input"]: + # Subset to HVG + adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() + else: + adata_subset = adata.copy() + + # Sanitize gene names and set as index of the AnnData object + adata_subset = set_var_index(adata_subset, par["var_gene_names"]) + + logger.info(f"Loading pre-trained scVI model from {par['scvi_model']}") + scvi_model = scvi.model.SCVI.load( + par["scvi_model"], + adata_subset, + accelerator="auto", + device="auto", + ) + + logger.info("Instantiating scANVI model from scVI model...") + vae_uns = scvi.model.SCANVI.from_scvi_model( + scvi_model, + unlabeled_category=par["unlabeled_category"], + labels_key=par["obs_labels"], + adata=adata_subset, + ) + + plan_kwargs = { + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], + } + + logger.info("Training scANVI model...") + # Train the model + vae_uns.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + plan_kwargs=plan_kwargs, + check_val_every_n_epoch=1, + accelerator="auto", + ) + # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set + + logger.info("Performing scANVI integration...") + # Get the latent output + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() + + logger.info("Performing scANVI prediction...") + adata.obs[par["obs_output_predictions"]] = vae_uns.predict() + adata.obs[par["obs_output_probabilities"]] = np.max( + vae_uns.predict(soft=True), axis=1 + ) + + logger.info("Writing output data...") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + if par["output_model"]: + vae_uns.save(par["output_model"], overwrite=True) + + +if __name__ == "__main__": + main() diff --git a/src/annotate/scanvi/test.py b/src/annotate/scanvi/test.py new file mode 100644 index 00000000..13d26147 --- /dev/null +++ b/src/annotate/scanvi/test.py @@ -0,0 +1,127 @@ +import pytest +import re +import subprocess +from pathlib import Path + +import mudata +from anndata.tests.helpers import assert_equal + +## VIASH START +meta = { + "executable": "./target/executable/integrate/scanvi/scanvi", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", +} +## VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +input_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu" +input_file_2 = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" +scvi_model = f"{meta['resources_dir']}/scvi_model" + + +def test_scanvi(run_component): + args = [ + "--input", + input_file, + "--modality", + "rna", + "--obs_labels", + "cell_ontology_class", + "--scvi_model", + scvi_model, + "--output", + "output.h5mu", + "--output_model", + "scanvi_model", + "--max_epochs", + "5", + "--output_compression", + "gzip", + ] + + run_component(args) + + input_rna = mudata.read_h5ad(input_file.strip(), mod="rna") + + # check files + assert Path("output.h5mu").is_file(), "Output file does not exist" + assert Path("scanvi_model").is_dir() + assert Path("scanvi_model/model.pt").is_file() + + # check output h5mu + output_data = mudata.read_h5mu("output.h5mu") + output_rna = output_data.mod["rna"] + assert output_rna.n_obs == input_rna.n_obs, ( + f"Number of observations changed\noutput_data: {output_data}" + ) + assert output_rna.n_vars == input_rna.n_vars, ( + f"Number of variables changed\noutput_data: {output_data}" + ) + + assert "X_scanvi_integrated" in output_rna.obsm, ( + f".obsm['X_scanvi_integrated'] not added\noutput_data: {output_data}" + ) + + assert "scanvi_pred" in output_rna.obs, ( + f".obs['scanvi_pred'] not added\noutput_data: {output_data}" + ) + + assert "scanvi_proba" in output_rna.obs, ( + f".obs['scanvi_proba'] not added\noutput_data: {output_data}" + ) + + predictions = output_data.mod["rna"].obs["scanvi_pred"] + probabilities = output_data.mod["rna"].obs["scanvi_proba"] + + assert predictions.dtype == "category", ( + "Calculated predictions should be category dtype" + ) + assert not all(predictions.isna()), "Not all predictions should be NA" + assert probabilities.dtype == "float32", ( + "Calculated probabilities should be float32 dtype" + ) + assert all(0 <= value <= 1 for value in probabilities), ( + ".obs at celltypist_probability has values outside the range [0, 1]" + ) + + # assert that nothing else has changed + del output_rna.obsm["X_scanvi_integrated"] + del output_rna.obs["scanvi_pred"] + del output_rna.obs["scanvi_proba"] + + assert_equal(input_rna, output_rna) + + +def test_raises_with_noncompatible_input_file(run_component): + args = [ + "--input", + input_file_2, + "--modality", + "rna", + "--obs_labels", + "cell_ontology_class", + "--scvi_model", + scvi_model, + "--output", + "output.h5mu", + "--output_model", + "scanvi_model", + "--max_epochs", + "5", + "--output_compression", + "gzip", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: Number of vars in `adata_target` not the same as source.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/singler/config.vsh.yaml b/src/annotate/singler/config.vsh.yaml new file mode 100644 index 00000000..c3a4ea8e --- /dev/null +++ b/src/annotate/singler/config.vsh.yaml @@ -0,0 +1,189 @@ +name: singler +namespace: annotate +scope: "public" +description: | + SingleR performs reference-based cell type annotation for single-cell RNA-seq data + by computing Spearman correlations between test cells and reference samples with known labels, + using marker genes to assign the most similar cell type label to each new cell. + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + description: Input dataset (query) arguments + arguments: + - name: "--input" + alternatives: [-i] + type: file + description: The input (query) data to be labeled. Should be a .h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used. + - name: "--input_var_gene_names" + type: string + required: false + description: | + The name of the adata .var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_obs_clusters" + type: string + required: false + description: | + The name of the adata .obs column containing cluster identities of the observations. + If provided, annoation is performed on the aggregated cluster profiles, + otherwise it defaults to annotation per observation. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." + example: reference.h5mu + direction: input + required: true + - name: "--reference_layer" + type: string + description: The layer in the reference data containing lognormalized couns to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset. + required: false + - name: "--reference_obs_target" + type: string + required: true + description: The name of the adata obs column in the reference data containing cell type annotations. + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--reference_var_input" + type: string + required: false + description: | + .var column containing a boolean mask corresponding to genes to be used for marker selection. By default, do not subset genes. + + - name: Arguments + description: Arguments related to the training of and classification with the SingleR model + arguments: + - name: "--de_n_genes" + type: integer + required: False + min: 1 + description: | + The number of differentially expressed genes across labels to be calculated from the reference. + Defaults to 500 * (2/3) ^ log2(N) where N is the number of unique labels when if `--de_method` is set to `classic`, + otherwise, defaults to 10. + - name: "--de_method" + type: string + default: "classic" + choices: ["classic", "t", "wilcox"] + description: Method to detect differentially expressed genes between pairs of labels. + - name: "--quantile" + type: double + min: 0 + max: 1 + default: 0.8 + description: The quantile of the correlation distribution to use to compute the score per label. + - name: "--fine_tune" + type: boolean + default: true + description: | + Whether finetuning should be performed to improve the resolution. + If set to True, an additional finetuning step is performed after initial classification, + new marker genes are calculated based on all cells with a score higher then the maximum score minus `--fine_tuning_thershold`, + and the calculation of the scores is repeated. + - name: "--fine_tuning_threshold" + type: double + default: 0.05 + description: | + The maximum difference from the maximum correlation to use in fine-tuning + - name: "--prune" + type: boolean + default: true + description: + Whether label pruning should be performed. + If set to True, an additional output .obs field `--output_obs_pruned_predictions` will be added to the `--output`, + containing labels where 'low-quality' labels are replaced with NA's. + Labels are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`) fall more than 3 median absolute deviations below the median for that label type. + + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obs_predictions" + type: string + default: singler_pred + description: | + In which `.obs` slot to store the predicted labels. If `--fine_tune False`, this is based only on the maximum entry in `--output_obsm_scores`. + - name: "--output_obs_probability" + type: string + default: singler_probability + description: | + In which `.obs` slots to store the probability of the predicted labels. + - name: "--output_obs_delta_next" + type: string + default: singler_delta_next + description: | + In which `.obs` slot to store the delta between the best and next-best score. If `--fine_tune True`, this is reported for scores after fine-tuning. + - name: "--output_obs_pruned_predictions" + type: string + default: singler_pruned_labels + description: | + In which `.obs` slot to store the pruned labels, where low-quality labels are replaced with NA's. Only added if `--prune True`. + - name: "--output_obsm_scores" + type: string + default: singler_scores + description: | + In which `.obsm` slot to store the matrix of prediction correlations at the specified quantile for each label (column) in each cell (row). + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu + +engines: + - type: docker + image: rocker/r2u:22.04 + setup: + - type: docker + env: + - RETICULATE_PYTHON=/usr/bin/python + - type: apt + packages: [ libhdf5-dev, python3, python3-pip, python3-dev, python-is-python3 ] + - type: r + cran: [ anndata, reticulate, SingleR ] + bioc: [ scrapper, bit64 ] + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml ] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowcpu] diff --git a/src/annotate/singler/script.R b/src/annotate/singler/script.R new file mode 100644 index 00000000..a46b59d2 --- /dev/null +++ b/src/annotate/singler/script.R @@ -0,0 +1,192 @@ +library(SingleR) +library(Matrix) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +mudata <- reticulate::import("mudata") + +### VIASH START +par <- list( + input = "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + modality = "rna", + input_layer = NULL, + input_var_gene_names = "gene_symbol", + input_reference_gene_overlap = 100, + input_obs_clusters = NULL, + reference = "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + reference_layer = NULL, + reference_var_input = NULL, + reference_var_gene_names = NULL, + reference_obs_target = "cell_ontology_class", + output = "singler_output.h5mu", + output_compression = "gzip", + output_obs_predictions = "singler_labels", + output_obs_probability = "singlr_proba", + output_obsm_scores = "single_r_scores", + output_obs_delta_next = "singler_delta_next", + output_obs_pruned_predictions = "singler_pruned_labels", + quantile = 0.8, + fine_tune = TRUE, + fine_tuning_threshold = 0.05, + prune = TRUE, + de_n_genes = NULL, + de_method = "classic" +) +meta <- list( + cpus = 4 +) + +### VIASH END + +get_layer <- function(adata, layer, var_gene_names) { + # find data + data <- + if (is.null(layer)) { + adata$X + } else { + adata$layers[[layer]] + } + + # check if data is available + if (is.null(data)) { + if (is.null(layer)) { + stop("No layer specified and no .X slot available in the AnnData object.") + } else { + stop( + "Requested layer '", + layer, + "' is not available in the AnnData object. Available layers: ", + paste(names(adata$layers), collapse = ", ") + ) + } + } + + # Set matrix dimnames + input_gene_names <- sanitize_gene_names(adata, var_gene_names) + dimnames(data) <- list(adata$obs_names, input_gene_names) + + # return output + data +} + +sanitize_gene_names <- function(adata, gene_symbol = NULL) { + if (is.null(gene_symbol)) { + gene_names <- adata$var_names + } else { + gene_names <- adata$var[[gene_symbol]] + } + # Remove version numbers (dot followed by digits at end of string) + sanitized <- gsub("\\.[0-9]+$", "", gene_names) + sanitized +} + +subset_vars <- function(adata, subset_col) { + # Check if column exists + if (!subset_col %in% colnames(adata$var)) { + stop( + "Requested to use .var column '", + subset_col, + "' as a selection of genes, but the column is not available." + ) + } + + # Get the column + subset_values <- adata$var[[subset_col]] + + # Check for NA values + if (any(is.na(subset_values))) { + stop( + "The .var column `", + subset_col, + "` contains NA values. Can not subset data." + ) + } + + # Check if it's logical/boolean + if (!is.logical(subset_values)) { + stop( + "Expected dtype of .var column '", + subset_col, + "' to be `logical`, but found ", + class(subset_values)[1], + ". Can not subset data." + ) + } + + # Subset and return copy + adata_subset <- adata[, subset_values]$copy() + adata_subset +} + +# Read input data +cat("Reading input file\n") +input_mdata <- mudata$read_h5mu(par$input) +input_adata <- input_mdata$mod[[par$modality]] +input_matrix <- Matrix::t( + get_layer(input_adata, par$input_layer, par$input_var_gene_names) +) + +# Read reference +cat("Reading reference file\n") +ref_adata <- mudata$read_h5ad(par$reference, mod = "rna") +if (!is.null(par$reference_var_input)) { + ref_adata <- subset_vars(ref_adata, par$reference_var_input) +} +ref_matrix <- Matrix::t( + get_layer(ref_adata, par$reference_layer, par$reference_var_gene_names) +) + +# Check overlap genes +cat("Checking overlap of genes between input and reference file\n") +common_ens_ids <- intersect(rownames(ref_matrix), rownames(input_matrix)) +if (length(common_ens_ids) < par$input_reference_gene_overlap) { + stop( + "The intersection of genes between the query and reference is too small.\n", + paste0("Expected overlap: ", par$input_reference_gene_overlap), + paste0("Detected overlap: ", length(common_ens_ids)) + ) +} + +# Calculate CPU cores +n_workers <- meta$cpus %||% max(1, parallel::detectCores() - 1) + +# Assign target labels +if (!par$reference_obs_target %in% colnames(ref_adata$obs)) { + stop( + "Requested to use .obs column '", + par$reference_obs_target, + "' as target labels, but the column is not available." + ) +} else { + target_labels <- ref_adata$obs[[par$reference_obs_target]] +} + +cat("Performing SingleR cell type prediction\n") +predictions <- SingleR( + test = input_matrix, + ref = ref_matrix, + labels = target_labels, + clusters = par$input_obs_clusters, + genes = "de", + de.method = par$de_method, + de.n = par$de_n_genes, + quantile = par$quantile, + fine.tune = par$fine_tune, + tune.thresh = par$fine_tuning_threshold, + prune = par$prune, + num.threads = n_workers +) + +cat("Writing output data\n") +# Writing output slots +input_adata$obs[[par$output_obs_predictions]] <- + predictions$labels +input_adata$obs[[par$output_obs_probability]] <- + apply(predictions$scores, 1, max) +input_adata$obs[[par$output_obs_delta_next]] <- + predictions$delta.next +input_adata$obs[[par$output_obs_pruned_predictions]] <- + predictions$pruned.labels +input_adata$obsm[[par$output_obsm_scores]] <- + predictions$scores +# Writing output H5MU +input_mdata$write(par$output, compression = par$output_compression) diff --git a/src/annotate/singler/test.py b/src/annotate/singler/test.py new file mode 100644 index 00000000..bbc5a65d --- /dev/null +++ b/src/annotate/singler/test.py @@ -0,0 +1,123 @@ +import sys +import os +import pytest +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +input_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) +reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu" + + +def test_simple_execution(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "singler_pred", + "singler_probability", + "singler_delta_next", + "singler_pruned_labels", + ] + assert list(output_mudata.mod["rna"].obsm.keys()) == ["singler_scores"] + + predictions = output_mudata.mod["rna"].obs["singler_pred"] + assert predictions.dtype == "category", ( + "Calculated predictions should be category dtype" + ) + assert predictions.notna().all(), "None of the predictions should be NA" + + pruned_predictions = output_mudata.mod["rna"].obs["singler_pruned_labels"] + assert pruned_predictions.dtype == "category", ( + "Pruned predictions should be category dtype" + ) + assert pruned_predictions.isna().any(), ( + "Some of the pruned predictions should be NA" + ) + assert not all(pruned_predictions.isna()), "Not all pruned predictions should be NA" + + delta_next = output_mudata.mod["rna"].obs["singler_delta_next"] + assert delta_next.dtype == "float", "Calculated delta next should be float dtype" + + scores = output_mudata.mod["rna"].obsm["singler_scores"] + assert scores.dtype == "float", "Calculated scores should be float dtype" + assert all(-1 <= value <= 1 for value in scores.flatten()), ( + ".obsm at singler_scores has values outside the range [-1, 1]" + ) + + +def test_params(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--input_reference_gene_overlap", + "1000", + "--reference_var_input", + "highly_variable", + "de_n_genes", + "600", + "--de_method", + "wilcox", + "--quantile", + "0.75", + "--fine_tuning_threshold", + "0.1", + "--prune", + "False", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "singler_pred", + "singler_probability", + "singler_delta_next", + ] + assert list(output_mudata.mod["rna"].obsm.keys()) == ["singler_scores"] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/svm_annotation/config.vsh.yaml b/src/annotate/svm_annotation/config.vsh.yaml new file mode 100644 index 00000000..6d365fb4 --- /dev/null +++ b/src/annotate/svm_annotation/config.vsh.yaml @@ -0,0 +1,155 @@ +name: svm_annotation +namespace: annotate +scope: "public" +description: Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + +argument_groups: + - name: Inputs + description: Input dataset (query) arguments + arguments: + - name: "--input" + type: file + description: The input (query) data to be labeled. Should be a .h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + description: The layer in the input data to be used for cell type annotation if .X is not to be used. + - name: "--input_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." + example: reference.h5mu + direction: input + required: false + - name: "--reference_layer" + type: string + description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset. + required: false + - name: "--reference_obs_target" + type: string + description: + required: true + description: | + Key in .obs attribute of reference modality with cell-type information. + + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--reference_var_input" + type: string + required: false + description: | + .var column containing highly variable genes. By default, do not subset genes. + + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obs_prediction" + type: string + default: svm_pred + required: false + description: | + In which `.obs` slots to store the predicted information. + - name: "--output_obs_probability" + type: string + default: svm_probability + required: false + description: | + In which `.obs` slots to store the probability of the predictions. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Model arguments + description: Model arguments. + arguments: + - name: "--model" + type: file + description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided." + required: false + example: pretrained_model.pkl + - name: "--feature_selection" + type: boolean + description: "Whether to perform feature selection." + default: true + - name: "--max_iter" + type: integer + description: "Maximum number of iterations for the SVM." + min: 1 + default: 5000 + - name: "--c_reg" + type: double + description: "Regularization parameter for the SVM." + min: 0.0 + default: 1.0 + - name: "--class_weight" + type: string + description: | + "Class weights for the SVM. The `uniform` mode gives all classes a weight of one. + The `balanced` mode (default) uses the values of y to automatically adjust weights inversely + proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))" + choices: ["balanced", "uniform"] + default: "balanced" + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/cross_check_genes.py + - path: /src/utils/subset_vars.py + - path: /src/utils/set_var_index.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - libhdf5-dev + - procps + - type: python + packages: + - scikit-learn==1.5.2 + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highcpu, highmem, highdisk] diff --git a/src/annotate/svm_annotation/script.py b/src/annotate/svm_annotation/script.py new file mode 100644 index 00000000..ab4f8f69 --- /dev/null +++ b/src/annotate/svm_annotation/script.py @@ -0,0 +1,140 @@ +import sys +import mudata as mu +import numpy as np +from sklearn.calibration import CalibratedClassifierCV +from sklearn import svm +import pickle + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "output.h5mu", + "modality": "rna", + "reference": None, + # "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + # "model": None, + "model": "resources_test/annotation_test_data/svm_model.pkl", + "reference_obs_target": "cell_ontology_class", + "reference_var_input": None, + "input_layer": None, + "reference_layer": None, + "max_iter": 5000, + "c_reg": 1, + "class_weight": "balanced", + "output_compression": "gzip", + "input_var_gene_names": None, + "reference_var_gene_names": "ensemblid", + "output_obs_prediction": "svm_pred", + "output_obs_probability": "svm_probability", + "input_reference_gene_overlap": 100, +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from subset_vars import subset_vars +from set_var_index import set_var_index + +logger = setup_logger() + + +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + + if par["model"]: + logger.info("Loading a pre-trained model") + model = pickle.load(open(par["model"], "rb")) + if hasattr(model, "_feature_names_in"): + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) + if not len(common_genes) == len(model._feature_names_in): + raise ValueError("Input dataset does not contain all model features.") + input_modality = input_modality[:, common_genes] + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + else: + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) + + elif par["reference"]: + logger.info("Reading reference data") + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Query and input require the exact same features + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + reference_modality = reference_modality[:, common_genes] + input_modality = input_modality[:, common_genes] + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + logger.info("Training a model...") + labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() + model = CalibratedClassifierCV( + svm.LinearSVC( + C=par["c_reg"], + max_iter=par["max_iter"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + dual="auto", + ) + ) + model.fit(reference_matrix, labels) + model._feature_names_in = reference_modality.var.index + + logger.info("Running predictions...") + predictions = model.predict(input_matrix) + probabilities = np.max(model.predict_proba(input_matrix), axis=1) + + logger.info("Writing output data") + input_adata.obs[par["output_obs_prediction"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() diff --git a/src/annotate/svm_annotation/test.py b/src/annotate/svm_annotation/test.py new file mode 100644 index 00000000..9b48ae68 --- /dev/null +++ b/src/annotate/svm_annotation/test.py @@ -0,0 +1,184 @@ +import sys +import os +import pytest +import subprocess +import re +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from sklearn import svm +from sklearn.calibration import CalibratedClassifierCV +import pickle + +## VIASH START +meta = {"resources_dir": "resources_test"} +sys.path.append("src/utils") +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" + +sys.path.append(meta["resources_dir"]) +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index + + +@pytest.fixture +def dummy_model(tmp_path): + reference_modality = mu.read_h5mu(reference_file).mod["rna"].copy() + reference_modality = set_var_index(reference_modality, "ensemblid") + + input_modality = mu.read_h5mu(input_file).mod["rna"].copy() + input_modality = set_var_index(input_modality, None) + + common_genes = cross_check_genes( + input_modality.var.index, reference_modality.var.index + ) + reference_modality = reference_modality[:, common_genes] + + labels = reference_modality.obs["cell_ontology_class"].to_numpy() + model = CalibratedClassifierCV( + svm.LinearSVC( + max_iter=10, + dual="auto", + ) + ) + model.fit(reference_modality.X, labels) + model._feature_names_in = reference_modality.var.index + + model_path = tmp_path / "model.pkl" + with open(model_path, "wb") as f: + pickle.dump(model, f) + + return model_path + + +def test_simple_execution(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"] + + obs_values = output_mudata.mod["rna"].obs["svm_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + "probabilities outside the range [0, 1]" + ) + + +def test_custom_out_obs_model_params(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_var_gene_names", + "ensemblid", + "--reference_obs_target", + "cell_ontology_class", + "--output_obs_prediction", + "dummy_pred", + "--output_obs_probability", + "dummy_probability", + "--max_iter", + "1000", + "--c_reg", + "0.1", + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "dummy_pred", + "dummy_probability", + ] + + obs_values = output_mudata.mod["rna"].obs["dummy_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + "probabilities outside the range [0, 1]" + ) + + +def test_with_model(run_component, random_h5mu_path, dummy_model): + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--reference_obs_target", + "cell_ontology_class", + "--model", + dummy_model, + "--output", + output_file, + ] + ) + + assert os.path.exists(output_file), "Output file does not exist" + + input_mudata = mu.read_h5mu(input_file) + output_mudata = mu.read_h5mu(output_file) + + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) + + assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"] + + obs_values = output_mudata.mod["rna"].obs["svm_probability"] + assert all(0 <= value <= 1 for value in obs_values), ( + "probabilities outside the range [0, 1]" + ) + + +def test_no_model_no_reference_error(run_component, random_h5mu_path): + output_file = random_h5mu_path() + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--reference_obs_target", + "cell_ontology_class", + "--output", + output_file, + ] + ) + assert re.search( + r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/authors/angela_pisco.yaml b/src/authors/angela_pisco.yaml new file mode 100644 index 00000000..2b7e9104 --- /dev/null +++ b/src/authors/angela_pisco.yaml @@ -0,0 +1,14 @@ +name: Angela Oliveira Pisco +info: + role: Contributor + links: + github: aopisco + orcid: "0000-0003-0142-2355" + linkedin: aopisco + organizations: + - name: Insitro + href: https://insitro.com + role: Director of Computational Biology + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/authors/dorien_roosen.yaml b/src/authors/dorien_roosen.yaml new file mode 100644 index 00000000..96bb9d37 --- /dev/null +++ b/src/authors/dorien_roosen.yaml @@ -0,0 +1,11 @@ +name: Dorien Roosen +info: + role: Core Team Member + links: + email: dorien@data-intuitive.com + github: dorien-er + linkedin: dorien-roosen + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist \ No newline at end of file diff --git a/src/authors/dries_de_maeyer.yaml b/src/authors/dries_de_maeyer.yaml new file mode 100644 index 00000000..dc67ecca --- /dev/null +++ b/src/authors/dries_de_maeyer.yaml @@ -0,0 +1,11 @@ +name: Dries De Maeyer +info: + role: Core Team Member + links: + email: ddemaeyer@gmail.com + github: ddemaeyer + linkedin: dries-de-maeyer-b46a814 + organizations: + - name: Janssen Pharmaceuticals + href: https://www.janssen.com + role: Principal Scientist \ No newline at end of file diff --git a/src/authors/dries_schaumont.yaml b/src/authors/dries_schaumont.yaml new file mode 100644 index 00000000..fc28b844 --- /dev/null +++ b/src/authors/dries_schaumont.yaml @@ -0,0 +1,12 @@ +name: Dries Schaumont +info: + role: Core Team Member + links: + email: dries@data-intuitive.com + github: DriesSchaumont + orcid: "0000-0002-4389-0440" + linkedin: dries-schaumont + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist \ No newline at end of file diff --git a/src/authors/elizabeth_mlynarski.yaml b/src/authors/elizabeth_mlynarski.yaml new file mode 100644 index 00000000..9e59872d --- /dev/null +++ b/src/authors/elizabeth_mlynarski.yaml @@ -0,0 +1,6 @@ +name: Elizabeth Mlynarski +info: + role: Contributor + organizations: + - name: Janssen R&D US + role: Principal Scientist Computational Genomics \ No newline at end of file diff --git a/src/authors/isabelle_bergiers.yaml b/src/authors/isabelle_bergiers.yaml new file mode 100644 index 00000000..64e95859 --- /dev/null +++ b/src/authors/isabelle_bergiers.yaml @@ -0,0 +1,10 @@ +name: Isabelle Bergiers +info: + role: Contributor + links: + github: Isabelle-b + orcid: 0000-0001-9622-7960 + organizations: + - name: Janssen Pharmaceuticals + href: https://www.janssen.com + role: Scientist OMICS Technology \ No newline at end of file diff --git a/src/authors/jakub_majercik.yaml b/src/authors/jakub_majercik.yaml new file mode 100644 index 00000000..b1086dee --- /dev/null +++ b/src/authors/jakub_majercik.yaml @@ -0,0 +1,11 @@ +name: Jakub Majercik +info: + role: Contributor + links: + email: jakub@data-intuitive.com + github: jakubmajercik + linkedin: jakubmajercik + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatics Engineer \ No newline at end of file diff --git a/src/authors/kai_waldrant.yaml b/src/authors/kai_waldrant.yaml new file mode 100644 index 00000000..2ba24891 --- /dev/null +++ b/src/authors/kai_waldrant.yaml @@ -0,0 +1,15 @@ +name: Kai Waldrant +info: + role: Contributor + links: + email: kai@data-intuitive.com + github: KaiWaldrant + orcid: "0009-0003-8555-1361" + linkedin: kaiwaldrant + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician + - name: Open Problems + href: https://openproblems.bio + role: Contributor diff --git a/src/authors/malte_luecken.yaml b/src/authors/malte_luecken.yaml new file mode 100644 index 00000000..ab6357cf --- /dev/null +++ b/src/authors/malte_luecken.yaml @@ -0,0 +1,16 @@ +name: Malte D. Luecken +info: + role: Core Team Member + links: + email: malte.luecken@helmholtz-muenchen.de + github: LuckyMD + orcid: "0000-0001-7464-7921" + linkedin: malte-l%C3%BCcken-b8b21049 + twitter: MDLuecken + organizations: + - name: Helmholtz Munich + href: https://www.helmholtz-munich.de + role: Group Leader + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/authors/marijke_van_moerbeke.yaml b/src/authors/marijke_van_moerbeke.yaml new file mode 100644 index 00000000..bfdd96ea --- /dev/null +++ b/src/authors/marijke_van_moerbeke.yaml @@ -0,0 +1,11 @@ +name: Marijke Van Moerbeke +info: + role: Contributor + links: + github: mvanmoerbeke + orcid: 0000-0002-3097-5621 + linkedin: marijke-van-moerbeke-84303a34 + organizations: + - name: OpenAnalytics + href: https://www.openanalytics.eu + role: Statistical Consultant \ No newline at end of file diff --git a/src/authors/matthias_beyens.yaml b/src/authors/matthias_beyens.yaml new file mode 100644 index 00000000..b2f80ce8 --- /dev/null +++ b/src/authors/matthias_beyens.yaml @@ -0,0 +1,12 @@ +name: Matthias Beyens +info: + role: Contributor + links: + github: MatthiasBeyens + orcid: "0000-0003-3304-0706" + email: matthias.beyens@gmail.com + linkedin: mbeyens + organizations: + - name: Janssen Pharmaceuticals + href: https://www.janssen.com + role: Principal Scientist \ No newline at end of file diff --git a/src/authors/mauro_saporita.yaml b/src/authors/mauro_saporita.yaml new file mode 100644 index 00000000..3cd7b06a --- /dev/null +++ b/src/authors/mauro_saporita.yaml @@ -0,0 +1,11 @@ +name: Mauro Saporita +info: + role: Contributor + links: + email: maurosaporita@gmail.com + github: mauro-saporita + linkedin: mauro-saporita-930b06a5 + organizations: + - name: Ardigen + href: https://ardigen.com + role: Lead Nextflow Developer \ No newline at end of file diff --git a/src/authors/povilas_gibas.yaml b/src/authors/povilas_gibas.yaml new file mode 100644 index 00000000..d438f9ec --- /dev/null +++ b/src/authors/povilas_gibas.yaml @@ -0,0 +1,11 @@ +name: Povilas Gibas +info: + role: Contributor + links: + email: povilasgibas@gmail.com + github: PoGibas + linkedin: povilas-gibas + organizations: + - name: Ardigen + href: https://ardigen.com + role: Bioinformatician \ No newline at end of file diff --git a/src/authors/robrecht_cannoodt.yaml b/src/authors/robrecht_cannoodt.yaml new file mode 100644 index 00000000..eb6305b8 --- /dev/null +++ b/src/authors/robrecht_cannoodt.yaml @@ -0,0 +1,15 @@ +name: Robrecht Cannoodt +info: + role: Core Team Member + links: + email: robrecht@data-intuitive.com + github: rcannood + orcid: "0000-0003-3641-729X" + linkedin: robrechtcannoodt + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Science Engineer + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/authors/samuel_d_souza.yaml b/src/authors/samuel_d_souza.yaml new file mode 100644 index 00000000..2c7ff474 --- /dev/null +++ b/src/authors/samuel_d_souza.yaml @@ -0,0 +1,10 @@ +name: Samuel D'Souza +info: + role: Contributor + links: + github: srdsam + linkedin: samuel-d-souza-887023150/ + organizations: + - name: Chan Zuckerberg Biohub + href: https://www.czbiohub.org + role: Data Engineer \ No newline at end of file diff --git a/src/authors/sarah_ouologuem.yaml b/src/authors/sarah_ouologuem.yaml new file mode 100644 index 00000000..5ed8795a --- /dev/null +++ b/src/authors/sarah_ouologuem.yaml @@ -0,0 +1,10 @@ +name: Sarah Ouologuem +info: + role: Contributor + links: + github: SarahOuologuem + orcid: 0009-0005-3398-1700 + organizations: + - name: Helmholtz Munich + href: https://www.helmholtz-munich.de + role: Student Assistant \ No newline at end of file diff --git a/src/authors/toni_verbeiren.yaml b/src/authors/toni_verbeiren.yaml new file mode 100644 index 00000000..aa78550d --- /dev/null +++ b/src/authors/toni_verbeiren.yaml @@ -0,0 +1,10 @@ +name: Toni Verbeiren +info: + role: Core Team Member + links: + github: tverbeiren + linkedin: verbeiren + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist and CEO \ No newline at end of file diff --git a/src/authors/vladimir_shitov.yaml b/src/authors/vladimir_shitov.yaml new file mode 100644 index 00000000..9dfe8227 --- /dev/null +++ b/src/authors/vladimir_shitov.yaml @@ -0,0 +1,12 @@ +name: Vladimir Shitov +info: + role: Contributor + links: + email: vladimir.shitov@helmholtz-muenchen.de + github: vladimirshitov + orcid: "0000-0002-1960-8812" + linkedin: vladimir-shitov-9a659513b + organizations: + - name: Helmholtz Munich + href: https://www.helmholtz-munich.de + role: PhD Candidate \ No newline at end of file diff --git a/src/authors/weiwei_schultz.yaml b/src/authors/weiwei_schultz.yaml new file mode 100644 index 00000000..7d4835e8 --- /dev/null +++ b/src/authors/weiwei_schultz.yaml @@ -0,0 +1,6 @@ +name: Weiwei Schultz +info: + role: Contributor + organizations: + - name: Janssen R&D US + role: Associate Director Data Sciences \ No newline at end of file diff --git a/src/authors/xichen_wu.yaml b/src/authors/xichen_wu.yaml new file mode 100644 index 00000000..935851ab --- /dev/null +++ b/src/authors/xichen_wu.yaml @@ -0,0 +1,11 @@ +name: Xichen Wu +info: + role: Contributor + links: + github: wxicu + linkedin: xichen-wu + orcid: 0009-0008-2168-4508 + organizations: + - name: Helmholtz Munich + href: https://www.helmholtz-munich.de + role: Student Assistant \ No newline at end of file diff --git a/src/base/h5_compression_argument.yaml b/src/base/h5_compression_argument.yaml new file mode 100644 index 00000000..d575860a --- /dev/null +++ b/src/base/h5_compression_argument.yaml @@ -0,0 +1,9 @@ +arguments: + - name: "--output_compression" + description: | + Compression format to use for the output AnnData and/or Mudata objects. + By default no compression is applied. + type: string + choices: ["gzip", "lzf"] + required: false + example: "gzip" \ No newline at end of file diff --git a/src/base/requirements/anndata.yaml b/src/base/requirements/anndata.yaml new file mode 100644 index 00000000..8ca4664c --- /dev/null +++ b/src/base/requirements/anndata.yaml @@ -0,0 +1,2 @@ +packages: + - anndata~=0.11.1 diff --git a/src/base/requirements/anndata_mudata.yaml b/src/base/requirements/anndata_mudata.yaml new file mode 100644 index 00000000..fb89a6e6 --- /dev/null +++ b/src/base/requirements/anndata_mudata.yaml @@ -0,0 +1,9 @@ +__merge__: [/src/base/requirements/anndata.yaml, .] +packages: + - mudata~=0.3.1 +# Make sure that awkward is not installed. Currently, support of awkward arrays +# in anndata is experimental, and it is enabled based on whether or not the package +# is available. By making sure that awkward is not installed, the functionality is +# not enabled. +script: | + exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)") diff --git a/src/base/requirements/openpipeline_testutils.yaml b/src/base/requirements/openpipeline_testutils.yaml new file mode 100644 index 00000000..ae65ae6c --- /dev/null +++ b/src/base/requirements/openpipeline_testutils.yaml @@ -0,0 +1,2 @@ +github: + - openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils diff --git a/src/base/requirements/python_test_setup.yaml b/src/base/requirements/python_test_setup.yaml new file mode 100644 index 00000000..45552f2d --- /dev/null +++ b/src/base/requirements/python_test_setup.yaml @@ -0,0 +1,8 @@ +test_setup: + - type: apt + packages: + - git + - type: python + __merge__: + - /src/base/requirements/viashpy.yaml + - /src/base/requirements/openpipeline_testutils.yaml diff --git a/src/base/requirements/scanpy.yaml b/src/base/requirements/scanpy.yaml new file mode 100644 index 00000000..5c1af07a --- /dev/null +++ b/src/base/requirements/scanpy.yaml @@ -0,0 +1,2 @@ +packages: + - scanpy~=1.10.4 diff --git a/src/base/requirements/testworkflows_setup.yaml b/src/base/requirements/testworkflows_setup.yaml new file mode 100644 index 00000000..5e2520e9 --- /dev/null +++ b/src/base/requirements/testworkflows_setup.yaml @@ -0,0 +1,10 @@ +setup: + - type: apt + packages: + - procps + - git + - type: python + __merge__: + - /src/base/requirements/anndata_mudata.yaml + - /src/base/requirements/openpipeline_testutils.yaml + - /src/base/requirements/viashpy.yaml diff --git a/src/base/requirements/viashpy.yaml b/src/base/requirements/viashpy.yaml new file mode 100644 index 00000000..f346e1e4 --- /dev/null +++ b/src/base/requirements/viashpy.yaml @@ -0,0 +1,2 @@ +packages: + - viashpy==0.8.0 diff --git a/src/cluster/leiden/config.vsh.yaml b/src/cluster/leiden/config.vsh.yaml new file mode 100644 index 00000000..58f4e36c --- /dev/null +++ b/src/cluster/leiden/config.vsh.yaml @@ -0,0 +1,94 @@ +name: leiden +namespace: "cluster" +scope: "public" +description: | + Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. + Leiden is an improved version of the [Louvain algorithm] [Blondel08]. + It has been proposed for single-cell analysis by [Levine15] [Levine15]. + This requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first. + + [Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. + [Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. + [Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. + [Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] +arguments: + # input + - name: "--input" + alternatives: [-i] + type: file + description: Input file. + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--obsp_connectivities" + type: string + description: In which .obsp slot the neighbor connectivities can be found. + default: "connectivities" + + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output file. + direction: output + required: true + example: output.h5mu + - name: "--obsm_name" + type: string + description: | + Name of the .obsm key under which to add the cluster labels. + The name of the columns in the matrix will correspond to the resolutions. + default: "leiden" + + # todo: add uns_params + # example: uns["leiden"] = {'params': {'n_iterations': -1, 'random_state': 0, 'resolution': 1.0}} + + # arguments + - name: "--resolution" + type: double + description: | + A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. + Multiple values will result in clustering being performed multiple times. + default: [1] + required: true + multiple: true +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: + - type: docker + image: python:3.13-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - leidenalg~=0.10.0 + __merge__: [ /src/base/requirements/python_test_setup.yaml ] +runners: + - type: executable + - type: nextflow + directives: + label: [highcpu, midmem, middisk] diff --git a/src/cluster/leiden/script.py b/src/cluster/leiden/script.py new file mode 100644 index 00000000..a940aa66 --- /dev/null +++ b/src/cluster/leiden/script.py @@ -0,0 +1,345 @@ +import sys +import signal +import os +import time +import logging +import logging.handlers +import warnings +import mudata as mu +import pandas as pd +import scanpy as sc +import numpy as np +import numpy.typing as npt +import anndata as ad +from multiprocessing import managers, shared_memory, get_context +from concurrent.futures import ProcessPoolExecutor, process, as_completed +from scipy.sparse import csr_matrix +from pathlib import Path +import shutil + +## VIASH START +par = { + "input": "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "output.h5mu", + "modality": "rna", + "output_format": "h5mu", + "obsm_name": "leiden", + "resolution": [ + 1, + 0.25, + 0.10, + 0.05, + 0.01, + 0.2, + 0.4, + 0.5, + 0.6, + 0.8, + 0.9, + 0.7, + 0.3, + 0.35, + 0.95, + ], + "obsp_connectivities": "connectivities", + "uns_name": "leiden", + "output_compression": "gzip", +} +meta = {"cpus": 8, "resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + +from compress_h5mu import compress_h5mu + +_shared_logger_name = "leiden" + + +# Function to check available space in /dev/shm +def get_available_shared_memory(): + shm_path = "/dev/shm" + shm_stats = os.statvfs(shm_path) + return shm_stats.f_bsize * shm_stats.f_bavail + + +class SharedNumpyMatrix: + def __init__( + self, + shared_memory: shared_memory.SharedMemory, + dtype: npt.DTypeLike, + shape: tuple[int, int], + ) -> None: + self._memory = shared_memory + self._dtype = dtype + self._shape = shape + + @classmethod + def from_numpy( + cls, memory_manager: managers.SharedMemoryManager, array: npt.ArrayLike + ): + available_shared_memory = get_available_shared_memory() + n_bytes_required = array.nbytes + if available_shared_memory < n_bytes_required: + raise ValueError( + "Not enough shared memory (/dev/shm) is available to load the data. " + f"Required amount: {n_bytes_required}, available: {available_shared_memory}." + ) + shm = memory_manager.SharedMemory(size=array.nbytes) + array_in_shared_memory = np.ndarray( + array.shape, dtype=array.dtype, buffer=shm.buf + ) + # Copy the data into shared memory + array_in_shared_memory[:] = array[:] + return cls(shm, array.dtype, array.shape) + + def to_numpy(self): + return np.ndarray(self._shape, dtype=self._dtype, buffer=self._memory.buf) + + def close(self): + self._memory.close() + + +class SharedCsrMatrix: + def __init__( + self, + data: SharedNumpyMatrix, + indices: SharedNumpyMatrix, + indptr: SharedNumpyMatrix, + shape: npt.DTypeLike, + ): + self._data = data + self._indices = indices + self._indptr = indptr + self._shape = shape + + @classmethod + def from_csr_matrix( + cls, memory_manager: managers.SharedMemoryManager, csr_matrix_obj: csr_matrix + ): + return cls( + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.data), + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indices), + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indptr), + csr_matrix_obj.shape, + ) + + def to_csr_matrix(self): + return csr_matrix( + (self._data.to_numpy(), self._indices.to_numpy(), self._indptr.to_numpy()), + shape=self._shape, + copy=False, + ) + + def close(self): + self._data.close() + self._indices.close() + self._indptr.close() + + +def create_empty_anndata_with_connectivities(connectivities, obs_names): + empty_anndata = ad.AnnData( + np.zeros((connectivities.shape[0], 1)), obs=pd.DataFrame(index=list(obs_names)) + ) + empty_anndata.obsp["connectivities"] = connectivities + return empty_anndata + + +def run_single_resolution(shared_csr_matrix, obs_names, resolution): + logger = logging.getLogger(_shared_logger_name) + logger.info( + "Process with PID '%s' for resolution '%s' started", os.getpid(), resolution + ) + try: + connectivities = shared_csr_matrix.to_csr_matrix() + adata = create_empty_anndata_with_connectivities(connectivities, obs_names) + with warnings.catch_warnings(): + # In the future, the default backend for leiden will be igraph instead of leidenalg. + warnings.simplefilter(action="ignore", category=FutureWarning) + adata_out = sc.tl.leiden( + adata, + resolution=resolution, + key_added=str(resolution), + obsp="connectivities", + copy=True, + ) + logger.info(f"Returning result for resolution {resolution}") + return adata_out.obs[str(resolution)] + finally: + obs_names.shm.close() + shared_csr_matrix.close() + + +def init_worker(parent_process_id, exit_event, log_queue, log_level): + import os + import threading + import time + + pid = os.getpid() + + logger = logging.getLogger(_shared_logger_name) + logger.setLevel(log_level) + + handler = logging.handlers.QueueHandler(log_queue) + logger.addHandler(handler) + + logger.info("Initializing process %s", pid) + + def exit_if_orphaned(): + logger.info( + "Starting orphanned process checker for process %s, parent process %s.", + pid, + parent_process_id, + ) + while True: + # Check if parent process is gone + try: + # If sig is 0, then no signal is sent, but error checking is still performed; + # this can be used to check for the existence of a process ID + os.kill(parent_process_id, 0) + except ProcessLookupError: + logger.info("Parent process is gone, shutting down %s", pid) + # Kill self + os.kill(pid, signal.SIGTERM) + time.sleep(0.2) + # Parent process requested exit + try: + exit_event_set = exit_event.wait(timeout=1) + except BrokenPipeError: + logger.info( + "Checking for shutdown resulted in BrokenPipeError, " + "parent process is most likely gone. Shutting down %s", + pid, + ) + os.kill(pid, signal.SIGTERM) + else: + if exit_event_set: + logger.info("Exit event set, shutting down %s", pid) + os.kill(pid, signal.SIGTERM) + time.sleep(1) + + threading.Thread(target=exit_if_orphaned, daemon=True).start() + logger.info( + "Initialization of process %s is complete, process is now waiting for work.", + pid, + ) + + +def main(): + with managers.SyncManager() as syncm: + log_level = logging.INFO + log_format = "%(name)s:%(levelname)s:%(asctime)s: %(message)s" + formatter = logging.Formatter(log_format) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + log_queue = syncm.Queue() + log_listener = logging.handlers.QueueListener(log_queue, console_handler) + log_listener.start() + + logger = logging.getLogger(_shared_logger_name) + logger.setLevel(log_level) + handler = logging.handlers.QueueHandler(log_queue) + logger.addHandler(handler) + + logger.info("Reading %s.", par["input"]) + adata = mu.read_h5ad(par["input"], mod=par["modality"], backed="r") + logger.info("Processing modality '%s'.", par["modality"]) + try: + connectivities = adata.obsp[par["obsp_connectivities"]] + except KeyError: + raise ValueError( + f'Could not find .obsp key "{par["obsp_connectivities"]}" ' + "in modality {par['modality']}" + ) + + # An event that, when triggered, will kill the child processes that are still running + exit_early_event = syncm.Event() + with managers.SharedMemoryManager() as smm: + # anndata converts the index to strings, so no worries that it cannot be stored in ShareableList + # because it has an unsupported dtype. It should always be string... + index_contents = adata.obs.index.to_list() + assert all([isinstance(item, str) for item in index_contents]) + obs_names = smm.ShareableList(index_contents) + + shared_csr_matrix = SharedCsrMatrix.from_csr_matrix(smm, connectivities) + results = {} + n_workers = ( + meta["cpus"] - 2 if (meta["cpus"] and (meta["cpus"] - 2) > 0) else 1 + ) + logger.info(f"Requesting {n_workers} workers") + executor = ProcessPoolExecutor( + max_workers=n_workers, + max_tasks_per_child=1, + mp_context=get_context("spawn"), + initializer=init_worker, + initargs=((os.getpid(), exit_early_event, log_queue, log_level)), + ) + pending_futures = { + executor.submit( + run_single_resolution, shared_csr_matrix, obs_names, resolution + ): resolution + for resolution in par["resolution"] + } + try: + logger.info("All futures sheduled") + for done_future in as_completed(pending_futures): + resolution = pending_futures[done_future] + data = done_future.result() + logger.info(f"Processed resolution '{resolution}'") + results[str(resolution)] = data + except process.BrokenProcessPool: + # This assumes that one of the child processses was killed by the kernel + # because the oom killer was activated. This the is the most likely scenario, + # other causes could be: + # * Subprocess terminates without raising a proper exception. + # * The code of the process handling the communication is broke (i.e. a python bug) + # * The return data could not be pickled. + logger.error("BrokenProcessPool is raised") + executor.shutdown(wait=False, cancel_futures=True) + time.sleep(3) + exit_early_event.set() + time.sleep(3) + sys.exit(137) + finally: + logger.info("Closing shared resources in main process") + shared_csr_matrix.close() + obs_names.shm.close() + logger.info("Shared resources closed") + logger.info( + "Shutting down logging queue and re-enabling logging from main thread." + ) + log_listener.enqueue_sentinel() + log_listener.stop() + stdout_handler = logging.StreamHandler() + stdout_handler.setFormatter(formatter) + logger.addHandler(stdout_handler) + logger.info("Re-enabled logging in main thread.") + logger.info("Waiting for shutdown of processes") + executor.shutdown() + logger.info("Executor shut down.") + adata.obsm[par["obsm_name"]] = pd.DataFrame(results) + + output_file = Path(par["output"]) + logger.info("Writing output to %s.", par["output"]) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if par["output_compression"] + else output_file + ) + shutil.copyfile(par["input"], output_file_uncompressed) + mu.write_h5ad( + filename=output_file_uncompressed, mod=par["modality"], data=adata + ) + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, + output_file, + compression=par["output_compression"], + ) + output_file_uncompressed.unlink() + logger.info("Finished.") + log_listener.enqueue_sentinel() + time.sleep(3) + + +if __name__ == "__main__": + main() diff --git a/src/cluster/leiden/test.py b/src/cluster/leiden/test.py new file mode 100644 index 00000000..f2f64de3 --- /dev/null +++ b/src/cluster/leiden/test.py @@ -0,0 +1,113 @@ +import mudata as mu +import pytest +import sys + +## VIASH START +meta = { + "name": "foo", + "resources_dir": "resources_test/", + "cpus": 2, + "config": "./src/cluster/leiden/config.vsh.yaml", + "executable": "./target/executable/cluster/leiden/leiden", +} +## VIASH END + + +@pytest.fixture() +def input_path(): + return meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.fixture() +def input_data(input_path): + return mu.read_h5mu(input_path) + + +@pytest.fixture() +def mudata_custom_connectivities_key(input_data, random_h5mu_path): + result = input_data.copy() + result.mod["rna"].obsp["custom_connectivities"] = ( + result.mod["rna"].obsp["connectivities"].copy() + ) + del result.mod["rna"].obsp["connectivities"] + output_path = random_h5mu_path() + result.write(output_path) + return output_path + + +# @pytest.fixture +# def random_h5mu_path(tmp_path): +# def wrapper(): +# unique_filename = f"{str(uuid.uuid4())}.h5mu" +# temp_file = tmp_path / unique_filename +# return temp_file +# return wrapper + + +@pytest.mark.parametrize("compression", ["gzip", ""]) +@pytest.mark.parametrize( + "output_key,expected_output_key", [("fooleiden", "fooleiden"), ("", "leiden")] +) +def test_leiden( + input_path, + run_component, + random_h5mu_path, + compression, + output_key, + expected_output_key, +): + output_path = random_h5mu_path() + args = ["--input", input_path, "--resolution", "1;0.25", "--output", output_path] + if compression: + args.extend(["--output_compression", compression]) + if output_key: + args.extend(["--obsm_name", output_key]) + + run_component(args) + assert output_path.exists(), "No output was created." + data = mu.read_h5mu(output_path) + assert expected_output_key in data.mod["rna"].obsm, ( + f"Expected to find key '{expected_output_key}' in .obsm" + ) + # check whether leiden.custom.resolution was found + assert "1.0" in data.mod["rna"].obsm[expected_output_key].columns, ( + "Output should contain resolution 1.0." + ) + assert "0.25" in data.mod["rna"].obsm[expected_output_key].columns, ( + "Output should contain resolution 0.25." + ) + + +def test_leiden_custom_connectivities_key( + mudata_custom_connectivities_key, run_component, random_h5mu_path +): + output_path = random_h5mu_path() + run_component( + [ + "--input", + mudata_custom_connectivities_key, + "--obsm_name", + "fooleiden", + "--resolution", + "1;0.25", + "--output", + output_path, + "--obsp_connectivities", + "custom_connectivities", + "--output_compression", + "gzip", + ] + ) + assert output_path.exists(), "No output was created." + data = mu.read_h5mu(output_path) + # check whether leiden.custom.resolution was found + assert "1.0" in data.mod["rna"].obsm["fooleiden"].columns, ( + "Output should contain resolution 1.0." + ) + assert "0.25" in data.mod["rna"].obsm["fooleiden"].columns, ( + "Output should contain resolution 0.25." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v"])) diff --git a/src/compression/compress_h5mu/config.vsh.yaml b/src/compression/compress_h5mu/config.vsh.yaml new file mode 100644 index 00000000..d72377a1 --- /dev/null +++ b/src/compression/compress_h5mu/config.vsh.yaml @@ -0,0 +1,47 @@ +name: compress_h5mu +namespace: "compression" +scope: "public" +description: | + Compress a MuData file. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Path to the input .h5mu. + required: true + example: sample_path + direction: input + - name: "--output" + type: file + description: location of output file. + required: true + direction: output +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py + - path: ../../utils/compress_h5mu.py +test_resources: + - type: python_script + path: run_test.py + - path: /resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu + +engines: + - type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [ singlecpu, lowmem ] \ No newline at end of file diff --git a/src/compression/compress_h5mu/run_test.py b/src/compression/compress_h5mu/run_test.py new file mode 100644 index 00000000..d2b720c7 --- /dev/null +++ b/src/compression/compress_h5mu/run_test.py @@ -0,0 +1,64 @@ +import sys +import pytest +import mudata as mu +from pathlib import Path +import pandas as pd + +## VIASH START +meta = { + "executable": "./target/executable/compression/compress_h5mu/compress_h5mu", + "resources_dir": "resources_test/concat_test_data/", + "config": "src/compression/compress_h5mu/config.vsh.yaml", +} +## VIASH END + + +input_file = Path( + f"{meta['resources_dir']}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +) + + +def compare_anndata(first, second): + for attr_name in ("obs", "var"): + pd.testing.assert_frame_equal( + getattr(first, attr_name), getattr(second, attr_name) + ) + + +@pytest.mark.parametrize("compression_type", ["gzip", "lzf"]) +def test_compress_h5mu(run_component, tmp_path, compression_type): + output_file = tmp_path / "output.h5mu" + + run_component( + [ + "--input", + str(input_file), + "--output", + str(output_file), + "--output_compression", + compression_type, + ] + ) + + # check whether file exists + assert output_file.is_file(), "Output file does not exist" + + # read output mudata + output = mu.read_h5mu(output_file) + uncompressed_h5mu = mu.read_h5mu(input_file) + for attr_name in ("obs", "var"): + pd.testing.assert_frame_equal( + getattr(output, attr_name), getattr(uncompressed_h5mu, attr_name) + ) + for mod_name in uncompressed_h5mu.mod: + assert mod_name in output.mod, ( + f"{mod_name} found in uncompressed file, but not in compressed output file." + ) + mod_compressed = output.mod[mod_name] + mod_uncompressed = uncompressed_h5mu.mod[mod_name] + compare_anndata(mod_compressed, mod_uncompressed) + assert output_file.stat().st_size < input_file.stat().st_size + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/compression/compress_h5mu/script.py b/src/compression/compress_h5mu/script.py new file mode 100644 index 00000000..d65408be --- /dev/null +++ b/src/compression/compress_h5mu/script.py @@ -0,0 +1,16 @@ +import sys + +### VIASH START +par = { + "input": "resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + "output": "test.h5mu", + "compression": "gzip", +} +meta = {} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import compress_h5mu + +if __name__ == "__main__": + compress_h5mu(par["input"], par["output"], compression=par["output_compression"]) diff --git a/src/compression/tar_extract/config.vsh.yaml b/src/compression/tar_extract/config.vsh.yaml new file mode 100644 index 00000000..95272eae --- /dev/null +++ b/src/compression/tar_extract/config.vsh.yaml @@ -0,0 +1,49 @@ +name: tar_extract +namespace: compression +scope: public +status: deprecated +description: Extract files from a tar archive +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input file + direction: input + example: input.tar.gz + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Folder to restore file(s) to. + direction: output + example: output_folder + required: true + - name: "--strip_components" + alternatives: ["-s"] + type: integer + description: Strip this amount of leading components from file names on extraction. For example, to extract only 'myfile.txt' from an archive containing the structure `this/goes/deep/myfile.txt', use 3 to strip 'this/goes/deep/'. + example: 1 + required: false + - name: "--exclude" + alternatives: ["-e"] + type: string + description: Prevents any file or member whose name matches the shell wildcard (pattern) from being extracted. + example: "docs/figures" + required: false +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../LICENSE + +engines: + - type: docker + image: ubuntu:latest + +runners: + - type: executable + - type: nextflow + directives: + label: [ singlecpu, lowmem ] diff --git a/src/compression/tar_extract/script.sh b/src/compression/tar_extract/script.sh new file mode 100755 index 00000000..523a8fb1 --- /dev/null +++ b/src/compression/tar_extract/script.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +## VIASH START + +par_input='temp/test.tar' +par_output='temp/output_folder' +par_strip_components='0' +par_exclude='docs/figures' + +## VIASH END + +extra_params=() +mkdir -p $par_output # Create output directory if it doesn't exist already + +if [ "$par_strip_components" != "" ]; then + extra_params+=("--strip-components=$par_strip_components") +fi + +if [ "$par_exclude" != "" ]; then + extra_params+=("--exclude=$par_exclude") +fi + +echo "Extracting $par_input to $par_output..." +echo "" +tar "${extra_params[@]}" -xvf "$par_input" -C "$par_output" diff --git a/src/compression/tar_extract/test.sh b/src/compression/tar_extract/test.sh new file mode 100644 index 00000000..3bb93c55 --- /dev/null +++ b/src/compression/tar_extract/test.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -ex + +INPUT_FILE="LICENSE" +OUTPUT_DIR="output/files/" +OUTPUT_FILE="${OUTPUT_DIR}${INPUT_FILE}" + +echo ">>> Check whether test file exists" +[[ ! -f ${INPUT_FILE} ]] && echo "Test file could not be found!" && exit 1 + +echo ">>> Creating tar.gz..." +tar czvf ${INPUT_FILE}.tar.gz ${INPUT_FILE} + +echo ">>> Check whether tar.gz can be extracted" +$meta_executable \ + --input "${INPUT_FILE}.tar.gz" \ + --output "$OUTPUT_DIR" + +echo ">>> Check whether extracted file exists" +[[ ! -f $OUTPUT_FILE ]] && echo "Output file could not be found!" && exit 1 + +echo ">>> Check whether input and output file are the same" +cmp $INPUT_FILE $OUTPUT_FILE || (echo "Input and output files are different!" && exit 1) + +echo ">>> Test finished successfully" diff --git a/src/convert/from_10xh5_to_h5mu/api_output.yaml b/src/convert/from_10xh5_to_h5mu/api_output.yaml new file mode 100644 index 00000000..d1db2969 --- /dev/null +++ b/src/convert/from_10xh5_to_h5mu/api_output.yaml @@ -0,0 +1,33 @@ +info: + slots: + mod: + - name: rna + required: true + description: Gene expression counts. + slots: + var: &global_var + - name: gene_symbol + type: string + description: Identification of the gene. + required: true + - name: feature_types + type: string + description: The full name of the modality. + required: true + - name: genome + type: string + description: Reference that was used to generate the data. + required: true + - name: prot + required: false + description: Protein abundancy + slots: + var: *global_var + - name: "vdj" + required: false + description: VDJ transcript counts + slots: + var: *global_var + + + diff --git a/src/convert/from_10xh5_to_h5mu/config.vsh.yaml b/src/convert/from_10xh5_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..7e299669 --- /dev/null +++ b/src/convert/from_10xh5_to_h5mu/config.vsh.yaml @@ -0,0 +1,71 @@ +name: "from_10xh5_to_h5mu" +namespace: "convert" +scope: "public" +description: | + Converts a 10x h5 into an h5mu file. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: A 10x h5 file as generated by Cell Ranger. + example: raw_feature_bc_matrix.h5 + direction: input + required: true + - name: "--input_metrics_summary" + type: file + description: A metrics summary csv file as generated by Cell Ranger. + example: metrics_cellranger.h5 + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + example: output.h5mu + direction: output + __merge__: api_output.yaml + - name: "--uns_metrics" + type: string + description: Name of the .uns slot under which to QC metrics (if any). + default: "metrics_cellranger" + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--min_genes" + type: integer + example: 100 + description: Minimum number of counts required for a cell to pass filtering. + - name: "--min_counts" + type: integer + example: 1000 + description: Minimum number of genes expressed required for a cell to pass filtering. +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: +- type: executable +- type: nextflow + directives: + label: [lowmem, singlecpu] \ No newline at end of file diff --git a/src/convert/from_10xh5_to_h5mu/script.py b/src/convert/from_10xh5_to_h5mu/script.py new file mode 100755 index 00000000..d83b55f7 --- /dev/null +++ b/src/convert/from_10xh5_to_h5mu/script.py @@ -0,0 +1,72 @@ +import mudata +import scanpy as sc +import sys +import pandas as pd + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5", + "input_metrics_summary": None, + "uns_metrics": "metrics_cellranger", + "output": "foo.h5mu", + "min_genes": None, + "min_counts": None, + "output_compression": "gzip", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading %s.", par["input"]) +adata = sc.read_10x_h5(par["input"], gex_only=False) + +# set the gene ids as var_names +logger.info("Renaming var columns") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + +# parse metrics summary file and store in .uns +if par["input_metrics_summary"] and par["uns_metrics"]: + logger.info("Reading metrics summary file '%s'", par["input_metrics_summary"]) + + def read_percentage(val): + try: + return float(val.strip("%")) / 100 + except AttributeError: + return val + + metrics_summary = pd.read_csv( + par["input_metrics_summary"], decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) + + logger.info("Storing metrics summary in .uns['%s']", par["uns_metrics"]) + adata.uns[par["uns_metrics"]] = metrics_summary +else: + is_none = ( + "input_metrics_summary" if not par["input_metrics_summary"] else "uns_metrics" + ) + logger.info("Not storing metrics summary because par['%s'] is None", is_none) + +# might perform basic filtering to get rid of some data +# applicable when starting from the raw counts +if par["min_genes"]: + logger.info("Filtering with min_genes=%d", par["min_genes"]) + sc.pp.filter_cells(adata, min_genes=par["min_genes"]) + +if par["min_counts"]: + logger.info("Filtering with min_counts=%d", par["min_counts"]) + sc.pp.filter_cells(adata, min_counts=par["min_counts"]) + +# generate output +logger.info("Convert to mudata") +mdata = mudata.MuData(adata) + +# override root .obs and .uns +mdata.obs = adata.obs +mdata.uns = adata.uns + +# write output +logger.info("Writing %s", par["output"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/convert/from_10xh5_to_h5mu/test.py b/src/convert/from_10xh5_to_h5mu/test.py new file mode 100644 index 00000000..8f7925bf --- /dev/null +++ b/src/convert/from_10xh5_to_h5mu/test.py @@ -0,0 +1,82 @@ +from os import path +from mudata import read_h5mu +import pytest +import sys + +## VIASH START +meta = { + "resources_dir": "resources_test/", + "config": "./src/convert/from_10xh5_to_h5mu/config.vsh.yaml", + "executable": "./target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu", +} +## VIASH END + +input = ( + meta["resources_dir"] + + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" +) +metrics = ( + meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv" +) + + +def test_run(run_component, random_h5mu_path): + output = random_h5mu_path() + cmd_pars = [ + "--input", + input, + "--output", + output, + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + # check if file exists + assert path.exists(output), "No output was created." + + # read it with scanpy + data = read_h5mu(output) + + # check whether gex was found + assert data.mod["rna"].var["feature_types"].unique() == ["Gene Expression"], ( + "Output X should only contain Gene Expression vars." + ) + + # check whether ab counts were found + assert "prot" in data.mod, 'Output should contain data.mod["rna"].' + + # check whether gene was found + assert "CD3" in data.mod["prot"].var_names, ( + 'Output should contain antibody column "CD3".' + ) + + +def test_run_with_metrics(run_component, random_h5mu_path): + output = random_h5mu_path() + cmd_pars = [ + "--input", + input, + "--output", + output, + "--input_metrics_summary", + metrics, + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + # check if file exists + assert path.exists(output), "No output was created." + + # read it with scanpy + data = read_h5mu(output) + + # check whether uns slot was found + assert "metrics_cellranger" in data.uns, ( + "Output mudata object should contain an .uns slot with cellranger metrics." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_10xmtx_to_h5mu/config.vsh.yaml b/src/convert/from_10xmtx_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..13f9024a --- /dev/null +++ b/src/convert/from_10xmtx_to_h5mu/config.vsh.yaml @@ -0,0 +1,47 @@ +name: "from_10xmtx_to_h5mu" +namespace: "convert" +scope: "public" +description: | + Converts a 10x mtx into an h5mu file. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input mtx folder + example: input_dir_containing_gz_files + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + example: output.h5mu + direction: output +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: run_test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] diff --git a/src/convert/from_10xmtx_to_h5mu/run_test.py b/src/convert/from_10xmtx_to_h5mu/run_test.py new file mode 100644 index 00000000..72b05aed --- /dev/null +++ b/src/convert/from_10xmtx_to_h5mu/run_test.py @@ -0,0 +1,53 @@ +from os import path +from mudata import read_h5mu +import pytest +import sys + +## VIASH START +meta = { + "resources_dir": "resources_test/", + "config": "./src/convert/from_10xmtx_to_h5mu/config.vsh.yaml", + "executable": "./target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu", +} +## VIASH END + +input = ( + meta["resources_dir"] + + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix" +) + + +def test_run(run_component, random_h5mu_path): + output = random_h5mu_path() + cmd_pars = [ + "--input", + input, + "--output", + output, + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + # check if file exists + assert path.exists(output), "No output was created." + + # read it with scanpy + data = read_h5mu(output) + + # check whether gex was found + assert data.mod["rna"].var["feature_types"].unique() == ["Gene Expression"], ( + "Output X should only contain Gene Expression vars." + ) + + # check whether ab counts were found + assert "prot" in data.mod, 'Output should contain data.mod["rna"].' + + # check whether gene was found + assert "CD3" in data.mod["prot"].var_names, ( + 'Output should contain antibody column "CD3".' + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_10xmtx_to_h5mu/script.py b/src/convert/from_10xmtx_to_h5mu/script.py new file mode 100755 index 00000000..b4fe5e38 --- /dev/null +++ b/src/convert/from_10xmtx_to_h5mu/script.py @@ -0,0 +1,32 @@ +import mudata as mu +import scanpy as sc +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix", + "output": "foo.h5mu", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading %s.", par["input"]) +adata = sc.read_10x_mtx(par["input"], gex_only=False) + +logger.info("Renaming keys.") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + +# generate output +logger.info("Convert to mudata") +mdata = mu.MuData(adata) + +# override root .obs +mdata.obs = adata.obs + +# write output +logger.info("Writing %s", par["output"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml b/src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml new file mode 100644 index 00000000..e8c9147f --- /dev/null +++ b/src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml @@ -0,0 +1,48 @@ +name: "from_bd_to_10x_molecular_barcode_tags" +namespace: "convert" +scope: "public" +description: | + Convert the molecular barcode sequence SAM tag from BD format (MA) to 10X format (UB). +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + must_exist: true + description: Input SAM or BAM file. + example: input.bam + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output alignment file. + example: output.sam + direction: output + - name: "--bam" + type: boolean_true + description: Output a BAM file. + - name: "--threads" + alternatives: ["-t"] + type: integer + description: Number of threads +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: run_test.sh + - path: /resources_test/bdrhap_5kjrt/processed/output_raw +engines: + - type: docker + image: ubuntu:latest + setup: + - type: apt + packages: [ samtools ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] diff --git a/src/convert/from_bd_to_10x_molecular_barcode_tags/run_test.sh b/src/convert/from_bd_to_10x_molecular_barcode_tags/run_test.sh new file mode 100644 index 00000000..07956d5a --- /dev/null +++ b/src/convert/from_bd_to_10x_molecular_barcode_tags/run_test.sh @@ -0,0 +1,40 @@ +#!/bin/bash + + + +echo "> Testing missing input file can be detected." +"$meta_executable" \ + --input "oops.BAM" \ + --output "foo.bam" \ + -t 8 \ + --bam > /dev/null + +exit_code=$? +if [ $exit_code -eq 0 ]; then + echo "Test failed: non-existent input file was not detected." && exit 1 +fi + +echo "> Testing if BAM output can be created" +"$meta_executable" \ + --input "$meta_resources_dir/output_raw/Combined_sample_Bioproduct.bam" \ + --output "foo.bam" \ + -t 8 \ + --bam + +[ ! -f "foo.bam" ] && { echo Output file could not be found; exit 1; } +readarray -t output_tags < <( samtools view --no-header "foo.bam" | grep -oP "(?<=UB:Z:).*[\s]" ) +readarray -t input_tags < <( samtools view --no-header "$meta_resources_dir/output_raw/Combined_sample_Bioproduct.bam" | grep -oP "(?<=MA:Z:).*[\s]" ) +[ "${output_tags[*]}" == "${input_tags[*]}" ] || { echo "Input tags differ from output tags!"; exit 1; } + +echo "> Testing if SAM output can be created" +"$meta_executable" \ + --input "$meta_resources_dir/output_raw/Combined_sample_Bioproduct.bam" \ + --output "foo.sam" \ + -t 8 +[ ! -f "foo.sam" ] && { echo Output file could not be found; exit 1; } + +readarray -t output_tags_sam < <( samtools view --no-header "foo.sam" | grep -oP "(?<=UB:Z:).*[\s]" ) +[ "${output_tags_sam[*]}" == "${input_tags[*]}" ] || { echo "Input tags differ from output tags!"; exit 1; } + + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/convert/from_bd_to_10x_molecular_barcode_tags/script.sh b/src/convert/from_bd_to_10x_molecular_barcode_tags/script.sh new file mode 100644 index 00000000..ca9d9e70 --- /dev/null +++ b/src/convert/from_bd_to_10x_molecular_barcode_tags/script.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -eo pipefail + +# Sam tags added by BD Rhapsody Pipeline +# From: https://www.bd.com/documents/guides/user-guides/GMX_BD-Rhapsody-genomics-informatics_UG_EN.pdf +# +# ========================================================================================= +# | | Definition | +# ========================================================================================= +# | CB | A number between 1 and 96 3 (884,736) representing a unique cell label sequence | +# | | (CB = 0 when no cell label sequence is detected) | +# ----------------------------------------------------------------------------------------- +# | MR | Raw molecular identifier sequence | +# ----------------------------------------------------------------------------------------- +# | MA | RSEC-adjusted molecular identifier sequence. If not a true cell, the raw UMI is | +# | | repeated in this tag. | +# ----------------------------------------------------------------------------------------- +# | PT | T if a poly(T) tail was found in the expected position on R1, or F if poly(T) | +# | | was not found | +# ----------------------------------------------------------------------------------------- +# | CN | Indicates if a sequence is derived from a putative cell, as determined by the | +# | | cell label filtering algorithm (T: putative cell; x: invalid cell label or noise | +# | | cell) Note: You can distinguish between an invalid cell label and a noise cell | +# | | with the CB tag (invalid cell labels are 0). | +# ----------------------------------------------------------------------------------------- +# | ST | The value is 1-12, indicating the Sample Tag of the called putative cell, or M | +# | | for multiplet, or x for undetermined. | +# ========================================================================================= + + +# SAM tags added by 10X +# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam +# ========================================================================================= +# | | Definition | +# ========================================================================================= +# | CB | Chromium cellular barcode sequence that is error-corrected and confirmed against | +# | | a list of known-good barcode sequences. For multiplex Fixed RNA Profiling, the | +# | | cellular barcode is a combination of the 10x GEM Barcode and Probe Barcode | +# | | sequences. | +# ----------------------------------------------------------------------------------------- +# | CR | Chromium cellular barcode sequence as reported by the sequencer. For multiplex | +# | | Fixed RNA Profiling, the cellular barcode is a combination of the 10x GEM | +# | | Barcode and Probe Barcode sequences. | +# ----------------------------------------------------------------------------------------- +# | CY | Chromium cellular barcode read quality. For multiplex Fixed RNA Profiling, the | +# | | cellular barcode is a combination of the 10x GEM Barcode and Probe Barcode | +# | | sequences. Phred scores as reported by sequencer. | +# ----------------------------------------------------------------------------------------- +# | UB | Chromium molecular barcode sequence that is error-corrected among other | +# | | molecular barcodes with the same cellular barcode and gene alignment. | +# ----------------------------------------------------------------------------------------- +# | UR | Chromium molecular barcode sequence as reported by the sequencer. | +# ----------------------------------------------------------------------------------------- +# | UY | Chromium molecular barcode read quality. Phred scores as reported by sequencer. | +# ----------------------------------------------------------------------------------------- +# | TR | Trimmed sequence. For the Single Cell 3' v1 chemistry, this is trailing sequence | +# | | following the UMI on Read 2. For the Single Cell 3' v2 chemistry, this is | +# | | trailing sequence following the cell and molecular barcodes on Read 1. | +# ========================================================================================= + +extra_params=() + +if [ "$par_bam" == "true" ]; then + extra_params+=("--bam") +fi + +cat \ + <(samtools view -SH "$par_input") \ + <(samtools view "$par_input" | grep "MA:Z:*" | sed "s/MA:Z:/UB:Z:/" ) | \ +samtools view -Sh "${extra_params[@]}" -@"$par_threads" - > "$par_output" \ No newline at end of file diff --git a/src/convert/from_bdrhap_to_h5mu/config.vsh.yaml b/src/convert/from_bdrhap_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..3968e7b7 --- /dev/null +++ b/src/convert/from_bdrhap_to_h5mu/config.vsh.yaml @@ -0,0 +1,56 @@ +name: "from_bdrhap_to_h5mu" +namespace: "convert" +scope: "public" +description: | + Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: string + description: "A sample ID." + example: my_id + required: true + - name: "--input" + alternatives: [-i] + type: file + description: The output h5mu of a BD Rhapsody workflow. + required: true + example: sample.h5mu + - name: Outputs + arguments: + - name: "--output" + alternatives: [-o] + direction: output + type: file + description: "Output h5mu file." + required: true + example: output.h5mu +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/bdrhap_5kjrt/processed/output_raw/sample.h5mu +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] diff --git a/src/convert/from_bdrhap_to_h5mu/script.py b/src/convert/from_bdrhap_to_h5mu/script.py new file mode 100644 index 00000000..a1d9b917 --- /dev/null +++ b/src/convert/from_bdrhap_to_h5mu/script.py @@ -0,0 +1,50 @@ +import mudata as mu + +## VIASH START +par = { + "id": "sample", + "input": "resources_test/bdrhap_5kjrt/processed/output_raw/sample.h5mu", + "output": "bd_rhap_to_h5mu_test.h5mu", + "output_compression": None, +} +## VIASH END + +print(">> Reading input file", flush=True) +mdata = mu.read_h5mu(par["input"]) + +# Check if modalities are present +modalities = list(mdata.mod.keys()) +assert len(modalities) > 0, "No modalities found in input data" + + +def process_modality_inline(adata, modality): + adata.obs["library_id"] = " & ".join(adata.uns["Pipeline_Inputs"]["Libraries"]) + adata.obs["cell_id"] = adata.obs.index + adata.obs["run_id"] = par["id"] + + adata.obs.rename( + columns={"Sample_Tag": "sample_tag", "Sample_Name": "sample_id"}, inplace=True + ) + + adata.var["gene_ids"] = adata.var.index + adata.var["gene_name"] = adata.var.index + + if modality == "rna": + adata.var["feature_type"] = "Gene Expression" + adata.var["reference_file"] = adata.uns["Pipeline_Inputs"]["Reference_Archive"] + + elif modality == "prot": + adata.var["feature_type"] = "Antibody Capture" + adata.var["reference_file"] = " & ".join( + adata.uns["Pipeline_Inputs"]["AbSeq_Reference"] + ) + + # TODO: add other modalities + + +for key, value in mdata.mod.items(): + print(">> Processing modality:", key, flush=True) + process_modality_inline(value, key) + +print(">> Writing output file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/convert/from_bdrhap_to_h5mu/test.py b/src/convert/from_bdrhap_to_h5mu/test.py new file mode 100644 index 00000000..9e4f218a --- /dev/null +++ b/src/convert/from_bdrhap_to_h5mu/test.py @@ -0,0 +1,81 @@ +import subprocess +from os import path +import mudata as mu +import numpy as np + +## VIASH START +meta = { + "executable": "target/docker/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu", + "resources_dir": "resources_test/bdrhap_5kjrt/processed/output_raw/", +} +## VIASH END + +input = meta["resources_dir"] + "/sample.h5mu" +output = "output1.h5mu" + +cmd_pars = [ + meta["executable"], + "--input", + input, + "--output", + output, + "--id", + "foo", + "--output_compression", + "gzip", +] +out = subprocess.check_output(cmd_pars).decode("utf-8") + +print(">> Check if output exists", flush=True) +assert path.exists(output), "No output was created." + + +print(">> Check contents of output", flush=True) +data = mu.read_h5mu(output) +rna_adata = data.mod["rna"] +prot_adata = data.mod["prot"] + +# check whether correct feature types are detected +assert np.array_equal(rna_adata.var["feature_type"].unique(), ["Gene Expression"]), ( + "RNA expression should only contain Gene Expression vars." +) +assert np.array_equal( + rna_adata.var["reference_file"].unique(), ["reference_bd_rhapsody.tar.gz"] +), "Wrong reference file detected for Gene Expression vars." +assert "ADAMTSL4" in rna_adata.var_names, 'RNA modality should contain gene "ADAMTS4".' +assert np.array_equal( + rna_adata.obs["library_id"].unique(), ["12ABC & 12SMK & 12WTA"] +), "Gene Expression .obs library_id should equal '12ABC & 12WTA." +assert "sample_tag" in rna_adata.obs.keys(), ( + "RNA modality should contain column 'sample_id'." +) +assert "sample_id" in rna_adata.obs.keys(), ( + "RNA modality should contain column 'sample_name'." +) + +assert np.array_equal(prot_adata.var["feature_type"].unique(), ["Antibody Capture"]), ( + "RNA expression should only contain Antibody Capture vars." +) +assert np.array_equal( + prot_adata.var["reference_file"].unique(), ["BDAbSeq_ImmuneDiscoveryPanel.fasta"] +), "Wrong reference file detected for Antibody Capture vars." +assert "CD279:EH12-1|PDCD1|AHS0014|pAbO" in prot_adata.var_names, ( + 'Protein modality should contain protein "CD279:EH12-1|PDCD1|AHS0014|pAbO".' +) +assert np.array_equal( + prot_adata.obs["library_id"].unique(), ["12ABC & 12SMK & 12WTA"] +), "Antibody Capture .obs library_id should equal '12ABC & 12WTA." +assert "sample_tag" in prot_adata.obs.keys(), ( + "Protein modality should contain column 'sample_id'." +) +assert "sample_id" in prot_adata.obs.keys(), ( + "Protein modality should contain column 'sample_name'." +) + +# check whether gene was found +assert "PDE4DIP" in data.var_names, 'Output should contain gex column "PDE4DIP".' +assert "CD279:EH12-1|PDCD1|AHS0014|pAbO" in data.var_names, ( + 'Output should contain abc column "CD279:EH12-1|PDCD1|AHS0014|pAbO".' +) + +print("> Test successful", flush=True) diff --git a/src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml b/src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..1764322c --- /dev/null +++ b/src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml @@ -0,0 +1,88 @@ +name: "from_cellranger_multi_to_h5mu" +namespace: "convert" +scope: "public" +description: | + Converts the output from cellranger multi to a single .h5mu file. + By default, will map the following library type names to modality names: + - Gene Expression: rna + - Peaks: atac + - Antibody Capture: prot + - VDJ: vdj + - VDJ-T: vdj_t + - VDJ-B: vdj_b + - CRISPR Guide Capture: crispr + - Multiplexing Capture: hashing + + Other library types have their whitepace removed and dashes replaced by + underscores to generate the modality name. + + Currently does not allow parsing the output from cell barcode demultiplexing. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input folder. Must contain the output from a cellranger multi run. + example: input_dir_containing_modalities + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + multiple: true + description: | + Locations for the output files. Must contain a wildcard (*) character, + which will be replaced with the sample name. + example: "*.h5mu" + direction: output + - name: "--sample_csv" + type: file + description: CSV file describing the sample name per output file + direction: output + example: "samples.csv" + - name: "--uns_metrics" + type: string + description: Name of the .uns slot under which to QC metrics (if any). + default: "metrics_cellranger" +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/10x_5k_anticmv + - path: /resources_test/10x_5k_lung_crispr + - path: /resources_test/10x_5k_beam + - path: /resources_test/10x_5k_fixed + - path: /resources_test/10x_4plex_dtc +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + # Starting from version 13, scirpy started using awkward arrays + # https://scirpy.scverse.org/en/latest/changelog.html#v0-13-0-new-data-structure-based-on-awkward-arrays + # But awkward arrays are still considered experimental by anndata: + # https://anndata.readthedocs.io/en/latest/tutorials/notebooks/awkward-arrays.html + # Officially, scirpy supports numpy 2.0 from version 17 onwards, but pinning <13 seems to work. + - scirpy~=0.12.0 + - pandas~=2.2.3 + - pytest # Scirpy has patsy as a dependency and it requires pytest + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] diff --git a/src/convert/from_cellranger_multi_to_h5mu/script.py b/src/convert/from_cellranger_multi_to_h5mu/script.py new file mode 100644 index 00000000..8ab1abda --- /dev/null +++ b/src/convert/from_cellranger_multi_to_h5mu/script.py @@ -0,0 +1,453 @@ +from pathlib import Path +import sys +import scanpy +import pandas as pd +import mudata +import numpy as np +from scirpy.io import read_10x_vdj +from collections import defaultdict +from functools import partial +import json +import csv +import tempfile + + +## VIASH START +par = { + "input": "resources_test/10x_4plex_dtc/processed/10x_4plex_dtc.cellranger_multi.output", + "output": "*.h5mu", + "uns_metrics": "metrics_cellranger", + "output_compression": "gzip", +} +meta = {"resources_dir": "./src/utils/"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +POSSIBLE_LIBRARY_TYPES = ( + "vdj_t", + "vdj_b", + "vdj_t_gd", + "count", + "antigen_analysis", + "multiplexing_analysis", +) + +FEATURE_TYPES_NAMES = { + "Gene Expression": "rna", + "Peaks": "atac", + "Antibody Capture": "prot", + "VDJ": "vdj", + "VDJ-T": "vdj_t", + "VDJ-B": "vdj_b", + "CRISPR Guide Capture": "gdo", + "Multiplexing Capture": "hto", + "Antigen Capture": "antigen", + "Custom": "custom", +} + + +def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: + """ + Cast the dataframe to dtypes that can be written by mudata. + """ + # dtype inferral workfs better with np.nan + result = result.replace({pd.NA: np.nan}) + + # MuData supports nullable booleans and ints + # ie. `IntegerArray` and `BooleanArray` + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # Convert leftover 'object' columns to string + # However, na values are supported, so convert all values except NA's to string + object_cols = result.select_dtypes(include="object").columns.values + for obj_col in object_cols: + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) + return result + + +def gather_input_data(dir: Path): + # / + # +-- multi + # | +-- count (raw output) + # | | +-- feature_reference.csv + # | | +-- raw_feature_bc_matrix.h5 + # | +-- vdj_t + # | | +-- all_contig_annotations.json + # | +-- vdj_b + # | | +-- all_contig_annotations.json + # | +-- vdj_t_gd + # | | +-- all_contig_annotations.json + # | +-- multiplexing_analysis + # | +-- cells_per_tag.json + # +-- per_sample_outs (filtered outputs) + # +-- example_1 + # +-- antigen_analysis + # | +-- per_barcode.csv + # | +-- antigen_specificity_scores.csv + # +-- count + # | +-- antibody_analysis + # | +-- crispr_analysis + # | +-- perturbation_efficiencies_by_feature.csv + # | +-- perturbation_efficiencies_by_target.csv + # +-- vdj_t (unused) + # +-- vdj_b (unused) + # +-- vdj_t_gd (unused) + # +-- metrics_summary.csv + + if not dir.is_dir(): + raise ValueError("Specified input is not a directory.") + folder_contents = list(dir.iterdir()) + config = dir / "config.csv" + if config not in folder_contents: + logger.warning( + "Config.csv not found in input directory, this folder might not be a valid cellranger multi output." + ) + + required_subfolders = [ + dir / subfolder_name for subfolder_name in ("multi", "per_sample_outs") + ] + found_input = {key_: {} for key_ in POSSIBLE_LIBRARY_TYPES} + for required_subfolder in required_subfolders: + if required_subfolder not in folder_contents: + raise ValueError( + f"Input folder must contain the subfolder {required_subfolder} please make " + "sure that the specified input folder is a valid cellranger multi output." + ) + + multi_dir = dir / "multi" + for library_type in multi_dir.iterdir(): + if not library_type.is_dir(): + logger.warning( + "%s is not a directory. Contents of the multi folder " + "must be directories to be recognized as valid input data", + library_type, + ) + continue + if library_type.name not in POSSIBLE_LIBRARY_TYPES: + raise ValueError( + f"Contents of the 'multi' folder must be found one of the following: {','.join(POSSIBLE_LIBRARY_TYPES)}." + ) + + found_input[library_type.name] = library_type + + per_sample_outs_dir = dir / "per_sample_outs" + samples_dirs = [ + samplepath + for samplepath in per_sample_outs_dir.iterdir() + if samplepath.is_dir() + ] + for samples_dir in samples_dirs: + for file_part in ( + "metrics_summary.csv", + "count/feature_reference.csv", + "count/crispr_analysis/perturbation_efficiencies_by_feature.csv", + "count/crispr_analysis/perturbation_efficiencies_by_target.csv", + "antigen_analysis", + ): + found_file = samples_dir / file_part + if found_file.exists(): + file_name = found_file.name.removesuffix(".csv") + found_input.setdefault(file_name, {})[samples_dir.name] = found_file + + return found_input + + +def proces_perturbation( + key_name: str, mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): + for sample_name, mudata_obj in mudatas.items(): + efficiency_file = efficiency_files[sample_name] + assert "gdo" in mudata_obj.mod + eff_df = pd.read_csv( + efficiency_file, + index_col="Perturbation", + sep=",", + decimal=".", + quotechar='"', + ) + mudata_obj.mod["gdo"].uns[key_name] = eff_df + return mudatas + + +def process_feature_reference( + mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): + for sample, mudata_obj in mudatas.items(): + efficiency_file = efficiency_files[sample] + df = pd.read_csv( + efficiency_file, index_col="id", sep=",", decimal=".", quotechar='"' + ) + assert "feature_type" in df.columns, ( + "Columns 'feature_type' should be present in features_reference file." + ) + feature_types = df["feature_type"] + missing_features = set(feature_types) - set(FEATURE_TYPES_NAMES) + if missing_features: + raise ValueError( + "Not all feature types present in the features_reference file are supported by this component.\n" + f"Missing support for features: {','.join(missing_features)}." + ) + for feature_type in feature_types: + modality = FEATURE_TYPES_NAMES[feature_type] + subset_df = df.loc[df["feature_type"] == feature_type] + mudata_obj.mod[modality].uns["feature_reference"] = subset_df + return mudatas + + +def process_counts(counts_folder: Path, multiplexing_info, metrics_files): + counts_matrix_file = counts_folder / "raw_feature_bc_matrix.h5" + logger.info("Reading %s.", counts_matrix_file) + adata = scanpy.read_10x_h5(counts_matrix_file, gex_only=False) + + # set the gene ids as var_names + logger.info("Renaming var columns") + adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + + # generate output + logger.info("Convert to mudata") + + def modality_name_factory(library_type): + return ("".join(library_type.replace("-", "_").split())).lower() + + feature_types = defaultdict(modality_name_factory, FEATURE_TYPES_NAMES) + mudata_all_samples = mudata.MuData(adata, feature_types_names=feature_types) + if multiplexing_info: + # Get the mapping between the barcode and the sample ID from one of the metrics files + metrics_file = pd.read_csv( + list(metrics_files.values())[0], decimal=".", quotechar='"', thousands="," + ) + sample_ids = metrics_file[metrics_file["Metric Name"] == "Sample ID"] + barcode_sample_mapping = ( + sample_ids.loc[:, ["Group Name", "Metric Value"]] + .set_index("Group Name") + .squeeze() + .to_dict() + ) + # When probe barcodes are combined with multiplexing barcodes; the probe barcode (BC) + # is paired with other sample with other sample information using the syntax "{BC}+{antibody}" + # (or even "{BC}+{antibody}+{crispr guide}"). The cells_per_tag + # file only encodes the BCs; so we need to strip the other information. + barcode_sample_mapping = { + key_.partition("+")[0]: value_ + for key_, value_ in barcode_sample_mapping.items() + } + return split_samples( + mudata_all_samples, multiplexing_info, barcode_sample_mapping + ) + return {"run": mudata_all_samples} + + +def split_samples(mudata_obj, multiplexing_analysis_folder, barcode_sample_mapping): + result = {} + cells_per_tag_file = multiplexing_analysis_folder / "cells_per_tag.json" + with cells_per_tag_file.open("r") as open_json: + sample_cell_mapping = json.load(open_json) + + for barcode, indices in sample_cell_mapping.items(): + if indices: + sample_mudata = mudata_obj[indices] + sample = barcode_sample_mapping[barcode] + result[sample] = sample_mudata.copy() + return result + + +def process_metrics_summary( + mudatas: dict[str, mudata.MuData], metrics_files: dict[str, Path] +): + def read_percentage(val): + try: + if str(val).endswith("%"): + return float(val.strip("%")) / 100 + else: + return val + except (AttributeError, ValueError): + return val + + for sample, mudata_obj in mudatas.items(): + metrics_file = metrics_files[sample] + metrics_summary = pd.read_csv( + metrics_file, decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) + + mudata_obj.uns[par["uns_metrics"]] = metrics_summary + for colname, coldata in metrics_summary.items(): + try: + new_column = coldata.astype(str, copy=True).astype( + {colname: "category"} + ) + metrics_summary[colname] = new_column + except (ValueError, TypeError): + logger.warning(f"Could not store column {colname} from metrics.") + pass + return mudatas + + +def process_antigen_analysis( + mudatas: dict[str, mudata.MuData], antigen_analysis_folder_paths: dict[str, Path] +): + for sample_id, mudata_obj in mudatas.items(): + antigen_analysis_folder_path = antigen_analysis_folder_paths[sample_id] + assert "antigen" in mudata_obj.mod + per_barcodes_file = antigen_analysis_folder_path / "per_barcode.csv" + assert per_barcodes_file.is_file(), ( + "Expected a per_barcode.csv file to be present." + ) + per_barcodes_df = pd.read_csv( + per_barcodes_file, index_col="barcode", sep=",", decimal=".", quotechar='"' + ) + is_gex_cell = per_barcodes_df["is_gex_cell"] + assert len(set(is_gex_cell.unique().tolist()) - set([False, True])) == 0, ( + "Expected 'is_gex_cell' column to be boolean. Please report this as a bug." + ) + barcodes_in_gex = per_barcodes_df[is_gex_cell] + # All of the barcodes listed in the per_barcode.csv with is_gex_cell set to 'True' + # must be in the 'rna' (an thus also 'antigen') modality + assert barcodes_in_gex.index.difference(mudata_obj["rna"].obs_names).empty + orig_obs_names = mudata_obj["antigen"].obs_names.copy() + mudata_obj["antigen"].obs = cast_to_writeable_dtype( + pd.concat( + [mudata_obj["antigen"].obs, barcodes_in_gex], + axis="columns", + join="outer", + verify_integrity=True, + sort=False, + ) + ) + assert orig_obs_names.equals(mudata_obj["antigen"].obs_names) + del orig_obs_names + + # The antigen_specificity_scores.csv file is only present when cellranger + # multi was run with a [antigen-specificity] section in config + specificity_file = ( + antigen_analysis_folder_path / "antigen_specificity_scores.csv" + ) + if specificity_file.is_file(): + antigen_scores_df = pd.read_csv( + specificity_file, + index_col=["barcode", "antigen"], + sep=",", + decimal=".", + quotechar='"', + ) + score = antigen_scores_df.unstack() + assert score.index.difference(mudata_obj["rna"].obs_names).empty + antigens = score.columns.unique(level="antigen") + for antigen in antigens: + score_antigen = score.loc[:, (slice(None), antigen)].droplevel( + "antigen", axis=1 + ) + score_antigen = score_antigen.reindex(mudata_obj["rna"].obs_names) + mudata_obj["antigen"].obsm[f"antigen_specificity_scores_{antigen}"] = ( + cast_to_writeable_dtype(score_antigen) + ) + return mudatas + + +def process_vdj(mudatas: dict[str, mudata.MuData], vdj_folder_path: str): + # https://scverse.org/scirpy/latest/generated/scirpy.io.read_10x_vdj.html#scirpy-io-read-10x-vdj + # According to docs, using the json is preferred as this file includes intron info. + all_config_json_file = vdj_folder_path / "all_contig_annotations.json" + vdj_type = vdj_folder_path.name + with all_config_json_file.open("r") as open_json: + json_obj = json.load(open_json) + for _, mudata_obj in mudatas.items(): + json_for_sample = [ + entry for entry in json_obj if entry["barcode"] in mudata_obj.obs_names + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as tfile: + json.dump(json_for_sample, tfile, indent=4) + tfile.flush() + vdj_anndata = read_10x_vdj(tfile.name) + mudata_obj.mod[vdj_type] = vdj_anndata + return mudatas + + +def get_modalities(input_data): + dispatcher = { + "multiplexing_analysis": split_samples, + "vdj_t": process_vdj, + "vdj_b": process_vdj, + "vdj_t_gd": process_vdj, + "metrics_summary": process_metrics_summary, + "feature_reference": process_feature_reference, + "perturbation_efficiencies_by_feature": partial( + proces_perturbation, "perturbation_efficiencies_by_feature" + ), + "perturbation_efficiencies_by_target": partial( + proces_perturbation, "perturbation_efficiencies_by_target" + ), + "antigen_analysis": process_antigen_analysis, + } + mudata_per_sample = process_counts( + input_data["count"], + input_data["multiplexing_analysis"], + input_data["metrics_summary"], + ) + for modality_name, modality_data_path in input_data.items(): + if ( + modality_name in ("count", "multiplexing_analysis") + or not modality_data_path + ): + continue + try: + parser_function = dispatcher[modality_name] + except KeyError as e: + raise ValueError( + "This component does not support the " + f"parsing of the '{modality_name}' yet." + ) from e + mudata_per_sample = parser_function(mudata_per_sample, modality_data_path) + return mudata_per_sample + + +def main(): + cellranger_multi_dir = Path(par["input"]) + # TODO: remove when issue https://github.com/viash-io/viash/issues/706 is resolved. + if isinstance(par["output"], (list, set, tuple)): + assert len(par["output"]) == 1, ( + "A single output file template should have been provided." + ) + par["output"] = par["output"][0] + assert par["output"].count("*") == 1, ( + f"Expected exactly one wildcard character (*) in output " + f"files template ({par['output']}). Found {par['output'].count('*')}" + ) + input_data = gather_input_data(cellranger_multi_dir) + result = get_modalities(input_data) + output_files = { + par["output"].replace("*", sample_name) for sample_name in result.keys() + } + assert len(output_files) == len(result.keys()), ( + "Replacing the wildcard in the output files " + "template did not produce unique file paths." + ) + logger.info( + "Writing output for samples: '%s' to '%s'", + "".join(result.keys()), + par["output"], + ) + with Path(par["sample_csv"]).open("w", newline="") as open_csv: + csvwriter = csv.DictWriter(open_csv, fieldnames=["sample_name", "file"]) + csvwriter.writeheader() + for sample_name, mudata_obj in result.items(): + output_file = Path(par["output"].replace("*", sample_name)) + mudata_obj.write_h5mu(output_file, compression=par["output_compression"]) + csvwriter.writerow({"sample_name": sample_name, "file": output_file.name}) + + +if __name__ == "__main__": + main() diff --git a/src/convert/from_cellranger_multi_to_h5mu/test.py b/src/convert/from_cellranger_multi_to_h5mu/test.py new file mode 100644 index 00000000..2be73b36 --- /dev/null +++ b/src/convert/from_cellranger_multi_to_h5mu/test.py @@ -0,0 +1,235 @@ +import sys +import pytest +from mudata import read_h5mu + +## VIASH START +meta = { + "executable": "./target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu", + "resources_dir": "resources_test/", + "config": "src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml", +} +## VIASH END + +input_anticmv = f"{meta['resources_dir']}/10x_5k_anticmv/processed/10x_5k_anticmv.cellranger_multi.output" +input_lung_crispr = f"{meta['resources_dir']}/10x_5k_lung_crispr/processed/10x_5k_lung_crispr.cellranger_multi.output.output" +input_beam = ( + f"{meta['resources_dir']}/10x_5k_beam/processed/10x_5k_beam.cellranger_multi.output" +) +input_fixed_rna = f"{meta['resources_dir']}/10x_5k_fixed/processed/10x_5k_fixed.cellranger_multi.output" + + +def test_cellranger_multi_basic(run_component, tmp_path): + output_dir = tmp_path / "converted" + output_path_template = output_dir / "*.h5mu" + samples_csv = tmp_path / "samples.csv" + # run component + run_component( + [ + "--input", + input_anticmv, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) + assert output_dir.is_dir() + + # check output + samples = [item for item in output_dir.iterdir() if item.is_file()] + assert len(samples) == 1 + output_path = samples[0] + converted_data = read_h5mu(output_path) + assert list(converted_data.mod.keys()) == ["rna", "prot", "vdj_t"] + assert list(converted_data.uns.keys()) == ["metrics_cellranger"] + expected_metrics = [ + "Category", + "Library Type", + "Grouped By", + "Group Name", + "Metric Name", + "Metric Value", + ] + assert ( + converted_data.uns["metrics_cellranger"].columns.to_list() == expected_metrics + ) + # Check that a metric that is stored as percentage (e.g "85.69%") is correctly represented + # as a floating point number + metrics_df_with_index = converted_data.uns["metrics_cellranger"].set_index( + ["Metric Name", "Library Type", "Category"] + ) + percentage = metrics_df_with_index.loc[ + ("Confidently mapped reads in cells", "Gene Expression", "Cells"), + "Metric Value", + ] + assert percentage.iloc[0] == "0.8569" + + thousand_delimited_number = metrics_df_with_index.loc[ + ("Cells", "Gene Expression", "Cells"), "Metric Value" + ] + thousand_delimited_number == "3,798" + + smaller_number = metrics_df_with_index.loc[ + ("Median genes per cell", "Gene Expression", "Cells"), "Metric Value" + ] + smaller_number == "6" + + +def test_cellranger_multi_to_h5mu_crispr(run_component, tmp_path): + output_dir = tmp_path / "converted" + output_path_template = output_dir / "*.h5mu" + samples_csv = tmp_path / "samples.csv" + + # run component + run_component( + [ + "--input", + input_lung_crispr, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) + assert output_dir.is_dir() + + # check output + samples = [item for item in output_dir.iterdir() if item.is_file()] + assert len(samples) == 1 + output_path = samples[0] + converted_data = read_h5mu(output_path) + assert list(converted_data.mod.keys()) == ["rna", "gdo"] + assert list(converted_data.uns.keys()) == ["metrics_cellranger"] + assert "perturbation_efficiencies_by_feature" in converted_data.mod["gdo"].uns + assert "perturbation_efficiencies_by_target" in converted_data.mod["gdo"].uns + assert "feature_reference" not in converted_data.mod["rna"].uns + assert "feature_reference" in converted_data.mod["gdo"].uns + + +def test_cellranger_multi_to_h5mu_beam(run_component, tmp_path): + output_dir = tmp_path / "converted" + output_path_template = output_dir / "*.h5mu" + samples_csv = tmp_path / "samples.csv" + + # run component + run_component( + [ + "--input", + input_beam, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) + assert output_dir.is_dir() + + # check output + samples = [item for item in output_dir.iterdir() if item.is_file()] + assert len(samples) == 1 + output_path = samples[0] + converted_data = read_h5mu(output_path) + assert list(converted_data.mod.keys()) == ["rna", "antigen", "vdj_t"] + assert "antigen_specificity_scores_CMV_B0702" in converted_data["antigen"].obsm + assert "antigen_specificity_scores_Flu_A0201" in converted_data["antigen"].obsm + + +def test_cellranger_multi_to_h5mu_fixed_rna(run_component, tmp_path): + output_dir = tmp_path / "converted" + output_path_template = output_dir / "*.h5mu" + samples_csv = tmp_path / "samples.csv" + + # run component + run_component( + [ + "--input", + input_fixed_rna, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) + assert output_dir.is_dir() + + # check output + samples = [item for item in output_dir.iterdir() if item.is_file()] + sample_names = {item.name.removesuffix(".h5mu") for item in samples} + assert sample_names == { + "Colorectal_BC3", + "Liver_BC1", + "Ovarian_BC2", + "Pancreas_BC4", + } + for output_path in samples: + converted_data = read_h5mu(output_path) + assert list(converted_data.mod.keys()) == ["rna", "prot"] + + +def test_custom_modality(run_component, tmp_path): + input_dir = f"{meta['resources_dir']}/10x_5k_anticmv/processed_with_custom/10x_5k_anticmv.cellranger_multi.output" + output_dir = tmp_path / "converted" + output_path_template = output_dir / "*.h5mu" + samples_csv = tmp_path / "samples.csv" + # run component + run_component( + [ + "--input", + input_dir, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) + assert output_dir.is_dir() + + # check output + samples = [item for item in output_dir.iterdir() if item.is_file()] + assert len(samples) == 1 + output_path = samples[0] + converted_data = read_h5mu(output_path) + assert list(converted_data.mod.keys()) == ["rna", "custom", "vdj_t"] + assert "feature_reference" in converted_data.mod["custom"].uns + + +def test_antibody_and_cell_barcode_probes(run_component, tmp_path): + input_dir = f"{meta['resources_dir']}/10x_4plex_dtc/processed/10x_4plex_dtc.cellranger_multi.output" + output_dir = tmp_path / "converted" + output_path_template = output_dir / "*.h5mu" + samples_csv = tmp_path / "samples.csv" + # run component + run_component( + [ + "--input", + input_dir, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) + assert output_dir.is_dir() + + # check output + samples = [item for item in output_dir.iterdir() if item.is_file()] + assert len(samples) == 4 + output_path = samples[0] + converted_data = read_h5mu(output_path) + assert list(converted_data.mod.keys()) == ["rna", "prot"] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_h5ad_to_h5mu/config.vsh.yaml b/src/convert/from_h5ad_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..33c43d7c --- /dev/null +++ b/src/convert/from_h5ad_to_h5mu/config.vsh.yaml @@ -0,0 +1,58 @@ +name: from_h5ad_to_h5mu +namespace: "convert" +scope: "public" +description: | + Converts a single layer h5ad file into a single MuData object +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5ad files + default: input.h5ad + direction: input + required: true + multiple: true + - name: "--modality" + description: | + List of names to use for the modalities. Will be used as the keys in the .mod attribute in the output MuData object + The number of items provided for this argument equal the number of input files (--input) and their order should match. + type: string + default: "rna" + required: false + multiple: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output MuData file. + default: output.h5mu + direction: output +__merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: +- type: executable +- type: nextflow + directives: + label: [lowmem, singlecpu] \ No newline at end of file diff --git a/src/convert/from_h5ad_to_h5mu/script.py b/src/convert/from_h5ad_to_h5mu/script.py new file mode 100755 index 00000000..21a8ae8b --- /dev/null +++ b/src/convert/from_h5ad_to_h5mu/script.py @@ -0,0 +1,42 @@ +import mudata as mu +import anndata +import sys + +## VIASH START +par = { + "input": [ + "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5ad" + ], + "modality": ["rna"], + "output": "output.h5mu", + "output_compression": "gzip", + "conversions_obsm": '{"counts_antibody":"prot", "counts_custom": "custom"}', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +assert len(par["input"]) == len(par["modality"]), ( + "Number of input files should be the same length as the number of modalities" +) + +logger.info("Reading input files") +data = { + key: anndata.read_h5ad(path) for key, path in zip(par["modality"], par["input"]) +} + +logger.info("Converting to mudata") +mudata = mu.MuData(data) + +try: + mudata.var_names_make_unique() +except (TypeError, ValueError): + pass + +logger.info("Writing to %s.", par["output"]) +mudata.write_h5mu(par["output"], compression=par["output_compression"]) + +logger.info("Finished") diff --git a/src/convert/from_h5ad_to_h5mu/test.py b/src/convert/from_h5ad_to_h5mu/test.py new file mode 100644 index 00000000..2612197a --- /dev/null +++ b/src/convert/from_h5ad_to_h5mu/test.py @@ -0,0 +1,53 @@ +import sys +import pytest +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "resources_dir": "resources_test", + "executable": "./target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu", + "config": "./src/convert/from_h5ad_to_h5mu/config.vsh.yaml", +} +## VIASH END + +input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +def test_run(run_component, random_h5mu_path, random_path): + mdata = mu.read_h5mu(input) + tmp_rna = random_path(extension="h5ad") + tmp_prot = random_path(extension="h5ad") + mdata.mod["rna"].write_h5ad(tmp_rna) + mdata.mod["prot"].write_h5ad(tmp_prot) + + tmp_output = random_h5mu_path() + + cmd_pars = [ + "--modality", + "rna", + "--input", + tmp_rna, + "--modality", + "prot", + "--input", + tmp_prot, + "--output", + tmp_output, + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + assert tmp_output.is_file(), "No output was created." + + mdata2 = mu.read_h5mu(tmp_output) + + assert list(mdata2.mod.keys()) == ["rna", "prot"] + + assert_annotation_objects_equal(mdata2.mod["rna"], tmp_rna) + assert_annotation_objects_equal(mdata2.mod["prot"], tmp_prot) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_h5ad_to_seurat/config.vsh.yaml b/src/convert/from_h5ad_to_seurat/config.vsh.yaml new file mode 100644 index 00000000..325f4dc5 --- /dev/null +++ b/src/convert/from_h5ad_to_seurat/config.vsh.yaml @@ -0,0 +1,53 @@ +name: "from_h5ad_to_seurat" +namespace: "convert" +scope: "public" +description: | + Converts an h5ad file into a Seurat file. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5ad file + direction: input + required: true + example: input.h5ad + - name: "--assay" + type: string + default: "RNA" + description: Name of the assay to be created. + - name: "--output" + alternatives: ["-o"] + type: file + description: Output Seurat file + direction: output + required: true + example: output.rds +resources: + - type: r_script + path: script.R +test_resources: + - type: r_script + path: test.R + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad +engines: + - type: docker + image: rocker/r2u:24.04 + setup: + - type: apt + packages: + - libhdf5-dev + - libgeos-dev + - type: r + cran: [ hdf5r, Seurat, SeuratObject ] + github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a + test_setup: + - type: r + cran: [ testthat ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] \ No newline at end of file diff --git a/src/convert/from_h5ad_to_seurat/script.R b/src/convert/from_h5ad_to_seurat/script.R new file mode 100644 index 00000000..804ecbfb --- /dev/null +++ b/src/convert/from_h5ad_to_seurat/script.R @@ -0,0 +1,19 @@ +library(anndataR) + +### VIASH START +par <- list( + input = "data_split/split_data.h5ad", + output = "data_split/split_data_anndatar.rds", + assay = "RNA" +) +### VIASH END + + +seurat_obj <- read_h5ad( + par$input, + mode = "r", + as = "Seurat", + assay_name = par$assay +) + +saveRDS(seurat_obj, file = par$output) diff --git a/src/convert/from_h5ad_to_seurat/test.R b/src/convert/from_h5ad_to_seurat/test.R new file mode 100644 index 00000000..a8689407 --- /dev/null +++ b/src/convert/from_h5ad_to_seurat/test.R @@ -0,0 +1,46 @@ +library(testthat, warn.conflicts = FALSE) +library(hdf5r) + +## VIASH START +meta <- list( + executable = "target/docker/convert/from_h5mu_to_seurat/from_h5ad_to_seurat", + resources_dir = "resources_test", + name = "from_h5mu_to_seurat" +) +## VIASH END + +cat("> Checking whether output is correct\n") + +in_h5ad <- paste0( + meta[["resources_dir"]], + "/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad" +) +out_rds <- "output.rds" + +cat("> Running ", meta[["name"]], "\n", sep = "") +out <- processx::run( + meta[["executable"]], + c( + "--input", in_h5ad, + "--output", out_rds + ) +) + +cat("> Checking whether output file exists\n") +expect_equal(out$status, 0) +expect_true(file.exists(out_rds)) + +cat("> Reading output file\n") +obj <- readRDS(file = out_rds) + +cat("> Checking whether Seurat object is in the right format\n") +expect_is(obj, "Seurat") +expect_equal(names(slot(obj, "assays")), "RNA") + +open_file <- H5File$new(in_h5ad, mode = "r+") + +dim_rds <- dim(obj) +dim_ad <- open_file[["X"]]$attr_open("shape")$read() + +expect_equal(dim_rds[1], dim_ad[2]) +expect_equal(dim_rds[2], dim_ad[1]) diff --git a/src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml b/src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml new file mode 100644 index 00000000..20839548 --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml @@ -0,0 +1,56 @@ +name: "from_h5mu_or_h5ad_to_seurat" +namespace: "convert" +scope: "public" +description: | + Converts an h5ad file or a single modality of an h5mu file into a Seurat file. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5ad or h5mu file + direction: input + required: true + example: input.h5ad + - name: "--modality" + type: string + default: "rna" + required: false + description: Modality to be converted if the input file is an h5mu file. + - name: "--assay" + type: string + default: "RNA" + description: Name of the assay to be created. + - name: "--output" + alternatives: ["-o"] + type: file + description: Output Seurat file + direction: output + required: true + example: output.rds +resources: + - type: r_script + path: script.R +test_resources: + - type: r_script + path: test.R + - path: /resources_test/pbmc_1k_protein_v3/ +engines: + - type: docker + image: rocker/r2u:22.04 + setup: + - type: apt + packages: [ libhdf5-dev, libgeos-dev, hdf5-tools ] + - type: r + cran: [ anndata, hdf5r, Seurat, SeuratObject ] + github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a + test_setup: + - type: r + cran: [ testthat ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] \ No newline at end of file diff --git a/src/convert/from_h5mu_or_h5ad_to_seurat/script.R b/src/convert/from_h5mu_or_h5ad_to_seurat/script.R new file mode 100644 index 00000000..86515882 --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_seurat/script.R @@ -0,0 +1,81 @@ +library(anndataR) +library(hdf5r) + + +### VIASH START +par <- list( + input = "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + output = "resources_test/annotation_test_data/TS_Blood_filtered.rds", + assay = "RNA", + modality = "rna" +) +### VIASH END + +is_mudata_file <- function(file_path) { + con <- file(file_path, "rb") + tryCatch({ + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + header <- readBin(con, "raw", n = 6) + identical(header, charToRaw("MuData")) + }, finally = { + close(con) + }) +} + +h5mu_to_h5ad <- function(h5mu_path, modality_name) { + tmp_path <- tempfile(fileext = ".h5ad") + mod_location <- paste("mod", modality_name, sep = "/") + h5src <- hdf5r::H5File$new(h5mu_path, "r") + h5dest <- hdf5r::H5File$new(tmp_path, "w") + # Copy over the child objects and the child attributes from root + # Root cannot be copied directly because it always exists and + # copying does not allow overwriting. + children <- hdf5r::list.objects(h5src, + path = mod_location, + full.names = FALSE, recursive = FALSE + ) + for (child in children) { + h5dest$obj_copy_from( + h5src, paste(mod_location, child, sep = "/"), + paste0("/", child) + ) + } + # Also copy the root attributes + root_attrs <- hdf5r::h5attr_names(x = h5src) + for (attr in root_attrs) { + h5a <- h5src$attr_open(attr_name = attr) + robj <- h5a$read() + h5dest$create_attr_by_name( + attr_name = attr, + obj_name = ".", + robj = robj, + space = h5a$get_space(), + dtype = h5a$get_type() + ) + } + h5src$close() + h5dest$close() + + tmp_path +} + +# If the input is a MuData file, use a single modality as input instead +if (is_mudata_file(par$input)) { + if (is.null(par$modality) || par$modality == "") { + stop("'modality' argument must be set if the input is a MuData file.") + } + h5ad_path <- h5mu_to_h5ad(par$input, par$modality) +} else { + h5ad_path <- par$input +} + +seurat_obj <- read_h5ad( + h5ad_path, + mode = "r", + as = "Seurat", + assay_name = par$assay +) + +saveRDS(seurat_obj, file = par$output) diff --git a/src/convert/from_h5mu_or_h5ad_to_seurat/test.R b/src/convert/from_h5mu_or_h5ad_to_seurat/test.R new file mode 100644 index 00000000..09367171 --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_seurat/test.R @@ -0,0 +1,82 @@ +library(testthat, warn.conflicts = FALSE) +library(hdf5r) + + +## VIASH START +meta <- list( + executable = "target/executable/convert/from_h5ad_to_seurat", + resources_dir = "resources_test", + name = "from_h5ad_to_seurat" +) +## VIASH END + +cat("> Checking conversion of h5ad file\n") + +in_h5ad <- paste0( + meta[["resources_dir"]], + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad" +) +out_rds <- "output.rds" + +cat("> Running ", meta[["name"]], "\n", sep = "") +out <- processx::run( + meta[["executable"]], + c( + "--input", in_h5ad, + "--output", out_rds + ) +) + +cat("> Checking whether output file exists\n") +expect_equal(out$status, 0) +expect_true(file.exists(out_rds)) + +cat("> Reading output file\n") +obj <- readRDS(file = out_rds) + +cat("> Checking whether Seurat object is in the right format\n") +expect_is(obj, "Seurat") +expect_equal(names(slot(obj, "assays")), "RNA") + +open_file <- H5File$new(in_h5ad, mode = "r+") + +dim_rds <- dim(obj) +dim_ad <- open_file[["X"]]$attr_open("shape")$read() + +expect_equal(dim_rds[1], dim_ad[2]) +expect_equal(dim_rds[2], dim_ad[1]) + +cat("> Checking conversion of h5mu file\n") + +in_h5mu <- paste0( + meta[["resources_dir"]], + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) +out_rds <- "output.rds" + +cat("> Running ", meta[["name"]], "\n", sep = "") +out <- processx::run( + meta[["executable"]], + c( + "--input", in_h5mu, + "--output", out_rds + ) +) + +cat("> Checking whether output file exists\n") +expect_equal(out$status, 0) +expect_true(file.exists(out_rds)) + +cat("> Reading output file\n") +obj <- readRDS(file = out_rds) + +cat("> Checking whether Seurat object is in the right format\n") +expect_is(obj, "Seurat") +expect_equal(names(slot(obj, "assays")), "RNA") + +dim_rds <- dim(obj) +mu_in <- H5File$new(in_h5mu, mode = "r") +dim_ad <- mu_in[["/mod/rna/X"]]$attr_open("shape")$read() + +expect_equal(dim_rds[1], dim_ad[2]) +expect_equal(dim_rds[2], dim_ad[1]) diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml new file mode 100644 index 00000000..63c3d98c --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml @@ -0,0 +1,159 @@ +name: "from_h5mu_or_h5ad_to_tiledb" +namespace: "convert" +scope: "public" +description: | + Convert a MuData or AnnData object to tiledb. Currently, transcriptome and protein modalities are supported. + + NOTE: The functionality provided by this component is experimental and may be subject to change. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Input" + arguments: + - name: --input + description: | + Input AnnData or MuData file. When an AnnData file is provided, it is automatically assumed to + contain transcriptome counts. + type: file + required: true + example: "input.h5mu" + direction: input + - name: "RNA modality" + arguments: + - name: --rna_modality + type: string + default: rna + description: | + The name used for the RNA modality. Used when input file is a MuData object. + - name: --rna_raw_layer_input + type: string + required: true + example: X + description: | + Location of the layer containing the raw transcriptome counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + - name: --rna_normalized_layer_input + type: string + required: true + example: log_normalized + description: | + Location of the layer containing the normalized counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + - name: --rna_var_gene_names_input + type: string + required: false + example: "gene_symbol" + description: | + Column in .var that provides the gene names. If not specified, the index from the input is used. + + - name: "Protein modality" + arguments: + - name: --prot_modality + description: | + The name used for the protein modality. Used when input file is a MuData object. + When not specified, the protein modality will not be processed. + type: string + required: false + example: prot + - name: --prot_raw_layer_input + type: string + example: X + description: | + Location of the layer containing the raw protein counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + - name: --prot_normalized_layer_input + type: string + example: clr + description: | + Location of the layer containing the normalized counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + + - name: "Output slots" + arguments: + - name: "--rna_modality_output" + type: string + default: "rna" + description: | + TileDB Measurement name where the RNA modality will be stored. + - name: "--prot_modality_output" + type: string + default: "prot" + description: | + Name of the TileDB Measurement where the protein modality will be stored. + - name: "--obs_index_name_output" + description: | + Name of the index that is used to describe the cells (observations). + type: string + default: cell_id + - name: --rna_var_index_name_output + description: | + Output name of the index that is used to describe the genes. + type: string + default: rna_index + - name: --rna_raw_layer_output + description: | + Output location for the raw transcriptomics counts. + type: string + default: "X" + - name: --rna_normalized_layer_output + type: string + default: "log_normalized" + description: | + Output location for the normalized RNA counts. + - name: --rna_var_gene_names_output + type: string + default: "gene_symbol" + description: | + Name of the .var column that specifies the gene games. + - name: --prot_var_index_name_output + description: | + Output name of the index that is used to describe the proteins. + type: string + default: prot_index + - name: --prot_raw_layer_output + type: string + default: "X" + description: | + Output location for the raw protein counts. + - name: --prot_normalized_layer_output + type: string + default: "log_normalized" + description: | + Output location for the normalized protein counts. + + - name: "Output arguments" + arguments: + - name: "--tiledb_dir" + type: file + direction: output + description: | + Directory where the TileDB output will be written to. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - tiledbsoma + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu] \ No newline at end of file diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py b/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py new file mode 100644 index 00000000..7712e5d0 --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py @@ -0,0 +1,711 @@ +import sys +import anndata +import numpy as np +import tiledbsoma + +# This import seems redundant, but it is required! +import tiledbsoma.io +import pandas as pd +from functools import singledispatch +from collections.abc import Mapping +import h5py +from shutil import copy2, rmtree +from collections.abc import Callable +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +import pyarrow as pa +from collections import defaultdict +import json + +anndata.settings.allow_write_nullable_strings = True + + +## VIASH START +par = { + "input": "/home/di/code/openpipelines-multisample/pmbc_process_samples.h5mu", + # modality names + "rna_modality": "rna", + "rna_modality_output": "rna", + "prot_modality": "prot", + "prot_modality_output": "prot", + # rna var + "rna_var_gene_names_input": "gene_symbol", + "rna_var_gene_names_output": "gene_foo", + "rna_var_index_name_output": "rna_index", + # rna layers + "rna_raw_layer_input": "X", + "rna_raw_layer_output": "X", + "rna_normalized_layer_input": "log_normalized", + "rna_normalized_layer_output": "log_normalized", + # prot layers + "prot_raw_layer_input": "X", + "prot_raw_layer_output": "X", + "prot_normalized_layer_input": "clr", + "prot_normalized_layer_output": "log_normalized", + # prot var + "prot_var_index_name_output": "prot_index", + # observation index name + "obs_index_name_output": "cell_id", + # TileDB output directory + "tiledb_dir": "output", +} + + +meta = { + "name": "from_h5mu_or_h5ad_to_tiledb", + "resources_dir": "src/utils/", + "temp_dir": "/tmp", +} +## VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +LAYER_ARGUMENTS = ( + "rna_raw_layer_input", + "rna_raw_layer_output", + "rna_normalized_layer_input", + "rna_normalized_layer_output", + "prot_raw_layer_input", + "prot_raw_layer_output", + "prot_normalized_layer_input", + "prot_normalized_layer_output", +) + + +multidim_column_indices = {} + + +def _convert_to_array(mod_name, _, group): + """ + tiledb SOMA does not support pandas dataframes in varm and obsm + Convert those to structured arrays and store column index as metadata + """ + entries = group.keys() + for entry in entries: + encoding = group[entry].attrs["encoding-type"] + if encoding in ("array", "csr_matrix", "csc_matrix"): + return + if encoding in { + "dataframe", + "nullable-integer", + "nullable-boolean", + "nullable-string-array", + }: + df = anndata.io.read_elem(group[entry]) + data = df.to_numpy() + anndata.io.write_elem(group, entry, data) + index_to_write = df.columns + # No need to store write the index to tileDB in the case its just the integer index + if not isinstance(index_to_write, pd.RangeIndex): + multidim_column_indices.setdefault(mod_name, {}).setdefault( + group.name.strip("/"), {} + ).update({entry: index_to_write.to_list()}) + return + logger.info( + "Deleting %s because it is of encoding %s, which is not supported.", + f"{group.name}/{entry}", + encoding, + ) + del group[entry] + + +def _read_obs(anndata_file): + with h5py.File(anndata_file, "r") as open_rna: + return _to_pandas_nullable(anndata.io.read_elem(open_rna["obs"])) + + +def _write_obs(anndata_file, obs_df): + with h5py.File(anndata_file, "r+") as open_h5: + anndata.io.write_elem(open_h5, "obs", obs_df) + + +def _log_arguments(function_obj, arg_dict): + """ + Format a dictionairy of arguments into a string that is put into the script logs. + """ + args_str = [f"\t{param}: {param_val}\n" for param, param_val in arg_dict.items()] + logger.info( + "Calling %s with arguments:\n%s", + function_obj.__name__, + "".join(args_str).rstrip(), + ) + + +def _to_pandas_nullable(table_or_df: pa.Table | pd.DataFrame) -> pd.DataFrame: + """ + Convert pyarrow Table or pandas dataframe to pandas dataframe that uses nullable data types. + + This function takes the following into account: + 1. AnnData supports writing BooleanDtype, IntegerDtype and StringDtype (which uses pd.NA) + 2. AnnData does *not* support FloatingDtype + 3. AnnData does support numpy-stype floating and integer dtypes (which use np.nan). + 4. TileDB SOMA's `update_obs` does not allow casting floats to integers (potential information loss). + 5. When pandas's `convert_dtypes` convert_integer argument is set to True, + floats will be converted to integer if they can be 'faithfully' casted. + + Giving the previous constraints, it is *not* possible to only convert integers using `convert_types` because + it might cause some float columns to be converted to integers (which is incompatible with tiledbsoma). + Therefore, this function will leave integers and floats as-is. + + Based on the conversion provided by this function, a suitable na representation (pd.NA or np.NA) for a column can + be chosen using `pd.api.types.is_extension_array_dtype(column_dtype)` + """ + try: + df = table_or_df.to_pandas( + types_mapper=defaultdict(None, {pa.large_string(): pd.StringDtype()}).get + ) + except AttributeError: + df = table_or_df.convert_dtypes( + infer_objects=False, + convert_string=True, + convert_integer=False, + convert_boolean=False, + convert_floating=False, + ) + return df.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_integer=False, + convert_boolean=True, + convert_floating=False, + ) + + +def _get_temp_h5ad(): + return Path(NamedTemporaryFile(suffix=".h5ad", dir=meta["temp_dir"]).name) + + +def _h5mu_to_h5ad(h5mu_path, modality_name): + """ + Create an anndata file from a modality in a mudata file by + copying over the relevant h5 objects. + """ + result = _get_temp_h5ad() + with h5py.File(h5mu_path, "r") as open_anndata: + with h5py.File(result, "w") as file_to_write: + for h5_key in open_anndata["mod"][modality_name].keys(): + open_anndata.copy( + source=f"/mod/{modality_name}/{h5_key}", dest=file_to_write + ) + return result + + +def _handle_layer_location(layer_name): + """ + Handle the special case where 'X' is specified as layer + this layer is not present in `/layers/X` but in `/X`. + """ + prefix = "layers/" if layer_name != "X" else "" + return f"{prefix}{layer_name}" + + +def _check_input_args(par, input, is_mudata): + """ + Check the input arguments: + * Arguments for protein modalities are not allowed when the input is not a MuData file. + * Arguments for input layers may not point to the same layer + * Output locations for layers may not be the same + * Input modalities must not be the same + * Input modalities must exist + """ + if not is_mudata: + for par_name in ( + "prot_modality", + "prot_raw_layer_input", + "prot_normalized_layer_input", + ): + if par[par_name]: + raise ValueError( + f"'{par_name}' can not be used when the input is not a MuData file." + ) + + if par["prot_modality"]: + for prot_arg in ("prot_raw_layer_input", "prot_raw_layer_output"): + if not par[prot_arg]: + ValueError( + f"When providing 'prot_modality', '{prot_arg}' must also be set." + ) + + NOT_SAME_VALUE = ( + ("rna_raw_layer_input", "rna_normalized_layer_input"), + ("prot_raw_layer_input", "prot_normalized_layer_input"), + ("rna_raw_layer_output", "rna_normalized_layer_output"), + ("prot_raw_layer_output", "prot_normalized_layer_output"), + ("rna_modality", "prot_modality"), + ) + for layer1_arg, layer2_arg in NOT_SAME_VALUE: + arg1_val, arg2_val = par[layer1_arg], par[layer2_arg] + if (arg1_val == arg2_val) and arg1_val is not None: + raise ValueError( + f"The value for argument '{layer1_arg}' ({arg1_val}) must not be the same as for argument '{layer2_arg}' ({arg2_val})" + ) + + with h5py.File(input, "r") as open_h5: + if "mod" in open_h5: + available_modalities = tuple(open_h5["mod"].keys()) + for mod in (par["rna_modality"], par["prot_modality"]): + if mod is not None and f"/mod/{mod}" not in open_h5: + raise ValueError( + f"Modality '{mod}' was not found in object, available modalities: {available_modalities}" + ) + + +def _copy_layer(source: str, dest: str, input_h5ad: Path, mod_obj: h5py.Group): + """ + Copy a h5py object from one location (source) to a new location (dest). + The source location must be located in the input AnnData file, the destination + will be written to a h5py Group object. + """ + logger.info("Copying layer '%s' from '%s' to '%s'", source, input_h5ad, dest) + if source == f"{mod_obj.name}/{dest}": + logger.info("Layer is already at the correct location. Nothing to do.") + return + if not input_h5ad.is_file(): + raise FileNotFoundError(f"Could not link to {input_h5ad}: file does not exist.") + with h5py.File(input_h5ad) as open_input: + if source not in open_input: + raise ValueError(f"Layer {source} was not found in {input_h5ad}.") + if dest in mod_obj: + logger.info(f"{dest} is already present. Removing before creating link.") + del mod_obj[dest] + # ExternalLink could have been used instead of 'copy', but it does not seem to work with tileDB + with h5py.File(str(input_h5ad), "r") as open_input: + open_input.copy( + source=open_input[source], dest=mod_obj, name=dest, expand_external=True + ) + logger.info("Copying complete.") + + +def _copy_column(src, dest, _, df_h5_obj): + """ + Duplicate a column in a dataframe and give a new name. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Copying column '%s' to '%s' for '%s'.", src, dest, df_h5_obj.name) + columns = df_h5_obj.attrs["column-order"] + if src == dest: + logger.info("Source and destination for the column are the same, not copying.") + return None + if dest in columns: + raise ValueError(f"Column {dest} already exists in {df_h5_obj.name}.") + df_h5_obj.attrs["column-order"] = np.append(columns, dest) + if src is None: + # Use the index as source. + src = df_h5_obj.attrs["_index"] + # Both the index and columns are written as keys to df_h5_obj + # An index name may be allowed to also be column name _only_ when + # the column and the index have the same contents (this is an anndata requirement). + # Here, this is always the case. + if src in df_h5_obj.attrs["column-order"]: + return None + df_h5_obj.copy(src, dest, expand_refs=True, expand_soft=True) + + +def _set_index_name(name, _, df_h5_obj): + """ + Set the index for a dataframe to an existing column. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Setting index name of '%s' to '%s'.", df_h5_obj.name, name) + original_index_name = df_h5_obj.attrs["_index"] + # An index and a column may share a key in df_h5_obj + # In this case we must not remove the original key from the object + operator = ( + df_h5_obj.move + if original_index_name not in df_h5_obj.attrs["column-order"] + else df_h5_obj.copy + ) + operator(original_index_name, name) + df_h5_obj.attrs["_index"] = name + logger.info("Done setting index name") + + +def _delete_all_keys(_, group_obj, exception=None): + """ + Delete all keys from a h5py.Group object; which optional exceptions. + """ + if exception is None: + exception = () + logger.info( + "Deleting all keys from '%s'%s.", + group_obj.name, + f" except {', '.join(exception)}" if exception else "", + ) + keys_to_delete = set(group_obj.keys()) - set(exception) + if not keys_to_delete: + logger.info("No keys to delete.") + return + for key in keys_to_delete: + del group_obj[key] + logger.info("Done deleting keys %s.", ", ".join(keys_to_delete)) + + +def _set_index_to_string_dtype(_, df_obj): + """ + Change the index dataype for a dataframe to string when it is encoded as categorical. + The dataframe must be provided as a h5py Group object that has been encoded by AnnData. + """ + logger.info( + "Checking if index of object '%s' is encoded as a categorical.", df_obj.name + ) + index_col_name = df_obj.attrs["_index"] + index_col = df_obj[index_col_name] + logger.info("Column '%s' is being used as the index.", index_col_name) + index_col_dtype = index_col.attrs["encoding-type"] + if index_col_dtype == "categorical": + logger.info( + "Column '%s' is encoded as categorical, converting to string.", + index_col_name, + ) + is_ordered = bool(index_col.attrs.get("ordered", False)) + codes, categories = index_col["codes"], index_col["categories"] + column = pd.Categorical.from_codes(codes, categories, is_ordered) + column_str = column.astype(str).astype(object) + del df_obj[index_col_name] + df_obj.create_dataset(index_col_name, data=column_str) + index_col.attrs["encoding-type"] = "string-array" + logger.info("Conversion to string dtype complete.") + return + logger.info( + "Column '%s' not encoded as categorical. Leaving as is.", index_col_name + ) + + +def _get_rna_conversion_specification(par): + rna_spec = [ + partial(_copy_layer, par["rna_raw_layer_input"], par["rna_raw_layer_output"]), + partial( + _copy_layer, + par["rna_normalized_layer_input"], + par["rna_normalized_layer_output"], + ), + { + "varm": partial(_convert_to_array, "rna"), + "obsm": partial(_convert_to_array, "rna"), + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + "var": [ + partial( + _copy_column, + par["rna_var_gene_names_input"], + par["rna_var_gene_names_output"], + ), + partial(_set_index_name, par["rna_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "obs": partial(_set_index_name, par["obs_index_name_output"]), + "layers": partial( + _delete_all_keys, + exception=( + par["rna_raw_layer_output"].removeprefix("layers/"), + par["rna_normalized_layer_output"].removeprefix("layers/"), + ), + ), + }, + ] + return rna_spec + + +def _get_prot_conversion_specification(par): + prot_spec = [ + partial(_copy_layer, par["prot_raw_layer_input"], par["prot_raw_layer_output"]), + partial( + _copy_layer, + par["prot_normalized_layer_input"], + par["prot_normalized_layer_output"], + ), + { + "varm": partial(_convert_to_array, "prot"), + "obsm": partial(_convert_to_array, "prot"), + "var": [ + partial(_set_index_name, par["prot_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + # "varm": _delete_all_keys, + "layers": partial( + _delete_all_keys, + exception=( + par["prot_raw_layer_output"].removeprefix("layers/"), + par["prot_normalized_layer_output"].removeprefix("layers/"), + ), + ), + "obs": partial(_set_index_name, par["obs_index_name_output"]), + }, + ] + + return prot_spec + + +def _copy_anndata_and_convert_inplace(anndata_path, conversion_specification): + converted_anndata = _get_temp_h5ad() + copy2(anndata_path, converted_anndata) + with h5py.File(converted_anndata, "r+") as open_prot_h5: + apply_conversion(conversion_specification, anndata_path, open_prot_h5) + return converted_anndata + + +@singledispatch +def apply_conversion(conversion_specification, input_h5ad, open_h5): + raise NotImplementedError( + f"Error: function 'apply_conversion' is not implemented for type '{type(conversion_specification)}'" + ) + + +@apply_conversion.register +def _(conversion_specification: list, input_h5ad, open_h5): + for item in conversion_specification: + apply_conversion(item, input_h5ad, open_h5) + + +@apply_conversion.register +def _(conversion_specifications: Mapping, input_h5ad, open_h5): + for h5_key, conversion_spec in conversion_specifications.items(): + if hasattr(open_h5, h5_key): + h5_obj = getattr(open_h5, h5_key) + else: + h5_obj = open_h5[h5_key] + apply_conversion(conversion_spec, input_h5ad, h5_obj) + + +@apply_conversion.register +def _(conversion_specifications: Callable, input_h5ad, open_h5): + conversion_specifications(input_h5ad, open_h5) + + +def _write_varm_obsm_indices(output_dir, mod_name, multidim_column_indices): + # TileDB SOMA does not support column indices for .varm and obsm matrices + # As a workaround, we will store these in the metadata slot + for multidim_key in ("varm", "obsm"): + collection_name = f"{str(output_dir)}/ms/{mod_name}/{multidim_key}" + if tiledbsoma.Collection.exists(collection_name): + with tiledbsoma.Collection.open( + collection_name, "w" + ) as multidim_collection: + for item in multidim_collection: + index_to_write = ( + multidim_column_indices.get(mod_name, {}) + .get(multidim_key, {}) + .get(item, None) + ) + if index_to_write: + multidim_collection[item].metadata["column_index"] = json.dumps( + index_to_write + ) + + +def _remove_directory(dir_path): + """ + Check if a directory exists and remove it does. + """ + if dir_path.exists(): + try: + # Delete the directory and all its contents + for item in dir_path.iterdir(): + if item.is_file() or item.is_symlink(): + item.unlink() # Removes files or symbolic links + elif item.is_dir(): + rmtree(item) # Removes directories recursively + logger.info("Directory '%s' has been deleted successfully.", dir_path) + except FileNotFoundError: + logger.info("Directory '%s' not found.", dir_path) + except PermissionError as e: + raise ValueError( + f"Permission denied: Unable to delete '{dir_path}'." + ) from e + + +def _align_obs_columns(experiment_df, anndata_df): + def _create_na_columns(df_to_write, columns, dtype_df): + """ + Create new columns in a dataframe with NA's as content while + making sure that the datatype of the newly created columns match + with those from another dataframe. + """ + for col in columns: + dtype = dtype_df[col].dtype + na = pd.NA if pd.api.types.is_extension_array_dtype(dtype) else np.nan + df_to_write.loc[:, col] = pd.Series( + na, index=df_to_write.index, dtype=dtype + ) + return df_to_write + + logger.info("Making sure obs columns are aligned.") + contents_columns = set(experiment_df.columns) + logger.info("Already ingested .obs columns: %s", ", ".join(contents_columns)) + logger.info("Retreiving obs columns to be added.") + + obs_to_add_columns = set(anndata_df.columns) + + logger.info("Checking dtype of common columns") + common_columns = obs_to_add_columns.intersection(contents_columns) + common_dtypes = {} + for column_name in common_columns: + # Combine the two columns, but only keep the calculated dtype. + # We'll update the data types in the two original frames and let tiledb do the joining. + new_series = experiment_df[column_name].combine_first(anndata_df[column_name]) + result_dtype = new_series.dtype + # Nullable floats are not supported by anndata... + if pd.api.types.is_float_dtype( + result_dtype + ) and pd.api.types.is_extension_array_dtype(result_dtype): + result_dtype = result_dtype.numpy_dtype + common_dtypes[column_name] = result_dtype + + logger.info( + "Updating the dtypes to comply with the following schema: %s", common_dtypes + ) + experiment_df, anndata_df = ( + experiment_df.astype(common_dtypes), + anndata_df.astype(common_dtypes), + ) + + logger.info("Will add the following columns: %s", ", ".join(obs_to_add_columns)) + missing_columns_in_old = obs_to_add_columns - contents_columns + logger.info( + "Adding the following columns to the schema: %s.", + ", ".join(missing_columns_in_old), + ) + experiment_df = _create_na_columns( + experiment_df, missing_columns_in_old, anndata_df + ) + + logger.info("Adjusting .obs succeeded!") + missing_columns_in_new = ( + contents_columns + - obs_to_add_columns + - {"soma_joinid", par["obs_index_name_output"]} + ) + logger.info( + "Adding the following columns to modality to be ingested: %s", + ", ".join(missing_columns_in_new), + ) + anndata_df = _create_na_columns(anndata_df, missing_columns_in_new, experiment_df) + return experiment_df, anndata_df + + +def main(par): + logger.info(f"Component {meta['name']} started.") + par["input"], par["tiledb_dir"] = Path(par["input"]), Path(par["tiledb_dir"]) + + # Handle case where input is a mudat file + with open(par["input"], "rb") as open_input_file: + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + is_mudata = open_input_file.read(6) == b"MuData" + + _check_input_args(par, par["input"], is_mudata) + + # Reference layers other than "X" refer as "layer/" + for arg_name in LAYER_ARGUMENTS: + par[arg_name] = _handle_layer_location(par[arg_name]) + + rna_input = par["input"] + # Create a temporary file that holds the converted RNA h5ad + if is_mudata: + if not par["rna_modality"]: + raise ValueError( + "'rna_modality' argument must be set if the input is a MuData file." + ) + # If the input is a MuData file, take the RNA modality and use that as input instead + rna_input = _h5mu_to_h5ad(par["input"], par["rna_modality"]) + + # Put a copy of the RNA input in the output file and convert it in-place + rna_converted = _copy_anndata_and_convert_inplace( + rna_input, _get_rna_conversion_specification(par) + ) + + if par["prot_modality"]: + prot_input = _h5mu_to_h5ad(par["input"], par["prot_modality"]) + prot_converted = _copy_anndata_and_convert_inplace( + prot_input, _get_prot_conversion_specification(par) + ) + + logger.info( + "Making sure that obs columns are shared between the two modalities." + ) + contents, obs_to_add = _read_obs(rna_converted), _read_obs(prot_converted) + contents, obs_to_add = _align_obs_columns(contents, obs_to_add) + _write_obs(prot_converted, obs_to_add) + _write_obs(rna_converted, contents) + + tiledbsoma.logging.info() + _remove_directory(par["tiledb_dir"]) + + logger.info("Ingesting RNA modality") + rna_from_h5ad_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": rna_converted, + "measurement_name": par["rna_modality_output"], + "uns_keys": [], + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, rna_from_h5ad_args) + func_to_call(**rna_from_h5ad_args) + + _write_varm_obsm_indices(par["tiledb_dir"], "rna", multidim_column_indices) + + logger.info("Done ingesting RNA modality") + + if par["prot_modality"]: + logger.info("Ingesting protein modality") + + logger.info("Initalizing prot modality schema.") + # Note the 'schema_only' + prot_from_h5ad_schema_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "uns_keys": [], + "ingest_mode": "schema_only", + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, prot_from_h5ad_schema_args) + func_to_call(**prot_from_h5ad_schema_args) + + logger.info("Registering prot modality anndata") + # Register the second anndata object in the protein measurement + register_prot_args = { + "experiment_uri": str(par["tiledb_dir"]), + "h5ad_file_names": [prot_converted], + "measurement_name": par["prot_modality_output"], + "obs_field_name": par["obs_index_name_output"], + "var_field_name": par["prot_var_index_name_output"], + "append_obsm_varm": True, + } + func_to_call = tiledbsoma.io.register_h5ads + _log_arguments(func_to_call, register_prot_args) + rd = func_to_call(**register_prot_args) + + rd.prepare_experiment(str(par["tiledb_dir"])) + + # Ingest the second anndata object into the protein measurement + prot_write_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "registration_mapping": rd, + "ingest_mode": "write", + "uns_keys": [], + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, prot_write_args) + func_to_call(**prot_write_args) + + _write_varm_obsm_indices(par["tiledb_dir"], "prot", multidim_column_indices) + + logger.info("Finished!") + + +if __name__ == "__main__": + main(par) diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py b/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py new file mode 100644 index 00000000..6ef6bc0e --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py @@ -0,0 +1,594 @@ +import sys +import pytest +import re +import json +import anndata as ad +import mudata as md +import numpy as np +import pandas as pd +import tiledbsoma +import pyarrow as pa +from subprocess import CalledProcessError + +ad.settings.allow_write_nullable_strings = True + + +## VIASH START +meta = { + "resources_dir": "resources_test", + "executable": "./target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb", + "config": "./src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def sample_1_modality_1(): + """ + >>> ad1.obs + Obs1 Shared_obs_col + obs1 A B + obs2 C D + + >>> ad1.var + Feat1 Shared_feat_col + var1 a b + var2 c d + overlapping_var_mod1 e f + + >>> ad1.X + array([[1, 2, 3], + [4, 5, 6]]) + """ + + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["obs1", "obs2"], + columns=["var1", "var2", "overlapping_var_mod1"], + ) + + layer_1 = np.array([[94, 81, 60], [50, 88, 69]]) + + obs = pd.DataFrame( + [["A", "B", True, 0.9, 1, pd.NA], ["C", "D", False, 0.2, 2, 8]], + index=df.index, + columns=[ + "Obs1", + "Shared_obs_col", + "bool_column", + "float_column", + "int_column", + "Shared_obs_col_int", + ], + ).astype( + { + "Obs1": "object", + "Shared_obs_col": "object", + "bool_column": "bool", + "float_column": "float64", + "int_column": "int64", + "Shared_obs_col_int": "Int64", + } + ) + + var = pd.DataFrame( + [["a", "b"], ["c", "d"], ["e", "f"]], + index=df.columns, + columns=["Feat1", "Shared_feat_col"], + ) + varm = np.random.rand(df.columns.size, 5) + varm2 = pd.DataFrame( + np.random.rand(df.columns.size, 5), + index=var.index, + columns=["varmcol1", "varmcol2", "varmcol3", "varmcol4", "varmcol5"], + ) + ad1 = ad.AnnData( + X=df, + layers={"layer1": layer_1}, + obs=obs, + var=var, + varm={"random_vals_mod1": varm, "df_varm": varm2}, + uns={ + "uns_unique_to_sample1": pd.DataFrame( + ["foo"], index=["bar"], columns=["col1"] + ), + "overlapping_uns_key": pd.DataFrame( + ["jing"], index=["jang"], columns=["col2"] + ), + }, + ) + return ad1 + + +@pytest.fixture +def sample_1_modality_2(): + """ + >>> ad2.X + array([[ 7, 8], + [ 9, 10], + [11, 12]]) + + >>> ad2.obs + Obs2 Obs3 Shared_obs_col + obs3 E F G + obs2 H I J + obs5 K L M + + >>> ad2.var + Feat2 Shared_feat_col + overlapping_var_mod1 d e + var5 f g + + """ + df = pd.DataFrame( + [[7, 8], [9, 10], [11, 12]], + index=["obs3", "obs2", "obs5"], + columns=["overlapping_var_mod1", "var4"], + ) + + layer_1 = np.array([[20, 35], [76, 93], [100, 38]]) + + obs = pd.DataFrame( + [["E", "F", "G", 1], ["H", "I", "J", 2.0], ["K", "L", "M", 3.0]], + index=df.index, + columns=["Obs2", "Obs3", "Shared_obs_col", "Shared_obs_col_int"], + ) + var = pd.DataFrame( + [["d", "e"], ["f", "g"]], index=df.columns, columns=["Feat2", "Shared_feat_col"] + ) + ad2 = ad.AnnData(df, obs=obs, var=var, layers={"layer_foo": layer_1}) + return ad2 + + +@pytest.fixture +def sample_1_mudata(sample_1_modality_1, sample_1_modality_2): + return md.MuData({"mod1": sample_1_modality_1, "mod2": sample_1_modality_2}) + + +@pytest.fixture +def random_h5ad_path(random_path): + def wrapper(): + return random_path(extension="h5ad") + + return wrapper + + +@pytest.fixture +def sample_1_mudata_file(sample_1_mudata, random_h5mu_path): + output_path = random_h5mu_path() + sample_1_mudata.write(output_path) + return output_path + + +@pytest.fixture +def sample_1_modality_1_anndata_file(sample_1_modality_1, random_h5ad_path): + output_path = random_h5ad_path() + sample_1_modality_1.write(output_path) + return output_path + + +def test_convert_anndata(run_component, sample_1_modality_1_anndata_file, tmp_path): + output_path = tmp_path / "output" + run_component( + [ + "--input", + sample_1_modality_1_anndata_file, + "--rna_modality", + "mod1", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + ) + with tiledbsoma.DataFrame.open(f"{output_path}/obs") as obs_arr: + obs_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("cell_id", pa.large_string()), + pa.field("Obs1", pa.large_string()), + pa.field("Shared_obs_col", pa.large_string()), + pa.field("bool_column", pa.bool_()), + pa.field("float_column", pa.float64()), + pa.field("int_column", pa.int64()), + pa.field("Shared_obs_col_int", pa.int64()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1]), + pa.array(["obs1", "obs2"]), + pa.array(["A", "C"]), + pa.array(["B", "D"]), + pa.array([True, False]), + pa.array([0.9, 0.2]), + pa.array([1, 2]), + pa.array([None, 8]), + ], + schema=expected_schema, + ) + assert expected.equals(obs_contents) + + with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr: + var_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("rna_index", pa.large_string()), + pa.field("Feat1", pa.large_string()), + pa.field("Shared_feat_col", pa.large_string()), + pa.field("gene_symbol", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + pa.array(["a", "c", "e"]), + pa.array(["b", "d", "f"]), + pa.array(["a", "c", "e"]), + ], + schema=expected_schema, + ) + assert expected.equals(var_contents) + with tiledbsoma.Experiment.open(f"{output_path}") as exp: + with tiledbsoma.ExperimentAxisQuery( + experiment=exp, measurement_name="rna" + ) as query: + assert query.n_obs == 2 + data = query.to_anndata("raw").X.todense() + expected = ( + [ + [1, 2, 3], + [4, 5, 6], + ], + ) + np.testing.assert_allclose(data, np.asmatrix(np.array(expected))) + log_normalized_data = query.to_anndata("log_normalized").X.todense() + expected_log_normalized = [ + [94, 81, 60], + [50, 88, 69], + ] + np.testing.assert_allclose( + log_normalized_data, np.asmatrix(np.array(expected_log_normalized)) + ) + + +@pytest.mark.parametrize( + "extra_arg", + [ + ("--prot_modality", "mod2"), + ("--prot_raw_layer_input", "X"), + ("--prot_normalized_layer_input", "layer_foo"), + ], +) +def test_convert_anndata_extra_prot_arguments_raises( + run_component, sample_1_modality_1_anndata_file, tmp_path, extra_arg +): + output_path = tmp_path / "output" + args = [ + "--input", + sample_1_modality_1_anndata_file, + "--rna_modality", + "mod1", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + args += extra_arg + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert re.search( + rf"'{extra_arg[0].lstrip('--')}' can not be used when the input is not a MuData file\.", + err.value.stdout.decode("utf-8"), + ) + + +@pytest.mark.parametrize( + "params", + [ + [("--rna_raw_layer_input", "X"), ("--rna_normalized_layer_input", "X")], + [("--rna_raw_layer_output", "X"), ("--rna_normalized_layer_output", "X")], + [("--rna_modality", "rna"), ("--prot_modality", "rna")], + [("--prot_raw_layer_input", "X"), ("--prot_normalized_layer_input", "X")], + [("--prot_raw_layer_output", "X"), ("--prot_normalized_layer_output", "X")], + ], +) +def test_arg_values_should_not_be_the_same( + run_component, sample_1_mudata_file, tmp_path, params +): + output_path = tmp_path / "output" + args = [ + "--input", + sample_1_mudata_file, + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + args += [param for param_tup in params for param in param_tup] + # The following avoids complaints about misszing input argument + if "--rna_raw_layer_input" not in args: + args += [ + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "log_normalized", + ] + if "--rna_modality" not in args: + args += ["--rna_modality", "mod1"] + + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert re.search( + rf"The value for argument '{params[0][0].lstrip('-')}' \({params[0][1]}\) must not be the same as for argument '{params[1][0].lstrip('-')}' \({params[1][1]}\)", + err.value.stdout.decode("utf-8"), + ) + + +def test_output_directory_already_exists(run_component, sample_1_mudata_file, tmp_path): + output_path = tmp_path / "output" + output_path.mkdir() + run_component( + [ + "--input", + sample_1_mudata_file, + "--rna_modality", + "mod1", + "--prot_modality", + "mod2", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--prot_raw_layer_input", + "X", + "--prot_normalized_layer_input", + "layer_foo", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + ) + + +def test_convert_mudata( + run_component, sample_1_mudata_file, sample_1_modality_1, tmp_path +): + output_path = tmp_path / "output" + run_component( + [ + "--input", + sample_1_mudata_file, + "--rna_modality", + "mod1", + "--prot_modality", + "mod2", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--prot_raw_layer_input", + "X", + "--prot_normalized_layer_input", + "layer_foo", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + ) + + assert output_path.is_dir() + with tiledbsoma.DataFrame.open(f"{output_path}/obs") as obs_arr: + obs_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("cell_id", pa.large_string()), + pa.field("Obs1", pa.large_string()), + pa.field("Shared_obs_col", pa.large_string()), + pa.field("bool_column", pa.bool_()), + pa.field("float_column", pa.float64()), + pa.field("int_column", pa.int64()), + pa.field("Shared_obs_col_int", pa.float64()), + pa.field("Obs2", pa.large_string()), + pa.field("Obs3", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2, 3]), + pa.array(["obs1", "obs2", "obs3", "obs5"]), + pa.array(["A", "C", None, None]), + pa.array(["B", "D", "G", "M"]), + pa.array([True, False, None, None]), + pa.array([0.9, 0.2, None, None]), + pa.array([1, 2, None, None]), + pa.array([None, 8, 1, 3]), + pa.array([None, None, "E", "K"]), + pa.array([None, None, "F", "L"]), + ], + schema=expected_schema, + ) + # Convert to pandas in order to allow the order of the fields to differ + pd.testing.assert_frame_equal( + expected.to_pandas(), obs_contents.to_pandas(), check_like=True + ) + + with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr: + var_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("rna_index", pa.large_string()), + pa.field("Feat1", pa.large_string()), + pa.field("Shared_feat_col", pa.large_string()), + pa.field("gene_symbol", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + pa.array(["a", "c", "e"]), + pa.array(["b", "d", "f"]), + pa.array(["a", "c", "e"]), + ], + schema=expected_schema, + ) + assert expected.equals(var_contents) + with tiledbsoma.Experiment.open(f"{output_path}") as exp: + with tiledbsoma.ExperimentAxisQuery( + experiment=exp, measurement_name="rna" + ) as query: + assert query.n_obs == 4 + data = query.to_anndata("raw").X.todense() + expected = ([[1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]],) + np.testing.assert_allclose(data, np.asmatrix(np.array(expected))) + log_normalized_data = query.to_anndata("log_normalized").X.todense() + expected_log_normalized = [[94, 81, 60], [50, 88, 69], [0, 0, 0], [0, 0, 0]] + np.testing.assert_allclose( + log_normalized_data, np.asmatrix(np.array(expected_log_normalized)) + ) + + with tiledbsoma.ExperimentAxisQuery( + experiment=exp, measurement_name="prot" + ) as query: + assert query.n_obs == 4 + data = query.to_anndata("raw").X.todense() + expected = ([[0, 0], [9, 10], [7, 8], [11, 12]],) + np.testing.assert_allclose(data, np.asmatrix(np.array(expected))) + log_normalized_data = query.to_anndata("log_normalized").X.todense() + expected_log_normalized = [[0, 0], [76, 93], [20, 35], [100, 38]] + np.testing.assert_allclose( + log_normalized_data, np.asmatrix(np.array(expected_log_normalized)) + ) + with tiledbsoma.Collection.open( + f"{output_path}/ms/rna/varm", "r" + ) as multidim_collection: + assert set(multidim_collection.keys()) == {"df_varm", "random_vals_mod1"} + for varm_key in multidim_collection: + expected_data = sample_1_modality_1.varm[varm_key] + try: + expected = expected_data.to_numpy() + expected_index = sample_1_modality_1.varm[varm_key].columns.to_list() + col_index = json.loads( + multidim_collection[varm_key].metadata["column_index"] + ) + assert expected_index == col_index + except AttributeError: + expected = expected_data + contents = ( + multidim_collection[varm_key] + .read() + .coos() + .concat() + .to_scipy() + .todense() + ) + np.testing.assert_allclose(expected, contents) + + +def test_gene_synbol_column_name_use_index( + run_component, sample_1_modality_1, random_h5ad_path, tmp_path +): + output_path = tmp_path / "output" + input_h5ad = random_h5ad_path() + sample_1_modality_1.var.index.name = "gene_symbol" + sample_1_modality_1.write(input_h5ad) + run_component( + [ + "--input", + input_h5ad, + "--rna_modality", + "mod1", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--tiledb_dir", + output_path, + ] + ) + with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr: + var_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("rna_index", pa.large_string()), + pa.field("Feat1", pa.large_string()), + pa.field("Shared_feat_col", pa.large_string()), + pa.field("gene_symbol", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + pa.array(["a", "c", "e"]), + pa.array(["b", "d", "f"]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + ], + schema=expected_schema, + ) + assert expected.equals(var_contents) + + +def test_gene_symbol_column_already_at_correct_location( + run_component, sample_1_modality_1_anndata_file, tmp_path +): + output_path = tmp_path / "output" + run_component( + [ + "--input", + sample_1_modality_1_anndata_file, + "--rna_modality", + "mod1", + "--rna_raw_layer_input", + "X", + "--rna_var_gene_names_input", + "Feat1", + "--rna_var_gene_names_output", + "Feat1", + "--rna_normalized_layer_input", + "layer1", + "--tiledb_dir", + output_path, + ] + ) + with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr: + var_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("rna_index", pa.large_string()), + pa.field("Feat1", pa.large_string()), + pa.field("Shared_feat_col", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + pa.array(["a", "c", "e"]), + pa.array(["b", "d", "f"]), + ], + schema=expected_schema, + ) + assert expected.equals(var_contents) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_h5mu_to_h5ad/config.vsh.yaml b/src/convert/from_h5mu_to_h5ad/config.vsh.yaml new file mode 100644 index 00000000..7570d306 --- /dev/null +++ b/src/convert/from_h5mu_to_h5ad/config.vsh.yaml @@ -0,0 +1,56 @@ +name: from_h5mu_to_h5ad +namespace: "convert" +scope: "public" +description: | + Converts a h5mu file into a h5ad file. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input MuData file + default: input.h5mu + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--output" + alternatives: ["-o"] + type: file + description: Output AnnData file. + default: output.h5ad + direction: output +__merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml] + __merge__: [ /src/base/requirements/python_test_setup.yaml, . ] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowmem, singlecpu] diff --git a/src/convert/from_h5mu_to_h5ad/script.py b/src/convert/from_h5mu_to_h5ad/script.py new file mode 100755 index 00000000..f3ecc72c --- /dev/null +++ b/src/convert/from_h5mu_to_h5ad/script.py @@ -0,0 +1,26 @@ +import mudata as mu +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "output_compression": "gzip", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +# TODO: Merge modalities into one layer + +logger.info("Reading input h5mu file %s, modality %s", par["input"], par["modality"]) +adat = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Writing to %s.", par["output"]) +adat.write_h5ad(par["output"], compression=par["output_compression"]) + +logger.info("Finished") diff --git a/src/convert/from_h5mu_to_h5ad/test.py b/src/convert/from_h5mu_to_h5ad/test.py new file mode 100644 index 00000000..61db527e --- /dev/null +++ b/src/convert/from_h5mu_to_h5ad/test.py @@ -0,0 +1,42 @@ +import sys +import pytest +import anndata as ad +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad", + "resources_dir": "resources_test", +} +## VIASH END + +input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +def test_run(run_component, tmp_path): + output = tmp_path / "output.h5ad" + + cmd_pars = [ + "--modality", + "rna", + "--input", + input, + "--output", + str(output), + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + assert output.is_file(), "No output was created." + + adata = ad.read_h5ad(output) + mdata = mu.read_h5mu(input) + + assert "rna" in mdata.mod.keys() + assert_annotation_objects_equal(mdata.mod["rna"], adata) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_h5mu_to_seurat/config.vsh.yaml b/src/convert/from_h5mu_to_seurat/config.vsh.yaml new file mode 100644 index 00000000..f67791a5 --- /dev/null +++ b/src/convert/from_h5mu_to_seurat/config.vsh.yaml @@ -0,0 +1,55 @@ +name: "from_h5mu_to_seurat" +namespace: "convert" +scope: "public" +status: "deprecated" +description: | + Converts an h5mu file into a Seurat file. + + Restrictions: + - Only the intersection of cells is currently loaded into the Seurat object due to the object structure limitation. + - Multimodal embeddings (global .obsm slot) are loaded with the assay.used field set to the default assay. + - Embeddings names are changed in order to comply with R & Seurat requirements and conventions. + - Feature names with underscores ('_') are automatically replaced with dashes ('-') + - Seurat does not support global variables metadata /var. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--output" + alternatives: ["-o"] + type: file + description: Output Seurat file + direction: output + required: true + example: output.rds +resources: + - type: r_script + path: script.R +test_resources: + - type: r_script + path: run_test.R + - path: /resources_test/10x_5k_anticmv/ +engines: + - type: docker + image: rocker/r2u:24.04 + setup: + - type: apt + packages: + - libhdf5-dev + - libgeos-dev + - type: r + cran: [ anndata, hdf5r, testthat, SeuratObject, Seurat ] + - type: r + github: pmbio/MuDataSeurat@empty-tables-and-nullable +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] \ No newline at end of file diff --git a/src/convert/from_h5mu_to_seurat/run_test.R b/src/convert/from_h5mu_to_seurat/run_test.R new file mode 100644 index 00000000..2a492b5a --- /dev/null +++ b/src/convert/from_h5mu_to_seurat/run_test.R @@ -0,0 +1,42 @@ +library(testthat, warn.conflicts = FALSE) + +## VIASH START +meta <- list( + executable = "target/docker/convert/from_h5mu_to_seurat/from_h5mu_to_seurat", + resources_dir = "resources_test", + name = "from_h5mu_to_seurat" +) +## VIASH END + +cat("> Checking whether output is correct\n") + +in_h5mu <- paste0( + meta[["resources_dir"]], + "/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_mms.h5mu" +) +out_rds <- "output.rds" + +cat("> Running ", meta[["name"]], "\n", sep = "") +out <- processx::run( + meta[["executable"]], + c( + "--input", in_h5mu, + "--output", out_rds + ) +) + +cat("> Checking whether output file exists\n") +expect_equal(out$status, 0) +expect_true(file.exists(out_rds)) + +cat("> Reading output file\n") +obj <- readRDS(file = out_rds) + +cat("> Checking whether Seurat object is in the right format\n") +expect_is(obj, "Seurat") +expect_equal(sort(names(slot(obj, "assays"))), sort(c("prot", "rna"))) + +obj_rna <- slot(obj, "assays")$rna +obj_prot <- slot(obj, "assays")$prot + +# todo: check whether obj_rna and obj_prot have correct properties diff --git a/src/convert/from_h5mu_to_seurat/script.R b/src/convert/from_h5mu_to_seurat/script.R new file mode 100644 index 00000000..5a90bbd5 --- /dev/null +++ b/src/convert/from_h5mu_to_seurat/script.R @@ -0,0 +1,80 @@ +library(MuDataSeurat) +library(hdf5r) + +## VIASH START +par <- list( + input = paste0( + "resources_test/10x_5k_anticmv/", + "5k_human_antiCMV_T_TBNK_connect_mms.h5mu" + ), + output = "output.rds" +) +## VIASH END + + +temp_h5mu <- tempfile(fileext = ".h5mu") +file.copy(par$input, temp_h5mu) + +delete_modality <- function(open_h5, modality_path) { + open_h5$link_delete(modality_path) + mod_name <- sub("/mod/", "", modality_path) + if ("mod-order" %in% names(hdf5r::h5attributes(open_h5[["mod"]]))) { + current_attributes <- hdf5r::h5attributes(open_h5[["mod"]])$`mod-order` + current_attributes <- current_attributes[current_attributes != mod_name] + hdf5r::h5attr(open_h5[["mod"]], "mod-order") <- current_attributes + } + for (obj_prefix in c("obsm/", "varm/", "varmap/", "obsmap/")) { + obj_path <- paste0(obj_prefix, mod_name) + if (hdf5r::existsGroup(open_h5, obj_path)) { + open_h5$link_delete(obj_path) + } + } +} + +open_file <- H5File$new(temp_h5mu, mode = "r+") +modalities <- list.groups(open_file[["mod"]], + full.names = TRUE, recursive = FALSE +) +to_delete <- c() + +determine_matrix_dims <- function(dataset, indexpointers, indices, rowwise) { + if ("shape" %in% hdf5r::h5attr_names(dataset)) { + x_dims <- hdf5r::h5attr(dataset, "shape") + } else { + x_dims <- c(length(indexpointers) - 1, max(indices) + 1) + if (rowwise) { + x_dims <- rev(x_dims) + } + } + x_dims +} +for (modality_path in modalities) { + dataset <- open_file[[modality_path]][["X"]] + dataset_names <- names(dataset) + if ( + "data" %in% dataset_names && + "indices" %in% dataset_names && + "indptr" %in% dataset_names + ) { + indexpointers <- dataset[["indptr"]]$read() + indices <- dataset[["indices"]]$read() + rowwise <- FALSE + if ("encoding-type" %in% h5attr_names(dataset)) { + rowwise <- h5attr(dataset, "encoding-type") == "csr_matrix" + } + x_dims <- determine_matrix_dims(dataset, indexpointers, indices, rowwise) + if (x_dims[2] < 1) { + delete_modality(open_file, modality_path) + } + } else if (dataset$dims[1] < 1) { + delete_modality(open_file, modality_path) + } +} + +open_file$close_all() + +cat("Reading input file\n") +obj <- ReadH5MU(temp_h5mu) + +cat("Writing output file\n") +saveRDS(obj, file = par$output, compress = TRUE) diff --git a/src/correction/cellbender_remove_background/config.vsh.yaml b/src/correction/cellbender_remove_background/config.vsh.yaml new file mode 100644 index 00000000..b6172c58 --- /dev/null +++ b/src/correction/cellbender_remove_background/config.vsh.yaml @@ -0,0 +1,306 @@ +name: cellbender_remove_background +namespace: "correction" +scope: "public" +description: | + Eliminating technical artifacts from high-throughput single-cell RNA sequencing data. + + This module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. + At the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols + will be added in the future. A quick start tutorial can be found here. + + Fleming et al. 2022, bioRxiv. +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: [-i] + type: file + description: "Input h5mu file. Data file on which to run tool. Data must be un-filtered: it should include empty droplets." + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: List of modalities to process. + type: string + default: "rna" + required: false + - name: Outputs + arguments: + - name: "--output" + alternatives: [-o] + type: file + description: Full count matrix as an h5mu file, with background RNA removed. This file contains all the original droplet barcodes. + direction: output + required: true + example: output.h5mu + - name: "--layer_output" + description: Output layer + type: string + default: "cellbender_corrected" + - name: "--obs_background_fraction" + type: string + default: "cellbender_background_fraction" + - name: "--obs_cell_probability" + type: string + default: "cellbender_cell_probability" + - name: "--obs_cell_size" + type: string + default: "cellbender_cell_size" + - name: "--obs_droplet_efficiency" + description: | + Name of the column in the .obs dataframe to store the droplet efficiencies in. + type: string + default: "cellbender_droplet_efficiency" + - name: "--obs_latent_scale" + type: string + default: "cellbender_latent_scale" + - name: "--var_ambient_expression" + type: string + default: "cellbender_ambient_expression" + - name: "--obsm_gene_expression_encoding" + type: string + default: "cellbender_gene_expression_encoding" + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + # custom arg + - name: "--expected_cells_from_qc" + type: boolean + description: Will use the Cell Ranger QC to determine the estimated number of cells + default: false + - name: "--expected_cells" + type: integer + description: Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient). + example: 1000 + - name: "--total_droplets_included" + type: integer + description: | + The number of droplets from the rank-ordered UMI plot + that will have their cell probabilities inferred as an + output. Include the droplets which might contain cells. + Droplets beyond TOTAL_DROPLETS_INCLUDED should be + 'surely empty' droplets. + example: 25000 + - name: "--force_cell_umi_prior" + type: integer + description: "Ignore CellBender's heuristic prior estimation, and use this prior for UMI counts in cells." + - name: "--force_empty_umi_prior" + type: integer + description: "Ignore CellBender's heuristic prior estimation, and use this prior for UMI counts in empty droplets." + - name: "--model" + type: string + choices: [naive, simple, ambient, swapping, full] + description: | + Which model is being used for count data. + + * 'naive' subtracts the estimated ambient profile. + * 'simple' does not model either ambient RNA or random barcode swapping (for debugging purposes -- not recommended). + * 'ambient' assumes background RNA is incorporated into droplets. + * 'swapping' assumes background RNA comes from random barcode swapping (via PCR chimeras). + * 'full' uses a combined ambient and swapping model. + default: full + - name: "--epochs" + type: integer + description: Number of epochs to train. + default: 150 + - name: "--low_count_threshold" + type: integer + description: | + Droplets with UMI counts below this number are completely + excluded from the analysis. This can help identify the correct + prior for empty droplet counts in the rare case where empty + counts are extremely high (over 200). + default: 5 + - name: "--z_dim" + type: integer + description: | + Dimension of latent variable z. + default: 64 + - name: "--z_layers" + type: integer + description: | + Dimension of hidden layers in the encoder for z. + multiple: true + default: [512] + - name: "--training_fraction" + type: double + description: | + Training detail: the fraction of the data used for training. + The rest is never seen by the inference algorithm. Speeds up learning. + default: 0.9 + - name: "--empty_drop_training_fraction" + type: double + description: | + Training detail: the fraction of the training data each epoch that + is drawn (randomly sampled) from surely empty droplets. + default: 0.2 + - name: "--ignore_features" + type: integer + description: | + Integer indices of features to ignore entirely. In the output + count matrix, the counts for these features will be unchanged. + multiple: true + - name: "--fpr" + type: double + description: | + Target 'delta' false positive rate in [0, 1). Use 0 for a cohort + of samples which will be jointly analyzed for differential expression. + A false positive is a true signal count that is erroneously removed. + More background removal is accompanied by more signal removal at + high values of FPR. You can specify multiple values, which will + create multiple output files. + multiple: true + default: [0.01] + - name: "--exclude_feature_types" + type: string + description: | + Feature types to ignore during the analysis. These features will + be left unchanged in the output file. + multiple: true + - name: "--projected_ambient_count_threshold" + type: double + description: | + Controls how many features are included in the analysis, which + can lead to a large speedup. If a feature is expected to have less + than PROJECTED_AMBIENT_COUNT_THRESHOLD counts total in all cells + (summed), then that gene is excluded, and it will be unchanged + in the output count matrix. For example, + PROJECTED_AMBIENT_COUNT_THRESHOLD = 0 will include all features + which have even a single count in any empty droplet. + default: 0.1 + - name: "--learning_rate" + type: double + description: | + Training detail: lower learning rate for inference. + A OneCycle learning rate schedule is used, where the + upper learning rate is ten times this value. (For this + value, probably do not exceed 1e-3). + default: 0.0001 + - name: "--final_elbo_fail_fraction" + type: double + description: | + Training is considered to have failed if + (best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION. + Training will automatically re-run if --num-training-tries > 1. + By default, will not fail training based on final_training_ELBO. + - name: "--epoch_elbo_fail_fraction" + type: double + description: | + Training is considered to have failed if + (previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) > EPOCH_ELBO_FAIL_FRACTION. + Training will automatically re-run if --num-training-tries > 1. + By default, will not fail training based on epoch_training_ELBO. + - name: "--num_training_tries" + type: integer + description: | + Number of times to attempt to train the model. At each subsequent attempt, + the learning rate is multiplied by LEARNING_RATE_RETRY_MULT. + default: 1 + - name: "--learning_rate_retry_mult" + type: double + description: | + Learning rate is multiplied by this amount each time a new training + attempt is made. (This parameter is only used if training fails based + on EPOCH_ELBO_FAIL_FRACTION or FINAL_ELBO_FAIL_FRACTION and + NUM_TRAINING_TRIES is > 1.) + default: 0.2 + - name: --posterior_batch_size + type: integer + description: | + Training detail: size of batches when creating the posterior. + Reduce this to avoid running out of GPU memory creating the posterior + (will be slower). + default: 128 + - name: --posterior_regulation + type: string + description: | + Posterior regularization method. (For experts: not required for normal usage, + see documentation). + + * PRq is approximate quantile-targeting. + * PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0). + * PRmu_gene is approximate mean-targeting per gene. + choices: [PRq, PRmu, PRmu_gene] + - name: "--alpha" + type: double + description: | + Tunable parameter alpha for the PRq posterior regularization method + (not normally used: see documentation). + - name: "--q" + type: double + description: | + Tunable parameter q for the CDF threshold estimation method (not + normally used: see documentation). + - name: "--estimator" + type: string + description: | + Output denoised count estimation method. (For experts: not required + for normal usage, see documentation). + default: mckp + choices: [map, mean, cdf, sample, mckp] + - name: "--estimator_multiple_cpu" + type: boolean_true + description: | + Including the flag --estimator-multiple-cpu will use more than one + CPU to compute the MCKP output count estimator in parallel (does nothing + for other estimators). + - name: "--constant_learning_rate" + type: boolean + description: | + Including the flag --constant-learning-rate will use the ClippedAdam + optimizer instead of the OneCycleLR learning rate schedule, which is + the default. Learning is faster with the OneCycleLR schedule. + However, training can easily be continued from a checkpoint for more + epochs than the initial command specified when using ClippedAdam. On + the other hand, if using the OneCycleLR schedule with 150 epochs + specified, it is not possible to pick up from that final checkpoint + and continue training until 250 epochs. + - name: "--debug" + type: boolean_true + description: | + Including the flag --debug will log extra messages useful for debugging. + - name: "--cuda" + type: boolean_true + description: | + Including the flag --cuda will run the inference on a + GPU. +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + # normally cellbender should only be run on unfiltered data, but for the purposes of the unit test the filtered data will do. + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu + +engines: +- type: docker + image: nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04 + # run_args: ["--gpus all"] + setup: + - type: docker + env: + - PYENV_ROOT="/root/.pyenv" + - PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH" + - PYTHON_VERSION=3.7.16 + run: | + apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8 git \ + && curl https://pyenv.run | bash \ + && pyenv update \ + && pyenv install $PYTHON_VERSION \ + && pyenv global $PYTHON_VERSION \ + && apt-get clean + - type: python + packages: + # Use this version of mudata because it is the last one that supports python 3.7 + - lxml~=4.8.0 + - mudata~=0.2.1 + - cellbender~=0.3.0 + +runners: +- type: executable +- type: nextflow + directives: + label: [midcpu, midmem, gpu] diff --git a/src/correction/cellbender_remove_background/script.py b/src/correction/cellbender_remove_background/script.py new file mode 100644 index 00000000..4948b011 --- /dev/null +++ b/src/correction/cellbender_remove_background/script.py @@ -0,0 +1,251 @@ +import mudata as mu +import tempfile +import subprocess +import os +import sys +import numpy as np +from scipy.sparse import csr_matrix +from cellbender.remove_background.downstream import anndata_from_h5 + +## VIASH START +file_input = "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + +par = { + # inputs + "input": file_input, + "modality": "rna", + # outputs + "output": "output.h5mu", + "layer_output": "corrected", + "obs_background_fraction": "background_fraction", + "obs_cell_probability": "cell_probability", + "obs_cell_size": "cell_size", + "obs_droplet_efficiency": "droplet_efficiency", + "obs_latent_scale": "latent_scale", + "var_ambient_expression": "ambient_expression", + "obsm_gene_expression_encoding": "gene_expression_encoding", + # args + "expected_cells_from_qc": False, + "expected_cells": 1000, + "total_droplets_included": 25000, + "force_cell_umi_prior": None, + "force_empty_umi_prior": None, + "model": "full", + "epochs": 150, + "low_count_threshold": 5, + "z_dim": 64, + "z_layers": [512], + "training_fraction": 0.9, + "empty_drop_training_fraction": 0.2, + "ignore_features": [], + "fpr": [0.01], + "exclude_feature_types": [], + "projected_ambient_count_threshold": 0.1, + "learning_rate": 1.0e-4, + "final_elbo_fail_fraction": None, + "epoch_elbo_fail_fraction": None, + "num_training_tries": 1, + "learning_rate_retry_mult": 0.2, + "posterior_batch_size": 128, + "posterior_regulation": None, + "alpha": None, + "q": None, + "estimator": "mckp", + "estimator_multiple_cpu": False, + "constant_learning_rate": True, + "debug": False, + "cuda": False, +} +meta = { + "temp_dir": os.getenv("VIASH_TEMP"), + "resources_dir": "src/correction/cellbender_remove_background", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) + +mod = par["modality"] +logger.info("Performing log transformation on modality %s", mod) +data = mdata.mod[mod] + +# import pathlib +# with pathlib.Path(os.path.dirname(par["output"])) / "cellbender" as temp_dir: +# os.mkdir(temp_dir) +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: + # construct paths within tempdir + input_file = os.path.join(temp_dir, "input.h5ad") + output_file = os.path.join(temp_dir, "output.h5") + + logger.info("Creating AnnData input file for CellBender: '%s'", input_file) + data.write_h5ad(input_file) + + logger.info("Constructing CellBender command") + cmd_pars = [ + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, + # don't create checkpoints because they're not used / returned anyways + "--checkpoint-mins", + "99999999", + ] + + if meta.get("cpus") is not None: + cmd_pars += ["--cpu-threads", str(meta["cpus"])] + + extra_args = [ + ("--expected-cells", "expected_cells", True), + ("--total-droplets-included", "total_droplets_included", True), + ("--force-cell-umi-prior", "force_cell_umi_prior", True), + ("--force-empty-umi-prior", "force_empty_umi_prior", True), + ("--model", "model", True), + ("--epochs", "epochs", True), + ("--low-count-threshold", "low_count_threshold", True), + ("--z-dim", "z_dim", True), + ("--z-layers", "z_layers", True), + ("--training-fraction", "training_fraction", True), + ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), + ("--ignore-features", "ignore_features", True), + ("--fpr", "fpr", True), + ("--exclude-feature-types", "exclude_feature_types", True), + ( + "--projected-ambient-count-threshold", + "projected_ambient_count_threshold", + True, + ), + ("--learning-rate", "learning_rate", True), + ("--final-elbo-fail-fraction", "final_elbo_fail_fraction", True), + ("--epoch-elbo-fail-fraction", "epoch_elbo_fail_fraction", True), + ("--num-training-tries", "num_training_tries", True), + ("--learning-rate-retry-mult", "learning_rate_retry_mult", True), + ("--posterior-batch-size", "posterior_batch_size", True), + ("--posterior-regulation", "posterior_regulation", True), + ("--alpha", "alpha", True), + ("--q", "q", True), + ("--estimator", "estimator", True), + ("--estimator-multiple-cpu", "estimator_multiple_cpu", False), + ("--constant-learning-rate", "constant_learning_rate", False), + ("--debug", "debug", False), + ("--cuda", "cuda", False), + ] + for flag, name, is_kwarg in extra_args: + if par[name]: + values = par[name] if isinstance(par[name], list) else [par[name]] + cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] + + if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: + assert par["expected_cells"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + assert par["total_droplets_included"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + met = data.uns["metrics_cellranger"] + col_name = "Estimated Number of Cells" + assert col_name in met.columns, ( + "%s should be a column in .obs[metrics_cellranger]" + ) + est_cells = met[col_name].values[0] + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) + out = subprocess.check_output(cmd_pars).decode("utf-8") + + logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) + adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=False) + + logger.info("CellBender output format:", adata_out) + + # AnnData object with n_obs x n_vars = 6794880 x 33538 + # obs: 'cellbender_analyzed' + # var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed' + # uns: 'background_fraction', 'barcode_indices_for_latents', 'cell_probability', 'cell_size', 'droplet_efficiency', 'gene_expression_encoding', + # 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'swapping_fraction_dist_params', + # 'barcodes_analyzed', 'barcodes_analyzed_inds', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', + # 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_elbo', 'learning_curve_train_epoch', + # 'target_false_positive_rate' + + logger.info("Copying X output to MuData") + data.layers[par["layer_output"]] = adata_out.X + + logger.info("Copying .obs output to MuData") + obs_store = { + "obs_background_fraction": "background_fraction", + "obs_cell_probability": "cell_probability", + "obs_cell_size": "cell_size", + "obs_droplet_efficiency": "droplet_efficiency", + "obs_latent_scale": "latent_scale", + } + for to_name, from_name in obs_store.items(): + if par[to_name]: + if from_name in adata_out.obs: + data.obs[par[to_name]] = adata_out.obs[from_name] + # when using unfiltered data, the values will be in uns instead of obs + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + vec = np.zeros(data.n_obs) + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] + data.obs[par[to_name]] = vec + + logger.info("Copying .var output to MuData") + var_store = {"var_ambient_expression": "ambient_expression"} + for to_name, from_name in var_store.items(): + if par[to_name]: + data.var[par[to_name]] = adata_out.var[from_name] + + logger.info("Copying obsm_gene_expression_encoding output to MuData") + obsm_store = {"obsm_gene_expression_encoding": "gene_expression_encoding"} + for to_name, from_name in obsm_store.items(): + if par[to_name]: + if from_name in adata_out.obsm: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + matrix_to_store = adata_out.uns[from_name] + number_of_obs = data.X.shape[0] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] + data.obsm[par[to_name]] = latent_space_sparse + else: + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) + + +logger.info("Writing to file %s", par["output"]) +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/correction/cellbender_remove_background/test.py b/src/correction/cellbender_remove_background/test.py new file mode 100644 index 00000000..bc6864d1 --- /dev/null +++ b/src/correction/cellbender_remove_background/test.py @@ -0,0 +1,45 @@ +from os import path +from mudata import read_h5mu +import subprocess + +## VIASH START +meta = { + "executable": "target/executable/correction/cellbender_remove_background/cellbender_remove_background", + "resources_dir": "resources_test/pbmc_1k_protein_v3", +} +## VIASH END + +file_input = ( + meta["resources_dir"] + "/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) +file_output = "output.h5mu" + +print("> Check whether cellbender works when it should be working") + +# run cellbender +cmd_pars = [ + meta["executable"], + "--input", + file_input, + "--output", + file_output, + "--epochs", + "5", + "--output_compression", + "gzip", +] +# todo: if cuda is available, add --cuda +out = subprocess.check_output(cmd_pars).decode("utf-8") + +# check if file exists +assert path.exists(file_output), "No output was created." + +data = read_h5mu(file_output) + +# check whether gex was found +assert data.mod["rna"].var["feature_types"].unique() == ["Gene Expression"], ( + "Output X should only contain Gene Expression vars." +) + +# check whether ab counts were found +assert "prot" in data.mod, 'Output should contain data.mod["rna"].' diff --git a/src/correction/cellbender_remove_background_v0_2/config.vsh.yaml b/src/correction/cellbender_remove_background_v0_2/config.vsh.yaml new file mode 100644 index 00000000..bbeaa15d --- /dev/null +++ b/src/correction/cellbender_remove_background_v0_2/config.vsh.yaml @@ -0,0 +1,187 @@ +name: cellbender_remove_background_v0_2 +namespace: "correction" +scope: "public" +description: | + Eliminating technical artifacts from high-throughput single-cell RNA sequencing data. + + This module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. + At the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols + will be added in the future. A quick start tutorial can be found here. + + Fleming et al. 2022, bioRxiv. +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: [-i] + type: file + description: Input h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: List of modalities to process. + type: string + default: "rna" + required: false + - name: Outputs + arguments: + - name: "--output" + alternatives: [-o] + type: file + description: Full count matrix as an h5mu file, with background RNA removed. This file contains all the original droplet barcodes. + direction: output + required: true + example: output.h5mu + - name: "--layer_output" + description: Output layer + type: string + default: "corrected" + - name: "--obs_latent_rt_efficiency" + type: string + default: "latent_rt_efficiency" + - name: "--obs_latent_cell_probability" + type: string + default: "latent_cell_probability" + - name: "--obs_latent_scale" + type: string + default: "latent_scale" + - name: "--var_ambient_expression" + type: string + default: "ambient_expression" + - name: "--obsm_latent_gene_encoding" + type: string + default: "cellbender_latent_gene_encoding" + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--expected_cells" + type: integer + description: Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient). + example: 1000 + - name: "--total_droplets_included" + type: integer + description: | + The number of droplets from the rank-ordered UMI plot + that will be analyzed. The largest 'total_droplets' + droplets will have their cell probabilities inferred + as an output. + example: 25000 + - name: "--expected_cells_from_qc" + type: boolean + description: Will use the Cell Ranger QC to determine the estimated number of cells + default: true + - name: "--model" + type: string + choices: [simple, ambient, swapping, full] + description: | + Which model is being used for count data. 'simple' + does not model either ambient RNA or random barcode + swapping (for debugging purposes -- not recommended). + 'ambient' assumes background RNA is incorporated into + droplets. 'swapping' assumes background RNA comes from + random barcode swapping. 'full' uses a combined + ambient and swapping model. + default: full + - name: "--epochs" + type: integer + description: Number of epochs to train. + default: 150 + - name: "--low_count_threshold" + type: integer + description: | + Droplets with UMI counts below this number are completely + excluded from the analysis. This can help identify the correct + prior for empty droplet counts in the rare case where empty + counts are extremely high (over 200). + default: 15 + - name: "--z_dim" + type: integer + description: | + Dimension of latent variable z. + default: 100 + - name: "--z_layers" + type: integer + description: | + Dimension of hidden layers in the encoder for z. + multiple: true + default: [500] + - name: "--training_fraction" + type: double + description: | + Training detail: the fraction of the data used for training. + The rest is never seen by the inference algorithm. Speeds up learning. + default: 0.9 + - name: "--empty_drop_training_fraction" + type: double + description: | + Training detail: the fraction of the training data each epoch that + is drawn (randomly sampled) from surely empty droplets. + default: 0.5 + - name: "--fpr" + type: double + description: | + Target false positive rate in (0, 1). A false positive + is a true signal count that is erroneously removed. + More background removal is accompanied by more signal + removal at high values of FPR. You can specify + multiple values, which will create multiple output + files. + multiple: true + default: [0.01] + - name: "--exclude_antibody_capture" + type: boolean_true + description: | + Including the flag --exclude-antibody-capture will + cause remove-background to operate on gene counts + only, ignoring other features. + - name: "--learning_rate" + type: double + description: | + Training detail: lower learning rate for inference. A + OneCycle learning rate schedule is used, where the + upper learning rate is ten times this value. (For this + value, probably do not exceed 1e-3). + example: 0.0001 + - name: "--cuda" + type: boolean_true + description: | + Including the flag --cuda will run the inference on a + GPU. +resources: + - type: python_script + path: script.py + - path: helper.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + # normally cellbender should only be run on unfiltered data, but for the purposes of the unit test the filtered data will do. + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 +engines: + - type: docker + image: nvcr.io/nvidia/pytorch:23.12-py3 + # run_args: ["--gpus all"] + # image: nvcr.io/nvidia/cuda:11.6.0-base-ubuntu20.04 + # image: pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime + setup: + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + packages: + - muon==0.1.5 + - cellbender==0.2.1 + # - torch + # - torchvision + # - torchaudio + # - "--extra-index-url" + # - "https://download.pytorch.org/whl/cu116" + test_setup: + - type: python + packages: + - muon~=0.1.4 +runners: + - type: executable + - type: nextflow + directives: + label: [gpu] diff --git a/src/correction/cellbender_remove_background_v0_2/helper.py b/src/correction/cellbender_remove_background_v0_2/helper.py new file mode 100644 index 00000000..55e0cd36 --- /dev/null +++ b/src/correction/cellbender_remove_background_v0_2/helper.py @@ -0,0 +1,166 @@ +# This file is copied from https://github.com/broadinstitute/CellBender/issues/128#issuecomment-1175336065 +# to solve an issue with scanpy not being able to read in the 10x h5 files produced by cellbender. +# +# Note: If something doesn't work in this helper function, it may be interesting to +# take a look at the comments by Dries: https://github.com/openpipelines-bio/openpipeline/pull/115 +# I'm not going to apply them for now -- if it ain't broke, don't fix it. +import tables +import numpy as np +import scipy.sparse as sp +import anndata +from typing import Dict + + +def anndata_from_h5( + file: str, analyzed_barcodes_only: bool = True +) -> "anndata.AnnData": + """Load an output h5 file into an AnnData object for downstream work. + + Args: + file: The h5 file + analyzed_barcodes_only: False to load all barcodes, so that the size of + the AnnData object will match the size of the input raw count matrix. + True to load a limited set of barcodes: only those analyzed by the + algorithm. This allows relevant latent variables to be loaded + properly into adata.obs and adata.obsm, rather than adata.uns. + + Returns: + adata: The anndata object, populated with inferred latent variables + and metadata. + + """ + + d = dict_from_h5(file) + X = ( + sp.csc_matrix( + (d.pop("data"), d.pop("indices"), d.pop("indptr")), shape=d.pop("shape") + ) + .transpose() + .tocsr() + ) + + # check and see if we have barcode index annotations, and if the file is filtered + barcode_key = [k for k in d.keys() if (("barcode" in k) and ("ind" in k))] + if len(barcode_key) > 0: + max_barcode_ind = d[barcode_key[0]].max() + filtered_file = max_barcode_ind >= X.shape[0] + else: + filtered_file = True + + if analyzed_barcodes_only: + if filtered_file: + # filtered file being read, so we don't need to subset + print('Assuming we are loading a "filtered" file that contains only cells.') + pass + elif "barcode_indices_for_latents" in d.keys(): + X = X[d["barcode_indices_for_latents"], :] + d["barcodes"] = d["barcodes"][d["barcode_indices_for_latents"]] + elif "barcodes_analyzed_inds" in d.keys(): + X = X[d["barcodes_analyzed_inds"], :] + d["barcodes"] = d["barcodes"][d["barcodes_analyzed_inds"]] + else: + print( + "Warning: analyzed_barcodes_only=True, but the key " + '"barcodes_analyzed_inds" or "barcode_indices_for_latents" ' + "is missing from the h5 file. " + "Will output all barcodes, and proceed as if " + "analyzed_barcodes_only=False" + ) + + # Construct the anndata object. + adata = anndata.AnnData( + X=X, + obs={"barcode": d.pop("barcodes").astype(str)}, + var={ + "gene_name": ( + d.pop("gene_names") if "gene_names" in d.keys() else d.pop("name") + ).astype(str) + }, + dtype=X.dtype, + ) + adata.obs.set_index("barcode", inplace=True) + adata.var.set_index("gene_name", inplace=True) + + # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this + if "genes" in d.keys(): + d["id"] = d.pop("genes") + + # For purely aesthetic purposes, rename "id" to "gene_id" + if "id" in d.keys(): + d["gene_id"] = d.pop("id") + + # If genomes are empty, try to guess them based on gene_id + if "genome" in d.keys(): + if np.array([s.decode() == "" for s in d["genome"]]).all(): + if "_" in d["gene_id"][0].decode(): + print( + "Genome field blank, so attempting to guess genomes based on gene_id prefixes" + ) + d["genome"] = np.array( + [s.decode().split("_")[0] for s in d["gene_id"]], dtype=str + ) + + # Add other information to the anndata object in the appropriate slot. + _fill_adata_slots_automatically(adata, d) + + # Add a special additional field to .var if it exists. + if "features_analyzed_inds" in adata.uns.keys(): + adata.var["cellbender_analyzed"] = [ + True if (i in adata.uns["features_analyzed_inds"]) else False + for i in range(adata.shape[1]) + ] + + if analyzed_barcodes_only: + for col in adata.obs.columns[ + adata.obs.columns.str.startswith("barcodes_analyzed") + | adata.obs.columns.str.startswith("barcode_indices") + ]: + try: + del adata.obs[col] + except Exception: + pass + else: + # Add a special additional field to .obs if all barcodes are included. + if "barcodes_analyzed_inds" in adata.uns.keys(): + adata.obs["cellbender_analyzed"] = [ + True if (i in adata.uns["barcodes_analyzed_inds"]) else False + for i in range(adata.shape[0]) + ] + + return adata + + +def dict_from_h5(file: str) -> Dict[str, np.ndarray]: + """Read in everything from an h5 file and put into a dictionary.""" + d = {} + with tables.open_file(file) as f: + # read in everything + for array in f.walk_nodes("/", "Array"): + d[array.name] = array.read() + return d + + +def _fill_adata_slots_automatically(adata, d): + """Add other information to the adata object in the appropriate slot.""" + + for key, value in d.items(): + try: + if value is None: + continue + value = np.asarray(value) + if len(value.shape) == 0: + adata.uns[key] = value + elif value.shape[0] == adata.shape[0]: + if (len(value.shape) < 2) or (value.shape[1] < 2): + adata.obs[key] = value + else: + adata.obsm[key] = value + elif value.shape[0] == adata.shape[1]: + if value.dtype.name.startswith("bytes"): + adata.var[key] = value.astype(str) + else: + adata.var[key] = value + else: + adata.uns[key] = value + except Exception: + print("Unable to load data into AnnData: ", key, value, type(value)) diff --git a/src/correction/cellbender_remove_background_v0_2/script.py b/src/correction/cellbender_remove_background_v0_2/script.py new file mode 100644 index 00000000..0f271e16 --- /dev/null +++ b/src/correction/cellbender_remove_background_v0_2/script.py @@ -0,0 +1,207 @@ +import mudata as mu +import tempfile +import subprocess +import os +import sys +import numpy as np +from scipy.sparse import csr_matrix + +## VIASH START +import muon + +file_raw = ( + "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" +) +mdat = muon.read_10x_h5(file_raw) +mdat = mdat[0:100000,] # subsample to reduce computational time +file_input = "cellbender_remove_background_input.h5mu" +mdat.write_h5mu(file_input) + +par = { + # inputs + "input": file_input, + "modality": "rna", + # outputs + "output": "output.h5mu", + "layer_output": "corrected", + "obs_latent_rt_efficiency": "latent_rt_efficiency", + "obs_latent_cell_probability": "latent_cell_probability", + "obs_latent_scale": "latent_scale", + "var_ambient_expression": "ambient_expression", + # "obsm_latent_gene_encoding": "latent_gene_encoding", + # args + "total_droplets_included": None, + "min_counts": 1000, + "epochs": 5, + "fpr": 0.01, + "exclude_antibody_capture": False, + "learning_rate": 0.001, + "layer_corrected": "corrected", + "cuda": False, + "expected_cells": None, + "model": "full", + "low_count_threshold": 15, + "z_dim": 100, + "z_layers": [500], + "training_fraction": 0.9, + "empty_drop_training_fraction": 0.5, + "expected_cells_from_qc": True, + "output_compression": "gzip", + "obsm_latent_gene_encoding": "cellbender_latent_gene_encoding", +} +meta = { + "temp_dir": os.getenv("VIASH_TEMP"), + "resources_dir": "src/correction/cellbender_remove_background", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +from helper import anndata_from_h5 + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) + +mod = par["modality"] +logger.info("Performing log transformation on modality %s", mod) +data = mdata.mod[mod] + +# with pathlib.Path(meta["temp_dir"]) / "cellbender" as temp_dir: +# os.mkdir(temp_dir) +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: + # construct paths within tempdir + input_file = os.path.join(temp_dir, "input.h5ad") + output_file = os.path.join(temp_dir, "output.h5") + + logger.info("Creating AnnData input file for CellBender: '%s'", input_file) + data.write_h5ad(input_file) + + logger.info("Constructing CellBender command") + cmd_pars = [ + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, + ] + + extra_args = [ + ("--expected-cells", "expected_cells", True), + ("--total-droplets-included", "total_droplets_included", True), + ("--model", "model", True), + ("--epochs", "epochs", True), + ("--cuda", "cuda", False), + ("--low-count-threshold", "low_count_threshold", True), + ("--z-dim", "z_dim", True), + ("--z-layers", "z_layers", True), + ("--training-fraction", "training_fraction", True), + ("--exclude-antibody-capture", "exclude_antibody_capture", False), + ("--learning-rate", "learning_rate", True), + ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), + ] + for flag, name, is_kwarg in extra_args: + if par[name]: + values = par[name] if isinstance(par[name], list) else [par[name]] + cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] + + if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: + assert par["expected_cells"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + assert par["total_droplets_included"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + met = data.uns["metrics_cellranger"] + col_name = "Estimated Number of Cells" + assert col_name in met.columns, ( + "%s should be a column in .obs[metrics_cellranger]" + ) + est_cells = met[col_name].values[0] + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) + out = subprocess.check_output(cmd_pars).decode("utf-8") + + logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) + # have to use custom read_10x_h5 function for now + # will be fixed when https://github.com/scverse/scanpy/pull/2344 is merged + # adata_out = sc.read_10x_h5(output_file, gex_only=False) + adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=False) + + logger.info("Copying X output to MuData") + data.layers[par["layer_output"]] = adata_out.X + + logger.info("Copying .obs output to MuData") + obs_store = { + "obs_latent_rt_efficiency": "latent_RT_efficiency", + "obs_latent_cell_probability": "latent_cell_probability", + "obs_latent_scale": "latent_scale", + } + for to_name, from_name in obs_store.items(): + if par[to_name]: + if from_name in adata_out.obs: + data.obs[par[to_name]] = adata_out.obs[from_name] + # when using unfiltered data, the values will be in uns instead of obs + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + vec = np.zeros(data.n_obs) + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] + data.obs[par[to_name]] = vec + + logger.info("Copying .var output to MuData") + var_store = {"var_ambient_expression": "ambient_expression"} + for to_name, from_name in var_store.items(): + if par[to_name]: + data.var[par[to_name]] = adata_out.var[from_name] + + logger.info("Copying obsm_latent_gene_encoding output to MuData") + obsm_store = {"obsm_latent_gene_encoding": "latent_gene_encoding"} + for to_name, from_name in obsm_store.items(): + if par[to_name]: + if from_name in adata_out.obsm: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + matrix_to_store = adata_out.uns[from_name] + number_of_obs = data.X.shape[0] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] + data.obsm[par[to_name]] = latent_space_sparse + else: + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) + + +logger.info("Writing to file %s", par["output"]) +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/correction/cellbender_remove_background_v0_2/test.py b/src/correction/cellbender_remove_background_v0_2/test.py new file mode 100644 index 00000000..3a6812e8 --- /dev/null +++ b/src/correction/cellbender_remove_background_v0_2/test.py @@ -0,0 +1,52 @@ +from os import path +import muon as mu +import pytest + +## VIASH START +meta = { + "executable": "target/executable/correction/cellbender_remove_background/cellbender_remove_background", + "resources_dir": "resources_test/pbmc_1k_protein_v3", +} +## VIASH END + +file_raw = meta["resources_dir"] + "/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + + +@pytest.fixture +def subsampled_input(write_mudata_to_file): + mdat = mu.read_10x_h5(file_raw) + mdat = mdat[0:100000,] + return write_mudata_to_file(mdat) + + +def test_run(run_component, random_h5mu_path, subsampled_input): + print("> Check whether cellbender works when it should be working") + + # run cellbender + output_file = random_h5mu_path() + cmd_pars = [ + "--input", + subsampled_input, + "--output", + output_file, + "--epochs", + "5", + "--output_compression", + "gzip", + ] + # todo: if cuda is available, add --cuda + run_component(cmd_pars) + + # check if file exists + assert path.exists(output_file), "No output was created." + + # read it with scanpy + data = mu.read_h5mu(output_file) + + # check whether gex was found + assert data.mod["rna"].var["feature_types"].unique() == ["Gene Expression"], ( + "Output X should only contain Gene Expression vars." + ) + + # check whether ab counts were found + assert "prot" in data.mod, 'Output should contain data.mod["rna"].' diff --git a/src/dataflow/concatenate_h5mu/config.vsh.yaml b/src/dataflow/concatenate_h5mu/config.vsh.yaml new file mode 100644 index 00000000..8559549a --- /dev/null +++ b/src/dataflow/concatenate_h5mu/config.vsh.yaml @@ -0,0 +1,97 @@ +name: concatenate_h5mu +namespace: "dataflow" +scope: "public" +description: | + Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + multiple: true + description: Paths to the different samples to be concatenated. + required: true + example: sample_paths + - name: "--modality" + type: string + multiple: true + description: "Only output concatenated objects for the provided modalities. Outputs all modalities by default." + required: false + - name: "--input_id" + type: string + multiple: true + description: | + Names of the different samples that have to be concatenated. Must be specified when using '--mode move'. + In this case, the ids will be used for the columns names of the dataframes registring the conflicts. + If specified, must be of same length as `--input`. + required: false + - name: "--output" + description: | + Output location for the concatenated MuData object file. + alternatives: ["-o"] + type: file + direction: output + example: "output.h5mu" + - name: "--obs_sample_name" + type: string + description: Name of the .obs key under which to add the sample names. + default: "sample_id" + - name: "--other_axis_mode" + type: string + choices: [same, unique, first, only, concat, move] + default: move + description: | + How to handle the merging of other axis (var, obs, ...). + + - None: keep no data + - same: only keep elements of the matrices which are the same in each of the samples + - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples) + - first: keep the annotation from the first sample + - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample) + - move: identical to 'same', but moving the conflicting values to .varm or .obsm + - name: "--uns_merge_mode" + description: | + How to handle the merging of .uns across modalities + - None: keep no data + - same: only keep elements of the matrices which are the same in each of the samples + - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples) + - first: keep the annotation from the first sample + - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample) + - make_unique: identical to 'unique', but keys which are not unique are made unique by prefixing them with the sample id. + type: string + choices: ["same", "unique", "first", "only", "make_unique"] + default: make_unique +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu + - path: /resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + packages: + - pandas~=2.1.1 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [midcpu, highmem] \ No newline at end of file diff --git a/src/dataflow/concatenate_h5mu/script.py b/src/dataflow/concatenate_h5mu/script.py new file mode 100644 index 00000000..de958769 --- /dev/null +++ b/src/dataflow/concatenate_h5mu/script.py @@ -0,0 +1,397 @@ +from __future__ import annotations +import sys +import anndata +import mudata as mu +import pandas as pd +import numpy as np +from collections.abc import Iterable +from multiprocessing import Pool +from pathlib import Path +from h5py import File as H5File +from typing import Literal +import shutil + +### VIASH START +par = { + "input": [ + "resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + ], + "output": "foo.h5mu", + "input_id": ["mouse", "human"], + "other_axis_mode": "move", + "output_compression": "gzip", + "uns_merge_mode": "make_unique", +} +meta = {"cpus": 10, "resources_dir": "resources_test/"} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import compress_h5mu +from setup_logger import setup_logger + +logger = setup_logger() + + +def nunique(row): + unique = pd.unique(row) + unique_without_na = pd.core.dtypes.missing.remove_na_arraylike(unique) + return len(unique_without_na) > 1 + + +def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) -> bool: + """ + Check if any row contains duplicate values, that are not NA. + """ + numpy_array = frame.to_numpy() + with Pool(n_processes) as pool: + is_duplicated = pool.map(nunique, iter(numpy_array)) + return any(is_duplicated) + + +def concatenate_matrices( + n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index +) -> tuple[ + dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype] +]: + """ + Merge matrices by combining columns that have the same name. + Columns that contain conflicting values (e.i. the columns have different values), + are not merged, but instead moved to a new dataframe. + """ + column_names = set(column_name for var in matrices.values() for column_name in var) + logger.debug("Trying to concatenate columns: %s.", ",".join(column_names)) + if not column_names: + return {}, pd.DataFrame(index=align_to) + conflicts, concatenated_matrix = split_conflicts_and_concatenated_columns( + n_processes, matrices, column_names, align_to + ) + concatenated_matrix = cast_to_writeable_dtype(concatenated_matrix) + conflicts = { + conflict_name: cast_to_writeable_dtype(conflict_df) + for conflict_name, conflict_df in conflicts.items() + } + return conflicts, concatenated_matrix + + +def get_first_non_na_value_vector(df): + numpy_arr = df.to_numpy() + n_rows, n_cols = numpy_arr.shape + col_index = pd.isna(numpy_arr).argmin(axis=1) + flat_index = n_cols * np.arange(n_rows) + col_index + return pd.Series(numpy_arr.ravel()[flat_index], index=df.index, name=df.columns[0]) + + +def make_uns_keys_unique(mod_data, concatenated_data): + """ + Check if the uns keys across samples are unique before adding them + to the final concatenated object. If a conflict occurs between the samples, + add the sample ID to make the key unique again. + """ + all_uns_keys = {} + for sample_id, mod in mod_data.items(): + for uns_key, _ in mod.uns.items(): + all_uns_keys.setdefault(uns_key, []).append(sample_id) + for uns_key, samples_ids in all_uns_keys.items(): + assert samples_ids + if len(samples_ids) == 1: + sample_id = samples_ids[0] + concatenated_data.uns[uns_key] = mod_data[sample_id].uns[uns_key] + else: + for sample_id in samples_ids: + concatenated_data.uns[f"{sample_id}_{uns_key}"] = mod_data[ + sample_id + ].uns[uns_key] + return concatenated_data + + +def split_conflicts_and_concatenated_columns( + n_processes: int, + matrices: dict[str, pd.DataFrame], + column_names: Iterable[str], + align_to: pd.Index, +) -> tuple[dict[str, pd.DataFrame], pd.DataFrame]: + """ + Retrieve columns with the same name from a list of dataframes which are + identical across all the frames (ignoring NA values). + Columns which are not the same are regarded as 'conflicts', + which are stored in seperate dataframes, one per columns + with the same name that store conflicting values. + """ + conflicts = {} + concatenated_matrix = [] + for column_name in column_names: + columns = { + input_id: var[column_name] + for input_id, var in matrices.items() + if column_name in var + } + assert columns, "Some columns should have been found." + concatenated_columns = pd.concat( + columns.values(), axis=1, join="outer", sort=False + ) + if any_row_contains_duplicate_values(n_processes, concatenated_columns): + concatenated_columns.columns = ( + columns.keys() + ) # Use the sample id as column name + concatenated_columns = concatenated_columns.reindex(align_to, copy=False) + conflicts[f"conflict_{column_name}"] = concatenated_columns + else: + unique_values = get_first_non_na_value_vector(concatenated_columns) + concatenated_matrix.append(unique_values) + if not concatenated_matrix: + return conflicts, pd.DataFrame(index=align_to) + concatenated_matrix = pd.concat( + concatenated_matrix, join="outer", axis=1, sort=False + ) + concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False) + return conflicts, concatenated_matrix + + +def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: + """ + Cast the dataframe to dtypes that can be written by mudata. + """ + # dtype inferral workfs better with np.nan + result = result.replace({pd.NA: np.nan}) + + # MuData supports nullable booleans and ints + # ie. `IntegerArray` and `BooleanArray` + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # Convert leftover 'object' columns to string + # However, na values are supported, so convert all values except NA's to string + object_cols = result.select_dtypes(include="object").columns.values + for obj_col in object_cols: + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) + return result + + +def split_conflicts_modalities( + n_processes: int, samples: dict[str, anndata.AnnData], output: anndata.AnnData +) -> anndata.AnnData: + """ + Merge .var and .obs matrices of the anndata objects. Columns are merged + when the values (excl NA) are the same in each of the matrices. + Conflicting columns are moved to a separate dataframe (one dataframe for each column, + containing all the corresponding column from each sample). + """ + matrices_to_parse = ("var", "obs") + for matrix_name in matrices_to_parse: + matrices = { + sample_id: getattr(sample, matrix_name) + for sample_id, sample in samples.items() + } + output_index = getattr(output, matrix_name).index + conflicts, concatenated_matrix = concatenate_matrices( + n_processes, matrices, output_index + ) + if concatenated_matrix.empty: + concatenated_matrix.index = output_index + + # Even though we did not touch the varm and obsm matrices that were already present, + # the joining of observations might have caused a dtype change in these matrices as well + # so these also need to be casted to a writable dtype... + for multidim_name, multidim_data in getattr(output, f"{matrix_name}m").items(): + new_data = ( + cast_to_writeable_dtype(multidim_data) + if isinstance(multidim_data, pd.DataFrame) + else multidim_data + ) + getattr(output, f"{matrix_name}m")[multidim_name] = new_data + + # Write the conflicts to the output + for conflict_name, conflict_data in conflicts.items(): + getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data + + # Set other annotation matrices in the output + setattr(output, matrix_name, concatenated_matrix) + + return output + + +def concatenate_modality( + n_processes: int, + mod: str | None, + input_files: Iterable[str | Path], + other_axis_mode: str, + uns_merge_mode: str, + input_ids: tuple[str], +) -> anndata.AnnData: + concat_modes = { + "move": "unique", + } + other_axis_mode_to_apply = concat_modes.get(other_axis_mode, other_axis_mode) + + uns_merge_modes = {"make_unique": None} + uns_merge_mode_to_apply = uns_merge_modes.get(uns_merge_mode, uns_merge_mode) + + mod_data = {} + mod_indices_combined = pd.Index([]) + for input_id, input_file in zip(input_ids, input_files): + if mod is not None: + try: + data = mu.read_h5ad(input_file, mod=mod) + mod_data[input_id] = data + mod_indices_combined = mod_indices_combined.append(data.obs.index) + except KeyError as e: # Modality does not exist for this sample, skip it + if ( + f"Unable to synchronously open object (object '{mod}' doesn't exist)" + not in str(e) + ): + raise e + pass + else: # When mod=None, process the 'global' h5mu state + with H5File(input_file, "r") as input_h5: + if "uns" in input_h5.keys(): + uns_data = anndata.experimental.read_elem(input_h5["uns"]) + if uns_data: + mod_data[input_id] = anndata.AnnData(uns=uns_data) + + if not mod_indices_combined.is_unique: + raise ValueError("Observations are not unique across samples.") + + if not mod_data: + return anndata.AnnData() + + concatenated_data = anndata.concat( + mod_data.values(), + join="outer", + merge=other_axis_mode_to_apply, + uns_merge=uns_merge_mode_to_apply, + ) + + if other_axis_mode == "move": + concatenated_data = split_conflicts_modalities( + n_processes, mod_data, concatenated_data + ) + + if uns_merge_mode == "make_unique": + concatenated_data = make_uns_keys_unique(mod_data, concatenated_data) + + return concatenated_data + + +def concatenate_modalities( + n_processes: int, + modalities: list[str], + input_files: Path | str, + other_axis_mode: str, + uns_merge_mode: str, + output_file: Path | str, + compression: Literal["gzip"] | Literal["lzf"], + input_ids: tuple[str] | None = None, +) -> None: + """ + Join the modalities together into a single multimodal sample. + """ + logger.info("Concatenating samples.") + output_file, input_files = ( + Path(output_file), + [Path(input_file) for input_file in input_files], + ) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) + output_file_uncompressed.touch() + # Create empty mudata file + mdata = mu.MuData({modality: anndata.AnnData() for modality in modalities}) + mdata.write(output_file_uncompressed, compression=compression) + + # Use "None" for the global slots (not assigned to any modality) + for mod_name in modalities + [ + None, + ]: + new_mod = concatenate_modality( + n_processes, + mod_name, + input_files, + other_axis_mode, + uns_merge_mode, + input_ids, + ) + if mod_name is None: + if new_mod.uns: + with H5File(output_file_uncompressed, "r+") as open_h5mu_file: + anndata.experimental.write_elem( + open_h5mu_file, "uns", dict(new_mod.uns) + ) + continue + logger.info( + "Writing out modality '%s' to '%s' with compression '%s'.", + mod_name, + output_file_uncompressed, + compression, + ) + mu.write_h5ad(output_file_uncompressed, data=new_mod, mod=mod_name) + + if compression: + compress_h5mu(output_file_uncompressed, output_file, compression=compression) + output_file_uncompressed.unlink() + else: + shutil.move(output_file_uncompressed, output_file) + + logger.info("Concatenation successful.") + + +def main() -> None: + # Get a list of all possible modalities + mods = set() + for path in par["input"]: + try: + with H5File(path, "r") as f_root: + mods = mods | set(f_root["mod"].keys()) + except OSError: + raise OSError(f"Failed to load {path}. Is it a valid h5 file?") + + input_ids = None + if par["input_id"]: + input_ids: tuple[str] = tuple(i.strip() for i in par["input_id"]) + if len(input_ids) != len(par["input"]): + raise ValueError( + "The number of sample names must match the number of sample files." + ) + + if len(set(input_ids)) != len(input_ids): + raise ValueError("The sample names should be unique.") + + logger.info("\nConcatenating data from paths:\n\t%s", "\n\t".join(par["input"])) + + if par["other_axis_mode"] == "move" and not input_ids: + raise ValueError("--mode 'move' requires --input_ids.") + + n_processes = meta["cpus"] if meta["cpus"] else 1 + + if par["modality"]: + par["modality"] = set(par["modality"]) + if not par["modality"].issubset(mods): + mods_joined, input_mods_joined = ", ".join(mods), ", ".join(par["modality"]) + raise ValueError( + f"One of the modalities provided ({input_mods_joined}) is not available in the input data {mods_joined}" + ) + mods = par["modality"] + + concatenate_modalities( + n_processes, + list(mods), + par["input"], + par["other_axis_mode"], + par["uns_merge_mode"], + par["output"], + par["output_compression"], + input_ids=input_ids, + ) + + +if __name__ == "__main__": + main() diff --git a/src/dataflow/concatenate_h5mu/test.py b/src/dataflow/concatenate_h5mu/test.py new file mode 100644 index 00000000..5d95597f --- /dev/null +++ b/src/dataflow/concatenate_h5mu/test.py @@ -0,0 +1,1197 @@ +import mudata as md +import anndata as ad +import subprocess +from pathlib import Path +import pandas as pd +import numpy as np +import pytest +import re +import sys +from openpipeline_testutils.utils import remove_annotation_column +from operator import attrgetter + +## VIASH START +meta = { + "executable": "./target/docker/dataflow/concatenate_h5mu/concatenate_h5mu", + "resources_dir": "./resources_test/concat_test_data/", + "cpus": 2, + "config": "./src/dataflow/concatenate_h5mu/config.vsh.yaml", +} +## VIASH END + +meta["cpus"] = 1 if not meta["cpus"] else meta["cpus"] + + +@pytest.fixture +def sample_1_modality_1(): + """ + >>> ad1.obs + Obs1 Shared_obs + obs1 A B + obs2 C D + + >>> ad1.var + Feat1 Shared_feat + var1 a b + var2 c d + overlapping_var_mod1 e f + + >>> ad1.X + array([[1, 2, 3], + [4, 5, 6]]) + """ + + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["obs1", "obs2"], + columns=["var1", "var2", "overlapping_var_mod1"], + ) + obs = pd.DataFrame( + [["A", "B"], ["C", "D"]], index=df.index, columns=["Obs1", "Shared_obs"] + ) + var = pd.DataFrame( + [["a", "b"], ["c", "d"], ["e", "f"]], + index=df.columns, + columns=["Feat1", "Shared_feat"], + ) + varm = np.random.rand(df.columns.size, 5) + ad1 = ad.AnnData( + df, + obs=obs, + var=var, + varm={"random_vals_mod1": varm}, + uns={ + "uns_unique_to_sample1": pd.DataFrame( + ["foo"], index=["bar"], columns=["col1"] + ), + "overlapping_uns_key": pd.DataFrame( + ["jing"], index=["jang"], columns=["col2"] + ), + }, + ) + # ad1 = ad.AnnData(df, obs=obs, var=var) + return ad1 + + +@pytest.fixture +def sample_1_input_modality_2(): + """ + >>> ad2.X + array([[ 7, 8], + [ 9, 10], + [11, 12]]) + + >>> ad2.obs + Obs2 Obs3 Shared_obs + obs3 E F G + obs4 H I J + obs5 K L M + + >>> ad2.var + Feat2 Shared_feat + var4 d e + var5 f g + + """ + df = pd.DataFrame( + [[7, 8], [9, 10], [11, 12]], + index=["obs3", "obs4", "obs5"], + columns=["var3", "var4"], + ) + obs = pd.DataFrame( + [["E", "F", "G"], ["H", "I", "J"], ["K", "L", "M"]], + index=df.index, + columns=["Obs2", "Obs3", "Shared_obs"], + ) + var = pd.DataFrame( + [["d", "e"], ["f", "g"]], index=df.columns, columns=["Feat2", "Shared_feat"] + ) + ad2 = ad.AnnData(df, obs=obs, var=var) + return ad2 + + +@pytest.fixture +def sample_1_h5mu(sample_1_modality_1, sample_1_input_modality_2): + tmp_mudata = md.MuData( + {"mod1": sample_1_modality_1, "mod2": sample_1_input_modality_2} + ) + return tmp_mudata + + +@pytest.fixture +def sample_2_modality_1(): + """ + >>> ad3.X + array([[13, 14], + [15, 16], + [17, 18]]) + + >>> ad3.var + Feat3 Shared_feat + var5 h i + overlapping_var_mod1 j k + + >>> ad3.obs + Obs4 Obs5 Shared_obs + obs6 O P Q + obs7 R S T + obs8 U V W + """ + df = pd.DataFrame( + [[13, 14], [15, 16], [17, 18]], + index=["obs6", "obs7", "obs8"], + columns=["var5", "overlapping_var_mod1"], + ) + obs = pd.DataFrame( + [["O", "P", "Q"], ["R", "S", "T"], ["U", "V", "W"]], + index=df.index, + columns=["Obs4", "Obs5", "Shared_obs"], + ) + var = pd.DataFrame( + [["h", "i"], ["j", "k"]], index=df.columns, columns=["Feat3", "Shared_feat"] + ) + ad3 = ad.AnnData( + df, + obs=obs, + var=var, + uns={ + "uns_unique_to_sample2": pd.DataFrame( + ["baz"], index=["qux"], columns=["col3"] + ), + "overlapping_uns_key": pd.DataFrame( + ["ping"], index=["pong"], columns=["col4"] + ), + }, + ) + return ad3 + + +@pytest.fixture +def sample_2_modality_2(): + """ + >>> ad4.X + array([[19, 20, 21], + [22, 23, 24]]) + + >>> ad4.obs + Obs6 Shared_obs + obs8 X Y + obs9 Z AA + + >>> ad4.var + Feat4 Shared_feat + var6 l m + var7 n o + var8 p q + """ + df = pd.DataFrame( + [[19, 20, 21], [22, 23, 24]], + index=["obs8", "obs9"], + columns=["var6", "var7", "var8"], + ) + obs = pd.DataFrame( + [["X", "Y"], ["Z", "AA"]], index=df.index, columns=["Obs6", "Shared_obs"] + ) + var = pd.DataFrame( + [["l", "m"], ["n", "o"], ["p", "q"]], + index=df.columns, + columns=["Feat4", "Shared_feat"], + ) + varm = np.random.rand(df.columns.size, 3) + ad4 = ad.AnnData(df, obs=obs, var=var, varm={"random_vals_mod2": varm}) + # ad4 = ad.AnnData(df, obs=obs, var=var) + return ad4 + + +@pytest.fixture +def sample_2_h5mu(sample_2_modality_1, sample_2_modality_2): + tmp_mudata = md.MuData({"mod1": sample_2_modality_1, "mod2": sample_2_modality_2}) + return tmp_mudata + + +@pytest.fixture +def sample_3_modality_1(): + """ + >>> ad3.X + array([[25]]) + + >>> ad3.obs + Obs7 + obs10 AB + + >>> ad3.var + Feat4 + var9 r + + """ + df = pd.DataFrame([[25]], index=["obs10"], columns=["var9"]) + obs = pd.DataFrame([["AB"]], index=df.index, columns=["Obs7"]) + var = pd.DataFrame([["r"]], index=df.columns, columns=["Feat4"]) + ad5 = ad.AnnData(df, obs=obs, var=var) + return ad5 + + +@pytest.fixture +def sample_3_modality_3(): + """ + >>> ad6.X + array([[ 26, 32, 33, 453], + [ 34, 35, 36, 543]]) + + >>> ad6.var + Feat5 Feat6 Feat7 Feat8 + var10 s t u v + var11 w x y z + var12 aa ab ac ad + var13 ae af ag ah + + >>> ad6.obs + Obs8 Obs9 obs10 obs11 + obs11 AC AD AE AF + obs12 AG AH AI AJ + """ + df = pd.DataFrame( + [[26, 32, 33, 453], [34, 35, 36, 543]], + index=["obs11", "obs12"], + columns=["var10", "var11", "var12", "var13"], + ) + obs = pd.DataFrame( + [["AC", "AD", "AE", "AF"], ["AG", "AH", "AI", "AJ"]], + index=df.index, + columns=["Obs8", "Obs9", "obs10", "obs11"], + ) + var = pd.DataFrame( + [ + ["s", "t", "u", "v"], + ["w", "x", "y", "z"], + ["aa", "ab", "ac", "ad"], + ["ae", "af", "ag", "ah"], + ], + index=df.columns, + columns=["Feat5", "Feat6", "Feat7", "Feat8"], + ) + ad6 = ad.AnnData(df, obs=obs, var=var) + return ad6 + + +@pytest.fixture +def sample_3_h5mu(sample_3_modality_1, sample_3_modality_3): + tmp_mudata = md.MuData({"mod1": sample_3_modality_1, "mod3": sample_3_modality_3}) + return tmp_mudata + + +@pytest.fixture +def wrap_anndata_to_mudata(): + def wrapper(anndata_obj, mod_name="mod"): + return md.MuData({mod_name: anndata_obj}) + + return wrapper + + +@pytest.fixture +def change_column_contents(): + def wrapper(mudata_obj, annotation_frame_name, column_name, values_per_modality): + mudata_obj.update() + get_frame = attrgetter(annotation_frame_name) + modality_columns = [] + for mod_name, col_value in values_per_modality.items(): + modality = mudata_obj.mod[mod_name] + annotation_frame = get_frame(modality) + annotation_frame[column_name] = col_value + modality_columns.append(annotation_frame[column_name]) + mudata_obj.update() + global_annotation_frame = get_frame(mudata_obj) + if column_name in global_annotation_frame.columns: + updated_global_column = pd.concat(modality_columns, copy=True, join="inner") + no_duplicates = updated_global_column.reset_index().drop_duplicates( + subset=["index"] + ) + no_duplicates = no_duplicates.set_index("index") + global_annotation_frame[column_name] = no_duplicates + setattr( + mudata_obj, + annotation_frame_name, + global_annotation_frame.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ), + ) + + return wrapper + + +def test_concatenate_samples_with_same_observation_ids_raises( + run_component, + wrap_anndata_to_mudata, + write_mudata_to_file, + sample_1_modality_1, + sample_2_modality_1, + random_h5mu_path, +): + """ + Test how concat handles overlapping observation IDs. + This should raise. + """ + # introduce an overlapping observation + input_1_mudata = wrap_anndata_to_mudata(sample_1_modality_1) + old_obs_names = sample_2_modality_1.obs_names + new_obs_names = old_obs_names.where( + old_obs_names.isin([old_obs_names[0]]), sample_1_modality_1.obs.index[0] + ) + sample_2_modality_1.obs_names = new_obs_names + input_2_mudata = wrap_anndata_to_mudata(sample_2_modality_1) + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input_id", + "foo;bar", + "--input", + write_mudata_to_file(input_1_mudata), + "--input", + write_mudata_to_file(input_2_mudata), + "--output", + random_h5mu_path(), + "--other_axis_mode", + "move", + "--output_compression", + "gzip", + ] + ) + assert ( + "ValueError: Observations are not unique across samples." + in err.value.stdout.decode("utf-8") + ) + + +def test_concat_different_var_columns_per_sample( + run_component, sample_1_h5mu, sample_2_h5mu, random_h5mu_path, write_mudata_to_file +): + """ + Test what happens when concatenating samples with differing auxiliary + (like in .var) columns (present in 1 sample, absent in other). + When concatenating the samples, all columns should be present in the + resulting object, filling the values from samples with the missing + column with NA. + + Looking at Shared_feat here: + + mod1 mod2 + sample 1 present present + sample 2 x x + """ + output_path = random_h5mu_path() + # Before removing the 'Shared_feat' column from one of the samples, + # check if they are present in both + assert "Shared_feat" in sample_1_h5mu.var_keys() + assert "Shared_feat" in sample_2_h5mu.var_keys() + + sample_2_h5mu = remove_annotation_column(sample_2_h5mu, ["Shared_feat"], axis="var") + assert "Shared_feat" in sample_1_h5mu.var_keys() + assert "Shared_feat" not in sample_2_h5mu.var_keys() + + # 'Shared_feat' column is not missing from sample2, which is what this test is about + input_sample1_path = write_mudata_to_file(sample_1_h5mu) + input_sample2_path = write_mudata_to_file(sample_2_h5mu) + + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + + assert Path(output_path).is_file() + concatenated_data = md.read(output_path) + + data_sample1 = md.read(input_sample1_path) + data_sample2 = md.read(input_sample2_path) + + assert ( + concatenated_data.n_vars + == data_sample1.var.index.union(data_sample2.var.index).size + ) + + for mod_name in ("mod1", "mod2"): + # Check if all features are present + concatenated_mod = concatenated_data.mod[mod_name] + sample1_original_mod = data_sample1.mod[mod_name] + sample2_original_mod = data_sample2.mod[mod_name] + + original_var_keys = set( + sample1_original_mod.var_keys() + + sample2_original_mod.var_keys() + + list(sample1_original_mod.varm.keys()) + + list(sample2_original_mod.varm.keys()) + ) + + assert original_var_keys == set(concatenated_mod.varm.keys()) | set( + concatenated_mod.var.columns.tolist() + ) + + # Values from sample2 (which are also not in sample1) should have NA + non_shared_features = data_sample2.var_names.difference(data_sample1.var_names) + assert concatenated_data.var["Shared_feat"].loc[non_shared_features].isna().all() + + # Values from sample1 should not have NA, and should be equal to the original values + var_values = concatenated_data.var["Shared_feat"].loc[data_sample1.var_names] + data_sample1.var["Shared_feat"].equals(var_values) + + +def test_concat_different_columns_per_modality( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): + """ + Test what happens when concatenating samples that have auxiliary columns + that is missing in one modality compared to the other, but the the column + is missing from the same modalities in both samples. + + Looking at Shared_feat here: + + mod1 mod2 + sample 1 x present + sample 2 x present + """ + sample_2_h5mu = remove_annotation_column( + sample_2_h5mu, ["Shared_feat"], axis="var", modality_name="mod1" + ) + sample_1_h5mu = remove_annotation_column( + sample_1_h5mu, ["Shared_feat"], axis="var", modality_name="mod1" + ) + + input_sample1_path = write_mudata_to_file(sample_1_h5mu) + input_sample2_path = write_mudata_to_file(sample_2_h5mu) + + output_path = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + + assert Path(output_path).is_file() is True + concatenated_data = md.read(output_path) + + data_sample1 = md.read(str(input_sample1_path)) + data_sample2 = md.read(str(input_sample2_path)) + + # Check if all features are present + assert ( + concatenated_data.n_vars + == data_sample1.var.index.union(data_sample2.var.index).size + ) + + for mod_name in ("mod1", "mod2"): + concatenated_mod = concatenated_data.mod[mod_name] + data_sample1_mod = data_sample1.mod[mod_name] + data_sample2_mod = data_sample2.mod[mod_name] + original_var_keys = set( + data_sample1_mod.var_keys() + + data_sample2_mod.var_keys() + + list(data_sample2_mod.varm.keys()) + + list(data_sample1_mod.varm.keys()) + ) + + assert original_var_keys == set(concatenated_mod.varm.keys()) | set( + concatenated_mod.var.columns.tolist() + ) + + # Check if the shared column stays removed from modality + assert "Shared_feat" not in concatenated_data.mod["mod1"].var.columns + + # Values from modality 1 have NA + mod_1_features = data_sample1["mod1"].var_names.union( + data_sample2["mod1"].var_names + ) + assert concatenated_data.var.loc[mod_1_features, "mod2:Shared_feat"].isna().all() + + # Values from modalitu should not have NA, and should be equal to the original values + mod2_data = pd.concat( + [ + data_sample2["mod2"].var["Shared_feat"], + data_sample1["mod2"].var["Shared_feat"], + ] + ) + mod2_features = mod2_data.index + assert ( + concatenated_data.var.loc[mod2_features, "mod2:Shared_feat"] + .astype(str) + .equals(mod2_data) + ) + + +def test_concat_different_columns_per_modality_and_per_sample( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): + """ + Test what happens when concatenating samples that have auxiliary columns + that differ between the modalities and also between samples + + + Looking at 'Feat4' from sample 2 here: + mod1 mod2 + sample 1 x x + sample 2 x present + """ + + input_sample1_path = write_mudata_to_file(sample_1_h5mu) + input_sample2_path = write_mudata_to_file(sample_2_h5mu) + output_path = random_h5mu_path() + + run_component( + [ + "--input_id", + "mouse;human", + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + + assert Path(output_path).is_file() + concatenated_data = md.read(output_path) + + data_sample1 = md.read(input_sample1_path) + data_sample2 = md.read(input_sample2_path) + + # Check if all features are present + assert ( + concatenated_data.n_vars + == data_sample1.var_names.union(data_sample2.var_names).size + ) + + # Check if all features are present + for mod_name in ("mod1", "mod2"): + concatenated_mod = concatenated_data.mod[mod_name] + data_sample1_mod = data_sample1.mod[mod_name] + data_sample2_mod = data_sample2.mod[mod_name] + original_var_keys = set( + data_sample1_mod.var_keys() + + data_sample2_mod.var_keys() + + list(data_sample2_mod.varm.keys()) + + list(data_sample1_mod.varm.keys()) + ) + + assert original_var_keys == set( + column_name.removeprefix("conflict_") + for column_name in concatenated_mod.varm.keys() + ) | set(concatenated_mod.var.columns.tolist()) + + assert "Shared_feat" in concatenated_data.mod["mod2"].var.columns + + # Values from modality 1 have NA + mod_1_features = data_sample1["mod1"].var_names.union( + data_sample2["mod1"].var_names + ) + assert concatenated_data.var.loc[mod_1_features, "mod2:Feat4"].isna().all() + + # Values from modality 2 should not have NA if they originate from sample2 + # These values should be equal to the original values + mod2_data = data_sample2["mod2"].var["Feat4"].rename("mod2:Feat4") + mod2_features = mod2_data.index + assert ( + concatenated_data.var.loc[mod2_features, "mod2:Feat4"] + .astype(str) + .equals(mod2_data) + ) + + # Values from modality2 should have NA if they originate from sample1 (and only from sample1) + non_shared_features = data_sample1.var_names.difference(data_sample2.var_names) + assert concatenated_data.var.loc[non_shared_features, "mod2:Feat4"].isna().all() + + +@pytest.mark.parametrize( + "test_value,test_value_dtype,expected", + [ + ("bar", "str", "bar"), + (True, pd.BooleanDtype(), True), + (1, pd.Int16Dtype(), 1), + (0.1, float, 0.1), + (0.1, np.float64, 0.1), + (np.nan, np.float64, pd.NA), + ], +) +def test_concat_remove_na( + run_component, + sample_1_h5mu, + sample_2_h5mu, + write_mudata_to_file, + random_h5mu_path, + test_value, + test_value_dtype, + expected, + change_column_contents, +): + """ + Test concatenation of samples where the column from one sample contains NA values + NA values should be removed from the concatenated result + + mod1 mod2 + sample 1 NA NA + sample 2 test_value NA + """ + change_column_contents( + sample_1_h5mu, "var", "Shared_feat", {"mod1": np.nan, "mod2": np.nan} + ) + change_column_contents( + sample_2_h5mu, "var", "Shared_feat", {"mod1": test_value, "mod2": np.nan} + ) + sample_2_h5mu.var["Shared_feat"] = sample_2_h5mu.var["Shared_feat"].astype( + test_value_dtype + ) + output_path = random_h5mu_path() + + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + + assert Path(output_path).is_file() + concatenated_data = md.read(output_path) + + # Values from modality 2 have NA + mod_2_features = sample_1_h5mu["mod2"].var_names.union( + sample_2_h5mu["mod2"].var_names + ) + assert concatenated_data.var.loc[mod_2_features, "Shared_feat"].isna().all() + + # Values from modality 1 should not have NA if they originate from sample 1 + # These values should be equal to the original values + assert sample_1_h5mu["mod1"].var["Shared_feat"].isna().all() + + # Values from modality 1 should hold a value if they originate from sample 2 + mod1_features = sample_2_h5mu["mod1"].var_names.difference(sample_1_h5mu.var_names) + if not pd.isna(expected): + assert ( + concatenated_data.var.loc[mod1_features, "Shared_feat"] == expected + ).all() + else: + assert concatenated_data.var.loc[mod1_features, "Shared_feat"].isna().all() + + # The 'Shared_feat' column for mod1 contains an overlapping feature. + # For sample 1, it is NA, for sample 2 is is filled with test value. + # The concat component should choose the test-value over NA + shared_features = sample_2_h5mu.var_names.intersection(sample_1_h5mu.var_names) + if not pd.isna(expected): + assert ( + concatenated_data.var.loc[shared_features, "Shared_feat"] == expected + ).all() + else: + assert concatenated_data.var.loc[shared_features, "Shared_feat"].isna().all() + + +def test_concat_invalid_h5_error_includes_path( + run_component, tmp_path, sample_1_h5mu, write_mudata_to_file +): + empty_file = tmp_path / "empty.h5mu" + empty_file.touch() + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input_id", + "mouse;empty", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + empty_file, + "--output", + "concat.h5mu", + "--other_axis_mode", + "move", + ] + ) + assert re.search( + rf"OSError: Failed to load .*{str(empty_file)}\. Is it a valid h5 file?", + err.value.stdout.decode("utf-8"), + ) + + +@pytest.mark.parametrize( + "test_value_1,value_1_dtype,test_value_2,value_2_dtype,expected", + [ + (1, float, "1", str, pd.CategoricalDtype(categories=["1.0", "1"])), + (1, np.float64, "1", str, pd.CategoricalDtype(categories=["1.0", "1"])), + (1, pd.Int16Dtype(), 2.0, pd.Int16Dtype(), pd.Int64Dtype()), + (True, bool, False, bool, pd.BooleanDtype()), + (True, pd.BooleanDtype(), False, bool, pd.BooleanDtype()), + ("foo", str, "bar", str, pd.CategoricalDtype(categories=["bar", "foo"])), + ], +) +def test_concat_dtypes_per_modality( + run_component, + write_mudata_to_file, + change_column_contents, + sample_1_h5mu, + sample_2_h5mu, + test_value_1, + value_1_dtype, + test_value_2, + value_2_dtype, + expected, + random_h5mu_path, +): + """ + Test joining column with different dtypes to make sure that they are writable. + The default path is to convert all non-na values to strings and wrap the column into a categorical dtype. + Here, we test on the level of a single modality only. Because the mod1 modality for both sample 1 and + sample 2 contain a column 'test_col' and there is an overlapping feature name (overlapping_var_mod1), + there is a conflict for this var column in mod 1 for this column. Upon concatenation, the column is moved + to .varm, but for mod1 only. The column is concatenated for mod2 as planned. Here we check if the results + for the test column in mod2 is still writable. + """ + change_column_contents( + sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1} + ) + sample_1_h5mu.var["test_col"] = sample_1_h5mu.var["test_col"].astype(value_1_dtype) + change_column_contents( + sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2} + ) + sample_2_h5mu.var["test_col"] = sample_2_h5mu.var["test_col"].astype(value_2_dtype) + + output_file = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_file, + "--other_axis_mode", + "move", + ] + ) + concatenated_data = md.read(output_file) + assert concatenated_data["mod2"].var["test_col"].dtype == expected + + +@pytest.mark.parametrize( + "test_value,value_dtype,expected", + [ + (1, float, pd.Int64Dtype()), + (1, np.float64, pd.Int64Dtype()), + (1, pd.Int16Dtype(), pd.Int16Dtype()), + (True, bool, pd.BooleanDtype()), + (True, pd.BooleanDtype(), pd.BooleanDtype()), + ("foo", str, pd.CategoricalDtype(categories=["foo"])), + ], +) +def test_concat_dtypes_per_modality_multidim( + run_component, + write_mudata_to_file, + sample_1_h5mu, + sample_2_h5mu, + test_value, + value_dtype, + expected, + random_h5mu_path, +): + """ + Test if the result of concatenation is still writable when the input already contain + data in .varm and this data is kept. Because we are joining observations, the dtype of this + data may change and the result might not be writable anymore + """ + + sample_1_h5mu["mod1"].varm["test_df"] = pd.DataFrame( + index=sample_1_h5mu["mod1"].var_names + ) + sample_1_h5mu["mod1"].varm["test_df"]["test_col"] = test_value + sample_1_h5mu["mod1"].varm["test_df"]["test_col"] = ( + sample_1_h5mu["mod1"].varm["test_df"]["test_col"].astype(value_dtype) + ) + + output_file = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_file, + "--other_axis_mode", + "move", + ] + ) + concatenated_data = md.read(output_file) + assert concatenated_data["mod1"].varm["test_df"]["test_col"].dtype == expected + + +@pytest.mark.parametrize( + "test_value_1,test_value_2,expected", + [(1, "1", pd.CategoricalDtype(categories=["1.0", "1"]))], +) +def test_concat_dtypes_global( + run_component, + write_mudata_to_file, + change_column_contents, + sample_1_h5mu, + sample_2_h5mu, + test_value_1, + test_value_2, + expected, + random_h5mu_path, +): + """ + Test joining column with different dtypes to make sure that they are writable. + The default path is to convert all non-na values to strings and wrap the column into a categorical dtype. + Here, we test on the level of a column that is added to a global annotation matrix. + """ + change_column_contents( + sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1} + ) + change_column_contents( + sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2} + ) + sample1_mod1_names = sample_2_h5mu["mod1"].var_names + # Here, we avoid a conflict between sample 1 and sample 2 by making sure there is no overlap in features + # between sample 1 and sample 2 (no shared var_names). If this change would not be done, a different + # value for sample 1 and sample 2 would be found by the concat component for the var feature + # 'overlapping_var_mod1' for modality 'mod1'. The concat component would move the column for mod1 to + # .varm because of this conflict, and in the global .var column of the concatenated object, only + # a 'mod2:test_col' column would be present. But here, we want to test the column that is populated by + # both 'mod1' and 'mod2' + assert "overlapping_var_mod1" in sample1_mod1_names + new_names = sample1_mod1_names.where( + ~sample1_mod1_names.isin(["overlapping_var_mod1"]), "non_overlapping" + ) + sample_2_h5mu["mod1"].var_names = new_names + sample_2_h5mu.update() + output_file = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_file, + "--other_axis_mode", + "move", + ] + ) + concatenated_data = md.read(output_file) + assert concatenated_data.var["test_col"].dtype == expected + + +def test_non_overlapping_modalities( + run_component, sample_2_h5mu, sample_3_h5mu, random_h5mu_path, write_mudata_to_file +): + """ + Test that the component does not fail when the modalities are not shared between samples. + """ + output_path = random_h5mu_path() + input_file_2 = write_mudata_to_file(sample_2_h5mu) + input_file_3 = write_mudata_to_file(sample_3_h5mu) + + run_component( + [ + "--input_id", + "sample2;sample3", + "--input", + input_file_2, + "--input", + input_file_3, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + output_data = md.read(output_path) + assert set(output_data.mod.keys()) == {"mod1", "mod2", "mod3"} + + +def test_resolve_annotation_conflict_missing_column( + run_component, + sample_1_h5mu, + sample_2_h5mu, + sample_3_h5mu, + write_mudata_to_file, + random_h5mu_path, +): + """ + Test using mode 'move' and resolving a conflict in metadata between the samples, + but the metadata column is missing in one of the samples. + """ + output_path = random_h5mu_path() + input_file_1 = write_mudata_to_file(sample_1_h5mu) + input_file_2 = write_mudata_to_file(sample_2_h5mu) + input_file_3 = write_mudata_to_file(sample_3_h5mu) + + run_component( + [ + "--input_id", + "sample1;sample2;sample3", + "--input", + input_file_1, + "--input", + input_file_2, + "--input", + input_file_3, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + + concatenated_data = md.read(output_path) + # 'Shared_feat' is defined for mod1 in sample 1 and 2 and there is a conflict + assert "conflict_Shared_feat" in concatenated_data["mod1"].varm + # 'Shared_feat' is defined for mod2 in sample 1 and 2 and there is no conflict + assert "Shared_feat" in concatenated_data["mod2"].var.columns + # 'Shared_feat' is not defined in any of the samples samples for modality 3 + assert "Shared_feat" not in concatenated_data["mod3"].var.columns + assert "Shared_feat" not in concatenated_data["mod3"].varm + + +def test_mode_move( + run_component, sample_1_h5mu, sample_2_h5mu, random_h5mu_path, write_mudata_to_file +): + """ + Test that in case of a conflict, the conflicting columns are move to the multidimensional annotation slot + (.varm and .obsm). The key of the datafame in the slot should start with 'conflict_' followed by the name + of the column and the columns of the dataframe should contain the sample names. + """ + output_path = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + assert output_path.is_file() + concatenated_data = md.read(output_path) + + # Check if observations from all of the samples are present + assert concatenated_data.n_obs == sample_1_h5mu.n_obs + sample_2_h5mu.n_obs + + # Check if all modalities are present + sample1_mods, sample2_mods = ( + set(sample_1_h5mu.mod.keys()), + set(sample_2_h5mu.mod.keys()), + ) + concatentated_mods = set(concatenated_data.mod.keys()) + assert (sample1_mods | sample2_mods) == concatentated_mods + + varm_check = { + "mod1": ({"conflict_Shared_feat": ("sample1", "sample2")}), + "mod2": {}, + } + + # Check if all features are present + for mod_name in ("mod1", "mod2"): + concatenated_mod = concatenated_data.mod[mod_name] + sample_1_mod = sample_1_h5mu.mod[mod_name] + sample_2_mod = sample_2_h5mu.mod[mod_name] + original_varm_keys = set( + list(sample_1_mod.varm.keys()) + list(sample_2_mod.varm.keys()) + ) + original_var_keys = ( + set(sample_1_mod.var_keys() + sample_2_mod.var_keys()) | original_varm_keys + ) + + assert original_var_keys == set( + column_name.removeprefix("conflict_") + for column_name in concatenated_mod.varm.keys() + ) | set(concatenated_mod.var.columns.tolist()) + + varm_expected = varm_check[mod_name] + assert set(concatenated_mod.varm.keys()) == set( + varm_expected.keys() | original_varm_keys + ) + for varm_key, expected_columns in varm_expected.items(): + assert tuple(concatenated_mod.varm[varm_key].columns) == expected_columns + if not varm_expected: + assert set(concatenated_mod.varm.keys()) == original_varm_keys + assert concatenated_mod.obsm == {} + + +# Execute this test multiple times, anndata.concat sometimes returns the observations in a different order +@pytest.mark.parametrize("_", range(10)) +def test_concat_var_obs_names_order( + run_component, + sample_1_h5mu, + sample_2_h5mu, + write_mudata_to_file, + random_h5mu_path, + _, +): + """ + Test that the var_names and obs_names are still linked to the correct count data. + """ + output_path = random_h5mu_path() + sample_1_h5mu["mod1"].obs["sample_id"] = "sample1" + sample_1_h5mu["mod2"].obs["sample_id"] = "sample1" + sample_2_h5mu["mod1"].obs["sample_id"] = "sample2" + sample_2_h5mu["mod2"].obs["sample_id"] = "sample2" + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) + assert output_path.is_file() + for sample_name, sample_h5mu in { + "sample1": sample_1_h5mu, + "sample2": sample_2_h5mu, + }.items(): + for mod_name in ["mod1", "mod2"]: + data_sample = sample_h5mu[mod_name].copy() + processed_data_ad = md.read_h5ad(output_path, mod=mod_name) + processed_data_ad = processed_data_ad[ + processed_data_ad.obs["sample_id"] == sample_name + ] + processed_data_ad = processed_data_ad[:, data_sample.var_names] + processed_data = pd.DataFrame( + processed_data_ad.X, + index=processed_data_ad.obs_names, + columns=processed_data_ad.var_names, + ) + data_sample = pd.DataFrame( + data_sample.X, + index=data_sample.obs_names, + columns=data_sample.var_names, + ).reindex_like(processed_data) + pd.testing.assert_frame_equal( + processed_data, data_sample, check_dtype=False + ) + + +def test_keep_uns( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): + sample_1_h5mu.uns["global_uns_sample1"] = "dolor" + sample_1_h5mu.uns["overlapping_global"] = "sed" + sample_2_h5mu.uns["global_uns_sample2"] = "amet" + sample_2_h5mu.uns["overlapping_global"] = "elit" + output_path = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + "--uns_merge_mode", + "make_unique", + ] + ) + assert output_path.is_file() + concatenated_data = md.read(output_path) + mod1 = concatenated_data.mod["mod1"] + mod2 = concatenated_data.mod["mod2"] + assert set(concatenated_data.uns.keys()) == set( + [ + "global_uns_sample1", + "global_uns_sample2", + "sample1_overlapping_global", + "sample2_overlapping_global", + ] + ) + assert set(mod1.uns.keys()) == set( + [ + "sample1_overlapping_uns_key", + "uns_unique_to_sample1", + "sample2_overlapping_uns_key", + "uns_unique_to_sample2", + ] + ) + assert set(mod2.uns.keys()) == set() + + +def test_subset_modalities( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): + """ + Test if outputting a subset of the modalities works + """ + output_path = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--modality", + "mod1", + "--other_axis_mode", + "move", + "--uns_merge_mode", + "make_unique", + ] + ) + assert output_path.is_file() + concatenated_data = md.read(output_path) + assert set(concatenated_data.mod.keys()) == set(["mod1"]) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v"])) diff --git a/src/dataflow/merge/config.vsh.yml b/src/dataflow/merge/config.vsh.yml new file mode 100644 index 00000000..fbd17393 --- /dev/null +++ b/src/dataflow/merge/config.vsh.yml @@ -0,0 +1,54 @@ +name: merge +namespace: "dataflow" +scope: "public" +description: | + Combine one or more single-modality .h5mu files together into one .h5mu file. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + multiple: true + description: Paths to the single-modality .h5mu files that need to be combined + required: true + default: sample_paths + - name: "--output" + description: Path to the output file. + alternatives: ["-o"] + type: file + direction: output + default: "output.h5mu" + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: ../../../resources_test/merge_test_data/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu + - path: ../../../resources_test/merge_test_data/pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, highmem] \ No newline at end of file diff --git a/src/dataflow/merge/script.py b/src/dataflow/merge/script.py new file mode 100644 index 00000000..97665c1e --- /dev/null +++ b/src/dataflow/merge/script.py @@ -0,0 +1,84 @@ +from __future__ import annotations +import sys +import mudata as md +import pandas as pd +import numpy as np + + +### VIASH START +par = { + "input": [ + "./resources_test/merge/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu", + "./resources_test/merge/pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu", + ], + "output": "foo.h5mu", + "output_compression": None, +} +meta = {"resources_dir": "src/utils"} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info("Reading input files %s", ",".join(par["input"])) + input_samples = [md.read_h5mu(path) for path in par["input"]] + + logger.info("Merging into single object.") + sample_modalities = {} + for input_sample in input_samples: + for mod_name, mod_data in input_sample.mod.items(): + if mod_name in sample_modalities: + raise ValueError( + f"Modality '{mod_name}' was found in more than 1 sample." + ) + sample_modalities[mod_name] = mod_data + + merged = md.MuData(sample_modalities) + merged.update() + for df_attr in ("var", "obs"): + df = getattr(merged, df_attr) + df = df.replace({pd.NA: np.nan}, inplace=False) + + # MuData supports nullable booleans and ints + # ie. `IntegerArray` and `BooleanArray` + df = df.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # pd.convert_dtypes does not convert already existing nullable dtypes + # to their native numpy dtypes. + # At this point, integer and boolean columns use a nullable dtype; and string columns + # might use 'object' or pd.StringDtype. This is OK. + # However, for example floating number columns might still use a nullable dtype + # from before the call to convert_dtypes. So we need to explicitly cast. + col_dtypes = df.select_dtypes( + exclude=["integer", "string", "object", "boolean"] + ).dtypes.to_dict() + numpy_dtypes = { + col_name: np.dtype(col_dtype.kind) + for col_name, col_dtype in col_dtypes.items() + if pd.api.types.is_extension_array_dtype(col_dtype) + } + df = df.astype(numpy_dtypes) + + # Convert leftover 'object' columns to string + object_cols = df.select_dtypes(include="object").columns.values + for obj_col in object_cols: + df[obj_col] = df[obj_col].astype(str).astype("category") + + setattr(merged, df_attr, df) + + merged.write_h5mu(par["output"], compression=par["output_compression"]) + logger.info("Finished") + + +if __name__ == "__main__": + main() diff --git a/src/dataflow/merge/test.py b/src/dataflow/merge/test.py new file mode 100644 index 00000000..2ba331da --- /dev/null +++ b/src/dataflow/merge/test.py @@ -0,0 +1,251 @@ +import sys +import pytest +import subprocess +from mudata import MuData, read_h5mu +import pandas as pd +import numpy as np +import re +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/dataflow/merge/merge", + "resources_dir": "./resources_test/merge_test_data/", + "config": "./src/dataflow/merge/config.vsh.yml", +} +## VIASH END + + +@pytest.fixture +def mudata_non_overlapping_observations(request, random_h5mu_path): + mudata_to_change_path = request.getfixturevalue(request.param) + # mudata_to_change_path = small_mudata_mod1_path + temp_h5mu = random_h5mu_path() + mudata_to_change = read_h5mu(mudata_to_change_path) + # Remove 1 observation + removed_observation_name = mudata_to_change.obs.index[-1] + mudata_to_change = mudata_to_change[: mudata_to_change.n_obs - 1] + mudata_to_change.write(temp_h5mu, compression="gzip") + return temp_h5mu, removed_observation_name + + +@pytest.fixture +def extra_var_column_value(): + return "bar" + + +@pytest.fixture +def extra_var_column_name(): + return "test" + + +@pytest.fixture +def mudata_with_extra_var_column( + random_h5mu_path, request, extra_var_column_value, extra_var_column_name +): + [sample1_path, sample2_path] = request.getfixturevalue(request.param) + result = [] + for sample_path, column_value in ( + (sample1_path, extra_var_column_value), + (sample2_path, np.nan), + ): + sample = read_h5mu(sample_path) + mod_names = list(sample.mod.keys()) + assert len(mod_names) == 1 + mod_name = mod_names[0] + sample.mod[mod_name].var[extra_var_column_name] = column_value + sample.var = sample.var.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + new_path = random_h5mu_path() + sample.write(new_path) + result.append(new_path) + return result + + +def test_merge( + run_component, random_h5mu_path, small_mudata_mod1_path, small_mudata_mod2_path +): + """ + Test a simple merge with fully overlapping observations + """ + output_path = random_h5mu_path() + # input_sample1_path, input_sample2_path = split_small_mudata_path + args = [ + "--input", + small_mudata_mod1_path, + "--input", + small_mudata_mod2_path, + "--output", + output_path, + "--output_compression", + "gzip", + ] + run_component(args) + + assert output_path.is_file() + concatenated_data = read_h5mu(output_path) + data_sample1 = read_h5mu(small_mudata_mod1_path) + data_sample2 = read_h5mu(small_mudata_mod2_path) + + expected_concatenated_data = MuData( + {"mod1": data_sample1.mod["mod1"], "mod2": data_sample2.mod["mod2"]} + ) + assert_annotation_objects_equal(concatenated_data, expected_concatenated_data) + + +@pytest.mark.parametrize( + "mudata_non_overlapping_observations", + ["small_mudata_mod1_path"], + indirect=["mudata_non_overlapping_observations"], +) +def test_merge_non_overlapping_observations( + run_component, + mudata_non_overlapping_observations, + random_h5mu_path, + small_mudata_mod2_path, +): + """ + Merge with differing observations in the samples + """ + edited_h5mu_path, removed_observation_name = mudata_non_overlapping_observations + output_path = random_h5mu_path() + # Remove 1 observation + run_component( + [ + "--input", + edited_h5mu_path, + "--input", + small_mudata_mod2_path, + "--output", + output_path, + ] + ) + + assert output_path.is_file() + concatenated_data = read_h5mu(output_path, backed=False) + data_sample1 = read_h5mu(edited_h5mu_path, backed=False) + data_sample2 = read_h5mu(small_mudata_mod2_path, backed=False) + + expected_concatenated_data = MuData( + {"mod1": data_sample1.mod["mod1"], "mod2": data_sample2.mod["mod2"]} + ) + + assert set(concatenated_data.obs_names) == ( + set(data_sample1.obs_names) | set(data_sample2.obs_names) + ) + assert concatenated_data[removed_observation_name:]["mod1"].n_obs == 0 + assert concatenated_data[removed_observation_name:]["mod2"].n_obs == 1 + + np.testing.assert_equal( + concatenated_data.copy()[removed_observation_name:]["mod2"].X.data, + data_sample2.copy()[removed_observation_name:]["mod2"].X.data, + ) + + assert_annotation_objects_equal(concatenated_data, expected_concatenated_data) + + +@pytest.mark.parametrize( + "extra_var_column_name,extra_var_column_value,expected", + [ + ("test", "bar", "bar"), + ("test", True, True), + ("test", 0.1, 0.1), + ("test", np.nan, pd.NA), + ], +) +@pytest.mark.parametrize( + "mudata_with_extra_var_column", + ["split_small_mudata_path"], + indirect=["mudata_with_extra_var_column"], +) +def test_boolean_and_na_types( + run_component, + mudata_with_extra_var_column, + extra_var_column_name, + expected, + random_h5mu_path, +): + """ + Test if merging booleans of NAs results in the .var .obs column being writeable + """ + input_sample1_path, input_sample2_path = mudata_with_extra_var_column + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + ] + ) + assert output_path.is_file() + merged_data = read_h5mu(output_path, backed=False) + first_sample_mod = list(read_h5mu(input_sample1_path).mod)[0] + second_sample_mod = list(read_h5mu(input_sample2_path).mod)[0] + + expected_merged_data = MuData( + { + "mod1": read_h5mu(input_sample1_path).mod["mod1"], + "mod2": read_h5mu(input_sample2_path).mod["mod2"], + } + ) + + if not pd.isna(expected): + assert merged_data.var.loc["var1"][extra_var_column_name] == expected + assert ( + merged_data.mod[first_sample_mod].var.loc["var1"][extra_var_column_name] + == expected + ) + else: + assert pd.isna(merged_data.var.loc["var1"][extra_var_column_name]) + assert pd.isna( + merged_data.mod[first_sample_mod].var.loc["var1"][extra_var_column_name] + ) + assert pd.isna(merged_data.var.loc["var4"][extra_var_column_name]) + assert pd.isna( + merged_data.mod[second_sample_mod].var.loc["var4"][extra_var_column_name] + ) + + assert_annotation_objects_equal(merged_data, expected_merged_data) + + +def test_same_modalities_raises( + run_component, random_h5mu_path, split_small_mudata_path +): + """ + Raise when trying to merge modalities with the same name. + """ + input_sample1_path, input_sample2_path = split_small_mudata_path + input_sample2_edited_path = random_h5mu_path() + output_path = random_h5mu_path() + data_sample2 = read_h5mu(input_sample2_path) + data_sample2 = MuData({"mod1": data_sample2.mod["mod2"]}) + data_sample2.write(input_sample2_edited_path, compression="gzip") + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_sample1_path, + "--input", + input_sample2_edited_path, + "--output", + output_path, + ] + ) + assert re.search( + r"ValueError: Modality 'mod1' was found in more than 1 sample\.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dataflow/split_h5mu/config.vsh.yaml b/src/dataflow/split_h5mu/config.vsh.yaml new file mode 100644 index 00000000..0bf8dab7 --- /dev/null +++ b/src/dataflow/split_h5mu/config.vsh.yaml @@ -0,0 +1,73 @@ +name: split_h5mu +namespace: "dataflow" +scope: "public" +description: | + Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] +argument_groups: +- name: Input & specifications + arguments: + - name: "--input" + type: file + description: Path to a single .h5mu file. + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obs_feature" + type: string + required: true + description: The .obs column to split the mudata on. + example: "celltype" + - name: "--drop_obs_nan" + type: boolean_true + description: Whether to drop all .obs columns that contain only nan values after splitting. + - name: "--ensure_unique_filenames" + type: boolean_true + description: Append number suffixes to ensure unique filenames after sanitizing obs feature values. + +- name: Outputs + arguments: + - name: "--output" + type: file + required: true + direction: output + example: "/path/to/output" + description: Output directory containing multiple h5mu files. + - name: "--output_files" + type: file + required: true + direction: output + example: sample_files.csv + description: A csv containing the base filename and obs feature by which it was split. + __merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ lowcpu, highmem, highdisk] diff --git a/src/dataflow/split_h5mu/script.py b/src/dataflow/split_h5mu/script.py new file mode 100644 index 00000000..607c002c --- /dev/null +++ b/src/dataflow/split_h5mu/script.py @@ -0,0 +1,114 @@ +import sys +import mudata as mu +import pandas as pd +import re +import gc +from pathlib import Path +from collections import defaultdict + +### VIASH START +par = { + "input": "harmony_knn/integrated.pynndescent_knn.output", + "modality": "rna", + "obs_feature": "dataset", + "output": "reference_download/sample_split", + "drop_obs_nan": "true", + "output_compression": None, + "output_files": "reference_download/sample_files.csv", + "ensure_unique_filenames": True, +} +import anndata as ad + +df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] +) +var3 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) +obs3 = pd.DataFrame(["C C", "C_C"], index=df.index, columns=["Obs"]) +ad3 = ad.AnnData(df, obs=obs3, var=var3) +mdata = mu.MuData({"rna": ad3}) +mdata.write_h5mu("test_san.h5mu") +par["input"] = "test_san.h5mu" +par["obs_feature"] = "Obs" +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info(f"Reading {par['input']}") + input_file = Path(par["input"].strip()) + + mdata = mu.read_h5mu(input_file) + adata = mdata.mod[par["modality"]] + + logger.info(f"Reading unique features from {par['obs_feature']}") + obs_features = adata.obs[par["obs_feature"]].unique().tolist() + + # sanitize --obs_feature values + obs_features_s = [re.sub(r"[-\s]", "_", str(s).strip()) for s in obs_features] + obs_features_s = [re.sub(r"[^A-Za-z0-9_]", "", s) for s in obs_features_s] + + # ensure that names are unique, if not raise or append number as suffix + if not len(obs_features_s) == len(set(obs_features_s)): + if not par["ensure_unique_filenames"]: + raise ValueError( + f"File names are not unique after sanitizing the --obs_feature {par['obs_feature']} values" + ) + + logger.info("Ensuring unique names for par['obs_feature']") + counts = defaultdict(lambda: -1) + for i, feature in enumerate(obs_features_s): + counts[feature] += 1 + if (curr_counts := counts[feature]) > 0: + obs_features_s[i] += f"_{curr_counts}" + + # generate output dir + output_dir = Path(par["output"]) + if not output_dir.is_dir(): + output_dir.mkdir(parents=True) + + # split modality of mdata file base on obs_feature + logger.info(f"Splitting file based on {par['obs_feature']} values {obs_features}") + obs_files = [] + + for obs_name, file_name in zip(obs_features, obs_features_s): + logger.info( + f"Filtering modality '{par['modality']}' observations by .obs['{par['obs_feature']}'] == {obs_name}" + ) + mdata_obs = mdata.copy() + adata_obs = mdata_obs.mod[par["modality"]] + + # split the samples + adata_obs = adata_obs[adata_obs.obs[par["obs_feature"]] == obs_name] + mdata_obs_name = f"{input_file.stem}_{file_name}.h5mu" + obs_files.append(mdata_obs_name) + + # Dropping columns that only have nan values after splitting + if par["drop_obs_nan"]: + logger.info("Dropping all .obs columns with NaN values") + adata_obs.obs.dropna(axis=1, how="all", inplace=True) + + # replace mdata file with modality adata contianing split samples + logger.info( + f"Writing h5mu filtered for {par['obs_feature']} {obs_name} to file {output_dir / mdata_obs_name}" + ) + mdata_obs.mod[par["modality"]] = adata_obs + mdata_obs.write_h5mu( + output_dir / mdata_obs_name, compression=par["output_compression"] + ) + + # avoid keeping files in memory + del mdata_obs + del adata_obs + gc.collect() + + logger.info(f"Writing output_files CSV file to {par['output_files']}") + df = pd.DataFrame({"name": obs_features_s, "filename": obs_files}) + df.to_csv(par["output_files"], index=False) + + +if __name__ == "__main__": + main() diff --git a/src/dataflow/split_h5mu/test.py b/src/dataflow/split_h5mu/test.py new file mode 100644 index 00000000..6fdab5cd --- /dev/null +++ b/src/dataflow/split_h5mu/test.py @@ -0,0 +1,288 @@ +import sys +from textwrap import dedent +import pytest +import mudata as mu +import anndata as ad +import numpy as np +import pandas as pd +import subprocess +import re + + +@pytest.fixture +def input_modality_1(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame({"Obs": ["A", "B"], "Obs_nan": [np.nan, np.nan]}, index=df.index) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) + ad1 = ad.AnnData(df, obs=obs, var=var) + return ad1 + + +@pytest.fixture +def input_modality_2(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = ad.AnnData(df, obs=obs2, var=var2) + return ad2 + + +@pytest.fixture +def input_modality_3(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + var3 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs3 = pd.DataFrame(["C C", "C_C"], index=df.index, columns=["Obs"]) + ad3 = ad.AnnData(df, obs=obs3, var=var3) + return ad3 + + +@pytest.fixture +def input_h5mu(input_modality_1, input_modality_2): + tmp_mudata = mu.MuData({"mod1": input_modality_1, "mod2": input_modality_2}) + return tmp_mudata + + +@pytest.fixture +def input_h5mu_path(write_mudata_to_file, input_h5mu): + return write_mudata_to_file(input_h5mu) + + +@pytest.fixture +def input_h5mu_non_unique_filenames(input_modality_3): + tmp_mudata = mu.MuData({"mod3": input_modality_3}) + return tmp_mudata + + +@pytest.fixture +def input_h5mu_path_non_unique_filenames( + write_mudata_to_file, input_h5mu_non_unique_filenames +): + return write_mudata_to_file(input_h5mu_non_unique_filenames) + + +def test_sample_split(run_component, random_path, input_h5mu, input_h5mu_path): + output_dir = random_path() + output_files = random_path(extension="csv") + args = [ + "--input", + input_h5mu_path, + "--output", + str(output_dir), + "--modality", + "mod1", + "--obs_feature", + "Obs", + "--output_files", + str(output_files), + ] + + run_component(args) + assert output_files.is_file() + assert output_dir.is_dir() + + # check output dir and file names + dir_content = [ + h5mu_file + for h5mu_file in output_dir.iterdir() + if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path + ] + s1_file = output_dir / f"{input_h5mu_path.stem}_A.h5mu" + s2_file = output_dir / f"{input_h5mu_path.stem}_B.h5mu" + assert set(dir_content) == set([s1_file, s2_file]) + + # check that number of modalities, variables and observations + s1 = mu.read_h5mu(s1_file) + s2 = mu.read_h5mu(s2_file) + assert s1.n_mod == 2 + assert s2.n_mod == 2 + + assert s1.n_obs == input_h5mu.n_obs, ( + "number of observations of split file does not match input file" + ) + assert s2.n_obs == input_h5mu.n_obs, ( + "number of observations of split file does not match input file" + ) + + assert s1.mod["mod1"].n_obs == 1, ( + "number of observations of split file s1 modality mod1 should equal 1" + ) + assert s1.mod["mod2"].n_obs == input_h5mu.n_obs, ( + "number of observations of split file s1 modality mod2 should equal input file" + ) + + assert len(s1.mod["mod1"].obs.keys()) == 2, ( + "number of observation keys split file s1 modality mod1 should equal 2" + ) + assert len(s1.mod["mod2"].obs.keys()) == 1, ( + "number of observation keys split file s1 modality mod2 should equal 1" + ) + + assert s2.mod["mod1"].n_obs == 1, ( + "number of observations of split file s2 modality mod1 should equal 1" + ) + assert s2.mod["mod2"].n_obs == input_h5mu.n_obs, ( + "number of observations of split file s2 modality mod2 should equal input file" + ) + + assert s1.n_vars == input_h5mu.n_vars, ( + "number of variables of split file s1 should equal input file" + ) + assert s2.n_vars == input_h5mu.n_vars, ( + "number of variables of split file s1 should equal input file" + ) + + assert s1.mod["mod1"].n_vars == input_h5mu.mod["mod1"].n_vars, ( + "number of variables of split file s1 modalitty mod1 should equal input file" + ) + assert s1.mod["mod2"].n_vars == input_h5mu.mod["mod1"].n_vars, ( + "number of variables of split file s1 modalitty mod2 should equal input file" + ) + + assert s2.mod["mod1"].n_vars == input_h5mu.mod["mod1"].n_vars, ( + "number of variables of split file s2 modalitty mod1 should equal input file" + ) + assert s2.mod["mod2"].n_vars == input_h5mu.mod["mod1"].n_vars, ( + "number of variables of split file s2 modalitty mod2 should equal input file" + ) + + # check correct sample splitting + assert np.all(s1.mod["mod1"].obs["Obs"] == "A"), ( + "observation of .obs Obs in s1 should equal A" + ) + assert np.all(s2.mod["mod1"].obs["Obs"] == "B"), ( + "observation of .obs Obs in s2 should equal B" + ) + + # Check contents of csv file + expected_csv_output = dedent( + f"""\ + name,filename + A,{s1_file.name} + B,{s2_file.name} + """ + ) + with open(output_files, "r") as open_csv_file: + result = open_csv_file.read() + assert result == expected_csv_output + + +def test_sample_split_dropna(run_component, random_path, input_h5mu, input_h5mu_path): + output_dir = random_path() + output_files = random_path(extension="csv") + args = [ + "--input", + input_h5mu_path, + "--output", + str(output_dir), + "--modality", + "mod1", + "--obs_feature", + "Obs", + "--drop_obs_nan", + "true", + "--output_files", + str(output_files), + ] + + run_component(args) + + # check output dir and file names + s1_file = output_dir / f"{input_h5mu_path.stem}_A.h5mu" + s2_file = output_dir / f"{input_h5mu_path.stem}_B.h5mu" + + # check that .obs columns with only nan values are dropped correctly + s1 = mu.read_h5mu(s1_file) + s2 = mu.read_h5mu(s2_file) + + assert s1.n_obs == input_h5mu.n_obs, ( + "number of observations of split file does not match input file" + ) + assert s2.n_obs == input_h5mu.n_obs, ( + "number of observations of split file does not match input file" + ) + + assert s1.mod["mod1"].n_obs == 1, ( + "number of observations of split file s1 modality mod1 should equal 1" + ) + assert s1.mod["mod2"].n_obs == input_h5mu.n_obs, ( + "number of observations of split file s1 modality mod2 should equal input file" + ) + + assert len(s1.mod["mod1"].obs.keys()) == 1, ( + "number of observation keys split file s1 modality mod1 should equal 1" + ) + assert len(s1.mod["mod2"].obs.keys()) == 1, ( + "number of observation keys split file s1 modality mod2 should equal 1" + ) + + +def test_sanitizing(run_component, random_path, input_h5mu_path_non_unique_filenames): + output_dir = random_path() + output_files = random_path(extension="csv") + + args = [ + "--input", + input_h5mu_path_non_unique_filenames, + "--output", + str(output_dir), + "--modality", + "mod3", + "--obs_feature", + "Obs", + "--drop_obs_nan", + "true", + "--output_files", + str(output_files), + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: File names are not unique after sanitizing the --obs_feature Obs values", + err.value.stdout.decode("utf-8"), + ) + + args_san = [ + "--input", + input_h5mu_path_non_unique_filenames, + "--output", + str(output_dir), + "--modality", + "mod3", + "--obs_feature", + "Obs", + "--drop_obs_nan", + "true", + "--output_files", + str(output_files), + "--ensure_unique_filenames", + "true", + ] + + run_component(args_san) + + # check output dir and file names + dir_content = [ + h5mu_file + for h5mu_file in output_dir.iterdir() + if h5mu_file.suffix == ".h5mu" + and h5mu_file != input_h5mu_path_non_unique_filenames + ] + s1_file = output_dir / f"{input_h5mu_path_non_unique_filenames.stem}_C_C.h5mu" + s2_file = output_dir / f"{input_h5mu_path_non_unique_filenames.stem}_C_C_1.h5mu" + + assert s1_file.is_file(), f"{s1_file} does not exist" + assert s2_file.is_file(), f"{s2_file} does not exist" + assert set(dir_content) == set([s1_file, s2_file]), ( + "Output files do not match file names in csv" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dataflow/split_modalities/config.vsh.yaml b/src/dataflow/split_modalities/config.vsh.yaml new file mode 100644 index 00000000..03a7f000 --- /dev/null +++ b/src/dataflow/split_modalities/config.vsh.yaml @@ -0,0 +1,56 @@ +name: split_modalities +namespace: "dataflow" +scope: "public" +description: | + Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ contributor ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Path to a single .h5mu file. + required: true + default: sample_path + - name: "--output" + alternatives: ["-o"] + type: file + required: true + direction: output + example: "/path/to/output" + description: Output directory containing multiple h5mu files. + - name: "--output_types" + type: file + required: true + direction: output + example: types.csv + description: A csv containing the base filename and modality type per output file. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] diff --git a/src/dataflow/split_modalities/script.py b/src/dataflow/split_modalities/script.py new file mode 100644 index 00000000..198baf68 --- /dev/null +++ b/src/dataflow/split_modalities/script.py @@ -0,0 +1,65 @@ +from __future__ import annotations +import sys +import mudata as md +from pathlib import Path +import pandas as pd + +### VIASH START +par = { + "input": "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "foo/", + "output_types": "foo_types.csv", + "output_compression": "gzip", +} +meta = {"resources_dir": "."} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main() -> None: + output_dir = Path(par["output"]) + logger.info("Creating output directory '%s' if it does not exist", output_dir) + if not output_dir.is_dir(): + logger.info("Creating %s", output_dir) + output_dir.mkdir(parents=True) + + logger.info("Reading input file '%s'", par["input"]) + input_file = Path(par["input"].strip()) + sample = md.read_h5mu(input_file) + + logger.info("Creating output types CSV.") + modalities = list(sample.mod.keys()) + + logger.info("Found the following modalities:\n%s", "\n".join(modalities)) + names = {mod_name: f"{input_file.stem}_{mod_name}.h5mu" for mod_name in modalities} + output_files = list(names.values()) + logger.info( + "Will be creating the following output .h5mu files:\n%s", + "\n".join(output_files), + ) + df = pd.DataFrame({"name": modalities, "filename": output_files}) + logger.info("Writing output_types CSV file to '%s'.", par["output_types"]) + df.to_csv(par["output_types"], index=False) + + logger.info("Splitting input file into unimodal output files.") + for mod_name, mod in sample.mod.items(): + logger.info("Processing modality '%s'", mod_name) + new_sample = md.MuData({mod_name: mod}) + logger.info( + "Writing to '%s', with compression '%s'", + names[mod_name], + par["output_compression"], + ) + new_sample.write_h5mu( + output_dir / names[mod_name], compression=par["output_compression"] + ) + logger.info("Done writing output file.") + logger.info("Finished") + + +if __name__ == "__main__": + main() diff --git a/src/dataflow/split_modalities/test.py b/src/dataflow/split_modalities/test.py new file mode 100644 index 00000000..69c6901a --- /dev/null +++ b/src/dataflow/split_modalities/test.py @@ -0,0 +1,123 @@ +import sys +import pytest +import mudata as md +import anndata as ad +import pandas as pd +import re +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from textwrap import dedent + +## VIASH START +meta = { + "name": "./target/native/dataflow/split_modalities/split_modalities", + "resources_dir": "./resources_test/", + "config": "./src/dataflow/split_modalities/config.vsh.yaml", + "executable": "./target/docker/dataflow/split_modalities/split_modalities", +} +## VIASH END + + +@pytest.fixture +def input_modality_1(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) + ad1 = ad.AnnData(df, obs=obs, var=var) + return ad1 + + +@pytest.fixture +def input_modality_2(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = ad.AnnData(df, obs=obs2, var=var2) + return ad2 + + +@pytest.fixture +def input_h5mu(input_modality_1, input_modality_2): + tmp_mudata = md.MuData({"mod1": input_modality_1, "mod2": input_modality_2}) + return tmp_mudata + + +@pytest.fixture +def input_h5mu_path(write_mudata_to_file, input_h5mu): + return write_mudata_to_file(input_h5mu) + + +@pytest.mark.parametrize("compression", ["gzip", None]) +def test_split( + run_component, + random_path, + input_h5mu, + input_h5mu_path, + input_modality_1, + input_modality_2, + compression, +): + output_dir = random_path() + output_types = random_path(extension="csv") + args = [ + "--input", + input_h5mu_path, + "--output", + str(output_dir), + "--output_types", + str(output_types), + ] + if compression: + args += ["--output_compression", compression] + run_component(args) + assert output_types.is_file() + assert output_dir.is_dir() + + # check output dir + dir_content = [ + h5mu_file + for h5mu_file in output_dir.iterdir() + if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path + ] + mod1_file = output_dir / f"{input_h5mu_path.stem}_mod1.h5mu" + mod2_file = output_dir / f"{input_h5mu_path.stem}_mod2.h5mu" + assert set(dir_content) == set([mod1_file, mod2_file]) + mod1 = md.read_h5mu(mod1_file) + mod2 = md.read_h5mu(mod2_file) + assert mod1.n_mod == 1 + assert mod2.n_mod == 1 + + assert_annotation_objects_equal(mod1.mod["mod1"], input_modality_1) + assert_annotation_objects_equal(mod2.mod["mod2"], input_modality_2) + + assert mod1.n_obs == input_h5mu.n_obs + assert mod2.n_obs == input_h5mu.n_obs + + # When a var_key is only present for one modality, it is prefixed by the name of the + # modality followed by a colon and the name of the key (in the global .var). + replace_regex = r"(^mod1:|^mod2:)" + expected_var_keys = { + re.sub(replace_regex, "", col_name) for col_name in input_h5mu.var_keys() + } + assert set(mod1.var_keys()) | set(mod2.var_keys()) == expected_var_keys + + assert set(mod1.var_keys()) == set(input_h5mu.mod["mod1"].var.columns) + assert set(mod2.var_keys()) == set(input_h5mu.mod["mod2"].var.columns) + + expected_csv_output = dedent( + f"""\ + name,filename + mod1,{mod1_file.name} + mod2,{mod2_file.name} + """ + ) + with open(output_types, "r") as open_csv_file: + result = open_csv_file.read() + assert result == expected_csv_output + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/demux/bcl2fastq/config.vsh.yaml b/src/demux/bcl2fastq/config.vsh.yaml new file mode 100644 index 00000000..d260d9fd --- /dev/null +++ b/src/demux/bcl2fastq/config.vsh.yaml @@ -0,0 +1,59 @@ +name: bcl2fastq +namespace: demux +scope: "public" +description: | + Convert bcl files to fastq files using bcl2fastq. +authors: + - __merge__: /src/authors/toni_verbeiren.yaml + roles: [ author, maintainer ] +arguments: + - name: "--input" + alternatives: [ "-i", "--runfolder_dir" ] + type: file + required: true + description: Input run directory + example: bcl_dir + - name: "--sample_sheet" + alternatives: [ "-s" ] + type: file + required: true + description: Pointer to the sample sheet + example: SampleSheet.csv + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + required: true + description: Output directory containig fastq files + example: fastq_dir + - name: "--reports" + type: file + direction: output + required: false + description: Reports directory + example: reports_dir + - name: "--ignore_missing" + description: | + Interpret missing *.bcl files as no call (N), interpret missing control files as not-set + control bits and fill in with zeros when *.stats files are missing. + type: boolean_true +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/cellranger_tiny_bcl/bcl + +engines: +- type: docker + image: ghcr.io/data-intuitive/bcl2fastq:2.20 + setup: + - type: apt + packages: + - procps +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/demux/bcl2fastq/script.sh b/src/demux/bcl2fastq/script.sh new file mode 100644 index 00000000..e06d142f --- /dev/null +++ b/src/demux/bcl2fastq/script.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -exo pipefail + +extra_params=() + +# Handle reports stored separate +if [ ! -z "$par_reports" ]; then + extra_params+=("--reports-dir" "$par_reports") +fi + +# Handle the boolean flag +if [ "$par_ignore_missing" == "true" ]; then + extra_params+=("--ignore-missing-control" "--ignore-missing-bcl" "--ignore-missing-filter") +fi + +# Run the actual command +bcl2fastq \ + --runfolder-dir "$par_input" \ + --sample-sheet "$par_sample_sheet" \ + --output-dir "$par_output" \ + "${extra_params[@]}" diff --git a/src/demux/bcl2fastq/test.sh b/src/demux/bcl2fastq/test.sh new file mode 100644 index 00000000..b85559e7 --- /dev/null +++ b/src/demux/bcl2fastq/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# We are using the tiny bcl dataset provided by Illumina: +# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq +# Unfortunately, +# 1. the sample sheet delivered with it does not work with bcl2fastq (v1 of the format) +# 2. 2 filter files are missing from the run directory that bcl2fastq requires to run +# +# We worked around this by ignoring all missing entries with --ignore_missing + +echo ">>> Running executable" +"$meta_executable" \ + --input "$meta_resources_dir/bcl" \ + --sample_sheet "$meta_resources_dir/bcl/sample_sheet.csv" \ + --output "fastq" \ + --reports "reports" \ + --ignore_missing + +echo ">>> Checking whether the output dir exists" +[[ ! -d fastq ]] && echo "Output dir could not be found!" && exit 1 + +echo ">>> Checking whether output fastq files are created" +[[ ! -f fastq/Undetermined_S0_L001_R1_001.fastq.gz ]] && echo "Output fastq files could not be found!" && exit 1 + +echo ">>> Checking whether reports files are created" +[[ ! -d reports/html ]] && echo "Output reports files could not be found!" && exit 1 + + + +echo ">>> Running executable without separate reports" +"$meta_executable" \ + --input "$meta_resources_dir/bcl" \ + --sample_sheet "$meta_resources_dir/bcl/sample_sheet.csv" \ + --output "fastq1" \ + --ignore_missing + +echo ">>> Checking whether the output dir exists" +[[ ! -d fastq1 ]] && echo "Output dir could not be found!" && exit 1 + +echo ">>> Checking whether output fastq files are created" +[[ ! -f fastq1/Undetermined_S0_L001_R1_001.fastq.gz ]] && echo "Output fastq files could not be found!" && exit 1 + +echo ">>> Checking whether reports files are created" +[[ ! -d fastq1/Reports/html ]] && echo "Output reports files could not be found!" && exit 1 + + + +# print final message +echo ">>> Test finished successfully" \ No newline at end of file diff --git a/src/demux/bcl_convert/config.vsh.yaml b/src/demux/bcl_convert/config.vsh.yaml new file mode 100644 index 00000000..6c571a6b --- /dev/null +++ b/src/demux/bcl_convert/config.vsh.yaml @@ -0,0 +1,85 @@ +name: bcl_convert +namespace: demux +scope: "public" +description: | + Convert bcl files to fastq files using bcl-convert. + Information about upgrading from bcl2fastq via + https://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html + and https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html +authors: + - __merge__: /src/authors/toni_verbeiren.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/marijke_van_moerbeke.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] +arguments: + - name: "--input" + alternatives: [ "-i" ] + type: file + required: true + description: Input run directory + example: bcl_dir + - name: "--sample_sheet" + alternatives: [ "-s" ] + type: file + required: true + description: Pointer to the sample sheet + example: bcl_dir + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + required: true + description: Output directory containig fastq files + example: fastq_dir + - name: "--reports" + type: file + direction: output + required: false + description: Reports directory + example: reports_dir + - name: "--test_mode" + type: boolean + default: false + description: Should bcl-convert be run in test mode (using --first-tile-only)? + - name: "--strict_mode" + type: boolean + default: false + description: Abort if any files are missing. + - name: "--tiles" + type: string + required: false + description: Process only a subset of tiles by a regular expression. + - name: "--exclude_tiles" + type: string + required: false + description: "Exclude set of tiles by a regular expression" + - name: --no_lane_splitting + type: boolean + required: false + description: Wheter to avoid splitting FASTQ file by lane. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/cellranger_tiny_bcl/bcl2 + +engines: +- type: docker + image: ghcr.io/data-intuitive/bclconvert:4.2 + setup: + - type: apt + packages: + - procps + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/demux/bcl_convert/script.sh b/src/demux/bcl_convert/script.sh new file mode 100644 index 00000000..5067a469 --- /dev/null +++ b/src/demux/bcl_convert/script.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input=work/fc/34b01dbb67178188ce8571b7c5459e/bcl2 +par_output=work/fc/34b01dbb67178188ce8571b7c5459e/foo +par_sample_sheet=work/fc/34b01dbb67178188ce8571b7c5459e/sample_sheet.csv +par_test_mode=false +## VIASH END + +[ -d "$par_output" ] || mkdir -p "$par_output" + +bcl-convert \ + --force \ + --bcl-input-directory "$par_input" \ + --output-directory "$par_output" \ + --sample-sheet "$par_sample_sheet" \ + --first-tile-only "$par_test_mode" \ + --strict-mode "$par_strict_mode" \ + ${par_no_lane_splitting:+--no-lane-splitting "$par_no_lane_splitting"} \ + ${par_tiles:+--tiles $par_tiles} \ + ${par_exclude_tiles:+--exclude-tiles $par_exclude_tiles} + + +if [ ! -z "$par_reports" ]; then + echo "Moving reports to its own location" + mv "$par_output"/Reports "$par_reports" +else + echo "Leaving reports alone" +fi \ No newline at end of file diff --git a/src/demux/bcl_convert/test.sh b/src/demux/bcl_convert/test.sh new file mode 100644 index 00000000..1c912f0e --- /dev/null +++ b/src/demux/bcl_convert/test.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# bcl-convert requires a v2 sample sheet +# bcl-convert is a bit more strict concerning filter files being present or not. +# We make a copy and make the necessary adaptations. +# See workflows/resources_test_scripts/cellranger_tiny_bcl.sh for more information + +# create tempdir +MY_TEMP="${VIASH_TEMP:-/tmp}" +TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +echo ">>> Running executable" +$meta_executable \ + --input "$meta_resources_dir/bcl2" \ + --sample_sheet "$meta_resources_dir/bcl2/sample_sheet.csv" \ + --output "$TMPDIR/output1" \ + --test_mode true +echo ">>> Checking whether the output dir exists" +[[ ! -d "$TMPDIR/output1" ]] && echo "Output dir could not be found!" && exit 1 + +echo ">>> Checking whether output fastq files are created" +[[ ! -f "$TMPDIR/output1/Undetermined_S0_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 +[[ ! -f "$TMPDIR/output1/s1_S1_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 + +echo ">>> Test strict mode" +cp -r "$meta_resources_dir/bcl2" "$TMPDIR/strict_input" +rm "$TMPDIR/strict_input/Data/Intensities/L001/s_1_1101.clocs" + +$meta_executable \ + --input "$TMPDIR/strict_input" \ + --sample_sheet "$meta_resources_dir/bcl2/sample_sheet.csv" \ + --output "$TMPDIR/output2" \ + --strict_mode false + +echo ">>> Checking whether the output dir exists" +[[ ! -d "$TMPDIR/output2" ]] && echo "Output dir could not be found!" && exit 1 + +echo ">>> Checking whether output fastq files are created" +[[ ! -f "$TMPDIR/output2/Undetermined_S0_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 +[[ ! -f "$TMPDIR/output2/s1_S1_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 + +echo ">>> Test no lane splitting argument" +awk '!/NoLaneSplitting/' "$meta_resources_dir/bcl2/sample_sheet.csv" > "$TMPDIR/sample_sheet_without_LaneSplitting.csv" + +$meta_executable \ + --input "$meta_resources_dir/bcl2" \ + --sample_sheet "$TMPDIR/sample_sheet_without_LaneSplitting.csv" \ + --output "$TMPDIR/output3" \ + --test_mode true \ + --no_lane_splitting true + +echo ">>> Checking whether the output dir exists" +[[ ! -d "$TMPDIR/output3" ]] && echo "Output dir could not be found!" && exit 1 + +echo ">>> Checking whether output fastq files with lane splitting are created" + +[[ ! -f "$TMPDIR/output3/Undetermined_S0_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 +[[ ! -f "$TMPDIR/output3/s1_S1_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 + +$meta_executable \ + --input "$meta_resources_dir/bcl2" \ + --sample_sheet "$TMPDIR/sample_sheet_without_LaneSplitting.csv" \ + --output "$TMPDIR/output4" \ + --test_mode true \ + --no_lane_splitting false + +echo ">>> Checking whether output fastq files without lane splitting are created" + +[[ ! -f "$TMPDIR/output4/Undetermined_S0_L001_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 +[[ ! -f "$TMPDIR/output4/s1_S1_L001_R1_001.fastq.gz" ]] && echo "Output fastq files could not be found!" && exit 1 + +# print final message +echo ">>> Test finished successfully" + +# do not remove this +# as otherwise your test might exit with a different exit code +exit 0 diff --git a/src/demux/cellranger_atac_mkfastq/config.vsh.yaml b/src/demux/cellranger_atac_mkfastq/config.vsh.yaml new file mode 100644 index 00000000..c89ed537 --- /dev/null +++ b/src/demux/cellranger_atac_mkfastq/config.vsh.yaml @@ -0,0 +1,81 @@ +name: cellranger_atac_mkfastq +namespace: demux +scope: "public" +description: Demultiplex raw sequencing data for ATAC experiments +usage: | + cellranger_atac_mkfastq \ + --input /path/to/bcl \ + --csv simple_layout_sample_sheet.csv \ + --output /path/to/output +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] +arguments: + - type: file + name: --input + description: Path of Illumina BCL run folder. + example: /path/to/bcl + required: true + - type: file + name: --csv + description: The path to the simple layout sample sheet. + example: SampleSheet.csv + required: true + - type: string + multiple: true + name: --lanes + description: bcl2fastq option. Semicolon-delimited series of lanes to demultiplex. Use this if you have a sample sheet for an entire flow cell but only want to generate a few lanes for further 10x Genomics analysis. + multiple_sep: "," + example: 1,3 + - type: string + multiple: true + name: --use_bases_mask + description: bcl2fastq option. Use to clip extra bases off a read if you ran extra cycles for QC. + multiple_sep: "," + example: y50n,I6n,Y50n + - type: boolean_true + name: --delete_undetermined + description: bcl2fastq option. Delete the Undetermined FASTQs generated by bcl2fastq. Useful if you are demultiplexing a small number of samples from a large flow cell. + - type: integer + name: --barcode_mismatches + description: bcl2fastq option. Use this option to change the number of allowed mismatches per index adapter (0, 1, 2). + default: 1 + min: 0 + - type: file + name: --output + direction: output + description: The folder to store the demux results + example: "/path/to/output" + default: fastqs + required: true + - type: file + name: "--reports" + direction: output + required: false + description: Reports directory + example: reports_dir +resources: + - type: bash_script + path: script.sh + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_atac_tiny_bcl +engines: + - type: docker + image: ghcr.io/data-intuitive/cellranger_atac:2.1 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update \ + && apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/demux/cellranger_atac_mkfastq/script.sh b/src/demux/cellranger_atac_mkfastq/script.sh new file mode 100644 index 00000000..335f08e0 --- /dev/null +++ b/src/demux/cellranger_atac_mkfastq/script.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input='resources_test/cellranger_atac_tiny_bcl/bcl' +par_csv='resources_test/cellranger_atac_tiny_bcl/bcl/sample_sheet.csv' +par_output=foo + +par_input=`realpath $par_input` +par_sample_sheet=`realpath $par_csv` +par_output=`realpath $par_output` +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# if par_input not is a folder, untar first +if [ ! -d "$par_input" ]; then + echo "Assuming input is a tar.gz, untarring" + input_dir="$tmpdir/bcl" + mkdir -p "$input_dir" + tar -xzf "$par_input" -C "$input_dir" --strip-components=1 +else + input_dir="$par_input" +fi + + +if [ ! -z "$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=`python -c "print(int('$meta_memory_gb') - 2)"` +fi + + +echo "Running cellranger-atac mkfastq" + +id=myoutput + +IFS="," +cellranger-atac mkfastq \ + --id "$id" \ + --csv "$par_csv" \ + --run "$par_input" \ + --disable-ui \ + --output-dir "$par_output" \ + ${meta_cpus:+--localcores=$meta_cpus} \ + ${memory_gb:+--localmem=$memory_gb} \ + ${par_lanes:+--lanes=${par_lanes[*]}} \ + ${par_use_bases_mask:+--use-bases-mask=${par_use_bases_mask[*]}} \ + ${par_delete_undetermined:+--delete-undetermined} \ + ${par_barcode_mismatches:+--barcode-mismatches=$par_barcode_mismatches} +unset IFS + +# Move reports to their own output location +if [ ! -z "$par_reports" ]; then + echo "Moving reports its own location" + mv "$par_output"/Reports "$par_reports" +else + echo "Leaving reports alone" +fi diff --git a/src/demux/cellranger_atac_mkfastq/test.py b/src/demux/cellranger_atac_mkfastq/test.py new file mode 100644 index 00000000..a854be12 --- /dev/null +++ b/src/demux/cellranger_atac_mkfastq/test.py @@ -0,0 +1,36 @@ +import sys +from pathlib import Path +import pytest + +## VIASH START +meta = {"name": "cellranger_mkfastq", "resources_dir": "resources_test"} +## VIASH END + +input_dir = Path(meta["resources_dir"]) / "cellranger_atac_tiny_bcl/bcl" +sample_sheet = Path(meta["resources_dir"]) / "cellranger_atac_tiny_bcl/bcl/layout.csv" + + +def test_run(run_component, tmp_path): + output = tmp_path / "output" + + print("Input dir exists: ", input_dir.is_dir()) + print("Input dir content: ", list(input_dir.glob("*"))) + print("Sample sheet exists: ", sample_sheet.is_file()) + + cmd_pars = [ + "--input", + str(input_dir), + "--csv", + str(sample_sheet), + "--output", + str(output), + ] + + run_component(cmd_pars) + + expected_dir: Path = output / "HJN3KBCX2" / "test_sample" + assert expected_dir.is_dir(), "No output was created." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/demux/cellranger_mkfastq/config.vsh.yaml b/src/demux/cellranger_mkfastq/config.vsh.yaml new file mode 100644 index 00000000..fd1eb572 --- /dev/null +++ b/src/demux/cellranger_mkfastq/config.vsh.yaml @@ -0,0 +1,66 @@ +name: cellranger_mkfastq +namespace: demux +scope: "public" +description: Demultiplex raw sequencing data +usage: | + cellranger_mkfastq \ + --input /path/to/bcl \ + --sample_sheet SampleSheet.csv \ + --output /path/to/output +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/samuel_d_souza.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +arguments: + - type: file + name: --input + description: Path to the (untarred) BCL files. Expects 'RunParameters.xml' at './'. + example: /path/to/bcl + required: true + - type: file + name: --sample_sheet + description: The path to the sample sheet. + example: SampleSheet.csv + required: true + - type: file + name: --output + direction: output + description: The folder to store the demux results + example: "/path/to/output" + default: fastqs + required: true + - name: "--reports" + type: file + direction: output + required: false + description: Reports directory + example: reports_dir +resources: + - type: bash_script + path: script.sh + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_bcl + +engines: +- type: docker + image: ghcr.io/data-intuitive/cellranger:9.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/demux/cellranger_mkfastq/script.sh b/src/demux/cellranger_mkfastq/script.sh new file mode 100644 index 00000000..26d72e4e --- /dev/null +++ b/src/demux/cellranger_mkfastq/script.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input='resources_test/cellranger_tiny_bcl/bcl' +par_sample_sheet='resources_test/cellranger_tiny_bcl/bcl/sample_sheet.csv' +par_output=foo +par_input=`realpath $par_input` +par_sample_sheet=`realpath $par_sample_sheet` +par_output=`realpath $par_output` +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# if par_input not is a folder, untar first +if [ ! -d "$par_input" ]; then + echo "Assuming input is a tar.gz, untarring" + input_dir="$tmpdir/bcl" + mkdir -p "$input_dir" + tar -xzf "$par_input" -C "$input_dir" --strip-components=1 +else + input_dir="$par_input" +fi + + +# add additional params +extra_params=( ) + +if [ ! -z "$meta_cpus" ]; then + extra_params+=( "--localcores=$meta_cpus" ) +fi +if [ ! -z "$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=`python -c "print(int('$meta_memory_gb') - 2)"` + extra_params+=( "--localmem=$memory_gb" ) +fi + + +echo "Running cellranger demux" + +id=myoutput + +cellranger mkfastq \ + --id "$id" \ + --csv "$par_sample_sheet" \ + --run "$par_input" \ + "${extra_params[@]}" \ + --disable-ui \ + --output-dir "$par_output" + +# Move reports to their own output location +if [ ! -z "$par_reports" ]; then + echo "Moving reports its own location" + mv "$par_output"/Reports "$par_reports" +else + echo "Leaving reports alone" +fi diff --git a/src/demux/cellranger_mkfastq/test.py b/src/demux/cellranger_mkfastq/test.py new file mode 100644 index 00000000..44870f6a --- /dev/null +++ b/src/demux/cellranger_mkfastq/test.py @@ -0,0 +1,32 @@ +import sys +from pathlib import Path +import pytest + +## VIASH START +meta = {"name": "cellranger_mkfastq", "resources_dir": "resources_test"} +## VIASH END + +input = meta["resources_dir"] + "/cellranger_tiny_bcl/bcl" +sample_sheet = meta["resources_dir"] + "/cellranger_tiny_bcl/bcl/sample_sheet.csv" + + +def test_run(run_component, tmp_path): + output = tmp_path / "output" + + cmd_pars = [ + "--input", + input, + "--sample_sheet", + sample_sheet, + "--output", + str(output), + ] + + run_component(cmd_pars) + + expected_dir: Path = output / "H35KCBCXY" / "test_sample" + assert expected_dir.is_dir(), "No output was created." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml new file mode 100644 index 00000000..aa90b260 --- /dev/null +++ b/src/differential_expression/create_pseudobulk/config.vsh.yaml @@ -0,0 +1,111 @@ +name: differential_expression +namespace: "create_pseudobulk" +scope: "public" +description: | + Generation of pseudobulk samples from single-cell transcriptomics data, + by aggregating raw gene expression counts from individual cells to create + bulk-like expression profiles suitable for differential expression analysis + with methods designed for bulk differential expression analysis. + Note that this componentonly considers factors as explanatory variables, + and excludes covariates from the analysis. + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file. + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used. This layer must contain raw counts." + - name: "--obs_label" + type: string + required: true + description: ".obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters." + - name: "--obs_groups" + type: string + multiple: true + description: ".obs fields containing the experimental condition(s) to create pseudobulk samples for." + + - name: Arguments + arguments: + - name: "--aggregation_method" + type: string + choices: ["sum", "mean"] + default: "sum" + description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean." + - name: "--min_obs_per_sample" + type: integer + min: 1 + default: 30 + description: "Minimum number of cells per pseudobulk sample." + - name: --random_state + type: integer + description: | + The random seed for sampling. + min: 0 + default: 0 + required: false + - name: "--obs_cell_count" + type: string + default: "n_cells" + description: "The .obs field to store the number of observations per pseudobulk sample." + + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file containing the aggregated pseudobulk samples. + direction: output + required: true + - name: "--output_compression" + type: string + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +engines: +- type: docker + image: python:3.12 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowcpu, lowmem] diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py new file mode 100644 index 00000000..a9af50f6 --- /dev/null +++ b/src/differential_expression/create_pseudobulk/script.py @@ -0,0 +1,107 @@ +import numpy as np +import mudata as mu +import pandas as pd +import sys +import scanpy as sc +import scipy.sparse as sp + +## VIASH START +par = { + "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "modality": "rna", + "input_layer": None, + "obs_label": "cell_type", + "obs_groups": ["treatment", "donor_id", "disease"], + "obs_cell_count": "n_cells", + "min_obs_per_sample": 5, + "random_state": 0, + "output": "test.h5mu", + "output_compression": "gzip", +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def is_normalized(layer): + if sp.issparse(layer): + row_sums = np.array(layer.sum(axis=1)).flatten() + else: + row_sums = layer.sum(axis=1) + + return np.allclose(row_sums, 1) + + +def count_obs(adata, pb_adata, obs_cols): + counts = [] + for i in range(pb_adata.n_obs): + values = pb_adata.obs.iloc[i][obs_cols] + + mask = pd.Series([True] * adata.n_obs) + for col in obs_cols: + mask &= (adata.obs[col] == values[col]).values + + count = mask.sum() + counts.append(count) + + return counts + + +def main(): + # Read in data + logger.info(f"Reading input data {par['input']}...") + adata = mu.read_h5ad(par["input"], mod=par["modality"]).copy() + + # Make sure .X contains raw counts + if par["input_layer"]: + adata.X = adata.layers[par["input_layer"]] + if is_normalized(adata.X): + raise ValueError("Input layer must contain raw counts.") + + # Sanitize pseudobulk aggregation fields + pseudobulk_cols = [par["obs_label"]] + if par["obs_groups"]: + pseudobulk_cols += par["obs_groups"] + + for col in pseudobulk_cols: + if col not in adata.obs.columns: + raise ValueError(f"Required column '{col}' not found in .obs.") + adata.obs[col] = ( + adata.obs[col] + .astype(str) + .str.replace(" ", "_") + .str.replace("+", "") + .astype("category") + ) + + # Aggregate pseudobulk data per cell group + logger.info("Creating pseudobulk samples...") + adata_pb = sc.get.aggregate(adata, by=pseudobulk_cols, func="sum", axis=0) + adata_pb.X = adata_pb.layers["sum"] + del adata_pb.layers["sum"] + + adata_pb.obs_names = [ + "_".join(values) + for values in zip(*[adata_pb.obs[col] for col in pseudobulk_cols]) + ] + + # Filter pseudobulk samples based on minimum observation count + logger.info("Filtering pseudobulk samples based on minimum observation count...") + adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols) + adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]] + + logger.info( + f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes" + ) + + logger.info("Writing output data...") + mdata_pb = mu.MuData({"rna": adata_pb}) + mdata_pb.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() diff --git a/src/differential_expression/create_pseudobulk/test.py b/src/differential_expression/create_pseudobulk/test.py new file mode 100644 index 00000000..f40963ee --- /dev/null +++ b/src/differential_expression/create_pseudobulk/test.py @@ -0,0 +1,113 @@ +import os +import mudata as mu +import sys +import pytest + +## VIASH START +meta = {"resources_dir": "resources_test/"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + +input_path = meta["resources_dir"] + "/TS_Blood_filtered.h5mu" + + +def test_simple_execution(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--obs_label", + "cell_type", + "--obs_groups", + "treatment", + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_path), "Output query file does not exist" + mdata = mu.read_h5mu(output_path) + adata = mdata.mod["rna"] + + expected_obs = ["treatment", "cell_type", "n_cells"] + assert all(col in adata.obs for col in expected_obs), ( + f"Expected columns {expected_obs} not found in .obs" + ) + assert adata.shape[0] == 8, "Expected a total of 8 pseudobulk samples in the output" + + +def test_multiple_factors(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--obs_label", + "cell_type", + "--obs_groups", + "disease", + "--obs_groups", + "donor_id", + "--obs_groups", + "treatment", + "--min_obs_per_sample", + "5", + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_path), "Output query file does not exist" + mdata = mu.read_h5mu(output_path) + adata = mdata.mod["rna"] + + expected_obs = ["treatment", "cell_type", "n_cells"] + assert all(col in adata.obs for col in expected_obs), ( + f"Expected columns {expected_obs} not found in .obs" + ) + assert adata.shape[0] == 16, ( + "Expected a total of 16 pseudobulk samples in the output" + ) + + +def test_filtering(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--obs_label", + "cell_type", + "--obs_groups", + "treatment", + "--min_obs_per_sample", + "50", + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_path), "Output query file does not exist" + mdata = mu.read_h5mu(output_path) + adata = mdata.mod["rna"] + + expected_obs = ["treatment", "cell_type", "n_cells"] + assert all(col in adata.obs for col in expected_obs), ( + f"Expected columns {expected_obs} not found in .obs" + ) + assert adata.shape[0] == 4, "Expected a total of 8 pseudobulk samples in the output" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/densmap/config.vsh.yaml b/src/dimred/densmap/config.vsh.yaml new file mode 100644 index 00000000..1282052c --- /dev/null +++ b/src/dimred/densmap/config.vsh.yaml @@ -0,0 +1,183 @@ +name: densmap +namespace: "dimred" +scope: "public" +description: | + A modification of UMAP that adds an extra cost term in order to preserve information + about the relative local density of the data. It is performed on the same inputs as UMAP. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--uns_neighbors" + type: string + default: "neighbors" + description: The `.uns` neighbors slot as output by the `find_neighbors` component. + + - name: "--obsm_pca" + type: string + description: | + The slot in `.obsm` where the PCA results are stored. + required: true + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--obsm_output" + type: string + description: The .obsm key to use for storing the densMAP results.. + default: "X_densmap" + __merge__: [., /src/base/h5_compression_argument.yaml] + + + - name: Arguments UMAP + arguments: + - name: "--min_dist" + type: double + description: | + The effective minimum distance between embedded points. Smaller values will result + in a more clustered/clumped embedding where nearby points on the manifold are drawn + closer together, while larger values will result on a more even dispersal of points. + The value should be set relative to the spread value, which determines the scale at + which embedded points will be spread out. + default: 0.5 + min: 0.0 + max: 10.0 + + - name: "--spread" + type: double + description: | + The effective scale of embedded points. In combination with `min_dist` this + determines how clustered/clumped the embedded points are. + default: 1.0 + min: 0.0 + max: 10.0 + + - name: "--num_components" + type: integer + description: The number of dimensions of the embedding. + default: 2 + min: 1 + + - name: "--max_iter" + type: integer + description: | + The number of iterations (epochs) of the optimization. Called `n_epochs` + in the original UMAP. Default is set to 500 if + neighbors['connectivities'].shape[0] <= 10000, else 200. + default: 0 + min: 0 + max: 1000 + + - name: "--alpha" + type: double + description: The initial learning rate for the embedding optimization. + default: 1.0 + + - name: "--gamma" + type: double + description: | + Weighting applied to negative samples in low dimensional embedding optimization. + Values higher than one will result in greater weight being given to negative samples. + default: 1.0 + + - name: "--negative_sample_rate" + type: integer + description: | + The number of negative samples to select per positive sample + in the optimization process. Increasing this value will result + in greater repulsive force being applied, greater optimization + cost, but slightly more accuracy. + default: 5 + + - name: "--init_pos" + type: string + description: | + How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are: + + * Any key from `.obsm` + * `'paga'`: positions from `paga()` + * `'spectral'`: use a spectral embedding of the graph + * `'random'`: assign initial embedding positions at random. + + default: spectral + choices: [ 'paga', 'spectral', 'random' ] + + - name: Arguments densMAP + arguments: + - name: "--lambda" + type: double + description: | + Controls the regularization weight of the density correlation term in densMAP. + Higher values prioritize density preservation over the UMAP objective, and vice versa + for values closer to zero. Setting this parameter to zero is equivalent to running + the original UMAP algorithm. + default: 2.0 + min: 0.01 + max: 10.0 + + - name: "--fraction" + type: double + description: | + Controls the fraction of epochs (between 0 and 1) where the density-augmented objective + is used in densMAP. The first (1 - dens_frac) fraction of epochs optimize the original + UMAP objective before introducing the density correlation term. + default: 0.3 + + - name: "--var_shift" + type: double + description: | + A small constant added to the variance of local radii in the embedding when calculating + the density correlation objective to prevent numerical instability from dividing by a + small number. + default: 0.1 + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - umap-learn + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [/src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highcpu, midmem, middisk] diff --git a/src/dimred/densmap/script.py b/src/dimred/densmap/script.py new file mode 100644 index 00000000..88a36a22 --- /dev/null +++ b/src/dimred/densmap/script.py @@ -0,0 +1,94 @@ +from umap import UMAP +import mudata as mu +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "output": "output.h5mu", + "obsm_output": "X_densmap", + "lambda": 2.0, + "fraction": 0.3, + "var_shift": 0.1, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +try: + data = mu.read_h5ad(par["input"], mod=par["modality"]) +except KeyError as e: + raise ValueError( + f"Modality '{par['modality']}' not found in the input data." + ) from e + +logger.info("Computing densMAP for modality '%s'", par["modality"]) + +neigh_key = par["uns_neighbors"] + +if neigh_key not in data.uns: + raise ValueError( + f"'{neigh_key}' was not found in .mod['{par['modality']}'].uns. Set the correct key or run 'find_neighbors' first." + ) + +temp_uns = {neigh_key: data.uns[neigh_key]} + +if "use_rep" not in temp_uns[neigh_key]["params"]: + raise ValueError( + f"'use_rep' was not found in .mod['{par['modality']}'].uns['{neigh_key}'].params. Set the correct key or run PCA first." + ) + + +X_densmap = UMAP( + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + n_epochs=par["max_iter"], + learning_rate=par["alpha"], + repulsion_strength=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init=par["init_pos"], + metric=data.uns["neighbors"].get("metric", "euclidean"), + metric_kwds=data.uns["neighbors"].get("metric_kwds", {}), + densmap=True, + dens_lambda=par["lambda"], + dens_frac=par["fraction"], + dens_var_shift=par["var_shift"], +).fit_transform(data.obsm[par["obsm_pca"]]) + +logger.info( + f"Writing densMAP embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = X_densmap + +logger.info(f"Writing densMAP metadata to .mod[{par['modality']}].uns['densmap']") +data.uns["densmap"] = { + "params": { + "min_dist": par["min_dist"], + "spread": par["spread"], + "n_components": par["num_components"], + "n_epochs": par["max_iter"], + "learning_rate": par["alpha"], + "repulsion_strength": par["gamma"], + "negative_sample_rate": par["negative_sample_rate"], + "init": par["init_pos"], + "metric": data.uns["neighbors"].get("metric", "euclidean"), + "metric_kwds": data.uns["neighbors"].get("metric_kwds", {}), + "dens_lambda": par["lambda"], + "dens_frac": par["fraction"], + "dens_var_shift": par["var_shift"], + } +} + +logger.info("Writing to %s.", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") diff --git a/src/dimred/densmap/test.py b/src/dimred/densmap/test.py new file mode 100644 index 00000000..eb215320 --- /dev/null +++ b/src/dimred/densmap/test.py @@ -0,0 +1,109 @@ +import sys +import pytest +import subprocess +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +import re + +## VIASH START +meta = { + "executable": "./target/docker/dimred/densmap/densmap", + "resources_dir": "./resources_test/", + "config": "./src/dimred/densmap/config.vsh.yaml", +} +## VIASH END + +input_path = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +def test_densmap(run_component, random_h5mu_path): + output_path = random_h5mu_path() + args = [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--obsm_pca", + "X_pca", + "--output_compression", + "gzip", + ] + run_component(args) + + assert output_path.is_file(), "No output was created." + output_mudata = read_h5mu(output_path) + input_mudata = read_h5mu(input_path) + + # check whether densmap was found and remove for comparison + assert "X_densmap" in output_mudata.mod["rna"].obsm, ( + "Check whether output was found in .obsm" + ) + assert "densmap" in output_mudata.mod["rna"].uns, ( + "Check whether output was found in .uns" + ) + output_mudata.mod["rna"].obsm.pop("X_densmap") + output_mudata.mod["rna"].uns.pop("densmap") + assert_annotation_objects_equal(output_mudata, input_mudata) + + +def test_densmap_custom_obsm_output(run_component, random_h5mu_path): + output_path = random_h5mu_path() + args = [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--obsm_pca", + "X_pca", + "--output_compression", + "gzip", + "--obsm_output", + "X_custom_densmap", + ] + run_component(args) + + assert output_path.is_file(), "No output was created." + output_mudata = read_h5mu(output_path) + input_mudata = read_h5mu(input_path) + + # check whether tsne was found and remove for comparison + assert "X_custom_densmap" in output_mudata.mod["rna"].obsm, ( + "Check whether output was found in .obsm" + ) + assert "densmap" in output_mudata.mod["rna"].uns, ( + "Check whether output was found in .uns" + ) + output_mudata.mod["rna"].obsm.pop("X_custom_densmap") + output_mudata.mod["rna"].uns.pop("densmap") + assert_annotation_objects_equal(output_mudata, input_mudata) + + +def test_densmap_no_neighbors_raise(run_component, random_h5mu_path): + output_path = random_h5mu_path() + args = [ + "--input", + input_path, + "--output", + output_path, + "--obsm_pca", + "X_pca", + "--modality", + "prot", + "--output_compression", + "gzip", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: 'neighbors' was not found in .mod\['prot'\].uns.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/lsi/config.vsh.yaml b/src/dimred/lsi/config.vsh.yaml new file mode 100644 index 00000000..5824aaa9 --- /dev/null +++ b/src/dimred/lsi/config.vsh.yaml @@ -0,0 +1,120 @@ +name: lsi +namespace: "dimred" +scope: "public" +description: | + Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy. +authors: + - __merge__: /src/authors/sarah_ouologuem.yaml + roles: [ contributor ] + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Path to input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + type: string + default: "atac" + description: On which modality to run LSI on. + required: false + + - name: "--layer" + type: string + description: Use specified layer for expression values. If not specified, uses adata.X. + required: false + + - name: "--var_input" + type: string + description: Column name in .var matrix that will be used to select which genes to run the LSI on. If not specified, uses all features. + required: false + + - name: LSI options + arguments: + - name: "--num_components" + type: integer + default: 50 + description: Number of components to compute. + required: false + min: 2 + + - name: "--scale_embeddings" + type: boolean + default: true + description: Scale embeddings to zero mean and unit variance. + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--obsm_output" + type: string + default: "X_lsi" + description: In which .obsm slot to store the resulting embedding. + required: false + + - name: "--varm_output" + type: string + default: "lsi" + description: In which .varm slot to store the resulting loadings matrix. + required: false + + - name: "--uns_output" + type: string + default: "lsi" + description: In which .uns slot to store the stdev. + required: false + + - name: "--overwrite" + type: boolean_true + description: Allow overwriting .obsm, .varm and .uns slots. + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: ../../utils/subset_vars.py + - path: /src/utils/compress_h5mu.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: ../../utils/subset_vars.py + - path: ../../../resources_test/concat_test_data + + +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - pkg-config # Otherwise h5py installation fails, which is required for scanpy + - libhdf5-dev + - gcc + - type: python + __merge__: [../../../src/base/requirements/anndata_mudata.yaml, .] + packages: + - muon~=0.1.6 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: + - highcpu + - highmem + - middisk diff --git a/src/dimred/lsi/script.py b/src/dimred/lsi/script.py new file mode 100644 index 00000000..13fa171d --- /dev/null +++ b/src/dimred/lsi/script.py @@ -0,0 +1,114 @@ +import muon as mu +import mudata as md +from anndata import AnnData +import numpy as np +import sys + + +## VIASH START +par = { + "num_components": 50, # number of components to calculate with SVD + "scale_embeddings": True, # scale embeddings to zero mean and unit variance + "modality": "atac", # on which modality the LSI should be run + "layer": None, # on which layer to run the LSI, if None, will run it on anndata.X + "var_input": None, # column in anndata.var of the highly variable features + "overwrite": True, + "obsm_output": "X_lsi", + "varm_output": "LSI", + "uns_output": "lsi", + "output": "output.h5mu", + "output_compression": "gzip", +} +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from subset_vars import subset_vars + +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from setup_logger import setup_logger + +logger = setup_logger() + + +# 1.read in adata +logger.info("Reading %s.", par["input"]) +try: + adata = md.read_h5ad(par["input"], mod=par["modality"]) +except KeyError as e: + raise ValueError( + f"Modality '{par['modality']}' was not found in mudata {par['input']}." + ) from e + +# 3. Specify layer +if par["layer"] and par["layer"] not in adata.layers: + raise ValueError( + f"Layer '{par['layer']}' was not found in modality '{par['modality']}'." + ) +layer = adata.X if not par["layer"] else adata.layers[par["layer"]] +adata_input_layer = AnnData(layer, var=adata.var) + + +if not par["layer"]: + logger.info("Using modality '%s' and adata.X for LSI computation", par["modality"]) +else: + logger.info( + "Using modality '%s' and layer '%s' for LSI computation", + par["modality"], + par["layer"], + ) + + +# 4. Subset on highly variable features if applicable +if par["var_input"]: + adata_input_layer = subset_vars(adata_input_layer, par["var_input"]) + + +# 5. Run LSI +logger.info( + "Computing %s LSI components on %s features", + par["num_components"], + adata_input_layer.X.shape[1], +) +mu.atac.tl.lsi( + adata_input_layer, + scale_embeddings=par["scale_embeddings"], + n_comps=par["num_components"], +) + + +# 6. Store output in object +check_exist_dict = { + "obsm_output": ("obsm"), + "varm_output": ("varm"), + "uns_output": ("uns"), +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(adata, field): + if not par["overwrite"]: + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) + del getattr(adata, field)[par[parameter_name]] + +adata.obsm[par["obsm_output"]] = adata_input_layer.obsm["X_lsi"] +adata.uns[par["uns_output"]] = adata_input_layer.uns["lsi"] +if par["var_input"]: + adata.varm[par["varm_output"]] = np.zeros( + shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]) + ) + adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = ( + adata_input_layer.varm["LSI"] + ) +else: + adata.varm[par["varm_output"]] = adata_input_layer.varm["LSI"] + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) + +logger.info("Finished") diff --git a/src/dimred/lsi/test.py b/src/dimred/lsi/test.py new file mode 100644 index 00000000..510d49a7 --- /dev/null +++ b/src/dimred/lsi/test.py @@ -0,0 +1,246 @@ +import sys +import pytest +import subprocess +import mudata as mu +import numpy as np + +## VIASH START +meta = { + "resources_dir": "resources_test", + "executable": "./target/docker/dimred/lsi/lsi", + "config": "./src/dimred/lsi/config.vsh.yaml", +} +## VIASH END + +input_path = f"{meta['resources_dir']}/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + + +""" +Tests: +1. general test +2. test HVF +3. test modality +4. test layer +5. test overwrite +""" + + +@pytest.fixture +def atac_mudata(tmp_path): + mdata = mu.read_h5mu(input_path) + mdata.mod["atac"].layers["counts"] = mdata.mod["atac"].X + mdata.mod["atac"].var["highly_variable"] = np.random.choice( + [True, False], size=mdata.mod["atac"].n_vars + ) + print(mdata) + + mdata.write(tmp_path / "atac_mudata.h5mu") + + return tmp_path / "atac_mudata.h5mu" + + +# 1.general test +def test_lsi(run_component, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + cmd_args = [ + "--input", + input_path, + "--output", + str(output_path), + "--obsm_output", + "X_test", + "--num_components", + "30", + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "X_test" in data.mod["atac"].obsm + assert data.mod["atac"].obsm["X_test"].shape == (data.mod["atac"].n_obs, 30) + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm + + +# 2.test HVF +def test_select_highly_variable_column(run_component, random_h5mu_path, atac_mudata): + output_path = random_h5mu_path() + + # run component + cmd_args = [ + "--input", + str(atac_mudata), + "--output", + str(output_path), + "--var_input", + "highly_variable", + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "X_lsi" in data.mod["atac"].obsm + assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 50) + assert "highly_variable" in data.mod["atac"].var.columns + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm + assert data.mod["atac"].varm["lsi"].shape == (data.mod["atac"].n_vars, 50) + + +def test_highly_variable_column_does_not_exist_raises(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", + input_path, + "--output", + "output_lsi.h5mu", + "--var_input", + "does_not_exist", + ] + run_component(cmd_args) + + assert ( + "ValueError: Requested to use .var column 'does_not_exist' as a selection of genes, but the column is not available." + in err.value.stdout.decode("utf-8") + ) + + +# 3.test modality +def test_modality_does_not_exist_raises(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", + input_path, + "--output", + "output_lsi.h5mu", + "--modality", + "does_not_exist", + ] + run_component(cmd_args) + + assert ( + "ValueError: Modality 'does_not_exist' was not found in mudata " + + input_path + + "." + in err.value.stdout.decode("utf-8") + ) + + +# 4.test layer +def test_selecting_input_layer(run_component, atac_mudata, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + # run component + cmd_args = [ + "--input", + str(atac_mudata), + "--output", + str(output_path), + "--num_components", + "20", + "--layer", + "counts", + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "counts" in data.mod["atac"].layers + assert "X_lsi" in data.mod["atac"].obsm + assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 20) + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm + + +def test_raise_if_input_layer_is_missing(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", + input_path, + "--output", + "output.h5mu", + "--layer", + "does_not_exist", + ] + run_component(cmd_args) + + assert ( + "ValueError: Layer 'does_not_exist' was not found in modality 'atac'." + in err.value.stdout.decode("utf-8") + ) + + +# 5.test overwrite + + +def test_output_field_already_present_raises(run_component, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + # create slots + input_data = mu.read_h5mu(input_path) + input_data.mod["atac"].varm["lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_vars, 50) + ) + input_data.mod["atac"].obsm["X_lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_obs, 50) + ) + input_data.mod["atac"].uns["lsi"] = "test" + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write_h5mu(tmp_file) + + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + run_component(cmd_args) + + assert ( + "ValueError: Requested to create field X_lsi in .obsm for " + "modality atac, but field already exists." in err.value.stdout.decode("utf-8") + ) + + +def test_output_field_already_present_overwrite(run_component, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + # create slots + input_data = mu.read_h5mu(input_path) + input_data.mod["atac"].varm["lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_vars, 50) + ) + input_data.mod["atac"].obsm["X_lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_obs, 50) + ) + input_data.mod["atac"].uns["lsi"] = "test" + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write_h5mu(tmp_file) + + cmd_args = [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--output_compression", + "gzip", + "--overwrite", + "--num_components", + "30", + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "X_lsi" in data.mod["atac"].obsm + assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 30) + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/pca/config.vsh.yaml b/src/dimred/pca/config.vsh.yaml new file mode 100644 index 00000000..70452271 --- /dev/null +++ b/src/dimred/pca/config.vsh.yaml @@ -0,0 +1,102 @@ +name: pca +namespace: "dimred" +scope: "public" +description: | + Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11]. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] +arguments: + # inputs + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--layer" + type: string + description: Use specified layer for expression values instead of the .X object from the modality. + required: false + + - name: "--var_input" + type: string + description: Column name in .var matrix that will be used to select which genes to run the PCA on. + example: filter_with_hvg + + # outputs + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--obsm_output" + type: string + default: "X_pca" + description: "In which .obsm slot to store the resulting embedding." + + - name: "--varm_output" + type: string + default: "pca_loadings" + description: "In which .varm slot to store the resulting loadings matrix." + + - name: "--uns_output" + type: string + default: "pca_variance" + description: "In which .uns slot to store the resulting variance objects." + + # arguments + - name: "--num_components" + type: integer + example: 25 + description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. + + - name: "--overwrite" + type: boolean_true + description: "Allow overwriting .obsm, .varm and .uns slots." + +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: + - highcpu + - highmem + - middisk diff --git a/src/dimred/pca/script.py b/src/dimred/pca/script.py new file mode 100644 index 00000000..f43dce22 --- /dev/null +++ b/src/dimred/pca/script.py @@ -0,0 +1,88 @@ +import scanpy as sc +import mudata as mu +import sys +from anndata import AnnData + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "output.h5mu", + "modality": "rna", + "output_key": "pca", + "num_components": 25, + "layer": None, + "obsm_output": "X_pca", + "var_input": "filter_with_hvg", + "varm_output": "varm_output", + "uns_output": "pca_variance", + "overwrite": True, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing PCA components for modality '%s'", par["modality"]) +if par["layer"] and par["layer"] not in data.layers: + raise ValueError(f"{par['layer']} was not found in modality {par['modality']}.") +layer = data.X if not par["layer"] else data.layers[par["layer"]] +adata_input_layer = AnnData(layer) +adata_input_layer.var.index = data.var.index + +use_highly_variable = False +if par["var_input"]: + if par["var_input"] not in data.var.columns: + raise ValueError( + f"Requested to use .var column {par['var_input']} " + "as a selection of genes to run the PCA on, " + f"but the column is not available for modality {par['modality']}" + ) + use_highly_variable = True + adata_input_layer.var["highly_variable"] = data.var[par["var_input"]] + +# run pca +output_adata = sc.tl.pca( + adata_input_layer, + n_comps=par["num_components"], + copy=True, + use_highly_variable=use_highly_variable, +) + +# store output in specific objects + +check_exist_dict = { + "obsm_output": ("obs"), + "varm_output": ("varm"), + "uns_output": ("uns"), +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(data, field): + if not par["overwrite"]: + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) + del getattr(data, field)[par[parameter_name]] + +data.obsm[par["obsm_output"]] = output_adata.obsm["X_pca"] +data.varm[par["varm_output"]] = output_adata.varm["PCs"] +data.uns[par["uns_output"]] = { + "variance": output_adata.uns["pca"]["variance"], + "variance_ratio": output_adata.uns["pca"]["variance_ratio"], +} + + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") diff --git a/src/dimred/pca/test.py b/src/dimred/pca/test.py new file mode 100644 index 00000000..a6989c99 --- /dev/null +++ b/src/dimred/pca/test.py @@ -0,0 +1,234 @@ +import sys +import pytest +import subprocess +import mudata as mu +import numpy as np + +## VIASH START +meta = { + "name": "foo", + "resources_dir": "resources_test/", + "executable": "./target/executable/dimred/pca/pca", + "config": "./src/dimred/pca/config.vsh.yaml", +} +## VIASH END + +input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +def test_pca(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # run component + run_component( + [ + "--input", + input_path, + "--output", + str(output_path), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--overwrite", + ] + ) + assert output_path.is_file() + data = mu.read_h5mu(output_path) + + # check whether pca was found + assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26) + assert "highly_variable" not in data.mod["rna"].var.columns + assert "filter_with_hvg" in data.mod["rna"].var.columns + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_foo" in data.mod["rna"].obsm + # GH298 + assert not np.array_equal( + data.mod["rna"].uns["pca_variance"]["variance"], + data.mod["rna"].uns["pca_variance"]["variance_ratio"], + ) + + +def test_no_overwrite_but_field_also_not_present(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # create input data + input_data = mu.read_h5mu(input_path) + input_data.mod["rna"].uns.pop("pca_variance") + input_data.mod["rna"].varm.pop("pca_loadings") + input_data.mod["rna"].obsm.pop("X_pca") + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write(tmp_file) + + # run component + run_component( + [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + data = mu.read_h5mu(output_path) + + # check whether pca was found + assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26) + assert "highly_variable" not in data.mod["rna"].var.columns + assert "filter_with_hvg" in data.mod["rna"].var.columns + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_foo" in data.mod["rna"].obsm + + +def test_selecting_input_layer(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # generate input data + input_data = mu.read_h5mu(input_path) + input_data.mod["rna"].layers["test"] = input_data.mod["rna"].X + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write_h5mu(tmp_file) + + # run component + run_component( + [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--obsm_output", + "test_foo", + "--num_components", + "26", + "--layer", + "test", + "--overwrite", + ] + ) + assert output_path.is_file() + + # check whether pca was found + data = mu.read_h5mu(output_path) + assert "test_foo" in data.mod["rna"].obsm + assert data.mod["rna"].obsm["test_foo"].shape == (data.n_obs, 26) + assert "highly_variable" not in data.mod["rna"].var.columns + assert "filter_with_hvg" in data.mod["rna"].var.columns + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_pca" in data.mod["rna"].obsm + + +def test_highly_variable_column_does_not_exist_raises(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--output", + "output.h5mu", + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--var_input", + "does_not_exist", + ] + ) + assert ( + "ValueError: Requested to use .var column does_not_exist as " + "a selection of genes to run the PCA on, but the column is " + "not available for modality rna" in err.value.stdout.decode("utf-8") + ) + + +def test_select_highly_variable_column(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # create input data + input_data = mu.read_h5mu(input_path) + input_data.mod["rna"].var["filter_with_hvg"] = True + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write_h5mu(tmp_file) + + # run component + run_component( + [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--var_input", + "filter_with_hvg", + "--overwrite", + ] + ) + assert output_path.is_file() + + # check whether pca was found + data = mu.read_h5mu(output_path) + assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26) + assert "highly_variable" not in data.mod["rna"].var.columns + assert "filter_with_hvg" in data.mod["rna"].var.columns + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_pca" in data.mod["rna"].obsm + + +def test_raise_if_input_layer_is_missing(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--output", + "output.h5mu", + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--layer", + "does_not_exist", + "--var_input", + "filter_with_hvg", + ] + ) + assert ( + "ValueError: does_not_exist was not found in modality rna." + in err.value.stdout.decode("utf-8") + ) + + +def test_output_field_already_present_raises(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--output", + "output.h5mu", + "--obsm_output", + "X_foo", + "--num_components", + "26", + ] + ) + assert ( + "ValueError: Requested to create field pca_loadings in .varm for " + "modality rna, but field already exists." in err.value.stdout.decode("utf-8") + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/tsne/config.vsh.yaml b/src/dimred/tsne/config.vsh.yaml new file mode 100644 index 00000000..57cbbf4d --- /dev/null +++ b/src/dimred/tsne/config.vsh.yaml @@ -0,0 +1,113 @@ +name: tsne +namespace: "dimred" +scope: "public" +description: | + t-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality reduction technique used to visualize high-dimensional data in a low-dimensional space, revealing patterns and clusters by preserving local data similarities. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: true + + - name: "--use_rep" + type: string + description: The `.obsm` slot to use as input for the tSNE computation. + required: true + example: "X_pca" + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--obsm_output" + type: string + description: The .obsm key to use for storing the tSNE results. + default: "X_tsne" + + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + + - name: "--n_pcs" + type: integer + description: The number of principal components to use for the tSNE computation. + default: 50 + required: false + + - name: "--perplexity" + type: double + description: The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. Different values can result in significantly different results. + default: 30.0 + + - name: "--min_dist" + type: double + description: The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the spread value, which determines the scale at which embedded points will be spread out. + default: 0.5 + + - name: "--metric" + type: string + description: Distance metric to calculate neighbors on. + default: euclidean + + - name: "--early_exaggeration" + type: double + description: Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. + default: 12.0 + + - name: "--learning_rate" + type: double + description: The learning rate for t-SNE optimization. Typical values range between 10.0 and 1000.0. + default: 1000.0 + + - name: "--random_state" + type: integer + description: The random seed to use for the tSNE computation. + required: false + default: 0 + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [highcpu, midmem, middisk] diff --git a/src/dimred/tsne/script.py b/src/dimred/tsne/script.py new file mode 100644 index 00000000..c03360d1 --- /dev/null +++ b/src/dimred/tsne/script.py @@ -0,0 +1,69 @@ +import scanpy as sc +import mudata as mu +import sys +import anndata as ad + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "use_rep": "X_pca", + "output": "output.h5mu", + "output_compression": "gzip", + "obsm_output": "X_tsne", + "n_pcs": 50, + "perplexity": 30, + "min_dist": 0.5, + "metric": "euclidean", + "early_exaggeration": 12, + "learning_rate": 1000, + "random_state": 0, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing tSNE for modality '%s'", par["modality"]) +if par["use_rep"] not in data.obsm.keys(): + raise ValueError( + f"'{par['use_rep']}' was not found in .mod['{par['modality']}'].obsm. No precomputed PCA provided. Please run PCA first." + ) +temp_obsm = {par["use_rep"]: data.obsm[par["use_rep"]]} + +temp_adata = ad.AnnData(obsm=temp_obsm, shape=data.shape) + +sc.tl.tsne( + adata=temp_adata, + n_pcs=par["n_pcs"], + use_rep=par["use_rep"], + perplexity=par["perplexity"], + metric=par["metric"], + early_exaggeration=par["early_exaggeration"], + learning_rate=par["learning_rate"], + random_state=par["random_state"], + n_jobs=meta["cpus"], +) + +logger.info( + f"Writing tSNE embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_tsne"] + +logger.info(f"Writing tSNE metadata to .mod[{par['modality']}].uns['tsne']") +data.uns["tsne"] = temp_adata.uns["tsne"] + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") diff --git a/src/dimred/tsne/test.py b/src/dimred/tsne/test.py new file mode 100644 index 00000000..bd30b396 --- /dev/null +++ b/src/dimred/tsne/test.py @@ -0,0 +1,148 @@ +import sys +import pytest +import subprocess +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +import re + +## VIASH START +meta = { + "executable": "./target/executable/dimred/tsne/tsne", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", + "config": "./src/dimred/tsne/config.vsh.yaml", +} +## VIASH END + +input_path = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.fixture +def mudata_no_obsm_pca(write_mudata_to_file): + input_mudata = read_h5mu(input_path) + input_mudata.mod["rna"].obsm.pop("X_pca") + return write_mudata_to_file(input_mudata) + + +def test_tsne(run_component, random_h5mu_path): + output_path = random_h5mu_path() + args = [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_pca", + "--output_compression", + "gzip", + ] + run_component(args) + + assert output_path.is_file(), "No output was created." + output_mudata = read_h5mu(output_path) + input_mudata = read_h5mu(input_path) + + # check whether tsne was found and remove for comparison + assert "X_tsne" in output_mudata.mod["rna"].obsm, ( + "Check whether output was found in .obsm" + ) + assert "tsne" in output_mudata.mod["rna"].uns, ( + "Check whether output was found in .uns" + ) + output_mudata.mod["rna"].obsm.pop("X_tsne") + output_mudata.mod["rna"].uns.pop("tsne") + assert_annotation_objects_equal(output_mudata, input_mudata) + + +def test_tsne_custom_rep_obsm_output(run_component, random_h5mu_path): + input_mudata_custom = read_h5mu(input_path) + input_mudata_custom.mod["rna"].obsm["X_custom_pca"] = input_mudata_custom.mod[ + "rna" + ].obsm["X_pca"] + input_mudata_custom_path = random_h5mu_path() + input_mudata_custom.write_h5mu(input_mudata_custom_path) + + output_path = random_h5mu_path() + args = [ + "--input", + input_mudata_custom_path, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_custom_pca", + "--obsm_output", + "X_custom_tsne", + "--output_compression", + "gzip", + ] + run_component(args) + + assert output_path.is_file(), "No output was created." + output_mudata = read_h5mu(output_path) + # check whether tsne was found and remove for comparison + assert "X_custom_tsne" in output_mudata.mod["rna"].obsm, ( + "Check whether output was found in .obsm" + ) + assert "tsne" in output_mudata.mod["rna"].uns, ( + "Check whether output was found in .uns" + ) + output_mudata.mod["rna"].obsm.pop("X_custom_tsne") + output_mudata.mod["rna"].uns.pop("tsne") + assert_annotation_objects_equal(output_mudata, input_mudata_custom) + + +def test_tsne_no_pca_in_input_raise( + run_component, random_h5mu_path, mudata_no_obsm_pca +): + output_path = random_h5mu_path() + args = [ + "--input", + mudata_no_obsm_pca, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_pca", + "--output_compression", + "gzip", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: 'X_pca' was not found in \.mod\['rna'\]\.obsm\. No precomputed PCA provided\. Please run PCA first\.", + err.value.stdout.decode("utf-8"), + ) + + +def test_tsne_too_many_pcs_raise(run_component, random_h5mu_path): + output_path = random_h5mu_path() + args = [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_pca", + "--output_compression", + "gzip", + "--n_pcs", + "100", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: X_pca does not have enough Dimensions\. Provide a Representation with equal or more dimensions than`n_pcs` or lower `n_pcs`", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/umap/config.vsh.yaml b/src/dimred/umap/config.vsh.yaml new file mode 100644 index 00000000..1b4b82a9 --- /dev/null +++ b/src/dimred/umap/config.vsh.yaml @@ -0,0 +1,120 @@ +name: umap +namespace: "dimred" +scope: "public" +description: | + UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of umap-learn [McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--uns_neighbors" + type: string + default: "neighbors" + description: The `.uns` neighbors slot as output by the `find_neighbors` component. + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--obsm_output" + type: string + description: The pre/postfix under which to store the UMAP results. + default: "umap" + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--min_dist" + type: double + description: The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the spread value, which determines the scale at which embedded points will be spread out. + default: 0.5 + + - name: "--spread" + type: double + description: The effective scale of embedded points. In combination with `min_dist` this determines how clustered/clumped the embedded points are. + default: 1.0 + + - name: "--num_components" + type: integer + description: The number of dimensions of the embedding. + default: 2 + + - name: "--max_iter" + type: integer + description: The number of iterations (epochs) of the optimization. Called `n_epochs` in the original UMAP. Default is set to 500 if neighbors['connectivities'].shape[0] <= 10000, else 200. + + - name: "--alpha" + type: double + description: The initial learning rate for the embedding optimization. + default: 1.0 + + - name: "--gamma" + type: double + description: Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. + default: 1.0 + + - name: "--negative_sample_rate" + type: integer + description: The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding. + default: 5 + + - name: "--init_pos" + type: string + description: | + How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are: + + * Any key from `.obsm` + * `'paga'`: positions from `paga()` + * `'spectral'`: use a spectral embedding of the graph + * `'random'`: assign initial embedding positions at random. + + default: spectral +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [highcpu, midmem, middisk] diff --git a/src/dimred/umap/script.py b/src/dimred/umap/script.py new file mode 100644 index 00000000..d5054496 --- /dev/null +++ b/src/dimred/umap/script.py @@ -0,0 +1,76 @@ +import scanpy as sc +import mudata as mu +import sys +import anndata as ad + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "output": "output.h5mu", + "obsm_output": "X_umap", + "min_dist": 0.5, + "spread": 1.0, + "num_components": 2, + "max_iter": None, + "alpha": 1.0, + "gamma": 1.0, + "negative_sample_rate": 5, + "init_pos": "spectral", + "uns_neighbors": "neighbors", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing UMAP for modality '%s'", par["modality"]) + +if par["uns_neighbors"] not in data.uns: + raise ValueError( + f"'{par['uns_neighbors']}' was not found in .mod['{par['modality']}'].uns." + ) + +# create temporary AnnData +# ... because sc.tl.umap doesn't allow to choose +# the obsm output slot +# ... also we can see scanpy is a data format dependency hell +neigh_key = par["uns_neighbors"] +temp_uns = {neigh_key: data.uns[neigh_key]} +conn_key = temp_uns[neigh_key]["connectivities_key"] +dist_key = temp_uns[neigh_key]["distances_key"] +temp_obsp = { + conn_key: data.obsp[conn_key], + dist_key: data.obsp[dist_key], +} +pca_key = temp_uns[neigh_key]["params"]["use_rep"] +temp_obsm = {pca_key: data.obsm[pca_key]} + +temp_adata = ad.AnnData(obsm=temp_obsm, obsp=temp_obsp, uns=temp_uns, shape=data.shape) + +sc.tl.umap( + temp_adata, + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + maxiter=par["max_iter"], + alpha=par["alpha"], + gamma=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init_pos=par["init_pos"], + neighbors_key=neigh_key, +) + +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_umap"] + +logger.info("Writing to %s, compression %s", par["output"], par["output_compression"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +logger.info("Finished") diff --git a/src/dimred/umap/test.py b/src/dimred/umap/test.py new file mode 100644 index 00000000..ed83c305 --- /dev/null +++ b/src/dimred/umap/test.py @@ -0,0 +1,66 @@ +import sys +import pytest +import subprocess +import mudata as mu + +## VIASH START +meta = {"name": "foo", "resources_dir": "/resources_test/pbmc_1k_protein_v3/"} +## VIASH END + +input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +def test_umap(run_component, tmp_path): + output = tmp_path / "output.h5mu" + run_component( + [ + "--input", + input, + "--output", + str(output), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--output_compression", + "gzip", + ] + ) + + assert output.is_file(), "No output was created." + data = mu.read_h5mu(output) + + # check whether umap was found + assert "X_foo" in data.mod["rna"].obsm, "Check whether output was found in .obsm" + assert data.mod["rna"].obsm["X_foo"].shape == ( + data.n_obs, + 26, + ), "Check whether output has correct shape" + + +def test_raise_if_uns_neighbor_is_missing(run_component, tmp_path): + output = tmp_path / "output.h5mu" + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input, + "--output", + str(output), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--uns_neighbors", + "does_not_exist", + ] + ) + assert not output.is_file(), "No output should be created." + assert ( + "ValueError: 'does_not_exist' was not found in .mod['rna'].uns." + in err.value.stdout.decode("utf-8") + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/download/download_file/config.vsh.yaml b/src/download/download_file/config.vsh.yaml new file mode 100644 index 00000000..fbcf8ad8 --- /dev/null +++ b/src/download/download_file/config.vsh.yaml @@ -0,0 +1,43 @@ +name: "download_file" +namespace: "download" +scope: "public" +status: "deprecated" +description: | + Download a file. +usage: | + download_file \ + --input https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \ + --output output_rna.h5 +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + type: string + description: "URL to a file to download." + required: true + example: "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + - name: "--output" + type: file + example: pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 + required: true + direction: output + description: "Path where to store output." + - name: "--verbose" + alternatives: ["-v"] + type: boolean_true + description: Increase verbosity +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: run_test.sh + +engines: +- type: docker + image: bash:5.1.16 + +runners: +- type: executable +- type: nextflow diff --git a/src/download/download_file/run_test.sh b/src/download/download_file/run_test.sh new file mode 100755 index 00000000..2cf5f944 --- /dev/null +++ b/src/download/download_file/run_test.sh @@ -0,0 +1,14 @@ + + +$meta_executable \ + --input https://raw.githubusercontent.com/scala/scala/2.13.x/NOTICE \ + --output NOTICE + +[ ! -f NOTICE ] && echo Output file could not be found && exit 1 + +if ! grep -q 'Licensed under the Apache License' NOTICE; then + echo Could not find content + exit 1 +fi + +echo Test succeeded! \ No newline at end of file diff --git a/src/download/download_file/script.sh b/src/download/download_file/script.sh new file mode 100644 index 00000000..3b561440 --- /dev/null +++ b/src/download/download_file/script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input='https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5' +par_output='pbmc_1k_protein_v3_raw_feature_bc_matrix.h5' +par_verbose='false' +## VIASH END + +extra_params=() + +if [ "$par_verbose" != "true" ]; then + extra_params+=("--quiet") +fi + +wget "$par_input" -O "$par_output" "${extra_params[@]}" diff --git a/src/download/sync_test_resources/config.vsh.yaml b/src/download/sync_test_resources/config.vsh.yaml new file mode 100644 index 00000000..8c88ad31 --- /dev/null +++ b/src/download/sync_test_resources/config.vsh.yaml @@ -0,0 +1,60 @@ +name: "sync_test_resources" +namespace: "download" +scope: "test" +description: Sync test resources to the local filesystem +usage: | + sync_test_resources + sync_test_resources --input _viash.yaml --output . +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: "Path to the _viash.yaml project configuration file." + default: _viash.yaml + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + default: . + direction: output + description: "Path to the directory where the resources will be synced to." + - name: Arguments + arguments: + - name: "--quiet" + type: boolean_true + description: "Displays the operations that would be performed using the specified command without actually running them." + - name: "--dryrun" + type: boolean_true + description: "Does not display the operations performed from the specified command." + - name: "--delete" + type: boolean_true + description: "Files that exist in the destination but not in the source are deleted during sync." + - name: "--exclude" + type: "string" + multiple: true + description: Exclude all files or objects from the command that matches the specified pattern. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: + - type: docker + image: "amazon/aws-cli:2.17.11" + setup: + - type: yum + packages: [wget] + - type: docker + run : | + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && \ + chmod +x /usr/bin/yq +runners: + - type: executable + - type: nextflow diff --git a/src/download/sync_test_resources/script.sh b/src/download/sync_test_resources/script.sh new file mode 100644 index 00000000..29afb29b --- /dev/null +++ b/src/download/sync_test_resources/script.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +## VIASH START +par_input='_viash.yaml' +par_output='.' +## VIASH END + +extra_params=( ) + +if [ "$par_quiet" == "true" ]; then + extra_params+=( "--quiet" ) +fi +if [ "$par_dryrun" == "true" ]; then + extra_params+=( "--dryrun" ) +fi +if [ "$par_delete" == "true" ]; then + extra_params+=( "--delete" ) +fi + +if [ ! -z ${par_exclude+x} ]; then + IFS=";" + for var in $par_exclude; do + unset IFS + extra_params+=( "--exclude" "$var" ) + done +fi + +function sync_s3() { + local s3_path="$1" + local dest_path="$2" + AWS_EC2_METADATA_DISABLED=true \ + aws s3 sync \ + "$s3_path" \ + "$dest_path" \ + --no-sign-request \ + "${extra_params[@]}" +} + +yq e \ + '.info.test_resources[] | "{type: " + (.type // "s3") + ", path: " + .path + ", dest: " + .dest + "}"' \ + "${par_input}" | \ + while read -r line; do + type=$(echo "$line" | yq e '.type') + path=$(echo "$line" | yq e '.path') + dest=$(echo "$line" | yq e '.dest') + + echo "Syncing '$path' to '$dest'..." + + if [ "$type" == "s3" ]; then + sync_s3 "$path" "$par_output/$dest" + fi + done diff --git a/src/download/sync_test_resources/test.sh b/src/download/sync_test_resources/test.sh new file mode 100755 index 00000000..19cf2fdf --- /dev/null +++ b/src/download/sync_test_resources/test.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +cat > _viash2.yaml << EOM +info: + test_resources: + - type: s3 + path: s3://openpipelines-data/pbmc_1k_protein_v3 + dest: bar +EOM + +echo ">> Run aws s3 sync" +"$meta_executable" \ + --input _viash2.yaml \ + --output foo \ + --exclude '*.h5' \ + --exclude '*.h5mu' \ + --exclude '*.mtx.gz' \ + --quiet + +echo ">> Check whether the right files were copied" +[ ! -f foo/bar/pbmc_1k_protein_v3_metrics_summary.csv ] && echo csv should have been copied && exit 1 +[ ! -f foo/bar/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz ] && echo barcodes.tsv.gz should have been copied && exit 1 +[ ! -f foo/bar/pbmc_1k_protein_v3_filtered_feature_bc_matrix/features.tsv.gz ] && echo features.tsv.gz should have been copied && exit 1 +[ -f foo/bar/pbmc_1k_protein_v3_filtered_feature_bc_matrix/matrix.mtx.gz ] && echo matrix.mtx.gz should have been excluded && exit 1 +[ -f foo/bar/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 ] && echo h5 should have been excluded && exit 1 +[ -f foo/bar/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu ] && echo h5mu should have been excluded && exit 1 + +echo ">> Check whether content was found" +if ! grep -q 'Estimated Number of Cells' foo/bar/pbmc_1k_protein_v3_metrics_summary.csv; then + echo Could not find content in csv + exit 1 +fi + +echo ">> Test succeeded!" diff --git a/src/feature_annotation/align_query_reference/config.vsh.yaml b/src/feature_annotation/align_query_reference/config.vsh.yaml new file mode 100644 index 00000000..03bb5d73 --- /dev/null +++ b/src/feature_annotation/align_query_reference/config.vsh.yaml @@ -0,0 +1,209 @@ +name: align_query_reference +namespace: feature_annotation +scope: "public" +description: | + Alignment of a query and reference dataset by: + * Alignment of layers + * Harmonization of .obs field names for batch and cell type labels + * Harmonization of .var field name for gene names + * Sanitation of gene names + * Cross-checking of genes + * Assignment of an id to the query and reference datasets + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer ] + +argument_groups: + - name: Inputs + description: Input dataset (query) arguments + arguments: + - name: "--input" + type: file + description: The input (query) data to be labeled. Should be a .h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: Which modality to process. Note that the query and reference modalities should be the same. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: The layer in the input (query) data containing raw counts if .X is not to be used. + - name: "--input_layer_lognormalized" + type: string + required: false + description: The layer in the input (query) data containing log normalized counts if .X is not to be used. + - name: "--input_var_gene_names" + type: string + required: false + description: | + The name of the .var column in the input (query) data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--input_obs_batch" + type: string + required: true + example: sample_id + description: | + The name of the .obs column in the input (query) data containing batch information. + - name: "--input_obs_label" + type: string + required: false + example: cell_type + description: | + The name of the .obs column in the input (query) data containing cell type labels. If not provided, the --unkown_celltype_label will be assigned to all observations. + - name: "--input_id" + type: string + required: false + default: "query" + description: | + Meta id value to be assigned to the --output_obs_id .obs field of the aligned input (query) data. + + - name: Reference + description: Arguments related to the reference dataset. + arguments: + - name: "--reference" + type: file + description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." + example: reference.h5mu + direction: input + required: false + - name: "--reference_layer" + type: string + description: The layer in the reference data containing raw counts if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset. + required: false + - name: "--reference_layer_lognormalized" + type: string + description: The layer in the reference data containing log normalized counts if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset. + required: false + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The name of the .var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--reference_obs_batch" + type: string + required: true + example: sample_id + description: | + The name of the .obs column in the reference data containing batch information. + - name: "--reference_obs_label" + type: string + required: false + example: cell_type + description: | + The name of the .obs column in the reference data containing cell type labels. If not provided, the --unkown_celltype_label will be assigned to all observations. + - name: "--reference_id" + type: string + required: false + default: "reference" + description: | + Meta id value to be assigned to the --output_obs_id .obs field of the aligned reference data. + + - name: Outputs + description: Output arguments. + arguments: + - name: "--output_query" + type: file + description: Aligned query data. + direction: output + example: output_query.h5mu + - name: "--output_reference" + type: file + description: Aligned reference data. + direction: output + example: output_reference.h5mu + - name: "--output_layer" + type: string + default: "_counts" + description: Name of the aligned layer containing raw counts in the output query and reference datasets. + - name: "--output_layer_lognormalized" + type: string + default: "_log_normalized" + description: Name of the aligned layer containing log normalized counts in the output query and reference datasets. + - name: "--output_var_gene_names" + type: string + default: "_gene_names" + description: Name of the .var column in the output query and reference datasets containing the gene names. + - name: "--output_obs_batch" + type: string + default: "_sample_id" + description: Name of the .obs column in the output query and reference datasets containing the batch information. + - name: "--output_obs_label" + type: string + default: "_cell_type" + description: Name of the .obs column in the output query and reference datasets containing the cell type labels. + - name: "--output_obs_id" + type: string + default: "_dataset" + description: Name of the .obs column in the output query and reference datasets containing the dataset id. + - name: "--output_var_index" + type: string + default: "_ori_var_index" + description: Name of the .var column to which the .var index of the --input and --reference datasets is stored. Only relevant if "--preserve_var_index" is False. + - name: "--output_var_common_genes" + type: string + default: "_common_vars" + description: Name of the .var column in the output query and reference datasets containing the boolean array indicating the common variables. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + description: Arguments related to the alignment of the input and reference datasets. + arguments: + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + - name: "--align_layers_raw_counts" + type: boolean + default: true + description: Whether to align the query and reference layers containing raw counts. + - name: "--align_layers_lognormalized_counts" + type: boolean_true + description: Whether to align the query and reference layers containing log normalized counts. + - name: "--unkown_celltype_label" + type: string + default: "Unknown" + description: | + The label to assign to cells with an unknown cell type. + - name: "--overwrite_existing_key" + type: boolean_true + description: If set to true and the layer, obs or var key already exists in the query/reference file, the key will be overwritten. + - name: "--preserve_var_index" + type: boolean_true + description: | + If set to true, the .var index of the --input and --reference datasets will be preserved. + If set to false (default behavior), the original .var index will be stored in the --output_var_index .var column and the .var index will be replaced with the sanitized & aligned gene names. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/cross_check_genes.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [lowcpu, midmem] diff --git a/src/feature_annotation/align_query_reference/script.py b/src/feature_annotation/align_query_reference/script.py new file mode 100644 index 00000000..d81f334f --- /dev/null +++ b/src/feature_annotation/align_query_reference/script.py @@ -0,0 +1,294 @@ +import sys +import mudata as mu + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_layer": None, + "input_var_gene_names": None, + "input_obs_batch": "sample_id", + "input_obs_label": None, + "input_id": "query", + "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "reference_layer": None, + "reference_var_gene_names": "ensemblid", + "reference_obs_batch": "donor_id", + "reference_obs_label": "cell_ontology_class", + "reference_id": "reference", + "output_query": "pbmc_1k_protein_v3_mms_ligned.h5mu", + "output_reference": "TS_Blood_filtered_aligned.h5mu", + "output_layer": "_counts", + "output_var_gene_names": "_gene_names", + "output_obs_batch": "_sample_id", + "output_obs_label": "_cell_type", + "output_obs_id": "_dataset", + "input_reference_gene_overlap": 100, + "unkown_celltype_label": "Unknown", + "overwrite_existing_key": False, + "output_compression": None, + "preserve_var_index": False, + "output_var_index": "_ori_var_index", + "output_var_common_genes": "_common_vars", + "align_layers_raw_counts": True, + "align_layers_lognormalized_counts": False, +} + +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from cross_check_genes import cross_check_genes + +logger = setup_logger() + + +def copy_layer(adata, input_layer, output_layer, overwrite=False): + if output_layer not in adata.layers.keys() and input_layer: + logger.info(f"Copying layer from `{input_layer}` to `{output_layer}`") + adata.layers[output_layer] = adata.layers[input_layer].copy() + + elif output_layer not in adata.layers.keys() and not input_layer: + logger.info(f"Copying layer from .X to `{output_layer}`") + adata.layers[output_layer] = adata.X.copy() + + else: + if not overwrite: + raise ValueError( + f"Layer `{output_layer}` already exists. Data can not be copied." + ) + logger.warning(f"Layer `{output_layer}` already exists. Overwriting data.") + adata.layers[output_layer] = ( + adata.layers[input_layer].copy() if input_layer else adata.X.copy() + ) + + return adata + + +def copy_obs(adata, input_obs_key, output_obs_key, overwrite=False, fill_value=None): + if output_obs_key not in adata.obs.keys() and input_obs_key: + logger.info(f"Copying .obs field from `{input_obs_key}` to `{output_obs_key}`") + adata.obs[output_obs_key] = adata.obs[input_obs_key].copy() + + elif output_obs_key not in adata.obs.keys() and not input_obs_key: + logger.info(f"Assigning fill value `{fill_value}` to .obs {output_obs_key}") + adata.obs[output_obs_key] = fill_value + + else: + if not overwrite: + raise ValueError( + f".obs key `{output_obs_key}` already exists. Data can not be copied." + ) + + logger.warning(f".obs key `{output_obs_key}` already exists. Overwriting data.") + adata.obs[output_obs_key] = ( + adata.obs[input_obs_key].copy() if input_obs_key else fill_value + ) + + return adata + + +def copy_and_sanitize_var_gene_names( + adata, + input_var_key, + output_var_key, + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field="ori_var_index", +): + if output_var_key not in adata.var.keys() and input_var_key: + logger.info(f"Copying .var field from `{input_var_key}` to `{output_var_key}`") + adata.var[output_var_key] = adata.var[input_var_key].str.replace( + "\\.[0-9]+$", "", regex=True + ) + + elif output_var_key not in adata.var.keys() and not input_var_key: + logger.info(f"Copying .var index to `{output_var_key}`") + adata.var[output_var_key] = adata.var.index.str.replace( + "\\.[0-9]+$", "", regex=True + ) + + else: + if not overwrite: + raise ValueError( + f".var key `{output_var_key}` already exists. Data can not be copied." + ) + + logger.warning(f".var key `{output_var_key}` already exists. Overwriting data.") + adata.var[output_var_key] = ( + adata.var[input_var_key].str.replace("\\.[0-9]+$", "", regex=True) + if input_var_key + else adata.var.index.str.replace("\\.[0-9]+$", "", regex=True) + ) + + if not preserve_index: + logger.info("Replacing .var index with sanitized gene names...") + adata.var[var_index_field] = adata.var.index + adata.var.index = list(adata.var[output_var_key]) + + return adata + + +def main(): + logger.info("Reading query and reference data") + input_adata = mu.read_h5ad(par["input"], mod=par["modality"]) + input_modality = input_adata.copy() + + reference_adata = mu.read_h5ad(par["reference"], mod=par["modality"]) + reference_modality = reference_adata.copy() + + # Aligning layers + logger.info("### Aligning layers") + + if par["align_layers_lognormalized_counts"] and par["align_layers_raw_counts"]: + if par["input_layer"] == par["input_layer_lognormalized"]: + raise ValueError( + "Layer names for raw and lognormalized counts in the query data can not be identical." + ) + + if par["reference_layer"] == par["reference_layer_lognormalized"]: + raise ValueError( + "Layer names for raw and lognormalized counts in the reference data can not be identical." + ) + + if par["align_layers_raw_counts"]: + logger.info("## Copying query layer raw counts...") + input_modality = copy_layer( + input_modality, + par["input_layer"], + par["output_layer"], + overwrite=par["overwrite_existing_key"], + ) + logger.info("## Copying reference layer raw counts...") + reference_modality = copy_layer( + reference_modality, + par["reference_layer"], + par["output_layer"], + overwrite=par["overwrite_existing_key"], + ) + + if par["align_layers_lognormalized_counts"]: + logger.info("## Copying query layer lognormalized counts...") + input_modality = copy_layer( + input_modality, + par["input_layer_lognormalized"], + par["output_layer_lognormalized"], + overwrite=par["overwrite_existing_key"], + ) + logger.info("## Copying reference layerlognormalized counts...") + reference_modality = copy_layer( + reference_modality, + par["reference_layer_lognormalized"], + par["output_layer_lognormalized"], + overwrite=par["overwrite_existing_key"], + ) + + # Aligning batch labels + logger.info("### Aligning batch labels") + logger.info("## Copying query .obs batch field...") + input_modality = copy_obs( + input_modality, + par["input_obs_batch"], + par["output_obs_batch"], + par["overwrite_existing_key"], + fill_value=None, + ) + logger.info("## Copying reference .obs batch field...") + reference_modality = copy_obs( + reference_modality, + par["reference_obs_batch"], + par["output_obs_batch"], + overwrite=par["overwrite_existing_key"], + fill_value=None, + ) + + # Aligning cell type labels + logger.info("### Aligning cell type labels") + logger.info("## Copying query .obs cell type label field...") + input_modality = copy_obs( + input_modality, + par["input_obs_label"], + par["output_obs_label"], + par["overwrite_existing_key"], + fill_value=par["unkown_celltype_label"], + ) + logger.info("## Copying reference .obs cell type label field...") + reference_modality = copy_obs( + reference_modality, + par["reference_obs_label"], + par["output_obs_label"], + overwrite=par["overwrite_existing_key"], + fill_value=par["unkown_celltype_label"], + ) + + # Aligning and sanitizing gene names + logger.info("### Aligning and sanitizing gene names") + logger.info("## Copying query .var gene names field...") + input_modality = copy_and_sanitize_var_gene_names( + input_modality, + par["input_var_gene_names"], + par["output_var_gene_names"], + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field=par["output_var_index"], + ) + logger.info("## Copying reference .var gene names field...") + reference_modality = copy_and_sanitize_var_gene_names( + reference_modality, + par["reference_var_gene_names"], + par["output_var_gene_names"], + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field=par["output_var_index"], + ) + + # Cross check genes + logger.info("### Cross checking genes") + common_vars = cross_check_genes( + input_modality.var[par["output_var_gene_names"]], + reference_modality.var[par["output_var_gene_names"]], + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + # Add common vars to the output + input_modality.var[par["output_var_common_genes"]] = input_modality.var[ + par["output_var_gene_names"] + ].isin(common_vars) + reference_modality.var[par["output_var_common_genes"]] = reference_modality.var[ + par["output_var_gene_names"] + ].isin(common_vars) + + # Adding an id to the query and reference datasets + logger.info("### Adding an id to the query and reference datasets") + logger.info( + f"## Adding `{par['input_id']}` to the .obs `{par['output_obs_id']}` field of the query dataset..." + ) + input_modality.obs[par["output_obs_id"]] = par["input_id"] + logger.info( + f"## Adding `{par['reference_id']}` to the .obs `{par['output_obs_id']}` field of the reference dataset..." + ) + reference_modality.obs[par["output_obs_id"]] = par["reference_id"] + + # Writing output data + logger.info("Writing output data") + write_h5ad_to_h5mu_with_compression( + par["output_query"], + par["input"], + par["modality"], + input_modality, + par["output_compression"], + ) + + write_h5ad_to_h5mu_with_compression( + par["output_reference"], + par["reference"], + par["modality"], + reference_modality, + par["output_compression"], + ) + + +if __name__ == "__main__": + main() diff --git a/src/feature_annotation/align_query_reference/test.py b/src/feature_annotation/align_query_reference/test.py new file mode 100644 index 00000000..6213117d --- /dev/null +++ b/src/feature_annotation/align_query_reference/test.py @@ -0,0 +1,541 @@ +import sys +import os +import pytest +import subprocess +import re +import numpy as np +import mudata as mu + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" +reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu" + + +@pytest.fixture +def copy_layer(random_h5mu_path): + def wrapper(input_mudata_file, output_layer_name): + input_mudata = mu.read_h5mu(input_mudata_file) + input_adata = input_mudata.mod["rna"] + input_adata.layers[output_layer_name] = input_adata.X.copy() + output_mudata_file = random_h5mu_path() + input_mudata.write_h5mu(output_mudata_file) + return output_mudata_file + + return wrapper + + +@pytest.fixture +def add_var(random_h5mu_path): + def wrapper(input_mudata_file, var_field): + input_mudata = mu.read_h5mu(input_mudata_file) + input_adata = input_mudata.mod["rna"] + input_adata.var[var_field] = "fill_value" + output_mudata_file = random_h5mu_path() + input_mudata.write_h5mu(output_mudata_file) + return output_mudata_file + + return wrapper + + +@pytest.fixture +def add_obs(random_h5mu_path): + def wrapper(input_mudata_file, obs_field): + input_mudata = mu.read_h5mu(input_mudata_file) + input_adata = input_mudata.mod["rna"] + input_adata.obs[obs_field] = "fill_value" + output_mudata_file = random_h5mu_path() + input_mudata.write_h5mu(output_mudata_file) + return output_mudata_file + + return wrapper + + +def test_simple_execution(run_component, random_h5mu_path): + output_query_file = random_h5mu_path() + output_reference_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + ] + ) + + assert os.path.exists(output_query_file), "Output query file does not exist" + assert os.path.exists(output_reference_file), "Output reference file does not exist" + + query_adata = mu.read_h5mu(output_query_file).mod["rna"] + reference_adata = mu.read_h5mu(output_reference_file).mod["rna"] + + expected_layers = ["_counts"] + expected_var = ["_gene_names", "_ori_var_index", "_common_vars"] + expected_obs = ["_sample_id", "_cell_type", "_dataset"] + + # Evaluate presence of obs, var, layers + assert all(key in list(query_adata.obs) for key in expected_obs), ( + f"Query obs columns should be: {expected_obs}, found: {query_adata.obs.keys()}." + ) + + assert all(key in list(query_adata.var) for key in expected_var), ( + f"Query var columns should be: {expected_var}, found: {query_adata.var.keys()}." + ) + assert all(key in list(query_adata.layers) for key in expected_layers), ( + f"Query layers should be: {expected_layers}, found: {query_adata.layers.keys()}." + ) + + assert all(key in list(reference_adata.obs) for key in expected_obs), ( + f"Reference obs columns should be: {expected_obs}, found: {reference_adata.obs.keys()}." + ) + + assert all(key in list(reference_adata.var) for key in expected_var), ( + f"Reference var columns should be: {expected_var}, found: {reference_adata.var.keys()}." + ) + assert all(key in list(reference_adata.layers) for key in expected_layers), ( + f"Reference layers should be: {expected_layers}, found: {reference_adata.layers.keys()}." + ) + + # Evaluate values in obs, var, layers + assert np.all(query_adata.var["_gene_names"] == query_adata.var.index), ( + "Query .var _gene_names should be equal to query .var index" + ) + assert np.all(query_adata.obs["_sample_id"] == query_adata.obs["sample_id"]), ( + "Query .obs _sample_id should be equal to query .obs sample_id" + ) + assert np.all(query_adata.obs["_cell_type"] == "Unknown"), ( + "Query .obs _cell_type should have value Unkown" + ) + assert np.all(query_adata.obs["_dataset"] == "query"), ( + "Query .obs _dataset should have value query" + ) + + assert np.all(reference_adata.var["_gene_names"] == reference_adata.var.index), ( + "Reference .var _gene_names should be equal to query .var index" + ) + + assert np.all(reference_adata.var.index != reference_adata.var["ensemblid"]), ( + "Reference .var index should not be equal to reference .var ensemblid" + ) + + assert np.all( + reference_adata.var.index + == [re.sub("\\.[0-9]+$", "", s) for s in reference_adata.var["ensemblid"]] + ), "Reference .var index should be equal to sanitized reference .var ensemblid " + + assert np.any( + reference_adata.var["_ori_var_index"] != reference_adata.var["ensemblid"] + ), ( + "Reference .var _ori_var_index should not be all equal to reference .var ensemblid" + ) + + assert np.all(reference_adata.var["_ori_var_index"] != reference_adata.var.index), ( + "Reference .var _ori_var_index should be equal to reference .var index" + ) + + assert np.all( + reference_adata.obs["_sample_id"] == reference_adata.obs["donor_id"] + ), "Reference .obs _sample_id should be equal to query .obs sample_id" + assert np.all( + reference_adata.obs["_cell_type"] == reference_adata.obs["cell_ontology_class"] + ), "Reference .obs _cell_type should be equal to reference .obs cell_ontology_class" + assert np.all(reference_adata.obs["_dataset"] == "reference"), ( + "Reference .obs _dataset should have value reference" + ) + + +def test_align_multiple_layers(run_component, random_h5mu_path): + output_query_file = random_h5mu_path() + output_reference_file = random_h5mu_path() + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--align_layers_lognormalized_counts", + "True", + "--input_layer_lognormalized", + "log_normalized", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_layer_lognormalized", + "log_normalized", + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + ] + ) + + query_adata = mu.read_h5mu(output_query_file).mod["rna"] + reference_adata = mu.read_h5mu(output_reference_file).mod["rna"] + + expected_layers = ["_counts", "_log_normalized"] + + assert all(key in list(query_adata.layers) for key in expected_layers), ( + f"Query layers should be: {expected_layers}, found: {query_adata.layers.keys()}." + ) + assert all(key in list(reference_adata.layers) for key in expected_layers), ( + f"Reference layers should be: {expected_layers}, found: {reference_adata.layers.keys()}." + ) + + args_identical_query_layers = [ + "--input", + input_file, + "--modality", + "rna", + "--align_layers_lognormalized_counts", + "True", + "--reference_layer_lognormalized", + "--log_normalized", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args_identical_query_layers) + assert re.search( + r"Layer names for raw and lognormalized counts in the query data can not be identical.", + err.value.stdout.decode("utf-8"), + ) + args_identical_query_layers = [ + "--input", + input_file, + "--modality", + "rna", + "--align_layers_lognormalized_counts", + "True", + "--input_layer_lognormalized", + "--log_normalized", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args_identical_query_layers) + assert re.search( + r"Layer names for raw and lognormalized counts in the reference data can not be identical.", + err.value.stdout.decode("utf-8"), + ) + + +def test_copy_layer(run_component, random_h5mu_path, copy_layer): + output_query_file = random_h5mu_path() + output_reference_file = random_h5mu_path() + input_file_with_layer = copy_layer(input_file, "copied_counts") + + run_component( + [ + "--input", + input_file_with_layer, + "--input_layer", + "copied_counts", + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + ] + ) + + query_adata = mu.read_h5mu(output_query_file).mod["rna"] + + assert "_counts" in query_adata.layers.keys(), ( + f"_counts layer should be present, found: {query_adata.layers.keys()}." + ) + + args_existing_layer = [ + "--input", + input_file_with_layer, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--output_layer", + "copied_counts", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args_existing_layer) + assert re.search( + r"ValueError: Layer `copied_counts` already exists. Data can not be copied.", + err.value.stdout.decode("utf-8"), + ) + + disable_raise_args = [ + "--input", + input_file_with_layer, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--output_layer", + "copied_counts", + "--overwrite_existing_key", + "True", + ] + + run_component(disable_raise_args) + + +def test_overwrite_obs(run_component, random_h5mu_path, add_obs): + output_query_file = random_h5mu_path() + output_reference_file = random_h5mu_path() + + input_file_with_obs = add_obs(input_file, "Obs") + + args = [ + "--input", + input_file_with_obs, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--output_obs_batch", + "Obs", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: .obs key `Obs` already exists. Data can not be copied.", + err.value.stdout.decode("utf-8"), + ) + + disable_raise_args = [ + "--input", + input_file, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--output_obs_batch", + "Obs--overwrite_existing_key", + "True", + ] + + run_component(disable_raise_args) + + +def test_overwrite_var(run_component, random_h5mu_path, add_var): + output_query_file = random_h5mu_path() + output_reference_file = random_h5mu_path() + + input_file_with_obs = add_var(input_file, "Var") + + args = [ + "--input", + input_file_with_obs, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--output_var_gene_names", + "Var", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: .var key `Var` already exists. Data can not be copied.", + err.value.stdout.decode("utf-8"), + ) + + disable_raise_args = [ + "--input", + input_file, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--output_var_gene_names", + "Var", + "--overwrite_existing_key", + "True", + ] + + run_component(disable_raise_args) + + +def test_preserve_var_index(run_component, random_h5mu_path): + output_query_file = random_h5mu_path() + output_reference_file = random_h5mu_path() + # input_file_transformed = copy_layer(input_file, "copied_counts") + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_obs_batch", + "sample_id", + "--reference", + reference_file, + "--reference_obs_batch", + "donor_id", + "--reference_obs_label", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_query", + output_query_file, + "--output_reference", + output_reference_file, + "--preserve_var_index", + ] + ) + + ori_query_adata = mu.read_h5mu(input_file).mod["rna"] + ori_reference_adata = mu.read_h5mu(reference_file).mod["rna"] + query_adata = mu.read_h5mu(output_query_file).mod["rna"] + reference_adata = mu.read_h5mu(output_reference_file).mod["rna"] + + assert "_ori_var_index" not in query_adata.var.keys() + assert "_ori_var_index" not in reference_adata.var.keys() + + assert np.all(ori_query_adata.var.index == query_adata.var.index), ( + "Query .var index should be preserved" + ) + + assert np.all(ori_reference_adata.var.index == reference_adata.var.index), ( + "Reference .var index should be preserved" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml b/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml new file mode 100644 index 00000000..9efad017 --- /dev/null +++ b/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml @@ -0,0 +1,139 @@ +name: highly_variable_features_scanpy +namespace: feature_annotation +scope: "public" +description: | + Annotate highly variable features [Satija15] [Zheng17] [Stuart19]. + + Expects logarithmized data, except when flavor='seurat_v3' in which count data is expected. + + Depending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19]. + + For the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected. + + For [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, contributor ] +arguments: + # input + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--layer" + type: string + description: use adata.layers[layer] for expression values instead of adata.X. + required: false + - name: "--var_input" + type: string + description: | + If specified, use boolean array in adata.var[var_input] to calculate hvg on subset of vars. + required: false + # output + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + + - name: "--var_name_filter" + type: string + default: "filter_with_hvg" + description: In which .var slot to store a boolean array corresponding to which observations should be filtered out. + + - name: "--varm_name" + type: string + default: "hvg" + description: In which .varm slot to store additional metadata. + + # arguments + - name: "--flavor" + type: string + default: "seurat" + choices: ["seurat", "cell_ranger", "seurat_v3"] + description: | + Choose the flavor for identifying highly variable features. For the dispersion based methods + in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features. + + - name: "--n_top_features" + type: integer + description: Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'. + + - name: "--min_mean" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. + default: 0.0125 + + - name: "--max_mean" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. + default: 3 + + - name: "--min_disp" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. + default: 0.5 + + - name: "--max_disp" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. Default is +inf. + # default: "+inf" + + - name: "--span" + type: double + description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'. + default: 0.3 + + - name: "--n_bins" + type: integer + description: Number of bins for binning the mean feature expression. Normalization is done with respect to each bin. If just a single feature falls into a bin, the normalized dispersion is artificially set to 1. + default: 20 + + - name: "--obs_batch_key" + type: string + description: | + If specified, highly-variable features are selected within each batch separately and merged. This simple + process avoids the selection of batch-specific features and acts as a lightweight batch correction method. + For all flavors, features are first sorted by how many batches they are a HVG. For dispersion-based flavors + ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across + batches) rank based on within-batch normalized variance. + +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py + - path: /src/utils/subset_vars.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - scikit-misc + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] diff --git a/src/feature_annotation/highly_variable_features_scanpy/script.py b/src/feature_annotation/highly_variable_features_scanpy/script.py new file mode 100644 index 00000000..6e5c36cd --- /dev/null +++ b/src/feature_annotation/highly_variable_features_scanpy/script.py @@ -0,0 +1,181 @@ +import scanpy as sc +import mudata as mu +import anndata as ad +import pandas as pd +import sys +import re + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "var_name_filter": "filter_with_hvg", + "do_subset": False, + "flavor": "seurat", + "n_top_features": None, + "min_mean": 0.0125, + "max_mean": 3.0, + "min_disp": 0.5, + "span": 0.3, + "n_bins": 20, + "varm_name": "hvg", + "obs_batch_key": "batch", + "layer": "log_transformed", +} + +meta = {"resources_dir": "."} + +mu_in = mu.read_h5mu( + "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) +rna_in = mu_in.mod["rna"] +assert "filter_with_hvg" not in rna_in.var.columns +log_transformed = sc.pp.log1p(rna_in, copy=True) +rna_in.layers["log_transformed"] = log_transformed.X +rna_in.uns["log1p"] = log_transformed.uns["log1p"] +temp_h5mu = "lognormed.h5mu" +rna_in.obs["batch"] = "A" +column_index = rna_in.obs.columns.get_indexer(["batch"]) +rna_in.obs.iloc[slice(rna_in.n_obs // 2, None), column_index] = "B" +rna_in.var["common_vars"] = False +rna_in.var["common_vars"].iloc[:10000] = True +mu_in.write_h5mu(temp_h5mu) +par["input"] = temp_h5mu +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() +modality_name = par["modality"] +data = mu.read_h5ad(par["input"], mod=modality_name) +assert data.var_names.is_unique, "Expected var_names of input modality to be unique." + +logger.info("Processing modality '%s'", modality_name) + +if par["layer"] and par["layer"] not in data.layers: + raise ValueError( + f"Layer '{par['layer']}' not found in layers for modality '{modality_name}'. " + f"Found layers are: {','.join(data.layers)}" + ) + +# input layer argument does not work when batch_key is specified because +# it still uses .X to filter out genes with 0 counts, even if .X might not exist. +# So create a custom anndata as input that always uses .X +input_layer = data.X if not par["layer"] else data.layers[par["layer"]] +obs = pd.DataFrame(index=data.obs_names.copy()) +var = pd.DataFrame(index=data.var_names.copy()) +if par["obs_batch_key"]: + obs = data.obs.loc[:, par["obs_batch_key"]].to_frame() +input_anndata = ad.AnnData(X=input_layer.copy(), obs=obs, var=var) +if "log1p" in data.uns: + input_anndata.uns["log1p"] = data.uns["log1p"] + +# Workaround for issue +# https://github.com/scverse/scanpy/issues/2239 +# https://github.com/scverse/scanpy/issues/2181 +if par["flavor"] != "seurat_v3": + # This component requires log normalized data when flavor is not seurat_v3 + # We assume that the data is correctly normalized but scanpy will look at + # .uns to check the transformations performed on the data. + # To prevent scanpy from automatically tranforming the counts when they are + # already transformed, we set the appropriate values to .uns. + if "log1p" not in input_anndata.uns: + logger.warning( + "When flavor is not set to 'seurat_v3', " + "the input data for this component must be log-transformed. " + "However, the 'log1p' dictionairy in .uns has not been set. " + "This is fine if you did not log transform your data with scanpy." + "Otherwise, please check if you are providing log transformed " + "data using --layer." + ) + input_anndata.uns["log1p"] = {"base": None} + elif "log1p" in input_anndata.uns and "base" not in input_anndata.uns["log1p"]: + input_anndata.uns["log1p"]["base"] = None + +# Enable calculating the HVG only on a subset of vars +# e.g for cell type annotation, only calculate HVG on variables that are common between query and reference +if par["var_input"]: + input_anndata.var[par["var_input"]] = data.var[par["var_input"]] + input_anndata = subset_vars(input_anndata, par["var_input"]) + +logger.info("\tUnfiltered data: %s", data) + +logger.info("\tComputing hvg") +# construct arguments +hvg_args = { + "adata": input_anndata, + "n_top_genes": par["n_top_features"], + "min_mean": par["min_mean"], + "max_mean": par["max_mean"], + "min_disp": par["min_disp"], + "span": par["span"], + "n_bins": par["n_bins"], + "flavor": par["flavor"], + "subset": False, + "inplace": False, + "layer": None, # Always uses .X because the input layer was already handled +} + +optional_parameters = { + "max_disp": "max_disp", + "obs_batch_key": "batch_key", + "n_top_genes": "n_top_features", +} +# only add parameter if it's passed +for par_name, dest_name in optional_parameters.items(): + if par.get(par_name): + hvg_args[dest_name] = par[par_name] + +# scanpy does not do this check, although it is stated in the documentation +if par["flavor"] == "seurat_v3" and not par["n_top_features"]: + raise ValueError( + "When flavor is set to 'seurat_v3', you are required to set 'n_top_features'." + ) + +# call function +try: + out = sc.pp.highly_variable_genes(**hvg_args) + if par["var_input"] is not None: + out.index = input_anndata.var.index + out = out.reindex(index=data.var.index, method=None) + out.highly_variable = out.highly_variable.fillna(False) + assert (out.index == data.var.index).all(), ( + "Expected output index values to be equivalent to the input index" + ) + elif par["obs_batch_key"] is not None: + out = out.reindex(index=data.var.index, method=None) + assert (out.index == data.var.index).all(), ( + "Expected output index values to be equivalent to the input index" + ) + +except ValueError as err: + if str(err) == "cannot specify integer `bins` when input data contains infinity": + err.args = ( + "Cannot specify integer `bins` when input data contains infinity. " + "Perhaps input data has not been log normalized?", + ) + if re.search("Bin edges must be unique:", str(err)): + raise RuntimeError( + "Scanpy failed to calculate hvg. The error " + "returned by scanpy (see above) could be the " + "result from trying to use this component on unfiltered data." + ) from err + raise err + +out.index = data.var.index +logger.info("\tStoring output into .var") +if par.get("var_name_filter", None) is not None: + data.var[par["var_name_filter"]] = out["highly_variable"] + +if par.get("varm_name", None) is not None and "mean_bin" in out: + # drop mean_bin as mudata/anndata doesn't support tuples + data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) + +logger.info("Writing h5mu to file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], modality_name, data, par["output_compression"] +) diff --git a/src/feature_annotation/highly_variable_features_scanpy/test.py b/src/feature_annotation/highly_variable_features_scanpy/test.py new file mode 100644 index 00000000..e2b4cc0f --- /dev/null +++ b/src/feature_annotation/highly_variable_features_scanpy/test.py @@ -0,0 +1,317 @@ +import os +import subprocess +import scanpy as sc +import mudata as mu +import sys +import pytest +import re +import pandas as pd + +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "resources_dir": "resources_test/", + "config": "./src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml", + "executable": "./target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_data(input_path): + mu_in = mu.read_h5mu(input_path) + return mu_in + + +@pytest.fixture +def lognormed_test_data(input_data): + rna_in = input_data.mod["rna"] + assert "filter_with_hvg" not in rna_in.var.columns + log_transformed = sc.pp.log1p(rna_in, copy=True) + rna_in.layers["log_transformed"] = log_transformed.X + rna_in.uns["log1p"] = log_transformed.uns["log1p"] + return input_data + + +@pytest.fixture +def lognormed_test_data_path(tmp_path, lognormed_test_data): + temp_h5mu = tmp_path / "lognormed.h5mu" + lognormed_test_data.write_h5mu(temp_h5mu) + return temp_h5mu + + +@pytest.fixture +def lognormed_batch_test_data_path(tmp_path, lognormed_test_data): + temp_h5mu = tmp_path / "lognormed_batch.h5mu" + rna_mod = lognormed_test_data.mod["rna"] + rna_mod.obs["batch"] = "A" + column_index = rna_mod.obs.columns.get_indexer(["batch"]) + rna_mod.obs.iloc[slice(rna_mod.n_obs // 2, None), column_index] = "B" + lognormed_test_data.write_h5mu(temp_h5mu) + return temp_h5mu + + +@pytest.fixture() +def filter_data_path(tmp_path, input_data): + temp_h5mu = tmp_path / "filtered.h5mu" + rna_in = input_data.mod["rna"] + sc.pp.filter_genes(rna_in, min_counts=20) + input_data.write_h5mu(temp_h5mu) + return temp_h5mu + + +@pytest.fixture() +def common_vars_data_path(tmp_path, lognormed_test_data): + temp_h5mu = tmp_path / "lognormed_var_input.h5mu" + rna_in = lognormed_test_data.mod["rna"] + rna_in.var["common_vars"] = False + rna_in.var["common_vars"].iloc[:10000] = True + rna_in.var["common_vars"] = rna_in.var["common_vars"].astype("boolean") + lognormed_test_data.write_h5mu(temp_h5mu) + return temp_h5mu + + +def test_filter_with_hvg(run_component, lognormed_test_data_path): + run_component( + [ + "--flavor", + "seurat", + "--input", + lognormed_test_data_path, + "--output", + "output.h5mu", + "--layer", + "log_transformed", + "--output_compression", + "gzip", + ] + ) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + # Put the output data back into its original shape + # so that we can compare it to the input + data.mod["rna"].var = data.mod["rna"].var.drop( + columns=["filter_with_hvg"], errors="raise" + ) + data.var = data.var.drop(columns=["rna:filter_with_hvg"], errors="raise") + del data["rna"].varm["hvg"] + assert_annotation_objects_equal(lognormed_test_data_path, data) + + +def test_filter_with_hvg_var_input(run_component, common_vars_data_path): + run_component( + [ + "--flavor", + "seurat", + "--input", + common_vars_data_path, + "--output", + "output.h5mu", + "--layer", + "log_transformed", + "--output_compression", + "gzip", + ] + ) + + run_component( + [ + "--flavor", + "seurat", + "--input", + common_vars_data_path, + "--output", + "common_vars.h5mu", + "--layer", + "log_transformed", + "--output_compression", + "gzip", + "--var_input", + "common_vars", + ] + ) + + mdata = mu.read_h5mu("output.h5mu") + common_vars = mu.read_h5mu("common_vars.h5mu") + + # Assert detected HVG are different + hvg = mdata.mod["rna"][:, mdata.mod["rna"].var["filter_with_hvg"]].var_names + common_vars_hvg = common_vars.mod["rna"][ + :, common_vars.mod["rna"].var["filter_with_hvg"] + ].var_names + + assert len(hvg) != len(common_vars_hvg), ( + "Number of HVG should be different when var_input is defined" + ) + + # Assert original data is unchanged + # Put the output data back into its original shape + # so that we can compare it to the input + mdata.mod["rna"].var = mdata.mod["rna"].var.drop( + columns=["filter_with_hvg"], errors="raise" + ) + mdata.var = mdata.var.drop(columns=["rna:filter_with_hvg"], errors="raise") + del mdata["rna"].varm["hvg"] + + common_vars.mod["rna"].var = common_vars.mod["rna"].var.drop( + columns=["filter_with_hvg"], errors="raise" + ) + common_vars.var = common_vars.var.drop( + columns=["rna:filter_with_hvg"], errors="raise" + ) + del common_vars["rna"].varm["hvg"] + + assert_annotation_objects_equal(mdata, common_vars) + + +def test_filter_with_hvg_batch_with_batch( + run_component, lognormed_batch_test_data_path +): + """ + Make sure that selecting a layer works together with obs_batch_key. + https://github.com/scverse/scanpy/issues/2396 + """ + run_component( + [ + "--flavor", + "seurat", + "--input", + lognormed_batch_test_data_path, + "--output", + "output.h5mu", + "--obs_batch_key", + "batch", + "--layer", + "log_transformed", + ] + ) + assert os.path.exists("output.h5mu") + output_data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in output_data.mod["rna"].var.columns + + # Check the contents of the output to check if the correct layer was selected + input_mudata = mu.read_h5mu(lognormed_batch_test_data_path) + input_data = input_mudata.mod["rna"].copy() + input_data.X = input_data.layers["log_transformed"].copy() + del input_data.layers["log_transformed"] + input_data.uns["log1p"]["base"] = None + expected_output = sc.pp.highly_variable_genes( + input_data, batch_key="batch", inplace=False, subset=False + ) + expected_output = expected_output.reindex(index=input_mudata.mod["rna"].var.index) + pd.testing.assert_series_equal( + expected_output["highly_variable"], + output_data.mod["rna"].var["filter_with_hvg"], + check_names=False, + ) + # Put the output data back into its original shape + # so that we can compare it to the input + output_data.mod["rna"].var = output_data.mod["rna"].var.drop( + columns=["filter_with_hvg"], errors="raise" + ) + output_data.var = output_data.var.drop( + columns=["rna:filter_with_hvg"], errors="raise" + ) + assert_annotation_objects_equal(lognormed_batch_test_data_path, output_data) + + +def test_filter_with_hvg_seurat_v3_requires_n_top_features(run_component, input_path): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--flavor", + "seurat_v3", # Uses raw data. + "--output", + "output.h5mu", + ] + ) + assert re.search( + "When flavor is set to 'seurat_v3', you are required to set 'n_top_features'.", + err.value.stdout.decode("utf-8"), + ) + + +def test_filter_with_hvg_seurat_v3(run_component, input_path): + run_component( + [ + "--input", + input_path, + "--flavor", + "seurat_v3", # Uses raw data. + "--output", + "output.h5mu", + "--n_top_features", + "50", + ] + ) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + # Put the output data back into its original shape + # so that we can compare it to the input + data.mod["rna"].var = data.mod["rna"].var.drop( + columns=["filter_with_hvg"], errors="raise" + ) + data.var = data.var.drop(columns=["rna:filter_with_hvg"], errors="raise") + assert_annotation_objects_equal(input_path, data) + + +def test_filter_with_hvg_cell_ranger(run_component, filter_data_path): + run_component( + [ + "--input", + filter_data_path, + "--flavor", + "cell_ranger", # Must use filtered data. + "--output", + "output.h5mu", + ] + ) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + # Put the output data back into its original shape + # so that we can compare it to the input + data.mod["rna"].var = data.mod["rna"].var.drop( + columns=["filter_with_hvg"], errors="raise" + ) + data.var = data.var.drop(columns=["rna:filter_with_hvg"], errors="raise") + del data["rna"].varm["hvg"] + assert_annotation_objects_equal(filter_data_path, data) + + +def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message( + run_component, input_path +): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--flavor", + "cell_ranger", # Must use filtered data, but in this test we use unfiltered data + "--output", + "output.h5mu", + ] + ) + assert re.search( + r"Scanpy failed to calculate hvg. The error " + r"returned by scanpy \(see above\) could be the " + r"result from trying to use this component on unfiltered data.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/feature_annotation/score_genes_cell_cycle_scanpy/config.vsh.yaml b/src/feature_annotation/score_genes_cell_cycle_scanpy/config.vsh.yaml new file mode 100644 index 00000000..22cb0e8c --- /dev/null +++ b/src/feature_annotation/score_genes_cell_cycle_scanpy/config.vsh.yaml @@ -0,0 +1,164 @@ +name: score_genes_cell_cycle_scanpy +namespace: feature_annotation +scope: "public" +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +description: | + Calculates the score associated to S phase and G2M phase and annotates the cell cycle phase for each cell, as implemented by scanpy. + The score is the average expression of a set of genes subtracted with the average expression of a reference set of genes. +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Input h5mu file + required: true + example: input_file.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + example: log_normalized + required: false + description: | + The layer of the adata object containing normalized expression values. + If not provided, the X attribute of the adata object will be used. + - name: "--var_gene_names" + type: string + description: | + The name of the column in the var attribute of the adata object that contains the gene names (symbols). + If not provided, the index of the var attribute will be used. + required: false + example: "gene_names" + - name: Gene list inputs + description: | + The gene list inputs can be provided as a list of gene symbols or as a file containing a list of gene symbols. The gene list file should be formatted as a single column with gene symbols. + + Make sure that the gene list inputs are consistent with the gene names in the adata object as provided by the --var_gene_names argument. + arguments: + - name: --s_genes + type: string + description: | + List of gene symbols for scoring s phase genes. + required: false + multiple: true + example: [gene1, gene2, gene3] + - name: --s_genes_file + type: file + description: | + Path to a .txt file containing the gene list of s phase genes to be scored. + The gene list file should be formatted as a single column with gene symbols. + required: false + example: s_gene_list.txt + - name: --g2m_genes + type: string + description: | + List of gene symbols for scoring g2m phase genes. + required: false + multiple: true + example: [gene1, gene2, gene3] + - name: --g2m_genes_file + type: file + description: | + Path to a .txt file containing the gene list of g2m phase genes to be scored. + The gene list file should be formatted as a single column with gene symbols. + required: false + example: g2m_gene_list.txt + - name: --gene_pool + type: string + description: | + List of gene symbols for sampling the reference set. Default is all genes. + required: false + multiple: true + example: [gene1, gene2, gene3] + - name: --gene_pool_file + type: file + description: | + File with genes for sampling the reference set. Default is all genes. + The gene pool file should be formatted as a single column with gene symbols. + required: false + example: gene_pool.txt + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + Output h5mu file + required: true + example: output_file.h5mu + - name: "--obs_phase" + type: string + description: | + The name of the column in the obs attribute of the adata object that will store the cell cycle phase annotation. + required: false + default: "phase" + - name: "--obs_s_score" + type: string + description: | + The name of the column in the obs attribute of the adata object that will store the s phase score. + required: false + default: "S_score" + - name: "--obs_g2m_score" + type: string + description: | + The name of the column in the obs attribute of the adata object that will store the g2m phase score. + required: false + default: "G2M_score" + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: --n_bins + type: integer + min: 0 + default: 25 + required: false + description: | + Number of expression level bins for sampling. + - name: --random_state + type: integer + description: | + The random seed for sampling. + default: 0 + required: false + - name: --allow_missing_genes + type: boolean + description: | + If true, missing genes in the gene list will be ignored. + default: false + required: false + +resources: + - path: script.py + type: python_script + - path: ../score_genes_scanpy/helper.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + image: python:3.11 + setup: + - type: python + __merge__: [ /src/base/requirements/scanpy.yaml, /src/base/requirements/anndata_mudata.yaml ] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: + - type: nextflow + - type: executable \ No newline at end of file diff --git a/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py b/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py new file mode 100644 index 00000000..7a2d2af7 --- /dev/null +++ b/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py @@ -0,0 +1,97 @@ +import scanpy as sc +import mudata as mu +import anndata as ad +import pandas as pd +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_layer": "log_normalized", + "var_gene_names": "gene_symbol", + "s_genes": ["MCM5", "PCNA", "TYMS"], + "s_genes_file": None, + "g2m_genes": ["UBE2C", "BIRC5", "TPX2"], + "g2m_genes_file": None, + "gene_pool": [], + "gene_pool_file": None, + "output": "output.h5mu", + "obs_phase": "phase", + "obs_s_score": "S_score", + "obs_g2m_score": "G2M_score", + "n_bins": 25, + "random_state": 0, + "output_compression": "gzip", + "allow_missing_genes": False, +} +meta = {"resources_dir": "src/feature_annotation/score_genes_scanpy"} +## VIASH END + +# import helper functions +sys.path.append(meta["resources_dir"]) +from helper import read_gene_list + +# read data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] + +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) +gene_names = pd.Series(input_adata.var_names, index=gene_names_index) + +# check if var index is unique +# input.var[par["var_gene_names"]] is mapped to var index, but may not contain unique values +if not input_adata.var.index.is_unique: + raise ValueError("var index is not unique") + +# read gene lists +s_genes = read_gene_list(par, gene_names.index, "s_genes", "s_genes_file") +g2m_genes = read_gene_list(par, gene_names.index, "g2m_genes", "g2m_genes_file") +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) + +# find matching index names for given genes +g2m_index = gene_names.loc[g2m_genes].tolist() +s_index = gene_names.loc[s_genes].tolist() +gene_pool_index = gene_names.loc[gene_pool].tolist() if gene_pool else None + +# create input data for scanpy +if par["input_layer"]: + X_data = input_adata.layers[par["input_layer"]].copy() +else: + X_data = input_adata.X.copy() +adata_scanpy = ad.AnnData( + X=X_data, + obs=pd.DataFrame(index=input_adata.obs.index), + var=pd.DataFrame(index=input_adata.var.index), +) + +# run score_genes_cell_cycle +sc.tl.score_genes_cell_cycle( + adata_scanpy, + s_genes=s_index, + g2m_genes=g2m_index, + gene_pool=gene_pool_index, + n_bins=par["n_bins"], + random_state=par["random_state"], +) + +# copy results to mudata +output_slot_mapping = { + par["obs_s_score"]: "S_score", + par["obs_g2m_score"]: "G2M_score", + par["obs_phase"]: "phase", +} +assert all(adata_scanpy.obs.index == input_adata.obs.index), ( + "index mismatch between input adata and scanpy output adata" +) +for dest, orig in output_slot_mapping.items(): + input_adata.obs[dest] = adata_scanpy.obs[orig] + +# write output to mudata +mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py b/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py new file mode 100644 index 00000000..13be1b4d --- /dev/null +++ b/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py @@ -0,0 +1,183 @@ +import pytest +import sys +import mudata as mu +import subprocess +import re + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +def test_cell_scoring_cell_cycle(run_component, tmp_path): + output_file = tmp_path / "output.h5mu" + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes", + "MCM5", + "--s_genes", + "PCNA", + "--s_genes", + "TYMS", + "--g2m_genes", + "UBE2C", + "--g2m_genes", + "BIRC5", + "--g2m_genes", + "TPX2", + "--output", + output_file, + "--obs_phase", + "my_phase", + "--obs_s_score", + "my_s_score", + "--obs_g2m_score", + "my_g2m_score", + ] + ) + + output = mu.read(output_file) + + # check output + expected_rna_obs_cols = [ + "my_phase", + "my_s_score", + "my_g2m_score", + ] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns mdata.mod['rna'].obs['{col}']" + ) + + +def test_cell_scoring_cell_cycle_with_alternative_args(run_component, tmp_path): + output_file = tmp_path / "output_new.h5mu" + g2m_gene_file = tmp_path / "g2m_genes.txt" + s_gene_file = tmp_path / "s_genes.txt" + + with open(g2m_gene_file, "w") as f: + f.write("UBE2C\nBIRC5\nTPX2\n") + with open(s_gene_file, "w") as f: + f.write("MCM5\nPCNA\nTYMS\n") + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes_file", + s_gene_file, + "--g2m_genes_file", + g2m_gene_file, + "--output", + output_file, + ] + ) + + output = mu.read(output_file) + + # check output + expected_rna_obs_cols = [ + "phase", + "S_score", + "G2M_score", + ] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns mdata.mod['rna'].obs['{col}']" + ) + + +def test_cell_scoring_cell_cycle_with_mixed_args(run_component, tmp_path): + output_file = tmp_path / "output_new.h5mu" + g2m_gene_file = tmp_path / "g2m_genes.txt" + s_gene_file = tmp_path / "s_genes.txt" + + with open(g2m_gene_file, "w") as f: + f.write("UBE2C\nBIRC5\nTPX2\n") + with open(s_gene_file, "w") as f: + f.write("MCM5\nPCNA\nTYMS\n") + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes_file", + s_gene_file, + "--s_genes", + "FEN1", + "--g2m_genes_file", + g2m_gene_file, + "--g2m_genes", + "TOP2A", + "--output", + output_file, + ] + ) + + output = mu.read(output_file) + + # check output + expected_rna_obs_cols = [ + "phase", + "S_score", + "G2M_score", + ] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns mdata.mod['rna'].obs['{col}']" + ) + + +def test_fail(run_component, tmp_path): + output_file = tmp_path / "output_newest.h5mu" + + with pytest.raises(subprocess.CalledProcessError) as e_info: + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes", + "a_gene_name_that_does_not_exist", + "--g2m_genes", + "MCM5", + "--output", + output_file, + ] + ) + + assert e_info.value.returncode != 0 + expected_error = r"The follow genes are missing from the input dataset: {\'a_gene_name_that_does_not_exist\'}" + assert re.search(expected_error, e_info.value.stdout.decode("utf-8")) is not None, ( + f"expected error message not found in {e_info.value.stdout.decode('utf-8')}" + ) + + assert not output_file.exists(), f"output file should not exist: {output_file}" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/feature_annotation/score_genes_scanpy/config.vsh.yaml b/src/feature_annotation/score_genes_scanpy/config.vsh.yaml new file mode 100644 index 00000000..50867404 --- /dev/null +++ b/src/feature_annotation/score_genes_scanpy/config.vsh.yaml @@ -0,0 +1,140 @@ +name: score_genes_scanpy +namespace: feature_annotation +scope: "public" +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +description: | + Calculates the score of a set of genes for each cell, as implemented by scanpy. + The score is the average expression of a set of genes subtracted with the average expression of a reference set of genes. +argument_groups: + +- name: Inputs + arguments: + - name: --input + type: file + description: Input h5mu file + required: true + example: input_file.h5mu + - name: --gene_list + type: string + description: | + List of gene symbols to be scored. + required: false + multiple: true + example: [gene1, gene2, gene3] + - name: --gene_list_file + type: file + description: | + Path to a .txt file containing the gene list to be scored. + The gene list file should be formatted as a single column with gene symbols. + required: false + example: gene_list.txt + - name: --gene_pool + type: string + description: | + List of gene symbols for sampling the reference set. Default is all genes. + required: false + multiple: true + example: [gene1, gene2, gene3] + - name: --gene_pool_file + type: file + description: | + File with genes for sampling the reference set. Default is all genes. + The gene pool file should be formatted as a single column with gene symbols. + required: false + example: gene_pool.txt + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + example: log_normalized + required: false + description: | + The layer of the adata object containing normalized expression values. + If not provided, the X attribute of the adata object will be used. + - name: "--var_gene_names" + required: false + example: "gene_symbol" + type: string + description: | + .var column name to be used to detect mitochondrial genes instead of .var_names (default if not set). + - name: "--allow_missing_genes" + type: boolean + description: | + Whether to run score_genes when some genes in the gene_list or gene_list_file are not present in the gene_pool + required: false + +- name: Outputs + arguments: + - name: --output + type: file + direction: output + description: | + Output h5mu file + required: true + example: output_file.h5mu + - name: --obs_score + type: string + default: score + required: false + description: | + Name of the score field to be added in .obs. + __merge__: [., /src/base/h5_compression_argument.yaml] + + +- name: Arguments + arguments: + - name: --ctrl_size + type: integer + min: 0 + default: 50 + required: false + description: | + Number of reference genes to be sampled from each bin. + If len(gene_list) is not too low, you can set ctrl_size=len(gene_list). + - name: --n_bins + type: integer + min: 0 + default: 25 + required: false + description: | + Number of expression level bins for sampling. + - name: --random_state + type: integer + description: | + The random seed for sampling. + default: 0 + required: false + +resources: + - path: script.py + type: python_script + - path: helper.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: +- type: docker + image: python:3.11 + setup: + - type: python + __merge__: [ /src/base/requirements/scanpy.yaml, /src/base/requirements/anndata_mudata.yaml ] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable \ No newline at end of file diff --git a/src/feature_annotation/score_genes_scanpy/helper.py b/src/feature_annotation/score_genes_scanpy/helper.py new file mode 100644 index 00000000..fe559658 --- /dev/null +++ b/src/feature_annotation/score_genes_scanpy/helper.py @@ -0,0 +1,42 @@ +from typing import List, Dict, Any, Optional + + +def read_gene_list( + par: Dict[str, Any], + gene_names: List[str], + list_key: str, + file_key: str, + required: bool = True, +) -> Optional[List[str]]: + """ + Reads a gene list from the parameters and returns it as a list of strings. + """ + + # check whether one or the other was provided, if required + if required and not par[list_key] and not par[file_key]: + raise ValueError(f"Either --{list_key} or --{file_key} must be set") + + # read gene list from parameters + list_of_genes = par[list_key] if par[list_key] else [] + + # read gene list from file + if par[file_key]: + with open(par[file_key]) as file: + file_genes = [x.strip() for x in file] + list_of_genes.extend(file_genes) + + # check for missing genes + if not par["allow_missing_genes"] and list_of_genes: + missing = set(list_of_genes).difference(gene_names) + if missing: + raise ValueError( + f"The follow genes are missing from the input dataset: {missing}" + ) + + # return gene list + if list_of_genes: + return list_of_genes + elif required: + raise ValueError(f"No genes detected in --{list_key} or --{file_key}") + else: + return None diff --git a/src/feature_annotation/score_genes_scanpy/script.py b/src/feature_annotation/score_genes_scanpy/script.py new file mode 100644 index 00000000..633bd905 --- /dev/null +++ b/src/feature_annotation/score_genes_scanpy/script.py @@ -0,0 +1,85 @@ +import scanpy as sc +import anndata as ad +import pandas as pd +import mudata as mu +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_layer": "log_normalized", + "gene_list_file": None, + "gene_list": ["MCM5", "PCNA", "TYMS"], + "gene_pool_file": None, + "gene_pool": None, + "var_gene_names": "gene_symbol", + "output": "output.h5mu", + "ctrl_size": 50, + "n_bins": 25, + "obs_score": "score", + "random_state": 0, + "output_compression": "gzip", + "allow_missing_genes": False, +} +meta = {"resources_dir": "src/feature_annotation/score_genes_scanpy"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import read_gene_list + +# read data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] + +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) +gene_names = pd.Series(input_adata.var_names, index=gene_names_index) + +# check if var index is unique +# input.var[par["var_gene_names"]] is mapped to var index, but may not contain unique values +if not input_adata.var.index.is_unique: + raise ValueError("var index is not unique") + +# read gene list +gene_list = read_gene_list(par, gene_names.index, "gene_list", "gene_list_file") +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) + +# find matching index names for given genes +gene_list_index = gene_names.loc[gene_list].tolist() +gene_pool_index = gene_names.loc[gene_pool].tolist() if gene_pool else None + +# create input data for scanpy +if par["input_layer"]: + layer_data = input_adata.layers[par["input_layer"]].copy() +else: + layer_data = input_adata.X.copy() +adata_scanpy = ad.AnnData( + X=layer_data, + obs=pd.DataFrame(index=input_adata.obs.index), + var=pd.DataFrame(index=input_adata.var.index), +) + +# run score_genes +sc.tl.score_genes( + adata_scanpy, + gene_list=gene_list_index, + gene_pool=gene_pool_index, + ctrl_size=par["ctrl_size"], + n_bins=par["n_bins"], + random_state=par["random_state"], +) + +# copy results to mudata +assert all(adata_scanpy.obs.index == input_adata.obs.index), ( + "index mismatch between input adata and scanpy output adata" +) +input_adata.obs[par["obs_score"]] = adata_scanpy.obs["score"] + +# write output to mudata +mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/feature_annotation/score_genes_scanpy/test.py b/src/feature_annotation/score_genes_scanpy/test.py new file mode 100644 index 00000000..c481e1ee --- /dev/null +++ b/src/feature_annotation/score_genes_scanpy/test.py @@ -0,0 +1,152 @@ +import pytest +import sys +import mudata as mu +import subprocess +import re + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.fixture +def gene_list_file(tmp_path): + result = tmp_path / "s_genes.txt" + gene_list = ["UBE2C", "BIRC5", "TPX2"] + with result.open("w") as open_gene_list_file: + open_gene_list_file.write("\n".join(gene_list)) + return result + + +def test_cell_scoring(run_component, tmp_path): + output_file = tmp_path / "output.h5mu" + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list", + "UBE2C", + "--gene_list", + "BIRC5", + "--gene_list", + "TPX2", + "--output", + output_file, + "--obs_score", + "cell_cycle_score", + ] + ) + + output = mu.read(output_file) + + # check output + expected_rna_obs_cols = ["cell_cycle_score"] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns .mod['rna'].obs['{col}']" + ) + + +def test_cell_scoring_with_alternative_args(run_component, tmp_path, gene_list_file): + output_file = tmp_path / "output_new.h5mu" + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list_file", + gene_list_file, + "--output", + output_file, + "--obs_score", + "cell_cycle_score", + ] + ) + + output = mu.read(output_file) + + # check output + expected_rna_obs_cols = ["cell_cycle_score"] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns mdata.mod['rna'].obs['{col}']" + ) + + +def test_cell_scoring_with_mixed_args(run_component, tmp_path, gene_list_file): + output_file = tmp_path / "output_new.h5mu" + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list_file", + gene_list_file, + "--gene_list", + "TOP2A", + "--output", + output_file, + "--obs_score", + "cell_cycle_score", + ] + ) + + output = mu.read(output_file) + + # check output + expected_rna_obs_cols = ["cell_cycle_score"] + for col in expected_rna_obs_cols: + assert col in output.mod["rna"].obs.columns, ( + f"could not find columns mdata.mod['rna'].obs['{col}']" + ) + + +def test_fail(run_component, tmp_path): + output_file = tmp_path / "output_newest.h5mu" + + with pytest.raises(subprocess.CalledProcessError) as e_info: + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list", + "a_gene_name_that_does_not_exist", + "--output", + output_file, + ] + ) + + assert e_info.value.returncode != 0 + expected_error = r"The follow genes are missing from the input dataset: {\'a_gene_name_that_does_not_exist\'}" + assert re.search(expected_error, e_info.value.stdout.decode("utf-8")) is not None, ( + f"expected error message not found in {e_info.value.stdout.decode('utf-8')}" + ) + + assert not output_file.exists(), f"output file should not exist: {output_file}" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/files/make_params/config.vsh.yaml b/src/files/make_params/config.vsh.yaml new file mode 100644 index 00000000..5755a880 --- /dev/null +++ b/src/files/make_params/config.vsh.yaml @@ -0,0 +1,62 @@ +name: make_params +namespace: "files" +scope: "public" +description: "Looks for files in a directory and turn it in a params file." +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, author ] +arguments: + - name: "--base_dir" + type: file + description: Base directory to search recursively + direction: input + required: true + example: "/path/to/dir" + - name: "--pattern" + type: string + description: An optional regular expression. Only file names which match the regular expression will be matched. + required: true + example: "*.fastq.gz" + - name: "--n_dirname_drop" + type: integer + description: For every matched file, the parent directory will be traversed N times. + default: 0 + - name: "--n_basename_id" + type: integer + description: The unique identifiers will consist of at least N dirnames. + default: 0 + - name: "--id_name" + type: string + description: The name for storing the identifier field in the yaml. + default: "id" + - name: "--path_name" + type: string + description: The name for storing the path field in the yaml. + default: "path" + - name: "--group_name" + type: string + description: Top level name for the group of entries. + example: param_list + - name: "--output" + type: file + description: Output YAML file. + direction: output + required: true + example: "params.yaml" +resources: + - type: r_script + path: script.R +test_resources: + - type: bash_script + path: test_make_params.sh + - path: ../../../src +engines: + - type: docker + image: ghcr.io/data-intuitive/randpy:r4.0 +runners: + - type: executable + - type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/files/make_params/script.R b/src/files/make_params/script.R new file mode 100644 index 00000000..b5e589bc --- /dev/null +++ b/src/files/make_params/script.R @@ -0,0 +1,73 @@ +library(dplyr) +library(purrr) + +## VIASH START +par <- list( + base_dir = "src", + pattern = "*.vsh.yaml", + n_dirname_drop = 1, + n_basename_id = 1, + output = "output.yaml", + id_name = "id", + path_name = "path", + group_name = "param_list" +) +## VIASH END + +cat("> Listing files of base dir ", par$base_dir, "\n", sep = "") +paths <- list.files( + normalizePath(par$base_dir), + pattern = par$pattern, + recursive = TRUE, + full.names = TRUE +) + +cat("> Traversing up ", par$n_dirname_apply, " times\n", sep = "") +for (i in seq_len(par$n_dirname_drop)) { + paths <- dirname(paths) %>% unique() +} + +# removing /viash_automount in case we're inside a docker container +paths <- gsub("^/viash_automount", "", paths) + +cat("> Checking whether basenames are unique\n") +i <- par$n_basename_id +maxi <- strsplit(paths, "/") %>% + map_int(length) %>% + max() + +regex <- paste0(".*/(", paste(rep("[^/]+/", i), collapse = ""), "[^/]*)$") +ids <- gsub("/", "_", gsub(regex, "\\1", paths)) + +cat("> Printing first five rows\n") +print(tibble(id = ids, path = paths) %>% head(5)) +cat("\n") + +while (i < maxi && any(duplicated(ids))) { + i <- i + 1 + cat( + "Duplicated ids detected, combining with ", i, + " dirnames in an attempt to get unique ids.\n" + ) + regex <- paste0(".*/(", paste(rep("[^/]+/", i), collapse = ""), "[^/]*)$") + ids <- gsub("/", "_", gsub(regex, "\\1", paths)) + + cat("> Printing first five rows\n") + print(tibble(id = ids, path = paths) %>% head(5)) + cat("\n") +} + +cat("> Transforming into list of items\n") +par_list <- map2( + ids, paths, + function(id, input) { + setNames(list(id, input), c(par$id_name, par$path_name)) + } +) + +if (!is.null(par$group_name)) { + par_list <- setNames(list(par_list), par$group_name) +} + +cat("> Writing as YAML\n") +yaml::write_yaml(par_list, par$output) diff --git a/src/files/make_params/test_make_params.sh b/src/files/make_params/test_make_params.sh new file mode 100644 index 00000000..f3a188c6 --- /dev/null +++ b/src/files/make_params/test_make_params.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +## VIASH START +meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +# function clean_up { +# rm -rf "$tmpdir" +# } +# trap clean_up EXIT + +echo "> Running $meta_executable." +$meta_executable \ + --base_dir "./src" \ + --pattern "*.vsh.yaml" \ + --n_dirname_drop 1 \ + --n_basename_id 1 \ + --output "$tmpdir/output.yaml" \ + --path_name "path" \ + --group_name "param_list" \ + --id_name "id" + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f "$tmpdir/output.yaml" ]] && echo "Output file could not be found!" && exit 1 + +if ! grep -qw param_list "$tmpdir/output.yaml"; then + echo "Yaml key 'param_list" not found && exit 1 +fi + + +# this component always be present because this test is executed. +if ! grep -qw "id: src_files_make_params" "$tmpdir/output.yaml"; then + echo "Yaml key 'id: src_files_make_params" not found && exit 1 +fi +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/filter/README.md b/src/filter/README.md new file mode 100644 index 00000000..18d6c387 --- /dev/null +++ b/src/filter/README.md @@ -0,0 +1,48 @@ +# Filter components + +Filter a h5mu file. + +Required input arguments: + +```yaml + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + type: string + multiple: true + default: "rna" + required: false +``` + +Required output arguments: + +```yaml + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + + - name: "--do_subset" + type: boolean_true + description: Whether to subset before storing the output. +``` + +Optional output arguments: + +```yaml + - name: "--obs_name_filter" + type: string + default: "filter_" + description: In which .obs slot to store a boolean array corresponding to which observations should be removed. + + - name: "--var_name_filter" + type: string + default: "filter_" + description: In which .var slot to store a boolean array corresponding to which variables should be removed. +``` \ No newline at end of file diff --git a/src/filter/delimit_fraction/config.vsh.yaml b/src/filter/delimit_fraction/config.vsh.yaml new file mode 100644 index 00000000..f914582a --- /dev/null +++ b/src/filter/delimit_fraction/config.vsh.yaml @@ -0,0 +1,90 @@ +name: delimit_fraction +namespace: "filter" +scope: "public" +description: | + Turns a column containing values between 0 and 1 into a boolean column based on thresholds. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--obs_fraction_column" + type: string + required: true + example: "fraction_mitochondrial" + description: | + Name of column from .var dataframe selecting + a column that contains floating point values between 0 and 1. + + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + + - name: "--obs_name_filter" + type: string + required: true + description: In which .obs slot to store a boolean array corresponding to which observations should be removed. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--min_fraction" + default: 0 + type: double + min: 0 + max: 1 + description: Min fraction for an observation to be retained (True in output). + + - name: "--max_fraction" + default: 1 + type: double + min: 0 + max: 1 + description: Max fraction for an observation to be retained (True in output). + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/delimit_fraction/script.py b/src/filter/delimit_fraction/script.py new file mode 100644 index 00000000..c9869288 --- /dev/null +++ b/src/filter/delimit_fraction/script.py @@ -0,0 +1,94 @@ +import mudata as mu +import numpy as np +import sys +from operator import le, ge +from pandas.api.types import is_float_dtype + + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "var_name_filter": "filter_with_counts", + "min_fraction": 0, + "max_fraction": 1, + "output_compression": "gzip", +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from %s", par["modality"], par["input"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +assert data.var_names.is_unique, ( + "The var_names of the input modality must be be unique." +) + +logger.info("\tUnfiltered data: %s", data) +logger.info("\tComputing aggregations.") + + +def apply_filter_to_mask(mask, base, filter, comparator): + new_filt = np.ravel(comparator(base, filter)) + num_removed = np.sum(np.invert(new_filt) & mask) + mask &= new_filt + return num_removed, mask + + +try: + fraction = data.obs[par["obs_fraction_column"]] +except KeyError: + raise ValueError(f"Could not find column '{par['obs_fraction_column']}'") +if not is_float_dtype(fraction): + raise ValueError( + f"Column '{par['obs_fraction_column']}' does not contain float datatype." + ) +if fraction.max() > 1: + raise ValueError(f"Column '{par['obs_fraction_column']}' contains values > 1.") +if fraction.min() < 0: + raise ValueError(f"Column '{par['obs_fraction_column']}' contains values < 0.") + + +# Filter cells +filters = ( + ( + "min_fraction", + fraction, + ge, + "\tRemoving %s cells with <%s percentage mitochondrial reads.", + ), + ( + "max_fraction", + fraction, + le, + "\tRemoving %s cells with >%s percentage mitochondrial reads.", + ), +) + +keep_cells = np.repeat(True, data.n_obs) +for filter_name_or_value, base, comparator, message in filters: + try: + filter = par[filter_name_or_value] + except KeyError: + filter = filter_name_or_value + if filter is not None: + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) + logger.info(message, num_removed, filter) + +data.obs[par["obs_name_filter"]] = keep_cells + +logger.info("\tFiltered data: %s", data) +logger.info("Writing output data to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") diff --git a/src/filter/delimit_fraction/test.py b/src/filter/delimit_fraction/test.py new file mode 100644 index 00000000..bab6ccf0 --- /dev/null +++ b/src/filter/delimit_fraction/test.py @@ -0,0 +1,155 @@ +import mudata as mu +import sys +from pathlib import Path +import pytest +import numpy as np +from subprocess import CalledProcessError +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/filter/delimit_fraction/delimit_fraction", + "resources_dir": "resources_test/", + "config": "./src/filter/delimit_fraction/config.vsh.yaml", +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +@pytest.fixture +def original_input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_h5mu(original_input_path): + input_data = mu.read_h5mu(original_input_path) + input_data.mod["rna"].obs["test_fraction"] = np.random.rand( + input_data.mod["rna"].n_obs + ) + return input_data + + +@pytest.fixture +def input_h5mu_string_data(original_input_path): + input_data = mu.read_h5mu(original_input_path) + string_data = ["these", "are", "random", "values"] + input_data.mod["rna"].obs["test_fraction"] = np.random.choice( + string_data, input_data.mod["rna"].n_obs + ) + return input_data + + +@pytest.fixture +def input_path(input_h5mu, random_h5mu_path): + output_path = random_h5mu_path() + input_h5mu.write(output_path) + return output_path + + +@pytest.fixture +def input_path_string_data(input_h5mu_string_data, random_h5mu_path): + output_path = random_h5mu_path() + input_h5mu_string_data.write(output_path) + return output_path + + +def test_filter_nothing(run_component, input_path, random_h5mu_path): + output_path = random_h5mu_path() + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--min_fraction", + "0", + "--max_fraction", + "1", + "--output_compression", + "gzip", + "--obs_name_filter", + "test_output", + "--obs_fraction_column", + "test_fraction", + ] + ) + assert Path(output_path).is_file() + mu_out = mu.read_h5mu(output_path) + assert "test_output" in mu_out.mod["rna"].obs + assert mu_out.mod["rna"].obs["test_output"].all() + + mu_out.mod["rna"].obs = mu_out.mod["rna"].obs.drop(["test_output"], axis=1) + mu_out.obs = mu_out.obs.drop(["rna:test_output"], axis=1) + mu_out.update() + assert_annotation_objects_equal(input_path, mu_out) + + +def test_filtering_a_little(run_component, input_path, random_h5mu_path): + output_path = random_h5mu_path() + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--min_fraction", + "0.5", + "--max_fraction", + "0.7", + "--output_compression", + "gzip", + "--obs_name_filter", + "test_output", + "--obs_fraction_column", + "test_fraction", + ] + ) + + assert Path(output_path).is_file() + mu_out = mu.read_h5mu(output_path) + assert not mu_out.mod["rna"].obs["test_output"].all() + assert mu_out.mod["rna"].obs["test_output"].any() + + mu_out.mod["rna"].obs = mu_out.mod["rna"].obs.drop(["test_output"], axis=1) + mu_out.obs = mu_out.obs.drop(["rna:test_output"], axis=1) + mu_out.update() + assert_annotation_objects_equal(input_path, mu_out) + + +def test_filtering_wrong_data_raises( + run_component, input_path_string_data, random_h5mu_path +): + output_path = random_h5mu_path() + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + input_path_string_data, + "--output", + output_path, + "--min_fraction", + "0.5", + "--max_fraction", + "0.7", + "--output_compression", + "gzip", + "--obs_name_filter", + "test_output", + "--obs_fraction_column", + "test_fraction", + ] + ) + assert ( + "Column 'test_fraction' does not contain float datatype." + in err.value.stdout.decode("utf-8") + ) + + +if __name__ == "__main__": + exit(pytest.main([__file__])) diff --git a/src/filter/do_filter/config.vsh.yaml b/src/filter/do_filter/config.vsh.yaml new file mode 100644 index 00000000..2f0cd25e --- /dev/null +++ b/src/filter/do_filter/config.vsh.yaml @@ -0,0 +1,72 @@ +name: do_filter +namespace: "filter" +scope: "public" +description: | + Remove observations and variables based on specified .obs and .var columns. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, contributor ] +arguments: + # input + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--obs_filter" + type: string + example: "filter_with_x" + multiple: true + description: Which .obs columns to use to filter the observations by. + + - name: "--var_filter" + type: string + example: "filter_with_x" + multiple: true + description: Which .var columns to use to filter the observations by. + + # output + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] diff --git a/src/filter/do_filter/script.py b/src/filter/do_filter/script.py new file mode 100644 index 00000000..ca07a72d --- /dev/null +++ b/src/filter/do_filter/script.py @@ -0,0 +1,67 @@ +import mudata as mu +import numpy as np +import sys + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "obs_filter": ["filter_none", "filter_with_random"], + "var_filter": ["filter_with_random"], + "output": "output.h5mu", +} + +mdata = mu.read_h5mu(par["input"]) +mdata.mod["rna"].obs["filter_none"] = np.repeat(True, mdata.mod["rna"].n_obs) +mdata.mod["rna"].obs["filter_with_random"] = np.random.choice( + a=[False, True], size=mdata.mod["rna"].n_obs +) +mdata.mod["rna"].var["filter_with_random"] = np.random.choice( + a=[False, True], size=mdata.mod["rna"].n_vars +) +mod = "rna" +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality: %s", par["input"], par["modality"]) +adata = mu.read_h5ad(par["input"], mod=par["modality"]) + +obs_filt = np.repeat(True, adata.n_obs) +var_filt = np.repeat(True, adata.n_vars) + +par["obs_filter"] = par["obs_filter"] if par["obs_filter"] else [] +par["var_filter"] = par["var_filter"] if par["var_filter"] else [] + +for obs_name in par["obs_filter"]: + logger.info( + "Filtering modality '%s' observations by .obs['%s']", par["modality"], obs_name + ) + if obs_name not in adata.obs: + raise ValueError(f".mod[{par['modality']}].obs[{obs_name}] does not exist.") + if obs_name in adata.obs: + obs_filt &= adata.obs[obs_name] + +for var_name in par["var_filter"]: + logger.info( + "Filtering modality '%s' variables by .var['%s']", par["modality"], var_name + ) + if var_name not in adata.var: + raise ValueError(f".mod[{par['modality']}].var[{var_name}] does not exist.") + if var_name in adata.var: + var_filt &= adata.var[var_name] + +adata_filtered = adata[obs_filt, var_filt] + +logger.info("Writing h5mu to file %s.", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + adata_filtered, + par["output_compression"], +) diff --git a/src/filter/do_filter/test.py b/src/filter/do_filter/test.py new file mode 100644 index 00000000..6b840339 --- /dev/null +++ b/src/filter/do_filter/test.py @@ -0,0 +1,174 @@ +import sys +import pytest +import uuid +from subprocess import CalledProcessError +import re +import mudata as mu +import numpy as np + +## VIASH START +meta = { + "name": "./target/native/filter/do_filter/do_filter", + "resources_dir": "resources_test/", + "executable": "./target/executable/filter/do_filter/do_filter", + "config": "./src/filter/do_filter/config.vsh.yaml", +} +## VIASH END + +input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def random_h5mu_path(tmp_path): + unique_filename = f"{str(uuid.uuid4())}.h5mu" + temp_file = tmp_path / unique_filename + return temp_file + + +@pytest.fixture +def write_to_temp_file(tmp_path): + def write_h5mu_wrapper(mudata_object): + unique_filename = f"{str(uuid.uuid4())}.h5mu" + temp_file = tmp_path / unique_filename + mudata_object.write(temp_file) + return temp_file + + return write_h5mu_wrapper + + +@pytest.fixture() +def input_data(): + mu_in = mu.read_h5mu(input_path) + return mu_in + + +@pytest.fixture() +def original_n_obs(input_data): + return input_data.mod["rna"].n_obs + + +@pytest.fixture() +def original_n_vars(input_data): + return input_data.mod["rna"].n_vars + + +@pytest.fixture() +def test_data_filter_nothing(input_data, write_to_temp_file): + rna_mod = input_data.mod["rna"] + rna_mod.obs["filter_none"] = np.repeat(True, rna_mod.n_obs) + return write_to_temp_file(input_data) + + +@pytest.fixture() +def test_data_filter_with_random(input_data, write_to_temp_file): + rna_mod = input_data.mod["rna"] + rna_mod.obs["filter_with_random"] = np.random.choice( + [False, True], size=rna_mod.n_obs + ) + rna_mod.var["filter_with_random"] = np.random.choice( + [False, True], size=rna_mod.n_vars + ) + return write_to_temp_file(input_data) + + +def test_filtering_a_little_bit( + run_component, + test_data_filter_with_random, + random_h5mu_path, + original_n_obs, + original_n_vars, +): + component_output = run_component( + [ + "--input", + test_data_filter_with_random, + "--output", + random_h5mu_path, + "--obs_filter", + "filter_with_random", + "--var_filter", + "filter_with_random", + "--output_compression", + "gzip", + ] + ) + assert random_h5mu_path.is_file(), "Output file not found" + mu_out = mu.read_h5mu(random_h5mu_path) + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs < original_n_obs, "Some RNA obs should have been filtered" + assert new_vars < original_n_vars, "Some RNA vars should have been filtered" + assert ( + b"Filtering modality 'rna' observations by .obs['filter_with_random']" + in component_output + ) + assert ( + b"Filtering modality 'rna' variables by .var['filter_with_random']" + in component_output + ) + + +def test_filter_nothing( + run_component, + test_data_filter_nothing, + random_h5mu_path, + original_n_obs, + original_n_vars, +): + run_component( + [ + "--input", + test_data_filter_nothing, + "--output", + random_h5mu_path, + "--obs_filter", + "filter_none", + ] + ) + assert random_h5mu_path.is_file(), "Output file not found" + mu_out = mu.read_h5mu(random_h5mu_path) + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs == original_n_obs, "No RNA obs should have been filtered" + assert new_vars == original_n_vars, "No RNA vars should have been filtered" + + +def test_nonexisting_column_raises( + run_component, test_data_filter_nothing, random_h5mu_path +): + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + test_data_filter_nothing, + "--output", + random_h5mu_path, + "--obs_filter", + "doesnotexist", + ] + ) + assert re.search( + r"\.mod\[rna\]\.obs\[doesnotexist\] does not exist\.", + err.value.stdout.decode("utf-8"), + ) + + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + test_data_filter_nothing, + "--output", + random_h5mu_path, + "--var_filter", + "doesnotexist", + ] + ) + + assert re.search( + r"\.mod\[rna\]\.var\[doesnotexist\] does not exist\.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/filter/filter_with_counts/config.vsh.yaml b/src/filter/filter_with_counts/config.vsh.yaml new file mode 100644 index 00000000..51986fe1 --- /dev/null +++ b/src/filter/filter_with_counts/config.vsh.yaml @@ -0,0 +1,114 @@ +name: filter_with_counts +namespace: "filter" +scope: "public" +description: | + Filter scRNA-seq data based on the primary QC metrics. + This is based on both the UMI counts, the gene counts + and the mitochondrial genes (genes starting with mt/MT). +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, author ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--layer" + description: | + description: | + Location of the count matrix. If specified, will be used to select a key from .layers, + otherwise .X is used. + type: string + example: "raw_counts" + required: false + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--do_subset" + type: boolean_true + description: Whether to subset before storing the output. + - name: "--obs_name_filter" + type: string + default: "filter_with_counts" + description: In which .obs slot to store a boolean array corresponding to which observations should be removed. + + - name: "--var_name_filter" + type: string + default: "filter_with_counts" + description: In which .var slot to store a boolean array corresponding to which variables should be removed. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--min_counts" + example: 200 + type: integer + description: Minimum number of counts captured per cell. + + - name: "--max_counts" + example: 5000000 + type: integer + description: Maximum number of counts captured per cell. + + - name: "--min_genes_per_cell" + example: 200 + type: integer + description: Minimum of non-zero values per cell. + + - name: "--max_genes_per_cell" + example: 1500000 + type: integer + description: Maximum of non-zero values per cell. + + - name: "--min_cells_per_gene" + example: 3 + type: integer + description: Minimum of non-zero values per gene. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/filter_with_counts/script.py b/src/filter/filter_with_counts/script.py new file mode 100644 index 00000000..787035ad --- /dev/null +++ b/src/filter/filter_with_counts/script.py @@ -0,0 +1,124 @@ +import mudata as mu +import numpy as np +import sys +from operator import le, ge, gt + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "do_subset": True, + "min_counts": 200, + "max_counts": 5000000, + "min_genes_per_cell": 200, + "max_genes_per_cell": 1500000, + "min_cells_per_gene": 3, + "layer": None, +} +meta = {"name": "filter_on_counts", "resources_dir": "."} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input data from %s, modality %s", par["input"], par["modality"]) +modality_data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("\tUnfiltered data: %s", modality_data) + +logger.info("Selecting input layer %s", "X" if par["layer"] else par["layer"]) +input_layer = ( + modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] +) + +logger.info("\tComputing aggregations.") +n_counts_per_cell = np.ravel(np.sum(input_layer, axis=1)) +n_cells_per_gene = np.sum(input_layer > 0, axis=0) +n_genes_per_cell = np.sum(input_layer > 0, axis=1) + + +def apply_filter_to_mask(mask, base, filter, comparator): + new_filt = np.ravel(comparator(base, filter)) + num_removed = np.sum(np.invert(new_filt) & mask) + mask &= new_filt + return num_removed, mask + + +# Filter genes +keep_genes = np.repeat(True, modality_data.n_vars) +if par["min_cells_per_gene"] is not None: + num_removed, keep_genes = apply_filter_to_mask( + keep_genes, n_cells_per_gene, par["min_cells_per_gene"], ge + ) + logger.info( + "\tRemoving %s genes with non-zero values in <%s cells.", + num_removed, + par["min_cells_per_gene"], + ) + +# Filter cells +filters = ( + ( + "min_genes_per_cell", + n_genes_per_cell, + ge, + "\tRemoving %s cells with non-zero values in <%s genes.", + ), + ( + "max_genes_per_cell", + n_genes_per_cell, + le, + "\tRemoving %s cells with non-zero values in >%s genes.", + ), + ("min_counts", n_counts_per_cell, ge, "\tRemoving %s cells with <%s total counts."), + ("max_counts", n_counts_per_cell, le, "\tRemoving %s cells with >%s total counts."), + ( + 0, + np.sum(input_layer[:, keep_genes], axis=1), + gt, + "\tRemoving %s cells with %s counts", + ), +) + +keep_cells = np.repeat(True, modality_data.n_obs) +for filter_name_or_value, base, comparator, message in filters: + try: + filter = par[filter_name_or_value] + except KeyError: + filter = filter_name_or_value + if filter is not None: + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) + logger.info(message, num_removed, filter) + +if par["obs_name_filter"] is not None: + modality_data.obs[par["obs_name_filter"]] = keep_cells +if par["var_name_filter"] is not None: + modality_data.var[par["var_name_filter"]] = keep_genes + +if par["do_subset"]: + modality_data = modality_data[keep_cells, keep_genes] + +logger.info("\tFiltered data: %s", modality_data) +logger.info( + "Writing output data to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], +) + + +logger.info("Finished") diff --git a/src/filter/filter_with_counts/test.py b/src/filter/filter_with_counts/test.py new file mode 100644 index 00000000..0d22c577 --- /dev/null +++ b/src/filter/filter_with_counts/test.py @@ -0,0 +1,212 @@ +import mudata as mu +import sys +from pathlib import Path +import pytest +import numpy as np + +## VIASH START +meta = { + "executable": "./target/executable/filter/filter_with_counts/filter_with_counts", + "resources_dir": "resources_test/", + "config": "/home/di/code/openpipeline/src/filter/filter_with_counts/config.vsh.yaml", +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_h5mu(input_path): + return mu.read_h5mu(input_path) + + +@pytest.fixture +def input_n_rna_obs(input_h5mu): + return input_h5mu.mod["rna"].n_obs + + +@pytest.fixture +def input_n_prot_obs(input_h5mu): + return input_h5mu.mod["prot"].n_obs + + +@pytest.fixture +def input_n_rna_vars(input_h5mu): + return input_h5mu.mod["rna"].n_vars + + +@pytest.fixture +def input_n_prot_vars(input_h5mu): + return input_h5mu.mod["prot"].n_vars + + +def test_filter_nothing( + run_component, + input_path, + input_n_rna_obs, + input_n_prot_obs, + input_n_rna_vars, + input_n_prot_vars, +): + run_component( + [ + "--input", + input_path, + "--output", + "output-1.h5mu", + "--min_cells_per_gene", + "3", + "--output_compression", + "gzip", + ] + ) + assert Path("output-1.h5mu").is_file() + mu_out = mu.read_h5mu("output-1.h5mu") + assert "filter_with_counts" in mu_out.mod["rna"].obs + assert "filter_with_counts" in mu_out.mod["rna"].var + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs == input_n_rna_obs + assert new_vars == input_n_rna_vars + assert mu_out.mod["prot"].n_obs == input_n_prot_obs + assert mu_out.mod["prot"].n_vars == input_n_prot_vars + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ] + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ] + + +def test_filtering_a_little( + run_component, + input_path, + input_n_rna_obs, + input_n_prot_obs, + input_n_rna_vars, + input_n_prot_vars, +): + run_component( + [ + "--input", + input_path, + "--output", + "output-2.h5mu", + "--modality", + "rna", + "--min_counts", + "200", + "--max_counts", + "5000000", + "--min_genes_per_cell", + "200", + "--max_genes_per_cell", + "1500000", + "--min_cells_per_gene", + "10", + "--do_subset", + ] + ) + assert Path("output-2.h5mu").is_file() + mu_out = mu.read_h5mu("output-2.h5mu") + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs < input_n_rna_obs + assert new_vars < input_n_rna_vars + assert mu_out.mod["prot"].n_obs == input_n_prot_obs + assert mu_out.mod["prot"].n_vars == input_n_prot_vars + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ] + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ] + + +def test_filter_cells_without_counts(run_component, input_h5mu, tmp_path): + # create_an_empty_cell + obs_to_remove = input_h5mu.mod["rna"].obs.index[0] + input_h5mu.mod["rna"].X[0] = 0 + temp_h5mu_path = tmp_path / "temp.h5mu" + input_h5mu.write(temp_h5mu_path) + run_component( + [ + "--input", + temp_h5mu_path, + "--output", + "output-3.h5mu", + "--min_cells_per_gene", + "0", + ] + ) + assert Path("output-3.h5mu").is_file() + mu_out = mu.read_h5mu("output-3.h5mu") + assert mu_out.mod["rna"].obs.at[obs_to_remove, "filter_with_counts"] is np.False_ + assert "mitochondrial" not in mu_out.mod["rna"].var + + +def test_filter_using_different_layer( + run_component, + input_h5mu, + tmp_path, + input_n_rna_obs, + input_n_prot_obs, + input_n_rna_vars, + input_n_prot_vars, +): + # move X to different input layer + input_h5mu.mod["rna"].layers["test_layer"] = input_h5mu.mod["rna"].X.copy() + input_h5mu.mod["rna"].X = None + + temp_h5mu_path = tmp_path / "temp.h5mu" + input_h5mu.write(temp_h5mu_path) + run_component( + [ + "--input", + temp_h5mu_path, + "--output", + "output-4.h5mu", + "--modality", + "rna", + "--min_counts", + "200", + "--max_counts", + "5000000", + "--min_genes_per_cell", + "200", + "--max_genes_per_cell", + "1500000", + "--min_cells_per_gene", + "10", + "--layer", + "test_layer", + "--do_subset", + ] + ) + assert Path("output-4.h5mu").is_file() + mu_out = mu.read_h5mu("output-2.h5mu") + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs < input_n_rna_obs + assert new_vars < input_n_rna_vars + assert mu_out.mod["prot"].n_obs == input_n_prot_obs + assert mu_out.mod["prot"].n_vars == input_n_prot_vars + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ] + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ] + + +if __name__ == "__main__": + exit(pytest.main([__file__])) diff --git a/src/filter/filter_with_scrublet/config.vsh.yaml b/src/filter/filter_with_scrublet/config.vsh.yaml new file mode 100644 index 00000000..795d272e --- /dev/null +++ b/src/filter/filter_with_scrublet/config.vsh.yaml @@ -0,0 +1,163 @@ +name: filter_with_scrublet +namespace: "filter" +scope: "public" +description: | + Doublet detection using the Scrublet method (Wolock, Lopez and Klein, 2019). + The method tests for potential doublets by using the expression profiles of + cells to generate synthetic potential doubles which are tested against cells. + The method returns a "doublet score" on which it calls for potential doublets. + + For the source code please visit https://github.com/AllonKleinLab/scrublet. + + For 10x we expect the doublet rates to be: + Multiplet Rate (%) - # of Cells Loaded - # of Cells Recovered + ~0.4% ~800 ~500 + ~0.8% ~1,600 ~1,000 + ~1.6% ~3,200 ~2,000 + ~2.3% ~4,800 ~3,000 + ~3.1% ~6,400 ~4,000 + ~3.9% ~8,000 ~5,000 + ~4.6% ~9,600 ~6,000 + ~5.4% ~11,200 ~7,000 + ~6.1% ~12,800 ~8,000 + ~6.9% ~14,400 ~9,000 + ~7.6% ~16,000 ~10,000 +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, contributor ] +arguments: + # input + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--layer" + description: "Input layer to use as data for calculating doublets. .X is used not specified." + type: string + required: false + + # output + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + + - name: "--obs_name_filter" + type: string + default: "filter_with_scrublet" + description: In which .obs slot to store a boolean array corresponding to which observations should be filtered out. + + - name: "--do_subset" + type: boolean_true + description: Whether to subset before storing the output. + + - name: "--obs_name_doublet_score" + type: string + default: "scrublet_doublet_score" + description: Name of the doublet scores column in the obs slot of the returned object. + + - name: "--expected_doublet_rate" + type: double + required: false + min: 0 + max: 1 + description: | + The estimated fraction of doublets as from the experimental setup. + + - name: "--stdev_doublet_rate" + type: double + min: 0 + required: false + description: Uncertainty in the expected doublet rate. + + - name: "--n_neighbors" + type: integer + required: false + min: 0 + description: | + Number of neighbors used to construct the KNN classifier of observed transcriptomes + and simulated doublets. + + - name: "--sim_doublet_ratio" + type: double + min: 0 + description: | + Number of doublets to simulate relative to the number of observed + transcriptomes. + + - name: "--min_counts" + type: integer + default: 2 + description: The number of minimal UMI counts per cell that have to be present for initial cell detection. + + - name: "--min_cells" + type: integer + default: 3 + description: The number of cells in which UMIs for a gene were detected. + + - name: "--min_gene_variablity_percent" + type: double + default: 85 + description: Used for gene filtering prior to PCA. Keep the most highly variable genes (in the top min_gene_variability_pctl percentile), as measured by the v-statistic [Klein et al., Cell 2015]. + + - name: "--num_pca_components" + type: integer + default: 30 + description: Number of principal components to use during PCA dimensionality reduction. + + - name: "--distance_metric" + type: string + default: "euclidean" + description: The distance metric used for computing similarities. + + - name: "--allow_automatic_threshold_detection_fail" + type: "boolean_true" + description: | + When scrublet fails to automatically determine the double score threshold, + allow the component to continue and set the output columns to NA. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - build-essential + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - scrublet + - annoy==1.17.3 + __merge__: [/src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable + docker_run_args: [--env NUMBA_CACHE_DIR=/tmp] +- type: nextflow + directives: + label: [highcpu, midmem] diff --git a/src/filter/filter_with_scrublet/script.py b/src/filter/filter_with_scrublet/script.py new file mode 100644 index 00000000..1758670c --- /dev/null +++ b/src/filter/filter_with_scrublet/script.py @@ -0,0 +1,116 @@ +import scrublet as scr +import mudata as mu +import numpy as np +import sys +import pandas as pd + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + # "input": "output/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu.h5mu", + "modality": "rna", + "output": "output.h5mu", + "output_compression": "gzip", + "obs_name_filter": "filter_with_scrublet", + "expected_doublet_rate": 0.05, + "min_counts": 2, + "min_cells": 3, + "min_gene_variablity_percent": 85, + "num_pca_components": 30, + "distance_metric": "euclidean", + "obs_name_doublet_score": "scrublet_doublet_score", + "obs_name_predicted_doublets": "scrublet_predicted_doublets", + "do_subset": True, + "layer": None, + "stdev_doublet_rate": None, + "sim_doublet_ratio": None, + "n_neighbors": None, +} +meta = { + "name": "scrublet", + "resources_dir": "src/utils", +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"]) +input_layer = data.X if not par["layer"] else data.layers[par["layer"]] + +if 0 in input_layer.shape: + raise ValueError( + f"Modality {par['modality']} of input Mudata {par['input']} appears " + f"to be empty (shape: {input_layer.shape})." + ) + +logger.info("\tRunning scrublet") +initializer_args = { + arg: par[arg] + for arg in ( + "expected_doublet_rate", + "stdev_doublet_rate", + "n_neighbors", + "sim_doublet_ratio", + ) + if par[arg] is not None +} +scrub = scr.Scrublet(input_layer, **initializer_args) + +doublet_scores, predicted_doublets = scrub.scrub_doublets( + min_counts=par["min_counts"], + min_cells=par["min_cells"], + min_gene_variability_pctl=par["min_gene_variablity_percent"], + n_prin_comps=par["num_pca_components"], + distance_metric=par["distance_metric"], + use_approx_neighbors=False, +) + +try: + keep_cells = np.invert(predicted_doublets) +except TypeError: + if par["allow_automatic_threshold_detection_fail"]: + # Scrublet might not throw an error and return None if it fails to detect doublets... + logger.info( + "\tScrublet could not automatically detect the doublet score threshold. Setting output columns to NA." + ) + keep_cells = np.nan + doublet_scores = np.nan + else: + raise RuntimeError( + "Scrublet could not automatically detect the doublet score threshold. " + "--allow_automatic_threshold_detection_fail can be used to ignore this failure " + "and set the corresponding output columns to NA." + ) + +logger.info("\tStoring output into .obs") +if par["obs_name_doublet_score"] is not None: + data.obs[par["obs_name_doublet_score"]] = doublet_scores + data.obs[par["obs_name_doublet_score"]] = data.obs[ + par["obs_name_doublet_score"] + ].astype("float64") +if par["obs_name_filter"] is not None: + data.obs[par["obs_name_filter"]] = keep_cells + data.obs[par["obs_name_filter"]] = data.obs[par["obs_name_filter"]].astype( + pd.BooleanDtype() + ) + +if par["do_subset"]: + if pd.api.types.is_scalar(keep_cells) and pd.isna(keep_cells): + logger.warning("Not subsetting beacuse doublets were not predicted") + else: + data = data[keep_cells, :] + +logger.info( + "Writing h5mu to %s, with compression %s", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) diff --git a/src/filter/filter_with_scrublet/test.py b/src/filter/filter_with_scrublet/test.py new file mode 100644 index 00000000..1516c790 --- /dev/null +++ b/src/filter/filter_with_scrublet/test.py @@ -0,0 +1,366 @@ +from pathlib import Path +import re +import subprocess +import pytest + +import mudata as mu +import numpy as np +import pandas as pd +import anndata as ad +from scipy.sparse import csr_matrix, csr_array + +## VIASH START +meta = { + "name": "foo", + "resources_dir": "resources_test/", + "executable": "target/executable/filter/filter_with_scrublet/filter_with_scrublet", + "config": "./src/filter/filter_with_scrublet/config.vsh.yaml", +} +## VIASH END + +# read input file + + +@pytest.fixture +def input_mudata_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_mudata(input_mudata_path): + return mu.read_h5mu(input_mudata_path) + + +@pytest.fixture +def input_with_failed_run(random_h5mu_path, input_mudata_path): + new_mudata_path = random_h5mu_path() + + mudata_in = mu.read_h5mu(input_mudata_path) + + # Make test reproducable + np.random.seed(4) + + # Simulate a failed scrublet run by passing very little cells + mudata = mudata_in[152].copy() + nobs = 100 + x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0) + + # Random perturbations because otherwise the detection fails in other ways (PCA cannot be run) + replace_rate = 0.000001 + mask = np.random.choice( + [0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate) + ).astype("bool") + r = np.random.rand(*x_data.shape) * np.max(x_data) + x_data[mask] = r[mask] + + # create obs + obs_name = mudata.mod["rna"].obs.index.to_list()[0] + obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)]) + + # create resulting mudata + mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var) + new_mudata = mu.MuData({"rna": mod}) + new_mudata.update() + new_mudata.write(new_mudata_path) + + return new_mudata_path + + +def test_filter_a_little_bit( + run_component, random_h5mu_path, input_mudata_path, input_mudata +): + output_mu = random_h5mu_path() + + run_component( + [ + "--input", + input_mudata_path, + "--output", + output_mu, + "--min_counts", + "3", + "--output_compression", + "gzip", + ] + ) + assert Path(output_mu).is_file(), "Output file not found" + + mu_out = mu.read_h5mu(output_mu) + assert "filter_with_scrublet" in mu_out.mod["rna"].obs + + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs == input_mudata.mod["rna"].n_obs, ( + "No RNA obs should have been filtered" + ) + assert new_vars == input_mudata.mod["rna"].n_vars, ( + "No RNA vars should have been filtered" + ) + assert mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs, ( + "No prot obs should have been filtered" + ) + assert mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars, ( + "No prot vars should have been filtered" + ) + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + + +def test_filtering_a_lot( + run_component, random_h5mu_path, input_mudata_path, input_mudata +): + output_mu = random_h5mu_path() + + run_component( + [ + "--input", + input_mudata_path, + "--output", + output_mu, + "--modality", + "rna", + "--min_counts", + "10", + "--num_pca_components", + "10", + "--do_subset", + ] + ) + assert Path(output_mu).is_file(), "Output file not found" + + mu_out = mu.read_h5mu(output_mu) + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs < input_mudata.mod["rna"].n_obs, ( + "Some cells should have been filtered" + ) + assert new_vars == input_mudata.mod["rna"].n_vars, ( + "No genes should have been filtered" + ) + assert mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs, ( + "No prot obs should have been filtered" + ) + assert mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars, ( + "No prot vars should have been filtered" + ) + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + + +def test_empty_mudata(run_component, random_h5mu_path): + output_mu = random_h5mu_path() + empty_mudata_path = random_h5mu_path() + empty_mudata = mu.MuData( + { + modality: ad.AnnData(csr_array((5, 0), dtype=np.int8)) + for modality in ("rna",) + } + ) + + empty_mudata.write(empty_mudata_path) + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + empty_mudata_path, + "--output", + output_mu, + "--output_compression", + "gzip", + ] + ) + assert re.search( + "ValueError: Modality rna of input Mudata .* appears to be empty", + err.value.stdout.decode("utf-8"), + ) + + +@pytest.mark.xfail(strict=False) +def test_doublet_automatic_threshold_detection_fails( + run_component, input_with_failed_run, random_h5mu_path +): + """ + Test if the component fails if doublet score threshold could not automatically be set + """ + output_mu = random_h5mu_path() + + with pytest.raises(subprocess.CalledProcessError) as e_info: + run_component( + [ + "--input", + input_with_failed_run, + "--output", + output_mu, + "--output_compression", + "gzip", + "--num_pca_components", + "1", + "--min_gene_variablity_percent", + "0", + "--min_cells", + "1", + "--min_counts", + "1", + ] + ) + assert re.search( + r"RuntimeError: Scrublet could not automatically detect the doublet score threshold\. " + r"--allow_automatic_threshold_detection_fail can be used to ignore this failure and " + r"set the corresponding output columns to NA\.", + e_info.value.stdout.decode("utf-8"), + ) + + assert not Path(output_mu).is_file(), "Output file not found" + + +@pytest.mark.xfail(strict=False) +def test_doublet_automatic_threshold_detection_fails_recovery( + run_component, input_with_failed_run +): + """ + Test if the component can recover from scrublet not automatically able to set the doublet score threshold + and it is not set. + """ + output_mu = "output-5.h5mu" + + run_component( + [ + "--input", + input_with_failed_run, + "--output", + output_mu, + "--output_compression", + "gzip", + "--num_pca_components", + "1", + "--min_gene_variablity_percent", + "0", + "--min_cells", + "1", + "--min_counts", + "1", + "--allow_automatic_threshold_detection_fail", + ] + ) + assert Path(output_mu).is_file(), "Output file not found" + + mu_out = mu.read_h5mu(output_mu) + assert mu_out.mod["rna"].obs["filter_with_scrublet"].isna().all() + + +def test_selecting_input_layer( + run_component, tmp_path, random_h5mu_path, input_mudata, input_mudata_path +): + output_mu = random_h5mu_path() + input_data = mu.read_h5mu(input_mudata_path) + input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X + input_data.mod["rna"].X = None + + temp_input = tmp_path / "temp.h5mu" + input_data.write(temp_input) + + run_component( + [ + "--input", + temp_input, + "--output", + output_mu, + "--modality", + "rna", + "--min_counts", + "10", + "--num_pca_components", + "10", + "--layer", + "test_layer", + "--do_subset", + ] + ) + assert Path(output_mu).is_file(), "Output file not found" + + mu_out = mu.read_h5mu(output_mu) + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs < input_mudata.mod["rna"].n_obs, ( + "Some cells should have been filtered" + ) + assert new_vars == input_mudata.mod["rna"].n_vars, ( + "No genes should have been filtered" + ) + assert mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs, ( + "No prot obs should have been filtered" + ) + assert mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars, ( + "No prot vars should have been filtered" + ) + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + + +def test_customizing_detection_arguments( + run_component, random_h5mu_path, input_mudata_path, input_mudata +): + output_mu = random_h5mu_path() + + run_component( + [ + "--input", + input_mudata_path, + "--output", + output_mu, + "--modality", + "rna", + "--min_counts", + "10", + "--num_pca_components", + "10", + "--do_subset", + "--expected_doublet_rate", + "0.3", + "--stdev_doublet_rate", + "0.11", + "--n_neighbors", + "100", + "--sim_doublet_ratio", + "1.5", + ] + ) + assert Path(output_mu).is_file(), "Output file not found" + + mu_out = mu.read_h5mu(output_mu) + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + assert new_obs < input_mudata.mod["rna"].n_obs, ( + "Some cells should have been filtered" + ) + assert new_vars == input_mudata.mod["rna"].n_vars, ( + "No genes should have been filtered" + ) + assert mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs, ( + "No prot obs should have been filtered" + ) + assert mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars, ( + "No prot vars should have been filtered" + ) + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + + +if __name__ == "__main__": + exit(pytest.main([__file__])) diff --git a/src/filter/intersect_obs/config.vsh.yaml b/src/filter/intersect_obs/config.vsh.yaml new file mode 100644 index 00000000..16715ab1 --- /dev/null +++ b/src/filter/intersect_obs/config.vsh.yaml @@ -0,0 +1,68 @@ +name: intersect_obs +namespace: "filter" +scope: "public" +description: | + Create an intersection between two or more modalities. + + This component removes any observations which are not present in all modalities. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/authors/isabelle_bergiers.yaml + roles: [ contributor ] +arguments: + # input + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modalities" + description: | + Which modalities from the input MuData file to process. + type: string + multiple: true + required: true + example: [rna, prot] + + # output + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + +__merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowcpu, midmem] diff --git a/src/filter/intersect_obs/script.py b/src/filter/intersect_obs/script.py new file mode 100644 index 00000000..d8a709fd --- /dev/null +++ b/src/filter/intersect_obs/script.py @@ -0,0 +1,72 @@ +import mudata as mu +import anndata as ad +import sys +from pathlib import Path +import shutil + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modalities": ["rna", "prot"], + "output": "output.h5mu", +} +meta = {} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import compress_h5mu + +logger = setup_logger() + + +def main(): + modality_names = par["modalities"] + + if len(modality_names) < 2: + raise ValueError("Please provide two more more modalities.") + + obs_names = {} + for mod_name in par["modalities"]: + try: + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) + except KeyError: + raise ValueError( + f"Modality {mod_name} does not exist for file {par['input']}." + ) + + obs_names[mod_name] = modality.obs_names.copy() + del modality + + intersected_index = None + for mod_name, mod_index in obs_names.items(): + if intersected_index is None: + intersected_index = mod_index + continue + intersected_index = intersected_index.intersection(mod_index) + + output_file = Path(par["output"]) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) + output_file_uncompressed.touch() + + mdata = mu.MuData({modality: ad.AnnData() for modality in modality_names}) + mdata.write(output_file_uncompressed, compression=par["output_compression"]) + + for mod_name in modality_names: + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) + intersected_modality = modality[intersected_index] + mu.write_h5ad(output_file_uncompressed, data=intersected_modality, mod=mod_name) + + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, output_file, compression=par["output_compression"] + ) + output_file_uncompressed.unlink() + else: + shutil.move(output_file_uncompressed, output_file) + + +if __name__ == "__main__": + main() diff --git a/src/filter/intersect_obs/test.py b/src/filter/intersect_obs/test.py new file mode 100644 index 00000000..5d5fb371 --- /dev/null +++ b/src/filter/intersect_obs/test.py @@ -0,0 +1,102 @@ +import sys +import pytest +from mudata import read_h5mu, MuData +from anndata import AnnData +import pandas as pd +import uuid + +## VIASH START +meta = { + "executable": "./target/executable/filter/intersect_obs/intersect_obs", + "resources_dir": "./resources_test/", + "cpus": 2, + "config": "./src/filter/intersect_modalities/config.vsh.yaml", +} +## VIASH END + +input_sample_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" +) + + +@pytest.fixture +def generate_h5mu(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) + ad1 = AnnData(df, obs=obs, var=var) + df2 = pd.DataFrame( + [[7, 8, 9], [10, 11, 12]], index=["obs2", "obs3"], columns=df.columns + ) + var2 = pd.DataFrame(["d", "e", "g"], index=df2.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df2.index, columns=["Obs"]) + ad2 = AnnData(df2, obs=obs2, var=var2) + tmp_mudata = MuData({"mod1": ad1, "mod2": ad2}) + return tmp_mudata + + +@pytest.fixture +def sample_mudata(generate_h5mu, tmp_path): + filename = f"{uuid.uuid4()}.h5mu" + output_file = tmp_path / filename + generate_h5mu.write(output_file) + return output_file + + +def test_intersect_obs(run_component, sample_mudata, tmp_path): + output_path = tmp_path / f"{uuid.uuid4()}.h5mu" + + # run component + run_component( + [ + "--input", + sample_mudata, + "--modalities", + "mod1;mod2", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + output = read_h5mu(output_path) + assert list(output.mod.keys()) == ["mod1", "mod2"] + assert output.obs_names.tolist() == ["obs2"] + + +def test_intersect_obs_with_real(run_component, tmp_path): + input_path = tmp_path / f"{uuid.uuid4()}.h5mu" + output_path = tmp_path / f"{uuid.uuid4()}.h5mu" + + # subset input + input = read_h5mu(input_sample_file) + input.mod["rna"] = input.mod["rna"][range(0, 100)] + input.mod["prot"] = input.mod["prot"][range(50, 150)] + input.update_obs() + input.write(input_path) + + # run component + run_component( + [ + "--input", + input_path, + "--modalities", + "rna;prot", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + output = read_h5mu(output_path) + assert list(output.mod.keys()) == ["rna", "prot"] + assert output.n_obs == 50 + assert output.obs_names.tolist() == input.obs_names[range(50, 100)].tolist() + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/filter/remove_modality/config.vsh.yaml b/src/filter/remove_modality/config.vsh.yaml new file mode 100644 index 00000000..bb87e7af --- /dev/null +++ b/src/filter/remove_modality/config.vsh.yaml @@ -0,0 +1,54 @@ +name: remove_modality +namespace: "filter" +scope: "public" +description: | + Remove a modality from a .h5mu file +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Name(s) of the modality to remove + type: string + multiple: true + required: true + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/remove_modality/script.py b/src/filter/remove_modality/script.py new file mode 100644 index 00000000..301266f3 --- /dev/null +++ b/src/filter/remove_modality/script.py @@ -0,0 +1,21 @@ +from mudata import read_h5mu, MuData + + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": ["rna"], + "output": "foo.h5mu", +} +### VIASH END + + +input_mudata = read_h5mu(par["input"]) +new_mods = { + mod_name: mod + for mod_name, mod in input_mudata.mod.items() + if mod_name not in par["modality"] +} + +new_mudata = MuData(new_mods) +new_mudata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/filter/remove_modality/test.py b/src/filter/remove_modality/test.py new file mode 100644 index 00000000..5c07e599 --- /dev/null +++ b/src/filter/remove_modality/test.py @@ -0,0 +1,40 @@ +import sys +import pytest +from mudata import read_h5mu + +## VIASH START +meta = { + "executable": "./target/executable/filter/remove_modality/remove_modality", + "resources_dir": "./resources_test/", + "cpus": 2, +} +## VIASH END + +input_sample_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) + + +def test_remove_component(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # run component + run_component( + [ + "--input", + input_sample_file, + "--modality", + "rna", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + output = read_h5mu(output_path) + assert list(output.mod.keys()) == ["prot"] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/filter/subset_h5mu/config.vsh.yaml b/src/filter/subset_h5mu/config.vsh.yaml new file mode 100644 index 00000000..60ff23d9 --- /dev/null +++ b/src/filter/subset_h5mu/config.vsh.yaml @@ -0,0 +1,59 @@ +name: subset_h5mu +namespace: "filter" +scope: "public" +description: | + Create a subset of a mudata file by selecting the first number of observations +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--number_of_observations" + type: integer + description: Number of observations to be selected from the h5mu file. + example: 5 +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/subset_h5mu/script.py b/src/filter/subset_h5mu/script.py new file mode 100644 index 00000000..ccde6fdc --- /dev/null +++ b/src/filter/subset_h5mu/script.py @@ -0,0 +1,24 @@ +import mudata + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "bar.h5mu", + "number_of_observations": 100, +} +### VIASH END + +if __name__ == "__main__": + # read data + data = mudata.read(par["input"]) + + # subset data + if par["modality"]: + data.mod[par["modality"]] = data.mod[par["modality"]][ + : par["number_of_observations"] + ] + else: + data = data[: par["number_of_observations"]] + + # write data + data.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/filter/subset_h5mu/test.py b/src/filter/subset_h5mu/test.py new file mode 100644 index 00000000..df04e7c9 --- /dev/null +++ b/src/filter/subset_h5mu/test.py @@ -0,0 +1,64 @@ +import sys +import pytest +import mudata as mu + +## VIASH START +meta = { + "executable": "./target/executable/filter/subset_h5mu/subset_h5mu", + "resources_dir": "resources_test/pbmc_1k_protein_v3/", +} +## VIASH END + +input_path = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) + + +def test_filter_nothing(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # run component + run_component( + [ + "--input", + input_path, + "--output", + str(output_path), + "--number_of_observations", + "100", + "--output_compression", + "gzip", + ] + ) + + assert output_path.is_file(), "Output file not found" + + # check output file + mu_in = mu.read_h5mu(input_path) + mu_out = mu.read_h5mu(output_path) + + orig_vars = mu_in.mod["rna"].n_vars + orig_prot_obs = mu_in.mod["prot"].n_obs + orig_prot_vars = mu_in.mod["prot"].n_vars + + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars + + assert new_obs == 100, "Output should only contain 100 observations" + assert new_vars == orig_vars, "No RNA vars should have been filtered" + assert mu_out.mod["prot"].n_obs == orig_prot_obs, ( + "No prot obs should have been filtered" + ) + assert mu_out.mod["prot"].n_vars == orig_prot_vars, ( + "No prot vars should have been filtered" + ) + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/filter/subset_obsp/config.vsh.yaml b/src/filter/subset_obsp/config.vsh.yaml new file mode 100644 index 00000000..38667bde --- /dev/null +++ b/src/filter/subset_obsp/config.vsh.yaml @@ -0,0 +1,74 @@ +name: subset_obsp +namespace: "filter" +scope: "public" +description: | + Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Input + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_obsp_key" + type: string + required: true + description: The .obsp field to be filtered. + - name: "--input_obs_key" + type: string + required: true + description: The .obs column to filter on. + - name: "--input_obs_value" + type: string + required: true + description: The value to filter on in the .obs column. + - name: Output + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obsm_key" + type: string + required: true + description: The .obsm key to store the subset in. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/subset_obsp/script.py b/src/filter/subset_obsp/script.py new file mode 100644 index 00000000..62a6b64d --- /dev/null +++ b/src/filter/subset_obsp/script.py @@ -0,0 +1,53 @@ +import sys +import mudata as mu + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_obsp_key": "distances", + "input_obs_key": "leiden", + "input_obs_value": "1", + "output_obsm_key": "leiden_1", + "output": "subset_obsp_output.h5mu", + "output_compression": None, +} +### VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading %s, modality", par["input"], par["modality"]) + adata = mu.read_h5ad(par["input"], mod=par["modality"]) + + logger.info( + f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}" + ) + # .obsp, .obs and .obsm index and .obsp columns all have a dimension length of `n_obs` + # the index dimensions remain unaltered, but .obsp columns will be subset + obsp = adata.obsp[par["input_obsp_key"]] + idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"] + # A Series object cannot be used as an indexer for a scipy sparse array + # when the data type is a pandas boolean extension array because + # extension arrays do not define .nonzero() + # See https://github.com/pandas-dev/pandas/issues/46025 + idx = idx.to_numpy(dtype="bool", na_value=False) + obsm_subset = obsp[:, idx] + + logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}") + adata.obsm[par["output_obsm_key"]] = obsm_subset + + logger.info( + "Writing output to %s, modality %s", par["output"], par["output_compression"] + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + + +if __name__ == "__main__": + main() diff --git a/src/filter/subset_obsp/test.py b/src/filter/subset_obsp/test.py new file mode 100644 index 00000000..4846f7dd --- /dev/null +++ b/src/filter/subset_obsp/test.py @@ -0,0 +1,56 @@ +import sys +import pytest +import mudata as mu + +## VIASH START +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3/"} +## VIASH END + + +@pytest.fixture +def input_h5mu(): + input = mu.read_h5mu(f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu") + input.mod["rna"].obs["filter_column"] = "group_2" + input.mod["rna"].obs["filter_column"][:50] = "group_1" + return input + + +@pytest.fixture +def input_path(write_mudata_to_file, input_h5mu): + return write_mudata_to_file(input_h5mu) + + +def test_subset_obsp(input_path, run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # run component + run_component( + [ + "--input", + input_path, + "--output", + str(output_path), + "--input_obsp_key", + "distances", + "--input_obs_key", + "filter_column", + "--input_obs_value", + "group_1", + "--output_obsm_key", + "group_1", + ] + ) + + assert output_path.is_file(), "Output file not found" + + # check output file + mu_out = mu.read_h5mu(output_path) + + assert "group_1" in mu_out.mod["rna"].obsm, "Output should contain group_1 in .obsm" + assert mu_out.mod["rna"].obsm["group_1"].shape[1] == 50, ( + "Obsm should only contain a subset of the original obsp matrix" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/genetic_demux/bcftools/config.vsh.yaml b/src/genetic_demux/bcftools/config.vsh.yaml new file mode 100644 index 00000000..597dce48 --- /dev/null +++ b/src/genetic_demux/bcftools/config.vsh.yaml @@ -0,0 +1,50 @@ +name: bcftools +namespace: genetic_demux +scope: "public" +description: Filter the variants called by freebayes or cellSNP +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +arguments: + - name: "--vcf" + type: file + required: true + multiple: true + description: VCF files, must have the same sample columns appearing in the same order. + - name: "--concat" + type: boolean_true + description: Concatenate or combine VCFs and sort them. + - name: "--filter" + type: boolean_true + description: "Filter VCFs." + - name: "--filter_qual" + type: integer + default: 30 + description: "Filter VCFs with specified QUAL threshold." + - name: "--output" + type: file + direction: output + description: bcftools output directory + example: "bcftools_out/" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: ubuntu:latest + setup: + - type: apt + packages: [ wget, bzip2, gcc, make, libbz2-dev, zlib1g-dev, libncurses5-dev, libncursesw5-dev, liblzma-dev, autoconf, automake, perl, libcurl4-gnutls-dev, libssl-dev] + - type: docker + run: wget https://github.com/samtools/bcftools/releases/download/1.16/bcftools-1.16.tar.bz2 -O bcftools.tar.bz2 && tar -xjvf bcftools.tar.bz2 && cd bcftools-1.16 && make prefix=/usr/local install + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/genetic_demux/bcftools/script.sh b/src/genetic_demux/bcftools/script.sh new file mode 100644 index 00000000..227294bf --- /dev/null +++ b/src/genetic_demux/bcftools/script.sh @@ -0,0 +1,20 @@ +#!/bin/bash +if [ ! -d "$par_output" ]; then + mkdir -p $par_output +fi + +IFS=";" read -a vcf_list <<< $par_vcf + + +if [ "$par_concat" = true ] && [ "$par_filter" = true ] ; then + bcftools concat -o "$par_output/concated_chroms.vcf" ${vcf_list[@]} + bcftools sort "$par_output/concated_chroms.vcf" -o "$par_output/sorted_concated_chroms.vcf" + bcftools filter -i "QUAL>$par_filter_qual" "$par_output/sorted_concated_chroms.vcf" -o "$par_output/filtered_sorted_concated_chroms.vcf" + +elif [ "$par_filter" = true ] ; then + bcftools filter -i "QUAL>$par_filter_qual" ${vcf_list[@]} -o "$par_output/filtered.vcf" + +else + bcftools concat -o "$par_output/concated_chroms.vcf" ${vcf_list[@]} + bcftools sort "$par_output/concated_chroms.vcf" -o "$par_output/sorted_concated_chroms.vcf" +fi diff --git a/src/genetic_demux/bcftools/test.sh b/src/genetic_demux/bcftools/test.sh new file mode 100644 index 00000000..56a4f628 --- /dev/null +++ b/src/genetic_demux/bcftools/test.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --vcf "$meta_resources_dir/demuxafy_test_data/test_dataset_chr1_2.vcf" \ + --vcf "$meta_resources_dir/demuxafy_test_data/test_dataset_chr3_4.vcf" \ + --concat --filter \ + --output bcftools_result/ + +[[ ! -f bcftools_result/filtered_sorted_concated_chroms.vcf ]] && echo "Output processed VCF file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/cellsnp/config.vsh.yaml b/src/genetic_demux/cellsnp/config.vsh.yaml new file mode 100755 index 00000000..9e7c1221 --- /dev/null +++ b/src/genetic_demux/cellsnp/config.vsh.yaml @@ -0,0 +1,114 @@ +name: cellsnp +namespace: genetic_demux +scope: "public" +description: "cellSNP aims to pileup the expressed alleles in single-cell or bulk RNA-seq data. It can be directly used for donor deconvolution in multiplexed single-cell RNA-seq data, particularly with vireo." +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--sam_file" + type: file + description: "Indexed sam/bam file(s), comma separated multiple samples. Mode 1a & 2a: one sam/bam file with single cell. Mode 1b & 2b: one or multiple bulk sam/bam files, no barcodes needed, but sample ids and regionsVCF." + - name: "--sam_index_file" + type: file + description: Input SAM/BAM Index file, problem with samFileList. + - name: "--sam_fileList" + type: file + description: A list file containing bam files, each per line, for Mode 1b & 2b. + - name: "--regions_vcf" + type: file + description: A vcf file listing all candidate SNPs, for fetch each variants. If None, pileup the genome. Needed for bulk samples. + - name: "--targets_vcf" + type: file + description: "Similar as --regions_vcf, but the next position is accessed by streaming rather than indexing/jumping (like -T in samtools/bcftools mpileup)." + - name: "--barcode_file" + type: file + description: A plain file listing all effective cell barcode. + - name: "--sample_list" + type: file + description: A list file containing sample IDs, each per line. + - name: "--sample_ids" + type: string + description: Comma separated sample ids. + - name: "--genotype" + type: boolean_true + description: If use, do genotyping in addition to counting. + - name: "--gzip" + type: boolean_true + description: If use, the output files will be zipped into BGZF format. + - name: "--print_skip_snps" + type: boolean_true + description: If use, the SNPs skipped when loading VCF will be printed. + - name: "--chrom" + type: string + description: "The chromosomes to use in integer format 1-22, comma separated" + - name: "--cell_tag" + type: string + default: "CB" + description: Tag for cell barcodes, turn off with None. + - name: "--umi_tag" + type: string + default: "Auto" + description: "Tag for UMI: UR, Auto, None. For Auto mode, use UR if barcodes is inputted, otherwise use None. None mode means no UMI but read counts." + - name: "--min_count" + type: integer + default: 20 + description: Minimum aggragated count. + - name: "--min_maf" + type: double + default: 0.00 + description: Minimum minor allele frequency. + - name: "--doublet_gl" + type: boolean_true + description: If use, keep doublet GT likelihood, i.e., GT=0.5 and GT=1.5. + - name: "--incl_flag" + type: string + description: "Required flags: skip reads with all mask bits unset." + - name: "--excl_flag" + type: string + description: "Filter flags: skip reads with any mask bits set [UNMAP,SECONDARY,QCFAIL (when use UMI) or UNMAP,SECONDARY,QCFAIL,DUP (otherwise)]" + - name: "--count_orphan" + type: boolean_true + description: If use, do not skip anomalous read pairs. + - name: "--min_mapq" + type: integer + default: 20 + description: Minimum MAPQ for read filtering. + - name: "--min_len" + type: integer + default: 30 + description: Minimum mapped length for read filtering. +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "--outDir" ] + type: file + direction: output + description: Output directory for VCF and sparse matrices. + example: "cellsnp_out/" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: ubuntu:latest + setup: + - type: apt + packages: [ wget, gcc, zlib1g, make, libbz2-dev, zlib1g-dev, libncurses5-dev, liblzma-dev, autoconf, automake, perl, libcurl4-gnutls-dev, libssl-dev, git, bzip2] + - type: docker + run : wget https://github.com/samtools/htslib/releases/download/1.16/htslib-1.16.tar.bz2 -O htslib.tar.bz2 && tar -xjvf htslib.tar.bz2 && cd htslib-1.16 && make && make install + - type: docker + run : git clone https://github.com/single-cell-genetics/cellsnp-lite.git && cd cellsnp-lite && autoreconf -iv && ./configure --with-htslib=/htslib-1.16 && make && make install + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/genetic_demux/cellsnp/script.sh b/src/genetic_demux/cellsnp/script.sh new file mode 100755 index 00000000..a18f5d03 --- /dev/null +++ b/src/genetic_demux/cellsnp/script.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "$par_genotype" == "false" ]] && unset par_genotype +[[ "$par_gzip" == "false" ]] && unset par_gzip +[[ "$par_print_skip_snps" == "false" ]] && unset par_print_skip_snps +[[ "$par_doublet_gl" == "false" ]] && unset par_doublet_gl +[[ "$par_count_orphan" == "false" ]] && unset par_count_orphan + +cellsnp-lite \ + ${meta_cpus:+--nproc $meta_cpus} \ + --cellTAG $par_cell_tag \ + --UMItag $par_umi_tag \ + --minCOUNT $par_min_count \ + --minMAF $par_min_maf \ + --minLEN $par_min_len \ + --minMAPQ $par_min_mapq \ + --outDir $par_output \ + ${par_sam_file:+--samFile $par_sam_file} \ + ${par_sam_fileList:+--samFileList $par_sam_fileList} \ + ${par_regions_vcf:+--regionsVCF $par_regions_vcf} \ + ${par_targets_vcf:+--targetsVCF $par_targets_vcf} \ + ${par_barcode_file:+--barcodeFile $par_barcode_file} \ + ${par_sample_list:+--sampleList $par_sample_list} \ + ${par_sample_ids:+--sampleIDs $par_sample_ids} \ + ${par_genotype:+--genotype} \ + ${par_gzip:+--gzip} \ + ${par_print_skip_snps:+--printSkipSNPs} \ + ${par_chrom:+--chrom $par_chrom} \ + ${par_doublet_gl:+--doubletGL} \ + ${par_incl_flag:+--inclFLAG $par_incl_flag} \ + ${par_excl_flag:+--exclFLAG $par_excl_flag} \ + ${par_count_orphan:+--countORPHAN} diff --git a/src/genetic_demux/cellsnp/test.sh b/src/genetic_demux/cellsnp/test.sh new file mode 100644 index 00000000..4683da78 --- /dev/null +++ b/src/genetic_demux/cellsnp/test.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --sam_file "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --barcode_file "$meta_resources_dir/demuxafy_test_data/barcodes.tsv" \ + --output cellsnp_result/ \ + --regions_vcf "$meta_resources_dir/demuxafy_test_data/test_dataset.vcf" + +[[ ! -f cellsnp_result/cellSNP.base.vcf ]] && echo "Output VCF file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/demuxlet/config.vsh.yaml b/src/genetic_demux/demuxlet/config.vsh.yaml new file mode 100644 index 00000000..8529c638 --- /dev/null +++ b/src/genetic_demux/demuxlet/config.vsh.yaml @@ -0,0 +1,153 @@ +name: demuxlet +namespace: genetic_demux +scope: "public" +description: | + Demuxlet is a software tool to deconvolute sample identity and identify multiplets when + multiple samples are pooled by barcoded single cell sequencing. If external genotyping data + for each sample is available (e.g. from SNP arrays), demuxlet would be recommended. Be careful + that the parameters on the github is not in line with the newest help version. +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--sam" + type: file + description: Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed. + - name: "--tag_group" + type: string + default: 'CB' + description: Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups. For 10x genomics, use CB. + - name: "--tag_umi" + type: string + default: 'UB' + description: Tag representing UMIs. For 10x genomiucs, use UB. + - name: "--plp" + type: string + description: Input pileup format. If the value is a string, it will be considered as the path of the plp file. If the value is boolean true, it will perform dscpileup. + - name: "--vcf" + type: file + description: Input VCF/BCF file, containing the individual genotypes (GT), posterior probability (GP), or genotype likelihood (PL). + - name: "--field" + type: string + default: "GT" + description: FORMAT field to extract the genotype, likelihood, or posterior from + - name: "--geno_error_offset" + type: double + default: 0.10 + description: Offset of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2] + - name: "--geno_error_coeff" + type: double + default: 0.0 + description: Slope of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2] + - name: "--r2_info" + type: string + default: "R2" + description: INFO field name representing R2 value. Used for representing imputation quality. + - name: "--min_mac" + type: integer + default: 1 + description: Minimum minor allele frequency. + - name: "--min_call_rate" + type: double + default: 0.50 + description: Minimum call rate. + - name: "--alpha" + type: string + default: "0.5" + description: Grid of alpha to search for (default is 0.1, 0.2, 0.3, 0.4, 0.5) + - name: "--doublet_prior" + type: double + default: 0.5 + description: Prior of doublet + - name: "--sm" + type: string + description: "List of sample IDs to compare to (default: use all)." + - name: "--sm_list" + type: string + description: File containing the list of sample IDs to compare. + - name: "--sam_verbose" + type: integer + default: 1000000 + description: Verbose message frequency for SAM/BAM/CRAM. + - name: "--vcf_verbose" + type: integer + default: 1000 + description: Verbose message frequency for VCF/BCF. + - name: "--cap_bq" + type: integer + default: 20 + description: Maximum base quality (higher BQ will be capped). + - name: "--min_bq" + type: integer + default: 13 + description: Minimum base quality to consider (lower BQ will be skipped). + - name: "--min_mq" + type: integer + default: 20 + description: Minimum mapping quality to consider (lower MQ will be ignored). + - name: "--min_td" + type: integer + default: 0 + description: Minimum distance to the tail (lower will be ignored). + - name: "--excl_flag" + type: integer + default: 3844 + description: "SAM/BAM FLAGs to be excluded." + - name: "--group_list" + type: string + description: List of tag readgroup/cell barcode to consider in this run. All other barcodes will be ignored. This is useful for parallelized run. + - name: "--min_total" + type: integer + default: 0 + description: Minimum number of total reads for a droplet/cell to be considered. + - name: "--min_snp" + type: integer + default: 0 + description: Minimum number of SNPs with coverage for a droplet/cell to be considered. + - name: "--min_umi" + type: integer + default: 0 + description: Minimum number of UMIs for a droplet/cell to be considered. +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: Output directory + example: "demux/" + - name: "--out" + type: string + description: demuxlet output file prefix + example: "demuxlet" +resources: + - type: r_script + path: script.R + - path: demuxlet.patch +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + copy: ["demuxlet.patch /opt/demuxlet.patch"] + - type: apt + packages: [ autoconf, wget, git, build-essential, libcurl4-openssl-dev, cmake, libbz2-dev, libssl-dev, liblzma-dev, zlib1g-dev, r-base] + - type: docker + run: git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install + - type: docker + run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/demuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin + - type: r + cran: [ readr, processx, dplyr ] + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/genetic_demux/demuxlet/demuxlet.patch b/src/genetic_demux/demuxlet/demuxlet.patch new file mode 100644 index 00000000..29cc67c9 --- /dev/null +++ b/src/genetic_demux/demuxlet/demuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/src/genetic_demux/demuxlet/script.R b/src/genetic_demux/demuxlet/script.R new file mode 100644 index 00000000..ac9d662d --- /dev/null +++ b/src/genetic_demux/demuxlet/script.R @@ -0,0 +1,94 @@ +requireNamespace("processx", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +## VIASH START +par <- list( + sam = "resources_test/demuxafy_test_data/chr_1_pooled.sorted.bam", + vcf = "resources_test/demuxafy_test_data/test_dataset.vcf", + output = "demuxlet_result", + out = "out", + field = "GP" +) +## VIASH END + +if (!dir.exists(par$output)) { + dir.create(par$output, recursive = TRUE, showWarnings = FALSE) +} + +cmd <- c( + "popscle", "demuxlet", + "--out", paste0(par$output, "/", par$out) +) + +argmap <- c( + "tag_group" = "--tag-group", + "tag_umi" = "--tag-UMI", + "field" = "--field", + "geno_error_offset" = "--geno-error-offset", + "geno_error_coeff" = "--geno-error-coeff", + "r2_info" = "--r2-info", + "min_mac" = "--min-mac", + "min_call_rate" = "--min-callrate", + "alpha" = "--alpha", + "doublet_prior" = "--doublet-prior", + "sm" = "--sm", + "sm_list" = "--sm-list", + "sam_verbose" = "--sam-verbose", + "vcf_verbose" = "--vcf-verbose", + "cap_bq" = "--cap-BQ", + "min_bq" = "--min-BQ", + "min_mq" = "--min-MQ", + "min_td" = "--min-TD", + "excl_flag" = "--excl-flag", + "group_list" = "--group-list", + "min_total" = "--min-total", + "min_snp" = "--min-snp", + "min_umi" = "--min-umi", + "plp" = "--plp", + "vcf" = "--vcf", + "sam" = "--sam", + "sm" = "--sm", + "sm_list" = "--sm-list", + "group_list" = "--group-list" +) + +for (arg in names(argmap)) { + if (!is.null(par[[arg]])) { + cmd <- c(cmd, argmap[[arg]], par[[arg]]) + } +} + +zzz <- processx::run( + cmd[[1]], + args = cmd[-1], + echo = TRUE, + echo_cmd = TRUE +) + +if (zzz$status != 0) { + stop("Command failed with status ", zzz$status) +} + +out_file <- paste0(par$output, "/", par$out, ".best") +if (!file.exists(out_file)) { + stop("Output file '", out_file, "' not found") +} +res <- readr::read_tsv(out_file) + +res2 <- res %>% + mutate( + donor_part1 = gsub("([^,]*),([^,]*),.*", "\\1", BEST.GUESS), + donor_part2 = gsub("([^,]*),([^,]*),.*", "\\2", BEST.GUESS), + donor_id = case_when( + donor_part1 == donor_part2 ~ donor_part1, + TRUE ~ DROPLET.TYPE + ) + ) + +demuxlet_assign <- res2 %>% select(cell = BARCODE, donor_id) + +readr::write_csv( + demuxlet_assign, + paste0(par$output, "/cell_annotation.csv") +) diff --git a/src/genetic_demux/demuxlet/test.sh b/src/genetic_demux/demuxlet/test.sh new file mode 100644 index 00000000..47b64fca --- /dev/null +++ b/src/genetic_demux/demuxlet/test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --sam "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --vcf "$meta_resources_dir/demuxafy_test_data/test_dataset.vcf" \ + --output demuxlet_result/ \ + --out out \ + --field GP + +[[ ! -f demuxlet_result/out.best ]] && echo "Output result file could not be found!" && exit 1 +[[ ! -f demuxlet_result/cell_annotation.csv ]] && echo "Output cell annotation file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/dsc_pileup/config.vsh.yaml b/src/genetic_demux/dsc_pileup/config.vsh.yaml new file mode 100644 index 00000000..70576eb2 --- /dev/null +++ b/src/genetic_demux/dsc_pileup/config.vsh.yaml @@ -0,0 +1,120 @@ +name: dsc_pileup +namespace: genetic_demux +scope: "public" +description: | + Dsc-pileup is a software tool to pileup reads and corresponding base quality + for each overlapping SNPs and each barcode. By using pileup files, + it would allow us to run demuxlet/freemuxlet pretty fast multiple times + without going over the BAM file again. +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--sam" + type: file + description: Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed. + - name: "--tag_group" + type: string + default: 'CB' + description: Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups. For 10x genomics, use CB. + - name: "--tag_umi" + type: string + default: 'UB' + description: Tag representing UMIs. For 10x genomiucs, use UB. + - name: "--exclude_flag" + type: integer + default: 1796 + description: SAM/BAM FLAGs to be excluded. + - name: "--vcf" + type: file + description: "Input VCF/BCF file for dsc-pileup, containing the AC and AN field." + - name: "--sm" + type: string + description: "List of sample IDs to compare to (default: use all)." + - name: "--sm_list" + type: string + description: File containing the list of sample IDs to compare. + - name: "--sam_verbose" + type: integer + default: 1000000 + description: Verbose message frequency for SAM/BAM/CRAM. + - name: "--vcf_verbose" + type: integer + default: 1000 + description: Verbose message frequency for VCF/BCF. + - name: "--skip_umi" + type: boolean_true + description: Do not generate [prefix].umi.gz file, which stores the regions covered by each barcode/UMI pair. + - name: "--cap_bq" + type: integer + default: 40 + description: Maximum base quality (higher BQ will be capped). + - name: "--min_bq" + type: integer + default: 13 + description: Minimum base quality to consider (lower BQ will be skipped). + - name: "--min_mq" + type: integer + default: 20 + description: Minimum mapping quality to consider (lower MQ will be ignored). + - name: "--min_td" + type: integer + default: 0 + description: Minimum distance to the tail (lower will be ignored). + - name: "--excl_flag" + type: integer + default: 3844 + description: SAM/BAM FLAGs to be excluded for SNP overlapping Read filtering Options. + - name: "--group_list" + type: string + description: List of tag readgroup/cell barcode to consider in this run. All other barcodes will be ignored. This is useful for parallelized run. + - name: "--min_total" + type: integer + default: 0 + description: Minimum number of total reads for a droplet/cell to be considered. + - name: "--min_uniq" + type: integer + default: 0 + description: Minimum number of unique reads (determined by UMI/SNP pair) for a droplet/cell to be considered. + - name: "--min_snp" + type: integer + default: 0 + description: Minimum number of SNPs with coverage for a droplet/cell to be considered. +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: Output directory + example: "demux/" + - name: "--out" + type: string + description: dsc-pileup output file prefix + example: "demuxlet_dsc" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: ubuntu:20.04 + setup: + - type: apt + packages: [ autoconf, wget, git, build-essential, libcurl4-openssl-dev, cmake, libbz2-dev, libssl-dev, liblzma-dev, zlib1g-dev, r-base] + - type: docker + run: git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install + - type: docker + run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/genetic_demux/dsc_pileup/script.sh b/src/genetic_demux/dsc_pileup/script.sh new file mode 100644 index 00000000..01a66f05 --- /dev/null +++ b/src/genetic_demux/dsc_pileup/script.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "$par_skip_umi" == "false" ]] && unset par_skip_umi + +# Create output directory if it doesn't exist +if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" +fi + +popscle dsc-pileup \ + --sam $par_sam \ + --tag-group $par_tag_group \ + --tag-UMI $par_tag_umi \ + --exclude-flag $par_exclude_flag \ + --sam-verbose $par_sam_verbose \ + --vcf $par_vcf \ + --vcf-verbose $par_vcf_verbose \ + --out "$par_output/$par_out" \ + --cap-BQ $par_cap_bq \ + --min-BQ $par_min_bq \ + --min-MQ $par_min_mq \ + --min-TD $par_min_td \ + --excl-flag $par_excl_flag \ + --min-total $par_min_total \ + --min-uniq $par_min_uniq \ + --min-snp $par_min_snp \ + ${par_sm:+--sm $par_sm} \ + ${par_sm_list:+--sm-list $par_sm_list} \ + ${par_skip_umi:+--skip-umi} \ + ${par_group_list:+--group-list $par_group_list} diff --git a/src/genetic_demux/dsc_pileup/test.sh b/src/genetic_demux/dsc_pileup/test.sh new file mode 100644 index 00000000..a294f263 --- /dev/null +++ b/src/genetic_demux/dsc_pileup/test.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --sam "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --vcf "$meta_resources_dir/demuxafy_test_data/test_dataset.vcf" \ + --output dscpileup_result/ \ + --out out + +[[ ! -f dscpileup_result/out.plp.gz ]] && echo "Output result file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/freebayes/config.vsh.yaml b/src/genetic_demux/freebayes/config.vsh.yaml new file mode 100755 index 00000000..54698445 --- /dev/null +++ b/src/genetic_demux/freebayes/config.vsh.yaml @@ -0,0 +1,289 @@ +name: freebayes +namespace: genetic_demux +scope: "public" +description: | + Freebayes is a Bayesian genetic variant detector designed to + find small polymorphisms, specifically SNPs. +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--bam" + type: file + description: Add FILE to the set of BAM files to be analyzed. + - name: "--bam_list" + type: file + description: A file containing a list of BAM files to be analyzed. + - name: "--stdin" + type: boolean_true + description: Read BAM input on stdin. + - name: "--fasta_reference" + type: file + description: Use FILE as the reference sequence for analysis. An index file (FILE.fai) will be created if none exists. If neither --targets nor --region are specified, FreeBayes will analyze every position in this reference. + - name: "--fasta_reference_index" + type: file + description: Use FILE.fai as the index of reference sequence for analysis. + - name: "--targets" + type: file + description: Limit analysis to targets listed in the BED-format FILE. + - name: "--region" + type: string + description: Limit analysis to the specified region, 0-base coordinates, end_position not included (same as BED format). + - name: "--samples" + type: file + description: Limit analysis to samples listed (one per line) in the FILE. By default FreeBayes will analyze all samples in its input BAM files. + - name: "--populations" + type: file + description: Each line of FILE should list a sample and a population which it is part of. The population-based bayesian inference model will then be partitioned on the basis of the populations. + - name: "--cnv_map" + type: file + description: Read a copy number map from the BED file FILE, which has either a sample-level ploidy or a region-specific format. + - name: "--gvcf" + type: boolean_true + description: Write gVCF output, which indicates coverage in uncalled regions. + - name: "--gvcf_chunk" + type: integer + description: When writing gVCF output emit a record for every NUM bases. + - name: "--variant_input" + type: file + description: Use variants reported in VCF file as input to the algorithm. Variants in this file will included in the output even if there is not enough support in the data to pass input filters. + - name: "--only_use_input_alleles" + type: boolean_true + description: Only provide variant calls and genotype likelihoods for sites and alleles which are provided in the VCF input, and provide output in the VCF for all input alleles, not just those which have support in the data. + - name: "--haplotype_basis_alleles" + type: file + description: When specified, only variant alleles provided in this input VCF will be used for the construction of complex or haplotype alleles. + - name: "--report_all_haplotype_alleles" + type: boolean_true + description: At sites where genotypes are made over haplotype alleles, provide information about all alleles in output, not only those which are called. + - name: "--report_monomorphic" + type: boolean_true + description: Report even loci which appear to be monomorphic, and report all considered alleles, even those which are not in called genotypes. + - name: "--pvar" + type: double + default: 0.0 + description: Report sites if the probability that there is a polymorphism at the site is greater than N. Note that post-filtering is generally recommended over the use of this parameter. + - name: "--strict_vcf" + type: boolean_true + description: Generate strict VCF format (FORMAT/GQ will be an int). + - name: "--theta" + type: double + default: 0.001 + description: The expected mutation rate or pairwise nucleotide diversity among the population under analysis. This serves as the single parameter to the Ewens Sampling Formula prior model. + - name: "--ploidy" + type: integer + default: 2 + description: Sets the default ploidy for the analysis to N. + - name: "--pooled_discrete" + type: boolean_true + description: Assume that samples result from pooled sequencing. Model pooled samples using discrete genotypes across pools. + - name: "--pooled_continuous" + type: boolean_true + description: Output all alleles which pass input filters, regardles of genotyping outcome or model. + - name: "--use_reference_allele" + type: boolean_true + description: This flag includes the reference allele in the analysis as if it is another sample from the same population. + - name: "--reference_quality" + type: string + description: Assign mapping quality of MQ to the reference allele at each site and base quality of BQ. + default: "100,60" + - name: "--throw_away_snp_obs" + type: boolean_true + description: Ignore SNP alleles. + - name: "--throw_away_mnps_obs" + type: boolean_false + description: Ignore multi-nuceotide polymorphisms, MNPs. MNPs are excluded as default. + - name: "--throw_away_indel_obs" + type: boolean_false + description: Ignore insertion and deletion alleles. Indels are excluded as default. + - name: "--throw_away_complex_obs" + type: boolean_false + description: Ignore complex events (composites of other classes). Complex are excluded as default + - name: "--use_best_n_alleles" + type: integer + default: 0 + description: Evaluate only the best N SNP alleles, ranked by sum of supporting quality scores. + - name: "--max_complex_gap" + type: integer + default: 3 + description: Allow haplotype calls with contiguous embedded matches of up to this length. + - name: "--min_repeat_size" + type: integer + default: 5 + description: When assembling observations across repeats, require the total repeat length at least this many bp. + - name: "--min_repeat_entropy" + type: integer + default: 1 + description: To detect interrupted repeats, build across sequence until it has entropy > N bits per bp. Set to 0 to turn off. + - name: "--no_partial_observations" + type: boolean_true + description: Exclude observations which do not fully span the dynamically-determined detection window. (default, use all observations, dividing partial support across matching haplotypes when generating haplotypes.) + - name: "--dont_left_align_indels" + type: boolean_true + description: Turn off left-alignment of indels, which is enabled by default. + - name: "--use_duplicate_reads" + type: boolean_true + description: "Include duplicate-marked alignments in the analysis. default: exclude duplicates marked as such in alignments" + - name: "--min_mapping_quality" + type: integer + default: 1 + description: Exclude alignments from analysis if they have a mapping quality less than Q. + - name: "--min_base_quality" + type: integer + default: 1 + description: Exclude alleles from analysis if their supporting base quality is less than Q. Default value is changed according to the instruction of scSplit. + - name: "--min_supporting_allele_qsum" + type: integer + default: 0 + description: Consider any allele in which the sum of qualities of supporting observations is at least Q. + - name: "--min_supporting_mapping_qsum" + type: integer + default: 0 + description: Consider any allele in which and the sum of mapping qualities of supporting reads is at least. + - name: "--mismatch_base_quality_threshold" + type: integer + default: 10 + description: Count mismatches toward --read-mismatch-limit if the base quality of the mismatch is >= Q. + - name: "--read_max_mismatch_fraction" + type: double + default: 1.0 + description: Exclude reads with more than N mismatches where each mismatch has base quality >= mismatch-base-quality-threshold. + - name: "--read_mismatch_limit" + type: integer + description: Exclude reads with more than N [0,1] fraction of mismatches where each mismatch has base quality >= mismatch-base-quality-threshold. + - name: "--read_snp_limit" + type: integer + description: Exclude reads with more than N base mismatches, ignoring gaps with quality >= mismatch-base-quality-threshold. + - name: "--read_indel_limit" + type: integer + description: Exclude reads with more than N separate gaps. + - name: "--standard_filters" + type: boolean_true + description: Use stringent input base and mapping quality filters, equivalent to -m 30 -q 20 -R 0 -S 0 + - name: "--min_alternate_fraction" + type: double + default: 0.05 + description: Require at least this fraction of observations supporting an alternate allele within a single individual in order to evaluate the position. + - name: "--min_alternate_count" + type: integer + default: 2 + description: Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position. + - name: "--min_alternate_qsum" + type: integer + default: 0 + description: Require at least this sum of quality of observations supporting an alternate allele within a single individual in order to evaluate the position. + - name: "--min_alternate_total" + type: integer + default: 1 + description: Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis. + - name: "--min_coverage" + type: integer + default: 0 + description: Require at least this coverage to process a site. + - name: "--max_coverage" + type: integer + description: Do not process sites with greater than this coverage. + - name: "--no_population_priors" + type: boolean_true + description: Equivalent to --pooled-discrete --hwe-priors-off and removal of Ewens Sampling Formula component of priors. + - name: "--hwe_priors_off" + type: boolean_true + description: Disable estimation of the probability of the combination arising under HWE given the allele frequency as estimated by observation frequency. + - name: "--binomial_obs_priors_off" + type: boolean_true + description: Disable incorporation of prior expectations about observations. Uses read placement probability, strand balance probability, and read position probability. + - name: "--allele_balance_priors_off" + type: boolean_true + description: Disable use of aggregate probability of observation balance between alleles as a component of the priors. + - name: "--observation_bias" + type: file + description: Read length-dependent allele observation biases from FILE. The format is [length] [alignment efficiency relative to reference] where the efficiency is 1 if there is no relative observation bias. + - name: "--base_quality_cap" + type: integer + description: Limit estimated observation quality by capping base quality at Q. + - name: "--prob_contamination" + type: double + default: 10e-9 + description: An estimate of contamination to use for all samples. + - name: "--legacy_gls" + type: boolean_true + description: Use legacy (polybayes equivalent) genotype likelihood calculations + - name: "--contamination_estimates" + type: file + description: A file containing per-sample estimates of contamination, such as those generated by VerifyBamID. + - name: "--report_genotype_likelihood_max" + type: boolean_true + description: Report genotypes using the maximum-likelihood estimate provided from genotype likelihoods. + - name: "--genotyping_max_iterations" + type: integer + default: 1000 + description: Iterate no more than N times during genotyping step. + - name: "--genotyping_max_banddepth" + type: integer + default: 6 + description: Integrate no deeper than the Nth best genotype by likelihood when genotyping. + - name: "--posterior_integration_limits" + type: string + default: 1,3 + description: Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood. + - name: "--exclude_unobserved_genotypes" + type: boolean_true + description: Skip sample genotypings for which the sample has no supporting reads. + - name: "--genotype_variant_threshold" + type: integer + description: Limit posterior integration to samples where the second-best genotype likelihood is no more than log(N) from the highest genotype likelihood for the sample. + - name: "--use_mapping_quality" + type: boolean_true + description: Use mapping quality of alleles when calculating data likelihoods. + - name: "--harmonic_indel_quality" + type: boolean_true + description: Use a weighted sum of base qualities around an indel, scaled by the + distance from the indel. By default use a minimum BQ in flanking sequence. + - name: "--read_dependence_factor" + type: double + default: 0.9 + description: Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations. + - name: "--genotype_qualities" + type: boolean_true + description: Calculate the marginal probability of genotypes and report as GQ in each sample field in the VCF output. + - name: "--debug" + type: boolean_true + description: Print debugging output. + - name: "--dd" + type: boolean_true + description: Print more verbose debugging output +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: Output directory + example: "freebayes_out/" + - name: "--vcf" + type: string + description: Output VCF-format results to FILE. + example: "snp.vcf" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + - path: ../../../resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: freebayes + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/genetic_demux/freebayes/script.sh b/src/genetic_demux/freebayes/script.sh new file mode 100755 index 00000000..3a8d0e02 --- /dev/null +++ b/src/genetic_demux/freebayes/script.sh @@ -0,0 +1,91 @@ +#!/bin/bash +set -eo pipefail + +# Unset boolean flags if their values are not 'true' +for flag in par_stdin par_gvcf par_only_use_input_alleles par_report_all_haplotype_alleles par_report_monomorphic par_strict_vcf \ + par_pooled_discrete par_pooled_continuous par_use_reference_allele par_throw_away_snp_obs par_throw_away_indel_obs par_throw_away_mnps_obs par_throw_away_complex_obs \ + par_no_partial_observations par_dont_left_align_indels par_use_duplicate_reads par_standard_filters par_no_population_priors \ + par_hwe_priors_off par_binomial_obs_priors_off par_allele_balance_priors_off par_legacy_gls par_report_genotype_likelihood_max \ + par_exclude_unobserved_genotypes par_use_mapping_quality par_harmonic_indel_quality par_genotype_qualities par_debug par_dd; do + [[ "${!flag}" != "true" ]] && unset "$flag" +done + +# Create output directory if it doesn't exist +if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" +fi + +freebayes \ + --fasta-reference $par_fasta_reference \ + --pvar $par_pvar \ + --theta $par_theta \ + --ploidy $par_ploidy \ + --min-repeat-entropy $par_min_repeat_entropy \ + --reference-quality $par_reference_quality \ + --use-best-n-alleles $par_use_best_n_alleles \ + --max-complex-gap $par_max_complex_gap \ + --min-repeat-size $par_min_repeat_size \ + --min-mapping-quality $par_min_mapping_quality \ + --min-base-quality $par_min_base_quality \ + --min-supporting-allele-qsum $par_min_supporting_allele_qsum \ + --min-supporting-mapping-qsum $par_min_supporting_mapping_qsum \ + --mismatch-base-quality-threshold $par_mismatch_base_quality_threshold \ + --read-max-mismatch-fraction $par_read_max_mismatch_fraction \ + --min-alternate-fraction $par_min_alternate_fraction \ + --min-alternate-count $par_min_alternate_count \ + --min-alternate-qsum $par_min_alternate_qsum \ + --min-alternate-total $par_min_alternate_total \ + --min-coverage $par_min_coverage \ + --genotyping-max-iterations $par_genotyping_max_iterations \ + --genotyping-max-banddepth $par_genotyping_max_banddepth \ + --posterior-integration-limits $par_posterior_integration_limits \ + --read-dependence-factor $par_read_dependence_factor \ + --vcf ${par_output}/${par_vcf} \ + ${par_bam:+--bam $par_bam} \ + ${par_bam_list:+--bam-list $par_bam_list} \ + ${par_stdin:+--stdin} \ + ${par_targets:+--targets $par_targets} \ + ${par_region:+--region $par_region} \ + ${par_max_coverage:+--max-coverage $par_max_coverage} \ + ${par_samples:+--samples $par_samples} \ + ${par_populations:+--populations $par_populations} \ + ${par_cnv_map:+--cnv-map $par_cnv_map} \ + ${par_gvcf:+--gvcf} \ + ${par_gvcf_chunk:+--gvcf-chunk $par_gvcf_chunk} \ + ${par_variant_input:+--variant-input $par_variant_input} \ + ${par_only_use_input_alleles:+--only-use-input-alleles} \ + ${par_haplotype_basis_alleles:+--haplotype-basis-alleles $par_haplotype_basis_alleles} \ + ${par_report_all_haplotype_alleles:+--report-all-haplotype-alleles} \ + ${par_report_monomorphic:+--report-monomorphic} \ + ${par_strict_vcf:+--strict-vcf} \ + ${par_pooled_discrete:+--pooled-discrete} \ + ${par_pooled_continuous:+--pooled-continuous} \ + ${par_use_reference_allele:+--use-reference-allele} \ + ${par_throw_away_snp_obs:+--throw-away-snp-obs} \ + ${par_throw_away_indel_obs:+--throw-away-indel-obs} \ + ${par_throw_away_mnps_obs:+--throw-away-mnps-obs} \ + ${par_throw_away_complex_obs:+--throw-away-complex-obs} \ + ${par_no_partial_observations:+--no-partial-observations} \ + ${par_dont_left_align_indels:+--dont-left-align-indels} \ + ${par_use_duplicate_reads:+--use-duplicate-reads} \ + ${par_standard_filters:+--standard-filters} \ + ${par_no_population_priors:+--no-population-priors} \ + ${par_hwe_priors_off:+--hwe-priors-off} \ + ${par_binomial_obs_priors_off:+--binomial-obs-priors-off} \ + ${par_allele_balance_priors_off:+--allele-balance-priors-off} \ + ${par_legacy_gls:+--legacy-gls} \ + ${par_report_genotype_likelihood_max:+--report-genotype-likelihood-max} \ + ${par_exclude_unobserved_genotypes:+--exclude-unobserved-genotypes} \ + ${par_use_mapping_quality:+--use-mapping-quality} \ + ${par_harmonic_indel_quality:+--harmonic-indel-quality} \ + ${par_genotype_qualities:+--genotype-qualities} \ + ${par_debug:+--debug} \ + ${par_dd:+-dd} \ + ${par_observation_bias:+--observation-bias $par_observation_bias} \ + ${par_read_mismatch_limit:+--read-mismatch-limit $par_read_mismatch_limit} \ + ${par_read_snp_limit:+--read-snp-limit $par_read_snp_limit} \ + ${par_read_indel_limit:+--read-indel-limit $par_read_indel_limit} \ + ${par_base_quality_cap:+--base-quality-cap $par_base_quality_cap} \ + ${par_prob_contamination:+--prob-contamination $par_prob_contamination} \ + ${par_contamination_estimates:+--contamination-estimates $par_contamination_estimates} \ + ${par_genotype_variant_threshold:+--genotype-variant-threshold $par_genotype_variant_threshold} diff --git a/src/genetic_demux/freebayes/test.sh b/src/genetic_demux/freebayes/test.sh new file mode 100644 index 00000000..78483761 --- /dev/null +++ b/src/genetic_demux/freebayes/test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --bam "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --fasta_reference "$meta_resources_dir/cellranger_tiny_fastq/cellranger_tiny_ref/fasta/genome.fa" \ + --output freebayes_result/ \ + --region chr21.part \ + --vcf mixed_variant.vcf + +[[ ! -f freebayes_result/mixed_variant.vcf ]] && echo "Output VCF file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/freemuxlet/config.vsh.yaml b/src/genetic_demux/freemuxlet/config.vsh.yaml new file mode 100644 index 00000000..acd463ef --- /dev/null +++ b/src/genetic_demux/freemuxlet/config.vsh.yaml @@ -0,0 +1,124 @@ +name: freemuxlet +namespace: genetic_demux +scope: "public" +description: | + Freemuxlet is a software tool to deconvolute sample identity and identify multiplets when + multiple samples are pooled by barcoded single cell sequencing. If external genotyping + data is not available, the genotyping-free version demuxlet, freemuxlet, would be recommended. +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--plp" + type: string + description: "Prefix of input files generated by dsc-pileup" + - name: "--init_cluster" + type: file + description: "Input file containing the initial cluster information." + - name: "--nsample" + type: integer + default: 2 + description: "Number of samples multiplexed together" + - name: "--aux_files" + type: boolean_true + description: "Turn on writing auxilary output files" + - name: "--verbose" + type: integer + default: 100 + description: "Turn on verbose mode with specific verbosity threshold. 0: fully verbose, 100 : no verbose messages." + - name: "--doublet_prior" + type: double + default: 0.5 + description: "Prior of doublet." + - name: "--geno_error" + type: double + default: 0.1 + description: "Genotype error parameter per cluster." + - name: "--bf_thres" + type: double + default: 5.41 + description: "Bayes Factor Threshold used in the initial clustering." + - name: "--frac_init_clust" + type: double + default: 1 + description: "Fraction of droplets to be clustered in the very first round of initial clustering procedure." + - name: "--iter_init" + type: integer + default: 10 + description: "Iteration for initial cluster assignment (set to zero to skip the iterations)." + - name: "--keep_init_missing" + type: boolean_true + description: "Keep missing cluster assignment as missing in the initial iteration." + - name: "--randomize_singlet_score" + type: boolean_true + description: "Randomize the singlet scores to test its effect." + - name: "--seed" + type: integer + default: 0 + description: "Seed for random number (use clocks if not set)." + - name: "--cap_bq" + type: integer + default: 20 + description: "Maximum base quality (higher BQ will be capped)." + - name: "--min_bq" + type: integer + default: 13 + description: "Minimum base quality to consider (lower BQ will be skipped)." + - name: "--group_list" + type: string + description: "List of tag readgroup/cell barcode to consider in this run. All other barcodes will be ignored. This is useful for parallelized run." + - name: "--min_total" + type: integer + default: 0 + description: "Minimum number of total reads for a droplet/cell to be considered." + - name: "--min_umi" + type: integer + default: 0 + description: "Minimum number of UMIs for a droplet/cell to be considered." + - name: "--min_snp" + type: integer + default: 0 + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: Output directory + example: "freemux/" + - name: "--out" + type: string + description: freemuxlet Output file prefix + example: "freemuxlet" +resources: + - type: r_script + path: script.R + - path: freemuxlet.patch +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + copy: ["freemuxlet.patch /opt/freemuxlet.patch"] + - type: apt + packages: [ autoconf, wget, git, build-essential, libcurl4-openssl-dev, cmake, libbz2-dev, libssl-dev, liblzma-dev, zlib1g-dev, r-base] + - type: docker + run: git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install + - type: docker + run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/freemuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin + - type: r + cran: [ readr, processx, dplyr ] + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/genetic_demux/freemuxlet/freemuxlet.patch b/src/genetic_demux/freemuxlet/freemuxlet.patch new file mode 100644 index 00000000..29cc67c9 --- /dev/null +++ b/src/genetic_demux/freemuxlet/freemuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/src/genetic_demux/freemuxlet/script.R b/src/genetic_demux/freemuxlet/script.R new file mode 100644 index 00000000..2af4c877 --- /dev/null +++ b/src/genetic_demux/freemuxlet/script.R @@ -0,0 +1,94 @@ +requireNamespace("processx", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +## VIASH START +par <- list( + sam = "resources_test/demuxafy_test_data/chr_1_pooled.sorted.bam", + vcf = "resources_test/demuxafy_test_data/test_dataset.vcf", + output = "freemuxlet_result", + out = "out", + nsample = "14" +) +## VIASH END + +if (!dir.exists(par$output)) { + dir.create(par$output, recursive = TRUE, showWarnings = FALSE) +} + +cmd <- c( + "popscle", "freemuxlet", + "--out", paste0(par$output, "/", par$out) +) + +argmap <- c( + "plp" = "--plp", + "init_cluster" = "--init-cluster", + "nsample" = "--nsample", + "verbose" = "--verbose", + "doublet_prior" = "--doublet-prior", + "geno_error" = "--geno-error", + "bf_thres" = "--bf-thres", + "frac_init_clust" = "--frac-init-clust", + "iter_init" = "--iter-init", + "seed" = "--seed", + "cap_bq" = "--cap-BQ", + "min_bq" = "--min-BQ", + "min_total" = "--min-total", + "min_umi" = "--min-umi", + "min_snp" = "--min-snp", + "group_list" = "--group-list", + "aux_files" = "--aux-files", + "keep_init_missing" = "--keep-init-missing", + "randomize_singlet_score" = "randomize-singlet-score" +) + +for (arg in names(argmap)) { + if (!is.null(par[[arg]])) { + if (arg %in% c( + "aux_files", "keep_init_missing", + "randomize_singlet_score" + )) { + if (toupper(par[[arg]]) == TRUE) { + cmd <- c(cmd, argmap[[arg]]) + } + } else { + cmd <- c(cmd, argmap[[arg]], par[[arg]]) + } + } +} + +zzz <- processx::run( + cmd[[1]], + args = cmd[-1], + echo = TRUE, + echo_cmd = TRUE +) + +if (zzz$status != 0) { + stop("Command failed with status ", zzz$status) +} + +out_file <- paste0(par$output, "/", par$out, ".clust1.samples.gz") +if (!file.exists(out_file)) { + stop("Output file '", out_file, "' not found") +} + +res <- readr::read_tsv(out_file) + +res2 <- res %>% + mutate( + donor_part1 = gsub("([^,]*),([^,]*)*", "\\1", BEST.GUESS), + donor_part2 = gsub("([^,]*),([^,]*)*", "\\2", BEST.GUESS), + donor_id = case_when( + donor_part1 == donor_part2 ~ donor_part1, + TRUE ~ DROPLET.TYPE + ) + ) + +freemuxlet_assign <- res2 %>% select(cell = BARCODE, donor_id) + +readr::write_csv( + freemuxlet_assign, + paste0(par$output, "/cell_annotation.csv") +) diff --git a/src/genetic_demux/freemuxlet/test.sh b/src/genetic_demux/freemuxlet/test.sh new file mode 100644 index 00000000..7e565e71 --- /dev/null +++ b/src/genetic_demux/freemuxlet/test.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --plp "$meta_resources_dir/demuxafy_test_data/dsc_pileup/dscpileup_out" --output freemuxlet_result/ --out freemux_out --nsample 14 + +[[ ! -f freemuxlet_result/freemux_out.clust1.samples.gz ]] && echo "Output VCF file could not be found!" && exit 1 +[[ ! -f freemuxlet_result/cell_annotation.csv ]] && echo "Output cell type annotation file could not be found!" && exit 1 + + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/samtools/config.vsh.yaml b/src/genetic_demux/samtools/config.vsh.yaml new file mode 100644 index 00000000..7b5ab841 --- /dev/null +++ b/src/genetic_demux/samtools/config.vsh.yaml @@ -0,0 +1,41 @@ +name: samtools +namespace: genetic_demux +scope: "public" +description: Filter the BAM according to the instruction of scSplit via Samtools. +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +arguments: + - name: "--bam" + type: file + required: true + description: "Input bam file for filtering." + - name: "--output" + type: file + direction: output + description: "Samtools output directory." + example: "samtools_out/" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: python:3.12 + setup: + - type: apt + packages: [ wget, gcc, make, libbz2-dev, zlib1g-dev, libncurses5-dev, libncursesw5-dev, liblzma-dev] + - type: docker + run: wget https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2 && tar jxf samtools-1.16.1.tar.bz2 && rm samtools-1.16.1.tar.bz2 && cd samtools-1.16.1 && make prefix=/usr/local install + - type: python + pip: [ umi_tools ] + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/genetic_demux/samtools/script.sh b/src/genetic_demux/samtools/script.sh new file mode 100644 index 00000000..9dcf2d9d --- /dev/null +++ b/src/genetic_demux/samtools/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash +if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" +fi +samtools view -S -b -q 10 -F 3844 "$par_bam" > "${par_output}/filtered.bam" +cd $par_output +samtools index filtered.bam filtered.bam.bai +umi_tools dedup --stdin=filtered.bam --extract-umi-method=tag --umi-tag=UR --cell-tag=CB --log=logfile > no_dup.bam +samtools sort no_dup.bam -o sorted.bam +samtools index sorted.bam sorted.bam.bai diff --git a/src/genetic_demux/samtools/test.sh b/src/genetic_demux/samtools/test.sh new file mode 100644 index 00000000..f7b9c9d2 --- /dev/null +++ b/src/genetic_demux/samtools/test.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --bam "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --output samtools_result/ + +[[ ! -f samtools_result/sorted.bam ]] && echo "Output preprocessed BAM file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/scsplit/config.vsh.yaml b/src/genetic_demux/scsplit/config.vsh.yaml new file mode 100644 index 00000000..9d9b450b --- /dev/null +++ b/src/genetic_demux/scsplit/config.vsh.yaml @@ -0,0 +1,94 @@ +name: scsplit +namespace: genetic_demux +scope: "public" +description: "scsplit is a genotype-free demultiplexing methode of pooled single-cell RNA-seq, using a hidden state model for identifying genetically distinct samples within a mixed population." +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--vcf" + type: file + description: VCF from mixed BAM + - name: "--bam" + type: file + description: mixed sample BAM + - name: "--bar" + type: file + description: barcodes whitelist + - name: "--tag" + type: string + default: "CB" + description: tag for barcode + - name: "--com" + type: file + description: common SNVs + - name: "--num" + type: integer + description: expected number of mixed samples + - name: "--sub" + type: integer + default: 10 + description: maximum number of subpopulations in autodetect mode + - name: "--ems" + type: integer + default: 30 + description: number of EM repeats to avoid local maximum + - name: "--dbl" + type: double + description: correction for doublets. There will be no refinement on the results if this optional parameter is not specified or specified percentage is less than doublet rates detected during the run. + - name: "--vcf_known" + type: file + description: known individual genotypes to limit distinguishing variants to available variants, so that users do not need to redo genotyping on selected variants, otherwise any variants could be selected as distinguishing variants. + - name: "--geno" + type: boolean_true + description: generate sample genotypes based on the split result. + +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: Output directory + example: "scSplit_out/" + - name: "--ref" + type: string + description: output Ref count matrix + - name: "--alt" + type: string + description: output Alt count matrix + - name: "--psc" + type: string + description: generated P(S|C) +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: + - type: docker + image: python:3.11 + setup: + - type: python + pip: + - numpy<2 + - pandas<2.0 + - pysam + - setuptools<58 + - scikit-learn==1.1.3 + - scipy + - statistics + - type: python + pip: [ PyVCF ] + - type: docker + run: git clone https://github.com/jon-xu/scSplit && cp scSplit/scSplit /usr/local/bin && rm -rf scSplit +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/genetic_demux/scsplit/script.sh b/src/genetic_demux/scsplit/script.sh new file mode 100644 index 00000000..f539401d --- /dev/null +++ b/src/genetic_demux/scsplit/script.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -eo pipefail + +if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" +fi + +scSplit count \ + --vcf $par_vcf \ + --bam $par_bam \ + --bar $par_bar \ + --tag $par_tag \ + --ref $par_ref \ + --alt $par_alt \ + --out $par_output \ + ${par_com:+--com $par_com} + +scSplit run \ + --ref "$par_output/$par_ref" \ + --alt "$par_output/$par_alt" \ + --out $par_output \ + --num $par_num \ + ${par_sub:+--sub $par_sub} \ + ${par_ems:+--ems $par_ems} \ + ${par_dbl:+--dbl $par_dbl} \ + ${par_vcf_known:+--vcf $par_vcf_known} + +if [ "$par_geno" = true ]; then + scSplit genotype \ + --ref "$par_output/$par_ref" \ + --alt "$par_output/$par_alt" \ + --psc "$par_output/$par_psc" \ + "$par_output" +fi + +echo "cell,donor_id" > "$par_output/cell_annotation.csv" +sed -e '1d' -e 's/SNG-//g' "$par_output/scSplit_result.csv" | +sed 's/\t/,/g' | awk 'BEGIN{FS=OFS=","} { if ($2 ~ /^DBL-/) $2 = "doublet"; print }' \ +>> "$par_output/cell_annotation.csv" diff --git a/src/genetic_demux/scsplit/test.sh b/src/genetic_demux/scsplit/test.sh new file mode 100644 index 00000000..d404df34 --- /dev/null +++ b/src/genetic_demux/scsplit/test.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --bam "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --vcf "$meta_resources_dir/demuxafy_test_data/mixed_variant.vcf" \ + --num 12 \ + --ems 1 \ + --com "$meta_resources_dir/demuxafy_test_data/test_dataset.vcf" \ + --bar "$meta_resources_dir/demuxafy_test_data/barcodes.tsv" \ + --ref ref_filtered.csv \ + --alt alt_filtered.csv \ + --output scSplit_result/ + +[[ ! -f scSplit_result/scSplit_result.csv ]] && echo "Output donor assignment file could not be found!" && exit 1 +[[ ! -f scSplit_result/cell_annotation.csv ]] && echo "Output cell annotation file as tsv format could not be found!" && exit 1 +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/souporcell/config.vsh.yaml b/src/genetic_demux/souporcell/config.vsh.yaml new file mode 100755 index 00000000..d2de2b11 --- /dev/null +++ b/src/genetic_demux/souporcell/config.vsh.yaml @@ -0,0 +1,84 @@ +name: souporcell +namespace: genetic_demux +scope: "public" +description: "souporcell is a method for clustering mixed-genotype scRNAseq experiments by individual." +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--fasta" + type: file + description: reference fasta file + - name: "--bam" + type: file + description: cellranger bam + - name: "--bam_index" + type: file + description: cellranger bam index + - name: "--barcodes" + type: file + description: barcodes.tsv from cellranger + - name: "--clusters" + type: integer + description: number cluster, tbd add easy way to run on a range of k + - name: "--ploidy" + type: integer + default: 2 + description: ploidy, must be 1 or 2 + - name: "--min_alt" + type: integer + default: 10 + description: min alt to use locus + - name: "--min_ref" + type: integer + default: 10 + description: min ref to use locus + - name: "--max_loci" + type: integer + default: 2048 + description: max loci per cell, affects speed + - name: "--restarts" + type: integer + description: number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima + - name: "--common_variants" + type: file + description: common variant loci or known variant loci vcf, must be vs same reference fasta + - name: "--known_genotypes" + type: file + description: known variants per clone in population vcf mode, must be .vcf right now we dont accept gzip or bcf sorry + - name: "--known_genotypes_sample_names" + type: string + description: which samples in population vcf from known genotypes option represent the donors in your sample + - name: "--skip_remap" + type: boolean_true + description: dont remap with minimap2 (not recommended unless in conjunction with --common_variants + - name: "--ignore" + type: boolean_true + description: set to True to ignore data error assertions +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: name of directory to place souporcell files + example: "souporcell_out" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/demuxafy_test_data + +engines: +- type: docker + image: cumulusprod/souporcell:2022.12 + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/genetic_demux/souporcell/script.sh b/src/genetic_demux/souporcell/script.sh new file mode 100755 index 00000000..32844d1f --- /dev/null +++ b/src/genetic_demux/souporcell/script.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "$par_skip_remap" == "false" ]] && unset par_skip_remap +[[ "$par_ignore" == "false" ]] && unset par_ignore + +if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" +fi + +/opt/souporcell/souporcell_pipeline.py \ + --bam $par_bam \ + --fasta $par_fasta \ + --barcodes $par_barcodes \ + --clusters $par_clusters \ + --ploidy $par_ploidy \ + --min_alt $par_min_alt \ + --min_ref $par_min_ref \ + --max_loci $par_max_loci \ + --out_dir $par_output \ + --threads ${par_threads:=1} \ + ${par_restarts:+--restarts $par_restarts} \ + ${par_common_variants:+--common_variants $par_common_variants} \ + ${par_known_genotypes:+--known_genotypes $par_known_genotypes} \ + ${par_known_genotypes_sample_names:+--known_genotypes_sample_names $par_known_genotypes_sample_names} \ + ${par_skip_remap:+--skip_remap True} \ + ${par_ignore:+--ignore True} + +cut -d$'\t' -f 1-3 "$par_output/clusters.tsv" | +sed 's/\t/,/g' | +awk 'BEGIN{FS=OFS=","} {$2=($2=="singlet")?$3:$2; NF=NF-1; print}' | +sed '1s/barcode,status/cell,donor_id/' > "$par_output/cell_annotation.csv" \ No newline at end of file diff --git a/src/genetic_demux/souporcell/test.sh b/src/genetic_demux/souporcell/test.sh new file mode 100644 index 00000000..7661920a --- /dev/null +++ b/src/genetic_demux/souporcell/test.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --bam "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam" \ + --bam_index "$meta_resources_dir/demuxafy_test_data/chr_1_pooled.sorted.bam.bai" \ + --fasta "$meta_resources_dir/demuxafy_test_data/genome_chr1.fa" \ + --barcodes "$meta_resources_dir/demuxafy_test_data/barcodes.tsv" \ + --common_variants "$meta_resources_dir/demuxafy_test_data/test_dataset.vcf" \ + --clusters 14 \ + --skip_remap \ + --output soup_result/ + +[[ ! -f soup_result/clusters.tsv ]] && echo "Output donor assignment file could not be found!" && exit 1 +[[ ! -f soup_result/cell_annotation.csv ]] && echo "Output cell annotation file could not be found!" && exit 1 + +echo ">>> Test finished successfully" diff --git a/src/genetic_demux/vireo/config.vsh.yaml b/src/genetic_demux/vireo/config.vsh.yaml new file mode 100644 index 00000000..6c88c302 --- /dev/null +++ b/src/genetic_demux/vireo/config.vsh.yaml @@ -0,0 +1,93 @@ +name: vireo +namespace: genetic_demux +scope: "public" +description: "Vireo is primarily designed for demultiplexing cells into donors by modelling of expressed alleles." +authors: + - __merge__: /src/authors/xichen_wu.yaml + roles: [ author ] +argument_groups: +- name: "Input" + arguments: + - name: "--cell_data" + type: file + description: The cell genotype file in VCF format or cellSNP folder with sparse matrices. + - name: "--n_donor" + type: integer + default: 2 + description: Number of donors to demultiplex; can be larger than provided in donor_file. + - name: "--vartrix_data" + type: file + description: The cell genotype files in vartrix outputs. + - name: "--donor_file" + type: file + description: The donor genotype file in VCF format. Please filter the sample and region with bcftools first! + - name: "--geno_tag" + type: string + default: "PL" + description: The tag for donor genotype. + choices: ["GT", "GP", "PL"] + - name: "--no_doublet" + type: boolean + default: false + description: If use, not checking doublets. + - name: "--n_init" + type: integer + default: 50 + description: Number of random initializations, when GT needs to learn. + - name: "--extra_donor" + type: integer + default: 0 + description: Number of extra donor in pre-cluster, when GT needs to learn. + - name: "--extra_donorMode" + type: string + description: "Method for searching from extra donors. size: n_cell per donor; distance: GT distance between donors" + - name: "--force_learn_gt" + type: boolean + default: false + description: If use, treat donor GT as prior only. + - name: "--ase_mode" + type: boolean + default: false + description: If use, turn on SNP specific allelic ratio. + - name: "--no_plot" + type: boolean + default: false + description: If use, turn off plotting GT distance. + - name: "--rand_seed" + type: integer + description: Seed for random initialization + - name: "--cell_range" + type: string + description: Range of cells to process. + - name: "--call_ambient_rnas" + type: boolean + default: false + description: If use, detect ambient RNAs in each cell. +- name: "Output" + arguments: + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + description: Output directory + example: "vireo/" +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: ../../../resources_test/vireo_test_data + +engines: +- type: docker + image: python:3.10 + setup: + - type: python + pip: [ threadpoolctl, vireoSNP ] + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/genetic_demux/vireo/script.sh b/src/genetic_demux/vireo/script.sh new file mode 100644 index 00000000..3b3c8526 --- /dev/null +++ b/src/genetic_demux/vireo/script.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "$par_no_doublet" == "false" ]] && unset par_no_doublet +[[ "$par_force_learn_gt" == "false" ]] && unset par_force_learn_gt +[[ "$par_ase_mode" == "false" ]] && unset par_ase_mode +[[ "$par_no_plot" == "false" ]] && unset par_no_plot +[[ "$par_call_ambient_rnas" == "false" ]] && unset par_call_ambient_rnas + +if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" +fi + +vireo \ + --cellData $par_cell_data \ + --nDonor $par_n_donor \ + --genoTag $par_geno_tag \ + --nInit $par_n_init \ + --extraDonor $par_extra_donor \ + --out "${par_output}" \ + ${par_vartrix_data:+--vatrixData $par_vartrix_data} \ + ${par_donor_file:+--donorFile $par_donor_file} \ + ${par_no_doublet:+--noDoublet} \ + ${par_extra_donorMode:+--extraDonorMode $par_extra_donorMode} \ + ${par_force_learn_gt:+--forceLearnGT} \ + ${par_ase_mode:+--ASEmode} \ + ${par_no_plot:+--noPlot} \ + ${par_rand_seed:+--randSeed $par_rand_seed} \ + ${par_cell_range:+--cellRange $par_cell_range} \ + ${par_call_ambient_rnas:+--callAmbientRNAs} \ + ${meta_cpus:+--nproc $meta_cpus} + +cut -d$'\t' -f 1-2 "$par_output/donor_ids.tsv" | tr '\t' ',' > "$par_output/cell_annotation.csv" +awk 'BEGIN{FS=OFS=","} NR>1{ gsub("donor", "", $2) } 1' "$par_output/cell_annotation.csv" > "$par_output/cell_annotation_temp.csv" && mv "$par_output/cell_annotation_temp.csv" "$par_output/cell_annotation.csv" diff --git a/src/genetic_demux/vireo/test.sh b/src/genetic_demux/vireo/test.sh new file mode 100644 index 00000000..87c81b2a --- /dev/null +++ b/src/genetic_demux/vireo/test.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -ex + +echo ">>> Running executable" +$meta_executable \ + --cell_data "$meta_resources_dir/vireo_test_data/cells.cellSNP.vcf.gz" \ + --n_donor 4 \ + --output vireo_result/ + +[[ ! -f vireo_result/GT_donors.vireo.vcf.gz ]] && echo "Output donor genotype file could not be found!" && exit 1 +[[ ! -f vireo_result/cell_annotation.csv ]] && echo "Output cell annotation csv could not be found!" && exit 1 +echo ">>> Test finished successfully" diff --git a/src/integrate/harmony/config.vsh.yaml b/src/integrate/harmony/config.vsh.yaml new file mode 100644 index 00000000..d3370cda --- /dev/null +++ b/src/integrate/harmony/config.vsh.yaml @@ -0,0 +1,74 @@ +name: harmony +namespace: "integrate" +scope: "public" +status: disabled +description: "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony." +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, author ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--modality" + type: string + default: "rna" + required: false + - name: "--obsm_input" + type: string + default: "X_pca" + required: false + description: "Which .obsm slot to use as a starting PCA embedding." + - name: "--obsm_output" + type: string + default: "X_pca_integrated" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--theta" + description: "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." + type: double + default: 2 + multiple: true + - name: "--obs_covariates" + type: string + description: "The .obs field(s) that define the covariate(s) to regress out." + example: ["batch", "sample"] + multiple: true +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: r_script + path: script.R + +test_resources: + - type: python_script + path: ../harmonypy/test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: ghcr.io/data-intuitive/randpy:r4.0_py3.10 + setup: + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + - type: r + cran: + - harmony + - anndata + - reticulate + +runners: +- type: executable +- type: nextflow + directives: + label: [highmem, highcpu, highdisk] diff --git a/src/integrate/harmony/script.R b/src/integrate/harmony/script.R new file mode 100644 index 00000000..be474588 --- /dev/null +++ b/src/integrate/harmony/script.R @@ -0,0 +1,37 @@ +library(reticulate) +library(harmony) +library(anndata) + +mudata <- reticulate::import("mudata") + +### VIASH START +par <- list( + input = "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + output = "foo.h5mu", + modality = "rna", + obsm_input = "X_pca", + obsm_output = "X_pca_int", + theta = 2, + obs_covariates = c("leiden") +) +### VIASH END + +data <- mudata$read_h5mu(par$input) +rna_data <- data$mod[[par$modality]] + +## Run Harmony +pca_in <- rna_data$obsm[[par$obsm_input]] +meta_data <- rna_data$obs +harmony_embedding <- HarmonyMatrix( + data_mat = pca_in, + meta_data = meta_data, + vars_use = par$obs_covariates, + theta = par$theta, + do_pca = FALSE +) + +## Add Harmony embeddings to Anndata +rna_data$obsm[[par$obsm_output]] <- harmony_embedding + +## Save as h5mu +data$write(par$output, compression = par$output_compression) diff --git a/src/integrate/harmonypy/config.vsh.yaml b/src/integrate/harmonypy/config.vsh.yaml new file mode 100644 index 00000000..0b6bc07c --- /dev/null +++ b/src/integrate/harmonypy/config.vsh.yaml @@ -0,0 +1,79 @@ +name: harmonypy +namespace: "integrate" +scope: "public" +description: "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony. + Based on an implementation in python from https://github.com/slowkow/harmonypy" +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ contributor ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obsm_input" + type: string + default: "X_pca" + required: false + description: "Which .obsm slot to use as a starting PCA embedding." + - name: "--obsm_output" + type: string + default: "X_pca_integrated" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--theta" + description: "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." + type: double + default: 2 + multiple: true + - name: "--obs_covariates" + type: string + description: "The .obs field(s) that define the covariate(s) to regress out." + example: ["batch", "sample"] + required: true + multiple: true +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - harmonypy~=0.0.6 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [highmem, highcpu, highdisk] diff --git a/src/integrate/harmonypy/script.py b/src/integrate/harmonypy/script.py new file mode 100644 index 00000000..ab12dee0 --- /dev/null +++ b/src/integrate/harmonypy/script.py @@ -0,0 +1,51 @@ +import mudata +import sys +from harmonypy import run_harmony + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "foo.h5mu", + "compression": "gzip", + "modality": "rna", + "obsm_input": "X_pca", + "obsm_output": "X_pca_harmonypy", + "theta": 2, + "obs_covariates": ["batch"], +} +### VIASH END + + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading %s from %s", par["modality"], par["input"]) + mod = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + logger.info("Selecting embedding %s", par["obsm_input"]) + pca_embedding = mod.obsm[par["obsm_input"]] + metadata = mod.obs + logger.info( + "Available covariates to regress out: ", ", ".join(metadata.columns.to_list()) + ) + logger.info( + "Running harmonypy with theta %s to regress out the following variables: ", + par["theta"], + ", ".join(par["obs_covariates"]), + ) + ho = run_harmony(pca_embedding, metadata, par["obs_covariates"], theta=par["theta"]) + logger.info("Adding output to slot %s", par["obsm_output"]) + mod.obsm[par["obsm_output"]] = ho.Z_corr.T + logger.info("Writing to %s") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] + ) + logger.info("Finished") + + +if __name__ == "__main__": + main() diff --git a/src/integrate/harmonypy/test.py b/src/integrate/harmonypy/test.py new file mode 100644 index 00000000..658a382f --- /dev/null +++ b/src/integrate/harmonypy/test.py @@ -0,0 +1,55 @@ +import sys +import pytest +import mudata +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/integrate/harmonypy/harmonypy", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", + "config": "src/integrate/harmony/config.vsh.yaml", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + + +def test_harmonypy(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + # run component + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--obsm_input", + "X_pca", + "--obsm_output", + "X_pca_int", + "--obs_covariates", + "harmony_integration_leiden_1.0", + "--output", + output_path, + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + + # check output + output_data = mudata.read_h5mu(output_path) + assert "X_pca_int" in output_data.mod["rna"].obsm + assert ( + output_data.mod["rna"].obsm["X_pca_int"].shape + == output_data.mod["rna"].obsm["X_pca"].shape + ) + # Delete output slots in order to tests for equality with + # input data + del output_data["rna"].obsm["X_pca_int"] + assert_annotation_objects_equal(input_file, output_data) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/scanorama/config.vsh.yaml b/src/integrate/scanorama/config.vsh.yaml new file mode 100644 index 00000000..9db3a3ff --- /dev/null +++ b/src/integrate/scanorama/config.vsh.yaml @@ -0,0 +1,92 @@ +name: scanorama +namespace: "integrate" +scope: "public" +description: | + Use Scanorama to integrate different experiments. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: [-i] + type: file + description: Input h5mu file + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--output" + alternatives: ["-o"] + type: file + description: Output .h5mu file + direction: output + required: true + default: "output.h5ad" + - name: "--obs_batch" + type: string + description: Column name discriminating between your batches. + default: "batch" + - name: "--obsm_input" + type: string + description: Basis obsm slot to run scanorama on. + default: "X_pca" + - name: "--obsm_output" + type: string + description: The name of the field in adata.obsm where the integrated embeddings will be stored after running this function. Defaults to X_scanorama. + default: "X_scanorama" + - name: "--knn" + type: integer + description: "Number of nearest neighbors to use for matching." + default: 20 + - name: "--batch_size" + type: integer + description: "The batch size used in the alignment vector computation. Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory." + default: 5000 + - name: "--sigma" + type: double + description: "Correction smoothing parameter on Gaussian kernel." + default: 15 + - name: "--approx" + type: boolean + description: "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime." + default: True + - name: "--alpha" + type: double + description: "Alignment score minimum cutoff" + default: 0.1 +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - build-essential + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - scanorama + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [midcpu, highmem, highdisk] diff --git a/src/integrate/scanorama/script.py b/src/integrate/scanorama/script.py new file mode 100644 index 00000000..54ea7c96 --- /dev/null +++ b/src/integrate/scanorama/script.py @@ -0,0 +1,47 @@ +import sys +from scanpy.external.pp import scanorama_integrate +from mudata import read_h5ad + + +### VIASH START +par = {} +meta = {"resources_dir": "src/utils/"} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from input %s", par["modality"], par["input"]) +mod = read_h5ad(par["input"], mod=par["modality"]) + +arguments = { + "key": par["obs_batch"], + "basis": par["obsm_input"], + "adjusted_basis": par["obsm_output"], + "knn": par["knn"], + "alpha": par["alpha"], + "sigma": par["sigma"], + "approx": par["approx"], + "batch_size": par["batch_size"], +} + +logger.info( + "Running scanorama with parameters: \n", + "\n\t".join( + [ + f"{argument_name}: {argument_value}" + for argument_name, argument_value in arguments.items() + ] + ), +) +# Integration. +scanorama_integrate(mod, **arguments) +logger.info("Writing output to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] +) + +logger.info("Finished!") diff --git a/src/integrate/scanorama/test.py b/src/integrate/scanorama/test.py new file mode 100644 index 00000000..dbe67d9d --- /dev/null +++ b/src/integrate/scanorama/test.py @@ -0,0 +1,93 @@ +import sys +import pytest +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/docker/integrate/scanorama/scanorama", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", + "config": "src/integrate/scanorama/config.vsh.yaml", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.fixture +def input_with_batch(random_h5mu_path): + tmp_input_path = random_h5mu_path() + + input_data = read_h5mu(input_file) + mod = input_data.mod["rna"] + number_of_obs = mod.n_obs + mod.obs["batch"] = "A" + column_index = mod.obs.columns.get_indexer(["batch"]) + mod.obs.iloc[slice(number_of_obs // 2, None), column_index] = "B" + input_data.write(tmp_input_path) + + return tmp_input_path, input_data + + +def test_simple_integration(run_component, input_with_batch, random_h5mu_path): + tmp_input_path, _ = input_with_batch + output_path = random_h5mu_path() + + # run component + run_component( + [ + "--input", + tmp_input_path, + "--output", + output_path, + "--obs_batch", + "batch", + "--obsm_input", + "X_pca", + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + + # check output + data = read_h5mu(output_path) + assert "X_scanorama" in data.mod["rna"].obsm + + # Delete output data in order to compare with input + del data["rna"].obsm["X_scanorama"] + assert_annotation_objects_equal(tmp_input_path, data) + + +def test_obsm_output(run_component, input_with_batch, random_h5mu_path): + tmp_input_path, _ = input_with_batch + output_path = random_h5mu_path() + + # run component + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obsm_output", + "X_test", + "--obs_batch", + "batch", + "--obsm_input", + "X_pca", + ] + ) + assert output_path.is_file() + + # check output + data = read_h5mu(output_path) + assert "X_test" in data.mod["rna"].obsm + + # Delete output data in order to compare with input + del data["rna"].obsm["X_test"] + assert_annotation_objects_equal(tmp_input_path, data) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/scarches/config.vsh.yaml b/src/integrate/scarches/config.vsh.yaml new file mode 100644 index 00000000..62c47d45 --- /dev/null +++ b/src/integrate/scarches/config.vsh.yaml @@ -0,0 +1,169 @@ +name: scarches +namespace: "integrate" +scope: "public" +description: "Performs reference mapping with scArches" +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + description: Arguments related to the input (query) dataset + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file to use as a query + direction: input + required: true + - name: "--layer" + type: string + description: Layer to be used for scArches, if .X is not to be used. + required: false + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_obs_batch" + type: string + description: Name of the .obs column with batch information. + required: false + - name: "--input_obs_label" + type: string + description: Name of the .obs column with celltype information. + required: false + - name: "--input_var_gene_names" + type: string + description: Name of the .var column with gene names, if the var .index is not to be used. + required: false + - name: "--input_obs_size_factor" + type: string + required: false + description: | + Key in adata.obs for size factor information. Instead of using library size as a size factor, + the provided size factor column will be used as offset in the mean of the likelihood. + Assumed to be on linear scale. + + - name: Reference + arguments: + - name: "--reference" + alternatives: ["-r"] + type: file + description: Path to the directory with reference model or a web link. + required: true + - name: "--reference_class" + type: string + required: false + description: + For legacy models; the type of model (where the type of model was not saved with it; e.g. when they were generated with scvi-tools versions < 0.15). + example: SCANVI + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--model_output" + type: file + default: "model" + direction: output + description: Output directory for model + - name: "--obsm_output" + type: string + default: "X_integrated_scanvi" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--obs_output_predictions" + type: string + default: "scanvi_pred" + required: false + description: "In which .obs slot to store the resulting label predictions. Only relevant if a scANVI model was provided." + - name: "--obs_output_probabilities" + type: string + default: "scanvi_proba" + required: false + description: "In which .obs slot to store the probabilities of the label predictions. Only relevant if a scANVI model was provided." + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: "Early stopping arguments" + arguments: + - name: "--early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, + i.e. an absolute change of less than min_delta, will count as no improvement." + + - name: "Learning parameters" + arguments: + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py + - path: /src/utils/set_var_index.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/scanvi_model + - path: /resources_test/annotation_test_data/scvi_model + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/HLCA_reference_model/HLCA_reference_model.zip + +engines: +- type: docker + image: nvcr.io/nvidia/pytorch:25.05-py3 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + packages: + - jax[cuda] + - scvi-tools~=1.3.1 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable + # docker_run_args: ["--gpus all"] +- type: nextflow + directives: + label: [highmem, highcpu, highdisk, gpu] diff --git a/src/integrate/scarches/script.py b/src/integrate/scarches/script.py new file mode 100644 index 00000000..edc62489 --- /dev/null +++ b/src/integrate/scarches/script.py @@ -0,0 +1,364 @@ +import sys +import mudata +from anndata import AnnData +import scvi +import pandas as pd +import numpy as np +import torch +from tempfile import TemporaryDirectory +from pathlib import Path +import pickle + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "layer": None, + "input_obs_batch": "sample_id", + "input_obs_label": None, + "input_var_gene_names": None, + "unknown_celltype_label": "Unknown", + "input_obs_size_factor": None, + "reference": "resources_test/annotation_test_data/scanvi_model", + "modality": "rna", + "output": "foo.h5mu", + "model_output": "test", + # Other + "obsm_output": "X_integrated_scanvi", + "obs_output_probabilities": "scanvi_proba", + "obs_output_predictions": "scanvi_pred", + "early_stopping": None, + "early_stopping_monitor": "elbo_validation", + "early_stopping_patience": 45, + "early_stopping_min_delta": 0, + "max_epochs": 10, + "output_compression": "gzip", + "reference_class": "SCANVI", +} +meta = {"resources_dir": "src/utils", "temp_dir": "/tmp"} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def _read_model_name_from_registry(model_path) -> str: + """Read registry with information about the model, return the model name""" + registry = scvi.model.base.BaseModelClass.load_registry(model_path) + return registry["model_name"] + + +def _detect_base_model(model_path): + """Read from the model's file which scvi_tools model it contains""" + return getattr(scvi.model, _read_model_name_from_registry(model_path)) + + +def _validate_obs_metadata_params(model_registry, model_name): + """ + Validates .obs metadata par variables that are required by scvi-tools models. + + This function checks that necessary .obs metadata field names (batch, labels, size factors) + specified in the model registry have the required corresponding parameters provided in the input. + + Parameters + ---------- + model_registry : dict + Dictionary containing model configuration and requirements. + model_name : str + Name of the model being validated. + """ + + # Maps the keys of the model registry to their corresponding par keys containing the .obs metadata + registry_par_mapper = { + "batch_key": "input_obs_batch", + "labels_key": "input_obs_label", + "size_factor_key": "input_obs_size_factor", + } + + for registry_key, key in registry_par_mapper.items(): + model_registry_key = model_registry.get(registry_key) + par_key = par[key] + + if model_registry_key and not par_key: + if key != "input_obs_label" or not model_registry.get("unlabeled_category"): + raise ValueError( + f"The provided {model_name} model requires `--{key}` to be provided." + ) + + elif par_key and not model_registry_key: + logger.warning( + f"`--{key}` was provided but is not used in the provided {model_name} model." + ) + + +def _align_query_with_registry(adata_query, model_path): + """ + Creates a qeury AnnData object with the expected structure and metadata fields that are aligned with the pre-trained reference model. + + Parameters + ---------- + adata_query : AnnData + The query AnnData object to be aligned with the model structure. + model_path : str + Path to the directory containing the pre-trained model. + + Returns + ------- + AnnData + A new AnnData object with structure and metadata aligned to match the + requirements of the pre-trained model. + """ + + model = _detect_base_model(model_path) + model_name = _read_model_name_from_registry(model_path) + model_registry = model.load_registry(model_path)["setup_args"] + _validate_obs_metadata_params(model_registry, model_name) + + # Sanitize gene names and set as index of the AnnData object + # all scArches VAE models expect gene names to be in the .var index + adata_query = set_var_index(adata_query, par["input_var_gene_names"]) + + # align layer + query_layer = ( + adata_query.X if not par["layer"] else adata_query.layers[par["layer"]] + ) + var_index = adata_query.var_names.tolist() + obs_index = adata_query.obs_names.tolist() + + # align observations + query_obs = {} + + ## batch_key, size_factor_key + simple_mappings = { + "batch_key": "input_obs_batch", # relevant for AUTOZI, LinearSCVI, PEAKVI, SCANVI, SCVI, TOTALVI, MULTIVI, JaxSCVI + "size_factor_key": "input_obs_size_factor", # relevant for SCANVI, SCVI, TOTALVI, MULTIVI + } + + for registry_key, par_key in simple_mappings.items(): + if model_registry.get(registry_key): + query_obs[model_registry[registry_key]] = adata_query.obs[ + par[par_key] + ].tolist() + + ## labels-key, relevant for AUTOZI, CondSCVI, LinearSCVI, PEAKVI, SCANVI, SCVI + if model_registry.get("labels_key"): + if par["input_obs_label"]: + query_obs[model_registry["labels_key"]] = adata_query.obs[ + par["input_obs_label"] + ].tolist() + else: + adata_query.obs[model_registry["labels_key"]] = model_registry[ + "unlabeled_category" + ] + query_obs[model_registry["labels_key"]] = adata_query.obs[ + model_registry["labels_key"] + ].tolist() + + obs = pd.DataFrame(query_obs, index=obs_index) + var = pd.DataFrame(index=var_index) + + aligned_query_anndata = AnnData(X=query_layer, obs=obs, var=var) + if model_registry["layer"]: + aligned_query_anndata.layers[model_registry["layer"]] = query_layer + + return aligned_query_anndata + + +def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1): + """ + A function to map the query data to the reference atlas. + + Input: + * adata_query: An AnnData object with the query + * model_path: The reference model directory + + Output: + * vae_query: the trained scvi_tools model + * adata_query: The AnnData object with the query preprocessed for the mapping to the reference + """ + model = _detect_base_model(model_path) + + # Keys of the AnnData query object need to match the exact keys in the reference model registry + + aligned_adata_query = _align_query_with_registry(adata_query, model_path) + + try: + model.prepare_query_anndata(aligned_adata_query, model_path) + except ValueError: + logger.warning( + "ValueError thrown when preparing adata for mapping. Clearing .varm field to prevent it" + ) + aligned_adata_query.varm.clear() + try: + model.prepare_query_anndata(aligned_adata_query, model_path) + except ValueError: + raise ValueError( + f"Could not perform model.prepare_model_anndata, likely because the model was trained with different var names then were found in the index. \n\nmodel var_names: {model.prepare_query_anndata(aligned_adata_query, model_path, return_reference_var_names=True).tolist()} \n\nquery data var_names: {aligned_adata_query.var_names.tolist()} " + ) + + try: + # Load query data into the model + vae_query = model.load_query_data( + aligned_adata_query, model_path, freeze_dropout=True + ) + except KeyError: + # Older models do not have the 'setup_method_name' key saved with the model. + # Assume these were generated from an anndata object. + model_torch = torch.load( + f"{model_path}/model.pt", weights_only=False, map_location="cpu" + ) + model_torch["attr_dict"]["registry_"]["setup_method_name"] = "setup_anndata" + + with TemporaryDirectory(dir=meta["temp_dir"]) as tempdir: + temp_file_name = Path(tempdir) / "model.pt" + torch.save(model_torch, temp_file_name) + del model_torch + vae_query = model.load_query_data( + aligned_adata_query, tempdir, freeze_dropout=True + ) + + # Train scArches model for query mapping + vae_query.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + check_val_every_n_epoch=check_val_every_n_epoch, + accelerator="auto", + ) + + return vae_query + + +def _convert_object_dtypes_to_strings(adata): + """Convert object dtypes in .var and .obs to string to prevent error when saving file""" + + def convert_cols(df): + object_cols = df.columns[df.dtypes == "object"] + for col in object_cols: + df[col] = df[col].astype(str) + return df + + adata.var = convert_cols(adata.var) + adata.obs = convert_cols(adata.obs) + + return adata + + +def _get_model_path(model_path: str): + """Obtain path to the directory with reference model. If the proposed `model_path` is a .zip archive, unzip it. If nesessary, convert model to the new format + + Parameters + ---------- + model_path : str + Path to a directory, where to search for the model or to a zip file containing the model + + Returns + ------- + Path to a directory with reference model in format of scvi-tools>=0.15 + """ + import os + import zipfile + import tempfile + from pathlib import Path + + if os.path.isdir(model_path) and "model.pt" in os.listdir(model_path): + # Probably, the `model_path` already contains model in the output format of scvi-tools>=0.15 + return model_path + + # The model either has old format or is a zip file downloaded from Zenodo + new_directory = Path(tempfile.TemporaryDirectory().name) + + if zipfile.is_zipfile(model_path): + with zipfile.ZipFile(model_path) as archive: + archive.extractall(new_directory) + model_dir = next(new_directory.glob("**/*.pt")).parent + + else: + model_dir = next(Path(model_path).glob("**/*.pt")).parent + + if "model_params.pt" in os.listdir(model_dir): + try: + # The current approach is to store the class with the model in a registry + model_class = _detect_base_model(model_dir) + except ValueError: + # Failed to load the registry, try + with (model_dir / "attr.pkl").open("rb") as open_file: + attrs = pickle.load(open_file) + try: + model_name = attrs["registry_"]["model_name"] + except KeyError: + # With older scvi-tools models, the model name was not stored with the model... + model_name = par["reference_class"] + if not model_name: + raise ValueError( + "Could not load reference model. This might be because it is a legacy model for which the model type was not saved with the model. Please provide it manually using the 'reference_class' argument." + ) + if not hasattr(scvi.model, model_name): + raise ValueError( + f"Requested to load legacy model using type {model_name}, but such a class does not exist in `scvi.models`." + ) + model_class = getattr(scvi.model, model_name) + + # The model is in the `directory`, but it was generated with scvi-tools<0.15 + # TODO: for new references (that could not be SCANVI based), we need to check the base class somehow. Reading registry does not work with models generated by scvi-tools<0.15 + # Here I assume that the reference model is for HLCA and thus is SCANVI based + converted_model_path = os.path.join(model_dir, "converted") + model_class.convert_legacy_save(model_dir, converted_model_path) + return converted_model_path + + elif "model.pt" in os.listdir(model_dir): + # Archive contained model in the new format, so just return the directory + return model_dir + + else: + raise ValueError( + "Cannot find model in the provided reference path. Please, provide a path or a link to the directory with reference model. For HLCA use https://zenodo.org/record/6337966/files/HLCA_reference_model.zip" + ) + + +def main(): + logger.info("Reading %s, modality %s", par["input"], par["modality"]) + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + adata_query = adata.copy() + + model_path = _get_model_path(par["reference"]) + vae_query = map_to_existing_reference(adata_query, model_path=model_path) + model_name = _read_model_name_from_registry(model_path) + + # Save info about the used model + adata.uns["integration_method"] = model_name + + logger.info("Trying to write latent representation") + output_key = par["obsm_output"].format(model_name=model_name) + adata.obsm[output_key] = vae_query.get_latent_representation() + + # In addition to integrating the data, scANVI also performs label annotations + if model_name == "SCANVI": + adata.obs[par["obs_output_predictions"]] = vae_query.predict() + adata.obs[par["obs_output_probabilities"]] = np.max( + vae_query.predict(soft=True), axis=1 + ) + + logger.info("Converting dtypes") + adata = _convert_object_dtypes_to_strings(adata) + + logger.info( + "Saving h5mu file to %s with compression %s", + par["output"], + par["output_compression"], + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + + logger.info("Saving model") + vae_query.save(par["model_output"], overwrite=True) + + +if __name__ == "__main__": + main() diff --git a/src/integrate/scarches/test.py b/src/integrate/scarches/test.py new file mode 100644 index 00000000..ca9bf1ad --- /dev/null +++ b/src/integrate/scarches/test.py @@ -0,0 +1,196 @@ +import sys +import pytest +import mudata +import subprocess +import re + +## VIASH START +meta = { + "executable": "./target/executable/integrate/scarches/scarches", + "resources_dir": "./resources_test/", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" +reference = f"{meta['resources_dir']}/HLCA_reference_model.zip" +scvi_model = f"{meta['resources_dir']}/scvi_model" +scanvi_model = f"{meta['resources_dir']}/scanvi_model" + + +@pytest.fixture +def input_with_batch(tmp_path): + tmp_input_path = tmp_path / "input.h5mu" + + input_data = mudata.read_h5mu(input_file) + mod = input_data.mod["rna"] + number_of_obs = mod.n_obs + mod.obs["batch"] = "A" + column_index = mod.obs.columns.get_indexer(["batch"]) + mod.obs.iloc[slice(number_of_obs // 2, None), column_index] = "B" + input_data.write(tmp_input_path) + + return tmp_input_path, input_data + + +def test_hlca_reference_model(run_component, input_with_batch, tmp_path): + tmp_input_path, _ = input_with_batch + output_path = tmp_path / "output.h5mu" + output_model_path = tmp_path / "model_output" + + # run component + run_component( + [ + "--input", + str(tmp_input_path), + "--input_obs_batch", + "batch", + "--unknown_celltype_label", + "unlabeled", + "--reference", + reference, + "--reference_class", + "SCANVI", + "--modality", + "rna", + "--output", + str(output_path), + "--model_output", + str(output_model_path), + "--max_epochs", + "1", + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + + # check output + output_data = mudata.read_h5mu(output_path) + assert "X_integrated_scanvi" in output_data.mod["rna"].obsm + assert output_data["rna"].uns["integration_method"] == "SCANVI" + assert (output_model_path / "model.pt").is_file() + + +def test_scvi_model(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + output_model_path = tmp_path / "model_output" + + # run component + run_component( + [ + "--input", + input_file, + "--input_obs_batch", + "sample_id", + "--reference", + scvi_model, + "--modality", + "rna", + "--output", + str(output_path), + "--model_output", + str(output_model_path), + "--max_epochs", + "1", + "--output_compression", + "gzip", + "--obsm_output", + "X_integrated_scvi", + ] + ) + assert output_path.is_file() + + # check output + output_data = mudata.read_h5mu(output_path) + assert "X_integrated_scvi" in output_data.mod["rna"].obsm + assert "scanvi_pred" not in output_data.mod["rna"].obs + assert "scanvi_proba" not in output_data.mod["rna"].obs + assert output_data["rna"].uns["integration_method"] == "SCVI" + assert (output_model_path / "model.pt").is_file() + + +def test_scanvi_model(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + output_model_path = tmp_path / "model_output" + + # run component + run_component( + [ + "--input", + input_file, + "--input_obs_batch", + "sample_id", + "--reference", + scanvi_model, + "--modality", + "rna", + "--output", + str(output_path), + "--model_output", + str(output_model_path), + "--max_epochs", + "1", + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file() + + # check output + output_data = mudata.read_h5mu(output_path) + assert "X_integrated_scanvi" in output_data.mod["rna"].obsm + assert output_data["rna"].uns["integration_method"] == "SCANVI" + assert "scanvi_pred" in output_data.mod["rna"].obs + assert "scanvi_proba" in output_data.mod["rna"].obs + + predictions = output_data.mod["rna"].obs["scanvi_pred"] + probabilities = output_data.mod["rna"].obs["scanvi_proba"] + + assert predictions.dtype == "category", ( + "Calculated predictions should be category dtype" + ) + assert not all(predictions.isna()), "Not all predictions should be NA" + assert probabilities.dtype == "float32", ( + "Calculated probabilities should be float32 dtype" + ) + assert all(0 <= value <= 1 for value in probabilities), ( + ".obs at celltypist_probability has values outside the range [0, 1]" + ) + + assert (output_model_path / "model.pt").is_file() + + +def test_raises_with_missing_params(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + output_model_path = tmp_path / "model_output" + + # run component + args = [ + "--input", + input_file, + "--reference", + scvi_model, + "--modality", + "rna", + "--output", + str(output_path), + "--model_output", + str(output_model_path), + "--max_epochs", + "1", + "--output_compression", + "gzip", + "--obsm_output", + "X_integrated_scvi", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: The provided SCVI model requires `--input_obs_batch` to be provided.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/scvi/config.vsh.yaml b/src/integrate/scvi/config.vsh.yaml new file mode 100644 index 00000000..a4f56618 --- /dev/null +++ b/src/integrate/scvi/config.vsh.yaml @@ -0,0 +1,250 @@ +name: scvi +namespace: "integrate" +scope: "public" +description: "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA" +authors: + - __merge__: /src/authors/malte_luecken.yaml + roles: [ author ] + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/authors/matthias_beyens.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used" + - name: "--obs_batch" + type: string + default: "sample_id" + required: false + description: Column name discriminating between your batches. + - name: "--var_gene_names" + type: string + required: false + description: ".var column containing gene names. By default, use the index." + - name: "--var_input" + type: string + required: false + description: ".var column containing highly variable genes. By default, do not subset genes." + - name: "--obs_labels" + type: string + required: false + description: | + Key in adata.obs for label information. Categories will automatically be + converted into integer categories and saved to adata.obs['_scvi_labels']. + If None, assigns the same label to all the data. + - name: "--obs_size_factor" + type: string + required: false + description: | + Key in adata.obs for size factor information. Instead of using library size as a size factor, + the provided size factor column will be used as offset in the mean of the likelihood. + Assumed to be on linear scale. + - name: "--obs_categorical_covariate" + type: string + required: false + multiple: true + description: | + Keys in adata.obs that correspond to categorical data. These covariates can be added in + addition to the batch covariate and are also treated as nuisance factors + (i.e., the model tries to minimize their effects on the latent space). + Thus, these should not be used for biologically-relevant factors that you do _not_ want to correct for. + - name: "--obs_continuous_covariate" + type: string + required: false + multiple: true + description: | + Keys in adata.obs that correspond to continuous data. These covariates can be added in + addition to the batch covariate and are also treated as nuisance factors + (i.e., the model tries to minimize their effects on the latent space). Thus, these should not be + used for biologically-relevant factors that you do _not_ want to correct for. + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--output_model" + type: file + description: Folder where the state of the trained model will be saved to. + required: false + direction: output + - name: "--obsm_output" + type: string + default: "X_scvi_integrated" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: "SCVI options" + arguments: + - name: --n_hidden_nodes + type: integer + required: false + default: 128 # this is not the default in SCVI + description: "Number of nodes per hidden layer." + - name: --n_dimensions_latent_space + type: integer + required: false + default: 30 # this is not the default in SCVI + description: "Dimensionality of the latent space." + - name: "--n_hidden_layers" + required: false + type: integer + default: 2 # this is not the default in SCVI + description: Number of hidden layers used for encoder and decoder neural-networks. + - name: --dropout_rate + type: double + required: false + default: 0.1 + description: "Dropout rate for the neural networks." + - name: "--dispersion" + type: string + required: false + choices: ["gene", "gene-batch", "gene-label", "gene-cell"] + default: "gene" + description: | + Set the behavior for the dispersion for negative binomial distributions: + - gene: dispersion parameter of negative binomial is constant per gene across cells + - gene-batch: dispersion can differ between different batches + - gene-label: dispersion can differ between different labels + - gene-cell: dispersion can differ for every gene in every cell + - name: "--gene_likelihood" + type: string + default: "nb" # This is not the default in SCVI + choices: ["nb", "zinb", "poisson"] + description: | + Model used to generate the expression data from a count-based likelihood distribution. + - nb: Negative binomial distribution + - zinb: Zero-inflated negative binomial distribution + - poisson: Poisson distribution + - name: "Variational auto-encoder model options" + arguments: + - name: "--use_layer_normalization" + description: | + Neural networks for which to enable layer normalization. + type: string + choices: ["encoder", "decoder", "none", "both"] + default: "both" # This is not the default in SCVI + - name: "--use_batch_normalization" + description: | + Neural networks for which to enable batch normalization. + type: string + choices: ["encoder", "decoder", "none", "both"] + default: "none" # This is not the default in SCVI + - name: "--encode_covariates" + type: boolean_false # This is not the default in SCVI + description: "Whether to concatenate covariates to expression in encoder" + - name: "--deeply_inject_covariates" + type: boolean_true + description: | + Whether to concatenate covariates into output of hidden layers in encoder/decoder. + This option only applies when n_layers > 1. The covariates are concatenated to + the input of subsequent hidden layers. + - name: "--use_observed_lib_size" + type: boolean_true # This is not the default in SCVI + description: | + Use observed library size for RNA as scaling factor in mean of conditional distribution. + - name: "Early stopping arguments" + arguments: + - name: "--early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, + i.e. an absolute change of less than min_delta, will count as no improvement." + - name: "Learning parameters" + arguments: + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + - name: "Data validition" + arguments: + - name: "--n_obs_min_count" + type: integer + description: "Minimum number of cells threshold ensuring that every obs_batch category has sufficient observations (cells) for model training." + required: false + default: 0 + - name: "--n_var_min_count" + type: integer + description: "Minimum number of genes threshold ensuring that every var_input filter has sufficient observations (genes) for model training." + required: false + default: 0 +resources: + - type: python_script + path: script.py + - path: /src/utils/subset_vars.py + - path: /src/utils/compress_h5mu.py + - path: /src/utils/set_var_index.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +engines: +- type: docker + image: nvcr.io/nvidia/pytorch:25.05-py3 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + - type: python + packages: + - jax[cuda] + - scvi-tools~=1.3.1 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable + # docker_run_args: ["--gpus all"] +- type: nextflow + directives: + label: [midcpu, midmem, gpu, highdisk] diff --git a/src/integrate/scvi/script.py b/src/integrate/scvi/script.py new file mode 100644 index 00000000..fe427823 --- /dev/null +++ b/src/integrate/scvi/script.py @@ -0,0 +1,144 @@ +from scanpy._utils import check_nonnegative_integers +import mudata +import scvi + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_layer": None, + "obs_batch": "sample_id", + "var_input": None, + "output": "foo.h5mu", + "obsm_output": "X_scvi_integrated", + "early_stopping": True, + "early_stopping_monitor": "elbo_validation", + "early_stopping_patience": 45, + "early_stopping_min_delta": 0, + "reduce_lr_on_plateau": True, + "lr_factor": 0.6, + "lr_patience": 30, + "max_epochs": 500, + "n_obs_min_count": 10, + "n_var_min_count": 10, + "output_model": "test/", + "output_compression": "gzip", +} + +meta = {"resources_dir": "src/utils"} +### VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +# TODO: optionally, move to qa +# https://github.com/openpipelines-bio/openpipeline/issues/435 +def check_validity_anndata(adata, layer, obs_batch, n_obs_min_count, n_var_min_count): + assert check_nonnegative_integers(adata.layers[layer] if layer else adata.X), ( + "Make sure input adata contains raw_counts" + ) + + assert len(set(adata.var_names)) == len(adata.var_names), ( + "Dataset contains multiple genes with same gene name." + ) + + # Ensure every obs_batch category has sufficient observations + assert min(adata.obs[[obs_batch]].value_counts()) > n_obs_min_count, ( + f"Anndata has fewer than {n_obs_min_count} cells." + ) + + assert adata.n_vars > n_var_min_count, ( + f"Anndata has fewer than {n_var_min_count} genes." + ) + + +def main(): + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + if par["var_input"]: + # Subset to HVG + adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() + else: + adata_subset = adata.copy() + + # Sanitize gene names and set as index of the AnnData object + adata_subset = set_var_index(adata_subset, par["var_gene_names"]) + + check_validity_anndata( + adata_subset, + par["input_layer"], + par["obs_batch"], + par["n_obs_min_count"], + par["n_var_min_count"], + ) + + # Set up the data + scvi.model.SCVI.setup_anndata( + adata_subset, + batch_key=par["obs_batch"], + layer=par["input_layer"], + labels_key=par["obs_labels"], + size_factor_key=par["obs_size_factor"], + categorical_covariate_keys=par["obs_categorical_covariate"], + continuous_covariate_keys=par["obs_continuous_covariate"], + ) + + # Set up the model + vae_uns = scvi.model.SCVI( + adata_subset, + n_hidden=par["n_hidden_nodes"], + n_latent=par["n_dimensions_latent_space"], + n_layers=par["n_hidden_layers"], + dropout_rate=par["dropout_rate"], + dispersion=par["dispersion"], + gene_likelihood=par["gene_likelihood"], + use_layer_norm=par["use_layer_normalization"], + use_batch_norm=par["use_batch_normalization"], + encode_covariates=par[ + "encode_covariates" + ], # Default (True) is for better scArches performance -> maybe don't use this always? + deeply_inject_covariates=par[ + "deeply_inject_covariates" + ], # Default (False) for better scArches performance -> maybe don't use this always? + use_observed_lib_size=par[ + "use_observed_lib_size" + ], # When size_factors are not passed + ) + + plan_kwargs = { + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], + } + + # Train the model + vae_uns.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + plan_kwargs=plan_kwargs, + check_val_every_n_epoch=1, + accelerator="auto", + ) + # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set + + # Get the latent output + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() + + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + if par["output_model"]: + vae_uns.save(par["output_model"], overwrite=True) + + +if __name__ == "__main__": + main() diff --git a/src/integrate/scvi/test.py b/src/integrate/scvi/test.py new file mode 100644 index 00000000..ed2ef749 --- /dev/null +++ b/src/integrate/scvi/test.py @@ -0,0 +1,183 @@ +import pytest +from pathlib import Path + +import mudata +from anndata.tests.helpers import assert_equal + +## VIASH START +meta = { + "executable": "./target/executable/integrate/scvi/scvi", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", +} +## VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" +input_file_2 = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu" + + +@pytest.fixture +def mudata_with_mod_rna_obs_batch(tmp_path, request): + obs_batch, var_input, obsm_output = request.param + + new_input_file = tmp_path / "input.h5mu" + + input_data = mudata.read_h5mu(input_file) + input_rna = input_data.mod["rna"] + input_rna.obs[obs_batch] = "A" + column_index = input_rna.obs.columns.get_indexer([obs_batch]) + input_rna.obs.iloc[slice(input_rna.n_obs // 2, None), column_index] = "B" + input_data.write(new_input_file.name) + + return new_input_file.name, input_rna, obs_batch, var_input, obsm_output + + +@pytest.mark.parametrize( + "mudata_with_mod_rna_obs_batch", + [("batch", None, None), ("batch2", "filter_with_hvg", "X_int")], + indirect=True, +) +def test_scvi(run_component, mudata_with_mod_rna_obs_batch): + new_input_file, input_rna, obs_batch, var_input, obsm_output = ( + mudata_with_mod_rna_obs_batch + ) + + args = [ + "--input", + new_input_file, + "--modality", + "rna", + "--obs_batch", + obs_batch, + "--output", + "output.h5mu", + "--output_model", + "test/", + "--max_epochs", + "1", + "--n_obs_min_count", + "10", + "--n_var_min_count", + "10", + "--output_compression", + "gzip", + ] + + if var_input is not None: + args.extend(["--var_input", var_input]) + if obsm_output is not None: + args.extend(["--obsm_output", obsm_output]) + + run_component(args) + + # check files + assert Path("output.h5mu").is_file(), "Output file does not exist" + assert Path("test").is_dir() + assert Path("test/model.pt").is_file() + + # check output h5mu + output_data = mudata.read_h5mu("output.h5mu") + output_rna = output_data.mod["rna"] + assert output_rna.n_obs == input_rna.n_obs, ( + f"Number of observations changed\noutput_data: {output_data}" + ) + assert output_rna.n_vars == input_rna.n_vars, ( + f"Number of variables changed\noutput_data: {output_data}" + ) + + expected_obsm_output = "X_scvi_integrated" if obsm_output is None else obsm_output + assert expected_obsm_output in output_rna.obsm, ( + f".obsm['{expected_obsm_output}'] not added\noutput_data: {output_data}" + ) + + # assert that nothing else has changed + del output_rna.obsm[expected_obsm_output] + assert_equal(input_rna, output_rna) + + +def test_input_parameters(run_component): + args = [ + "--input", + input_file_2, + "--modality", + "rna", + "--obs_batch", + "donor_id", + "--var_gene_names", + "ensemblid", + "--obs_labels", + "cell_ontology_class", + "--obsm_output", + "X_scvi_integrated_test", + "--obs_categorical_covariate", + "donor_assay;donor_tissue", + "--output", + "output.h5mu", + "--output_model", + "test_model/", + "--max_epochs", + "1", + "--n_obs_min_count", + "10", + "--n_var_min_count", + "10", + "--output_compression", + "gzip", + ] + + run_component(args) + + # check files + assert Path("output.h5mu").is_file(), "Output file does not exist" + assert Path("test_model").is_dir() + assert Path("test_model/model.pt").is_file() + + # Read input h5mu + input_rna = mudata.read_h5mu(input_file_2).mod["rna"] + + # check output h5mu + output_data = mudata.read_h5mu("output.h5mu") + output_rna = output_data.mod["rna"] + assert output_rna.n_obs == input_rna.n_obs, ( + f"Number of observations changed\noutput_data: {output_data}" + ) + assert output_rna.n_vars == input_rna.n_vars, ( + f"Number of variables changed\noutput_data: {output_data}" + ) + + expected_obsm_output = "X_scvi_integrated_test" + assert expected_obsm_output in output_rna.obsm, ( + f".obsm['{expected_obsm_output}'] not added\noutput_data: {output_data}" + ) + + # assert that nothing else has changed + del output_rna.obsm[expected_obsm_output] + assert_equal(input_rna, output_rna) + + +def test_hvg_subsetting_helper(): + input_data = mudata.read_h5mu(input_file) + adata = input_data.mod["rna"] + + old_n_genes = adata.n_vars + + adata.var["highly_variable_features"] = False + adata.var.iloc[ + : old_n_genes // 2, adata.var.columns.get_indexer(["highly_variable_features"]) + ] = True + + adata = subset_vars(adata, subset_col="highly_variable_features") + + # Correct number of genes is subsetted + assert adata.n_vars == old_n_genes // 2 + # Only HVG are subsetted + assert adata.var["highly_variable_features"].all() + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/totalvi/config.vsh.yaml b/src/integrate/totalvi/config.vsh.yaml new file mode 100644 index 00000000..1f653783 --- /dev/null +++ b/src/integrate/totalvi/config.vsh.yaml @@ -0,0 +1,134 @@ +name: totalvi +namespace: "integrate" +scope: "public" +description: "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI" +authors: + - __merge__: /src/authors/vladimir_shitov.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file with query data to integrate with reference. + direction: input + required: true + - name: "--reference" + alternatives: ["-r"] + type: file + description: Input h5mu file with reference data to train the TOTALVI model. + direction: input + required: true + - name: "--force_retrain" + alternatives: ["-f"] + type: boolean_true + description: If true, retrain the model and save it to reference_model_path + - name: "--query_modality" + type: string + default: "rna" + required: false + - name: "--query_proteins_modality" + type: string + description: Name of the modality in the input (query) h5mu file containing protein data + required: false + - name: "--reference_modality" + type: string + default: "rna" + required: false + - name: "--reference_proteins_modality" + type: string + description: Name of the modality containing proteins in the reference + default: "prot" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used" + - name: "--obs_batch" + type: string + default: "sample_id" + required: false + description: Column name discriminating between your batches. + - name: "--var_input" + type: string + required: false + description: ".var column containing highly variable genes. By default, do not subset genes." + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--obsm_output" + type: string + default: "X_integrated_totalvi" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--obsm_normalized_rna_output" + type: string + default: "X_totalvi_normalized_rna" + required: false + description: "In which .obsm slot to store the normalized RNA from TOTALVI." + - name: "--obsm_normalized_protein_output" + type: string + default: "X_totalvi_normalized_protein" + required: false + description: "In which .obsm slot to store the normalized protein data from TOTALVI." + - name: "--reference_model_path" + type: file + description: Directory with the reference model. If not exists, trained model will be saved there + required: false + default: "totalvi_model_reference/" + direction: output + - name: "--query_model_path" + type: file + description: Directory, where the query model will be saved + required: false + default: "totalvi_model_query/" + direction: output + - name: "Learning parameters" + arguments: + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset" + required: false + default: 400 + - name: "--max_query_epochs" + type: integer + description: "Number of passes through the dataset, when fine-tuning model for query" + required: false + default: 200 + - name: "--weight_decay" + type: double + description: "Weight decay, when fine-tuning model for query" + required: false + default: 0.0 +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: nvcr.io/nvidia/pytorch:25.05-py3 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + packages: + - jax[cuda] + - scvi-tools~=1.3.1 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [highmem, highcpu, highdisk] diff --git a/src/integrate/totalvi/script.py b/src/integrate/totalvi/script.py new file mode 100644 index 00000000..94c3d3dc --- /dev/null +++ b/src/integrate/totalvi/script.py @@ -0,0 +1,211 @@ +from typing import Tuple + +import os +import sys +import mudata +from anndata import AnnData # For type hints +from mudata import MuData # For type hints +import numpy as np +import scvi +from scipy.sparse import issparse + + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "reference": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "query_modality": "rna", + "query_proteins_modality": "prot", + "reference_modality": "rna", + "reference_proteins_modality": "prot", + "force_retrain": False, + "input_layer": None, + "obs_batch": "sample_id", + "var_input": None, + "output": "foo.h5mu", + "obsm_output": "X_integrated_totalvi", + "obsm_normalized_rna_output": "X_totalvi_normalized_rna", + "obsm_normalized_protein_output": "X_totalvi_normalized_protein", + "reference_model_path": "totalvi_model_reference/", + "query_model_path": "totalvi_model_query/", + "max_epochs": 1, + "max_query_epochs": 1, + "weight_decay": 0.0, +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def align_proteins_names( + adata_reference: AnnData, + mdata_query: MuData, + adata_query: AnnData, + reference_proteins_key: str, + query_proteins_key: str, +) -> AnnData: + """Make sure that proteins are located in the same .obsm slot in reference and query. Pad query proteins with zeros if they are absent""" + proteins_reference = adata_reference.obsm[reference_proteins_key] + + # If query has no protein data, put matrix of zeros + if not query_proteins_key or query_proteins_key not in mdata_query.mod: + adata_query.obsm[reference_proteins_key] = np.zeros( + (adata_query.n_obs, proteins_reference.shape[1]) + ) + else: + # Make sure that proteins expression has the same key in query and reference + adata_query.obsm[reference_proteins_key] = adata_query.obsm[query_proteins_key] + + return adata_query + + +def extract_proteins_to_anndata( + mdata: MuData, rna_modality_key, protein_modality_key, input_layer, hvg_var_key=None +) -> AnnData: + """TOTALVI requires data to be stored in AnnData format with protein counts in .obsm slot. This function performs the conversion""" + adata: AnnData = mdata.mod[rna_modality_key].copy() + + if hvg_var_key: + selected_genes = adata.var_names[adata.var[hvg_var_key]] + adata = adata[:, selected_genes].copy() + + if protein_modality_key in mdata.mod: + # Put the proteins modality into .obsm slot + proteins_reference_adata = mdata.mod[protein_modality_key].copy() + + if input_layer is None: + proteins = proteins_reference_adata.X + else: + proteins = proteins_reference_adata.obsm[input_layer] + + if issparse(proteins): + proteins = proteins.toarray() + + adata.obsm[protein_modality_key] = proteins + + return adata + + +def build_reference_model( + adata_reference: AnnData, max_train_epochs: int = 400 +) -> scvi.model.TOTALVI: + vae_reference = scvi.model.TOTALVI( + adata_reference, use_layer_norm="both", use_batch_norm="none" + ) + vae_reference.train(max_train_epochs) + + vae_reference.save(par["reference_model_path"]) + + return vae_reference + + +def is_retraining_model() -> bool: + """Decide, whether reference model should be trained. It happens when no model exists or force_retrain flag is on""" + + trained_model_exists = os.path.isdir(par["reference_model_path"]) and ( + "model.pt" in os.listdir(par["reference_model_path"]) + ) + return not trained_model_exists or par["force_retrain"] + + +def map_query_to_reference( + mdata_reference: MuData, mdata_query: MuData, adata_query: AnnData +) -> Tuple[scvi.model.TOTALVI, AnnData]: + """Build model on the provided reference if necessary, and map query to the reference""" + + adata_reference: AnnData = extract_proteins_to_anndata( + mdata_reference, + rna_modality_key=par["reference_modality"], + protein_modality_key=par["reference_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) + + scvi.model.TOTALVI.setup_anndata( + adata_reference, + batch_key=par["obs_batch"], + protein_expression_obsm_key=par["reference_proteins_modality"], + ) + + if is_retraining_model(): + vae_reference = build_reference_model( + adata_reference, max_train_epochs=par["max_epochs"] + ) + else: + vae_reference = scvi.model.TOTALVI.load( + dir_path=par["reference_model_path"], adata=adata_reference + ) + + adata_query: AnnData = align_proteins_names( + adata_reference, + mdata_query, + adata_query, + reference_proteins_key=par["reference_proteins_modality"], + query_proteins_key=par["query_proteins_modality"], + ) + + # Reorder genes and pad missing genes with 0s + scvi.model.TOTALVI.prepare_query_anndata(adata_query, vae_reference) + + # Train the model for query + vae_query = scvi.model.TOTALVI.load_query_data(adata_query, vae_reference) + vae_query.train( + par["max_query_epochs"], plan_kwargs=dict(weight_decay=par["weight_decay"]) + ) + + return vae_query, adata_query + + +def main(): + mdata_query = mudata.read(par["input"].strip()) + adata_query = extract_proteins_to_anndata( + mdata_query, + rna_modality_key=par["query_modality"], + protein_modality_key=par["query_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) + + if par["reference"].endswith(".h5mu"): + logger.info("Reading reference") + mdata_reference = mudata.read(par["reference"].strip()) + + logger.info("Mapping query to the reference") + vae_query, adata_query = map_query_to_reference( + mdata_reference, mdata_query, adata_query + ) + else: + raise ValueError("Incorrect format of reference, please provide a .h5mu file") + + adata_query.uns["integration_method"] = "totalvi" + + logger.info("Getting the latent representation of query") + mdata_query.mod[par["query_modality"]].obsm[par["obsm_output"]] = ( + vae_query.get_latent_representation() + ) + + norm_rna, norm_protein = vae_query.get_normalized_expression() + mdata_query.mod[par["query_modality"]].obsm[par["obsm_normalized_rna_output"]] = ( + norm_rna.to_numpy() + ) + + if par["query_proteins_modality"] in mdata_query.mod: + mdata_query.mod[par["query_proteins_modality"]].obsm[ + par["obsm_normalized_protein_output"] + ] = norm_protein.to_numpy() + + logger.info("Updating mdata") + mdata_query.update() + + logger.info("Saving updated query data") + mdata_query.write_h5mu(par["output"].strip()) + + logger.info("Saving query model") + vae_query.save(par["query_model_path"], overwrite=True) + + +if __name__ == "__main__": + main() diff --git a/src/integrate/totalvi/test.py b/src/integrate/totalvi/test.py new file mode 100644 index 00000000..0c638027 --- /dev/null +++ b/src/integrate/totalvi/test.py @@ -0,0 +1,54 @@ +import sys +import pytest +import mudata + +## VIASH START +meta = { + "executable": "./target/docker/integrate/totalvi/totalvi", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + + +def test_totalvi(run_component, tmp_path): + """Map data containing proteins on itself""" + output_path = tmp_path / "output.h5mu" + ref_model_path = tmp_path / "totalvi_reference_model" + query_model_path = tmp_path / "totalvi_query_model" + + run_component( + [ + "--input", + input_file, + "--reference", + input_file, + "--query_proteins_modality", + "prot", + "--reference_proteins_modality", + "prot", + "--var_input", + "filter_with_hvg", + "--reference_model_path", + str(ref_model_path), + "--query_model_path", + str(query_model_path), + "--max_epochs", + "1", + "--max_query_epochs", + "1", + "--output", + str(output_path), + ] + ) + + assert output_path.is_file() + output_data = mudata.read_h5mu(output_path) + assert "X_integrated_totalvi" in output_data.mod["rna"].obsm + assert "X_totalvi_normalized_rna" in output_data.mod["rna"].obsm + assert "X_totalvi_normalized_protein" in output_data.mod["prot"].obsm + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/interpret/lianapy/config.vsh.yaml b/src/interpret/lianapy/config.vsh.yaml new file mode 100644 index 00000000..487e8a7e --- /dev/null +++ b/src/interpret/lianapy/config.vsh.yaml @@ -0,0 +1,115 @@ +name: lianapy +namespace: "interpret" +scope: "public" +description: "Performs LIANA integration based as described in https://github.com/saezlab/liana-py" +authors: + - __merge__: /src/authors/mauro_saporita.yaml + roles: [ author ] + - __merge__: /src/authors/povilas_gibas.yaml + roles: [ author ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--layer" + type: string + required: false + description: "Layer in anndata.AnnData.layers to use. If None, use mudata.mod[modality].X." + - name: "--de_method" + type: string + description: Differential expression method. `scanpy.tl.rank_genes_groups` is used to rank genes according to 1vsRest. + choices: ['logreg', 't-test', 'wilcoxon', 't-test_overestim_var'] + default: "t-test" + required: false + - name: "--groupby" + type: string + required: true + description: "The key of the observations grouping to consider." + - name: "--resource_name" + type: string + choices: [ "baccin2019", "cellcall", "cellchatdb", "cellinker", "cellphonedb", "celltalkdb", "connectomedb2020", "consensus", "embrace", "guide2pharma", "hpmr", "icellnet", "italk", "kirouac2010", "lrdb", "mouseconsensus", "ramilowski2015" ] + default: "consensus" + required: false + description: "Name of the resource to be loaded and use for ligand-receptor inference." + - name: "--gene_symbol" + type: string + default: "gene_symbol" + required: false + description: "Column name in var DataFrame in which gene symbol are stored." + - name: "--expr_prop" + type: double + default: 0.1 + required: false + description: "Minimum expression proportion for the ligands/receptors (and their subunits) in the corresponding cell identities. Set to '0', to return unfiltered results." + - name: "--min_cells" + type: integer + default: 5 + required: false + description: "Minimum cells per cell identity ('groupby') to be considered for downstream analysis." + - name: "--aggregate_method" + type: string + choices: [ "mean", "rra" ] + default: "rra" + required: false + description: "Method aggregation approach, one of ['mean', 'rra'], where 'mean' represents the mean rank, while 'rra' is the RobustRankAggregate (Kolde et al., 2014) of the interactions." + - name: "--consensus_opts" + type: string + multiple: true + required: false + description: Strategies to aggregate interactions across methods. Default is None - i.e. ['Specificity', 'Magnitude'] and both specificity and magnitude are aggregated. + - name: "--return_all_lrs" + type: boolean + default: False + required: false + description: "Bool whether to return all LRs, or only those that surpass the 'expr_prop' threshold. Those interactions that do not pass the 'expr_prop' threshold will be assigned to the *worst* score of the ones that do. 'False' by default." + - name: "--n_perms" + type: integer + default: 100 + required: false + description: "Number of permutations for the permutation test. Note that this is relevant only for permutation-based methods - e.g. 'CellPhoneDB" +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + packages: + - liana~=1.5.0 + # Pinning scipy because https://github.com/statsmodels/statsmodels/issues/9584 + - scipy<1.16 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowcpu] diff --git a/src/interpret/lianapy/script.py b/src/interpret/lianapy/script.py new file mode 100644 index 00000000..95036f0a --- /dev/null +++ b/src/interpret/lianapy/script.py @@ -0,0 +1,85 @@ +import sys +import liana +import mudata + +# TODO: Remove when grouping labels exist +# For sign/PCA/ +import pandas as pd + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "foo.h5mu", + "output_compression": "gzip", + "modality": "rna", + "layer": None, + "gene_symbol": "gene_symbol", + "groupby": "harmony_integration_leiden_1.0", + "resource_name": "consensus", + "expr_prop": 0.1, + "min_cells": 5, + "aggregate_method": "rra", + "return_all_lrs": False, + "n_perms": 100, +} +meta = {"cpus": 4} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def main(): + # Get input data + mod = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + # Add dummy grouping labels when they do not exist + if par["groupby"] not in mod.obs: + raise ValueError( + f"Column {par['groupy']} does not exist in " + f".obs for modality {par['modality']}." + ) + mod_col = mod.obs[par["groupby"]] + original_groupby_col = mod_col.copy() + if not isinstance(mod_col, pd.CategoricalDtype): + mod.obs[par["groupby"]] = mod_col.astype(str).astype("category") + + # Solve gene labels + orig_gene_label = mod.var.index + mod.var_names = mod.var[par["gene_symbol"]].astype(str) + mod.var_names_make_unique() + + if not meta.get("cpus"): + meta["cpus"] = 1 + n_jobs = meta["cpus"] - 1 if meta["cpus"] > 2 else 1 + liana.mt.rank_aggregate( + adata=mod, + groupby=par["groupby"], + resource_name=par["resource_name"], + expr_prop=par["expr_prop"], + min_cells=par["min_cells"], + aggregate_method=par["aggregate_method"], + consensus_opts=par["consensus_opts"], + return_all_lrs=par["return_all_lrs"], + layer=par["layer"], + de_method=par["de_method"], + n_perms=par["n_perms"], + n_jobs=n_jobs, + verbose=True, + inplace=True, + use_raw=False, + ) + + # Return original gene labels + mod.var_names = orig_gene_label + + # Undo modifications to groupby column + mod.obs[par["groupby"]] = original_groupby_col + + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] + ) + + +if __name__ == "__main__": + main() diff --git a/src/interpret/lianapy/test.py b/src/interpret/lianapy/test.py new file mode 100644 index 00000000..0bfced79 --- /dev/null +++ b/src/interpret/lianapy/test.py @@ -0,0 +1,77 @@ +import sys +import pytest +import mudata +import numpy as np + +## VIASH START +meta = { + "executable": "./target/executable/interpret/lianapy/", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + + +def test_lianapy(run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # run component + run_component( + [ + "--input", + input_file, + "--output_compression", + "gzip", + "--modality", + "rna", + "--layer", + "log_normalized", + "--groupby", + "harmony_integration_leiden_1.0", + "--resource_name", + "consensus", + "--consensus_opts", + "Magnitude", + "--de_method", + "wilcoxon", + "--gene_symbol", + "gene_symbol", + "--expr_prop", + "0.1", + "--min_cells", + "5", + "--aggregate_method", + "rra", + "--return_all_lrs", + "False", + "--n_perms", + "11", + "--output", + str(output_path), + ] + ) + assert output_path.is_file() + + # check output + input_data = mudata.read_h5mu(input_file) + output_data = mudata.read_h5mu(output_path) + np.testing.assert_array_equal( + output_data.mod["rna"].X.data, input_data.mod["rna"].X.data + ) + np.testing.assert_array_equal( + input_data.mod["rna"].var.index, output_data.mod["rna"].var.index + ) + assert "liana_res" in output_data.mod["rna"].uns + assert all( + elem in output_data.mod["rna"].obs["harmony_integration_leiden_1.0"].values + for elem in output_data.mod["rna"].uns["liana_res"]["source"].unique() + ) + assert all( + elem in output_data.mod["rna"].obs["harmony_integration_leiden_1.0"].values + for elem in output_data.mod["rna"].uns["liana_res"]["target"].unique() + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/labels_transfer/api/common_arguments.yaml b/src/labels_transfer/api/common_arguments.yaml new file mode 100644 index 00000000..57b728eb --- /dev/null +++ b/src/labels_transfer/api/common_arguments.yaml @@ -0,0 +1,146 @@ + argument_groups: + - name: Input dataset (query) arguments + arguments: + - name: "--input" + type: file + direction: input + required: true + description: "The query data to transfer the labels to. Should be a .h5mu file." + info: + label: "Query" + file_format: + type: h5mu + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: double + name: features + required: false + description: | + The expression data to use for the classifier's inference, if `--input_obsm_features` argument is not provided. + obsm: + - type: "double" + name: "features" + example: X_scvi + required: false + description: | + The embedding to use for the classifier's inference. Override using the `--input_obsm_features` argument. If not provided, the `.X` slot will be used instead. + Make sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing). + - name: "--modality" + type: string + default: "rna" + required: false + description: Which modality to use. + - name: "--input_obsm_features" + type: string + required: false + description: | + The `.obsm` key of the embedding to use for the classifier's inference. If not provided, the `.X` slot will be used instead. + Make sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing). + example: X_scvi + + - name: Reference dataset arguments + arguments: + - name: "--reference" + type: "file" + required: false + description: "The reference data to train classifiers on." + example: reference.h5mu + info: + label: "Reference" + file_format: + type: h5mu + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: double + name: features + required: false + description: | + The expression data to use for the classifier's training, if `--input_obsm_features` argument is not provided. + obsm: + - type: "double" + name: "features" + example: X_scvi + description: | + The embedding to use for the classifier's training. Override using the `--reference_obsm_features` argument. + Make sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing). + required: true + obs: + - type: "string" + name: "targets" + multiple: true + example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] + description: "The target labels to transfer. Override using the `--reference_obs_targets` argument." + required: true + - name: "--reference_obsm_features" + type: string + required: false + description: | + The `.obsm` key of the embedding to use for the classifier's training. If not provided, the `.X` slot will be used instead. + Make sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing). + example: X_scvi + - name: "--reference_obs_targets" + type: string + default: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] + required: false + multiple: true + description: The `.obs` key(s) of the target labels to tranfer. + + - name: Outputs + arguments: + - name: "--output" + type: "file" + direction: output + required: true + description: "The query data in .h5mu format with predicted labels transfered from the reference." + info: + label: "Output data" + file_format: + type: h5mu + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + obs: + - type: "string" + name: "predictions" + description: "The predicted labels. Override using the `--output_obs_predictions` argument." + required: true + - type: "double" + name: "probability" + description: "The probability of the predicted labels. Override using the `--output_obs_probability` argument." + required: false + obsm: + - type: "double" + name: "X_scvi" + description: + The embedding used for the classifier's inference. Could have any name, + specified by `input_obsm_features` argument." + required: false + - name: "--output_obs_predictions" + type: string + example: + required: false + multiple: true + description: | + In which `.obs` slots to store the predicted information. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. + - name: "--output_obs_probability" + type: string + example: + required: false + multiple: true + description: | + In which `.obs` slots to store the probability of the predictions. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. + __merge__: [., /src/base/h5_compression_argument.yaml] + diff --git a/src/labels_transfer/knn/config.vsh.yaml b/src/labels_transfer/knn/config.vsh.yaml new file mode 100644 index 00000000..c3c82bd0 --- /dev/null +++ b/src/labels_transfer/knn/config.vsh.yaml @@ -0,0 +1,89 @@ +name: knn +namespace: "labels_transfer" +scope: "public" +description: | + This component performs label transfer from reference to query using a K-Neirest Neighbors classifier. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] + +__merge__: ../api/common_arguments.yaml + +argument_groups: + - name: Input dataset (query) arguments + arguments: + - name: "--input_obsm_distances" + type: string + direction: input + required: false + example: bbknn_distances + description: | + The `.obsm` key of the input (query) dataset containing pre-calculated distances. + If not provided, the distances will be calculated using PyNNDescent. + Make sure the distance matrix contains distances relative to the reference dataset and were obtained in the same way as the reference embedding. + + - name: Reference dataset arguments + arguments: + - name: "--reference_obsm_distances" + type: string + required: false + description: | + The `.obsm` key of the reference dataset containing pre-calculated distances. + If not provided, the distances will be calculated using PyNNDescent. + example: bbknn_distances + + - name: KNN label transfer arguments + arguments: + - name: "--weights" + type: string + choices: ["uniform", "distance", "gaussian"] + default: "uniform" + description: | + Weight function used in prediction. Possible values are: + - `uniform` - all points in each neighborhood are weighted equally + - `distance` - weight points by the inverse of their distance + - `gaussian` - weight points by the sum of their Gaussian kernel similarities to each sample + - name: "--n_neighbors" + type: integer + min: 5 + default: 15 + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. + +resources: + - type: python_script + path: script.py + - path: /src/labels_transfer/utils/helper.py + - path: /src/utils/compress_h5mu.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ + +engines: + - type: docker + image: python:3.12 + setup: + - type: apt + packages: + - procps + - pkg-config + - libhdf5-dev + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + - type: python + packages: + - pynndescent~=0.5.10 + - numpy<2 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, highcpu] diff --git a/src/labels_transfer/knn/script.py b/src/labels_transfer/knn/script.py new file mode 100644 index 00000000..2c1b85e6 --- /dev/null +++ b/src/labels_transfer/knn/script.py @@ -0,0 +1,169 @@ +import mudata as mu +import numpy as np +import sys +from pynndescent import PyNNDescentTransformer +from sklearn.neighbors import KNeighborsClassifier + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_obsm_features": None, + "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "reference_obsm_features": None, + "reference_obs_targets": ["cell_type"], + "output": "foo_distance.h5mu", + "output_obs_predictions": None, + "output_obs_probability": None, + "output_uns_parameters": "labels_transfer", + "output_compression": None, + "weights": "distance", + "n_neighbors": 15, +} +meta = {"resources_dir": "src/labels_transfer/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger + + +# END TEMPORARY WORKAROUND setup_logger + + +def distances_to_affinities(distances): + # Apply Gaussian kernel to distances + stds = np.std(distances, axis=1) + stds = (2.0 / stds) ** 2 + stds = stds.reshape(-1, 1) + distances_tilda = np.exp(-np.true_divide(distances, stds)) + + # normalize the distances_tilda + # if the sum of a row of the distances tilda equals 0, + # set normalized distances for that row to 1 + # else divide the row values by the value of the sum of the row + distances_tilda_normalized = np.where( + np.sum(distances_tilda, axis=1, keepdims=True) == 0, + 1, + distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True), + ) + return distances_tilda_normalized + + +logger = setup_logger() + +# Reading in data +logger.info( + f"Reading in query dataset {par['input']} and reference datasets {par['reference']}" +) +q_adata = mu.read_h5ad(par["input"], mod=par["modality"]) +r_adata = mu.read_h5ad(par["reference"], mod=par["modality"]) + +# check arguments +logger.info("Checking arguments") +par = check_arguments(par) + +if par["input_obsm_distances"] and par["reference_obsm_distances"]: + logger.info( + "Using pre-calculated distances for KNN classification as provided in `--input_obsm_distances` and `--reference_obsm_distances`." + ) + + assert par["input_obsm_distances"] in q_adata.obsm, ( + f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}." + ) + assert par["reference_obsm_distances"] in r_adata.obsm, ( + f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}." + ) + + query_neighbors = q_adata.obsm[par["input_obsm_distances"]] + reference_neighbors = r_adata.obsm[par["reference_obsm_distances"]] + + if query_neighbors.shape[1] != reference_neighbors.shape[1]: + raise ValueError( + "The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset." + ) + + # Make sure the number of neighbors present in the distance matrix matches the requested number of neighbors in --n_neighbors + # Otherwise reduce n_neighbors for KNN + smallest_neighbor_count = min( + np.diff(query_neighbors.indptr).min(), np.diff(reference_neighbors.indptr).min() + ) + if smallest_neighbor_count < par["n_neighbors"]: + logger.warning( + f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification" + ) + par["n_neighbors"] = smallest_neighbor_count + +elif par["input_obsm_distances"] or par["reference_obsm_distances"]: + raise ValueError( + "Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification." + ) + +elif not par["input_obsm_distances"] and not par["reference_obsm_distances"]: + logger.info( + "No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm." + ) + # Generating training and inference data + train_X = get_reference_features(r_adata, par, logger) + inference_X = get_query_features(q_adata, par, logger) + + neighbors_transformer = PyNNDescentTransformer( + n_neighbors=par["n_neighbors"], + parallel_batch_queries=True, + ) + neighbors_transformer.fit(train_X) + + # Square sparse matrix with distances to n neighbors in reference data + query_neighbors = neighbors_transformer.transform(inference_X) + reference_neighbors = neighbors_transformer.transform(train_X) + +# For each target, train a classifier and predict labels +for obs_tar, obs_pred, obs_proba in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], +): + logger.info(f"Predicting labels for {obs_tar}") + + weights_dict = { + "uniform": "uniform", + "distance": "distance", + "gaussian": distances_to_affinities, + } + + logger.info(f"Using KNN classifier with {par['weights']} weights") + train_y = r_adata.obs[obs_tar].to_numpy() + classifier = KNeighborsClassifier( + n_neighbors=par["n_neighbors"], + metric="precomputed", + weights=weights_dict[par["weights"]], + ) + classifier.fit(X=reference_neighbors, y=train_y) + predicted_labels = classifier.predict(query_neighbors) + probabilities = classifier.predict_proba(query_neighbors).max(axis=1) + + # save_results + logger.info( + f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs" + ) + q_adata.obs[obs_pred] = predicted_labels + q_adata.obs[obs_proba] = probabilities + +logger.info(f"Saving output data to {par['output']}") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], q_adata, par["output_compression"] +) diff --git a/src/labels_transfer/knn/test.py b/src/labels_transfer/knn/test.py new file mode 100644 index 00000000..002e7b0a --- /dev/null +++ b/src/labels_transfer/knn/test.py @@ -0,0 +1,202 @@ +import re +import subprocess +import pytest +from pathlib import Path +import anndata as ad +import mudata as mu +import numpy as np +from scipy.sparse import csr_matrix + +## VIASH START +meta = {"resources_dir": "./resources_test/"} +## VIASH END + +reference_h5ad_file = ( + f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +) +# convert reference to h5mu +reference_adata = ad.read_h5ad(reference_h5ad_file) +reference_mdata = mu.MuData({"rna": reference_adata}) +reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" +reference_mdata.write_h5mu(reference_file) +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +def test_label_transfer(run_component, random_h5mu_path): + output = random_h5mu_path() + + args = [ + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--output", + output, + "--n_neighbors", + "5", + ] + + run_component(args) + + assert Path(output).is_file() + + output_data = mu.read_h5mu(output) + + assert "cell_type_pred" in output_data.mod["rna"].obs, ( + f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert "cell_type_probability" in output_data.mod["rna"].obs, ( + f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + ) + + +@pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"]) +def test_label_transfer_prediction_columns(run_component, weights, random_h5mu_path): + output = random_h5mu_path() + + args = [ + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--weights", + weights, + "--output", + output, + "--output_obs_probability", + "test_probability", + "--output_obs_predictions", + "test_prediction", + "--n_neighbors", + "5", + ] + + run_component(args) + + assert Path(output).is_file() + + output_data = mu.read_h5mu(output) + + assert "test_prediction" in output_data.mod["rna"].obs, ( + f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert "test_probability" in output_data.mod["rna"].obs, ( + f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + ) + + +def test_label_transfer_prediction_precomputed_neighbor_graph( + run_component, random_h5mu_path +): + output = random_h5mu_path() + + # Add mock distance matrix to obsm slot + reference_mdata = mu.read_h5mu(reference_file) + ref_distances = np.random.rand(400, 400) + ref_distances[ref_distances < 0.5] = 0 + ref_distances = csr_matrix(ref_distances) + reference_mdata.mod["rna"].obsm["distances"] = ref_distances + reference_mdata.write_h5mu(reference_file) + + query_mdata = mu.read_h5mu(input_file) + query_distances = np.random.rand(713, 400) + query_distances[query_distances < 0.5] = 0 + query_distances = csr_matrix(query_distances) + query_mdata.mod["rna"].obsm["distances"] = query_distances + query_mdata.write_h5mu(input_file) + + args = [ + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--output", + output, + "--input_obsm_distances", + "distances", + "--reference_obsm_distances", + "distances", + "--output_obs_probability", + "test_probability", + "--output_obs_predictions", + "test_prediction", + "--n_neighbors", + "5", + ] + + run_component(args) + + assert Path(output).is_file() + + output_data = mu.read_h5mu(output) + + assert "test_prediction" in output_data.mod["rna"].obs, ( + f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert "test_probability" in output_data.mod["rna"].obs, ( + f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + ) + + +def test_raises_distance_matrix_dimensions(run_component, random_h5mu_path): + output = random_h5mu_path() + + reference_mdata = mu.read_h5mu(reference_file) + ref_distances = np.random.rand(400, 100) + ref_distances[ref_distances < 0.5] = 0 + ref_distances = csr_matrix(ref_distances) + reference_mdata.mod["rna"].obsm["distances"] = ref_distances + reference_mdata.write_h5mu(reference_file) + + query_mdata = mu.read_h5mu(input_file) + query_distances = np.random.rand(713, 400) + query_distances[query_distances < 0.5] = 0 + query_distances = csr_matrix(query_distances) + query_mdata.mod["rna"].obsm["distances"] = query_distances + query_mdata.write_h5mu(input_file) + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--output", + output, + "--input_obsm_distances", + "distances", + "--reference_obsm_distances", + "distances", + "--output_obs_probability", + "test_probability", + "--output_obs_predictions", + "test_prediction", + "--n_neighbors", + "5", + ] + ) + assert re.search( + r"ValueError: The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + exit(pytest.main([__file__])) diff --git a/src/labels_transfer/utils/helper.py b/src/labels_transfer/utils/helper.py new file mode 100644 index 00000000..06d5d3a3 --- /dev/null +++ b/src/labels_transfer/utils/helper.py @@ -0,0 +1,42 @@ +def check_arguments(par): + # check output .obs predictions + if not par["output_obs_predictions"]: + par["output_obs_predictions"] = [ + t + "_pred" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" + ) + + # check output .obs uncertainty + if not par["output_obs_probability"]: + par["output_obs_probability"] = [ + t + "_probability" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" + ) + + return par + + +def get_reference_features(adata_reference, par, logger): + if par["reference_obsm_features"] is None: + logger.info("Using .X of reference data") + train_data = adata_reference.X + else: + logger.info(f"Using .obsm[{par['reference_obsm_features']}] of reference data") + train_data = adata_reference.obsm[par["reference_obsm_features"]] + + return train_data + + +def get_query_features(adata, par, logger): + if par["input_obsm_features"] is None: + logger.info("Using .X of query data") + query_data = adata.X + else: + logger.info(f"Using .obsm[{par['input_obsm_features']}] of query data") + query_data = adata.obsm[par["input_obsm_features"]] + + return query_data diff --git a/src/labels_transfer/xgboost/README.md b/src/labels_transfer/xgboost/README.md new file mode 100644 index 00000000..9bc038e3 --- /dev/null +++ b/src/labels_transfer/xgboost/README.md @@ -0,0 +1,53 @@ +# XGBoost Labels Transfer + +This component transfers labels from a reference dataset to an query using a boosting method [XGBoost](https://xgboost.readthedocs.io/en/stable/). + +# Input + +- `.h5mu` files for reference and query as specified in [config](./config.vsh.yaml) +- List of targets (labels) to predict from `.obs` slot of the reference. Common choice for HLCA would be `["ann_level_1", "ann_level_2", "ann_level_3", "ann_level_4", "ann_level_5", "ann_finest_level"]` +- models output directory + +For the description of other input arguments, refer to the [config](./config.vsh.yaml). + +# Output + +`.h5mu` file with the query dataset that contains the transferred annotations +- The annotations for each of the `targets` will be stored as new columns in the `.obs` data frame with the suffix `_pred` by default +- The uncertainty of the prediction for each target will be stored as another column in `.obs` with the suffix `_uncertainty` +- `labels_transfer` entry is added to the `.uns` dictionary of the output file, which contains information about the method used and its parameters. Keys of this dictionary are labels from `targets` list. Values are dictionaries with the following keys: + +| Key | Description | +|-------------|------------------------| +| method | Always the string "XGBClassifier" | +| learning_rate | Training parameter of XGBoost classifier used | +| min_split_loss | Training parameter of XGBoost classifier used | +| max_depth | Training parameter of XGBoost classifier used | +| min_child_weight | Training parameter of XGBoost classifier used | +| max_delta_step | Training parameter of XGBoost classifier used| +| subsample | Training parameter of XGBoost classifier used | +| sampling_method | Training parameter of XGBoost classifier used | +| colsample_bytree | Training parameter of XGBoost classifier used | +| colsample_bylevel | Training parameter of XGBoost classifier used | +| colsample_bynode | Training parameter of XGBoost classifier used| +| reg_lambda | Training parameter of XGBoost classifier used | +| reg_alpha | Training parameter of XGBoost classifier used | +| scale_pos_weight | Training parameter of XGBoost classifier used | + +- Models saved in models output directory (`./model` by default). File for each target is named `classifier_.xgb` +- `model_info.json` file saved in models output directory. It has the following format: +1. The first level key is always the string "classifier_info". This is done for the possibility of further development +2. The second level keys are labels from the list of `targets`. The values are dictionaries with the following format: + +| Key | Description | Example | +|-------------|------------------------|----------| +| filename | Name of the file containing classifier in models output directory| classifier_ann_level_1.xgb | +| labels | List of unique labels in the corresponding column in reference | ["Stroma", "Endothelial", "Immune"] | +| obs_column | Output column for the predicted labels in the output dataset | ann_level_1_pred | +| model_params | Dictionary with the training parameters, the same that in `.uns` dictionary of the output dataset | {"learning_rate": 0.3, ...} | + +# Notes + +- If there are suitable files with classifiers in the models output directory, the trained models from these files would be used for the labels transfer +- New models are trained only for targets, for which trained classifiers are not found +- If you want to retrain all the classifier, use the `force_retrain` parameter during running of the component. Be carefut: it may overwrite the files in models output directory diff --git a/src/labels_transfer/xgboost/config.vsh.yaml b/src/labels_transfer/xgboost/config.vsh.yaml new file mode 100644 index 00000000..c022584e --- /dev/null +++ b/src/labels_transfer/xgboost/config.vsh.yaml @@ -0,0 +1,150 @@ +name: xgboost +namespace: "labels_transfer" +scope: "public" +description: "Performs label transfer from reference to query using XGBoost classifier" +info: + method_id: XGBClassifier +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] +__merge__: ../api/common_arguments.yaml +argument_groups: + - name: "Execution arguments" + arguments: + - name: "--force_retrain" + alternatives: ["-f"] + type: boolean_true + description: "Retrain models on the reference even if model_output directory already has trained classifiers. WARNING! It will rewrite existing classifiers for targets in the model_output directory!" + - name: "--use_gpu" + type: boolean + description: "Use GPU during models training and inference (recommended)." + default: false + - name: "--verbosity" + alternatives: ["-v"] + type: integer + description: "The verbosity level for evaluation of the classifier from the range [0,2]" + required: false + default: 1 + - name: "--model_output" + type: file + default: "model" + description: Output directory for model + direction: output + required: false + - name: "--output_uns_parameters" + type: string + default: "xgboost_parameters" + description: The key in `uns` slot of the output AnnData object to store the parameters of the XGBoost classifier. + required: false + - name: "Learning parameters" + arguments: + - name: "--learning_rate" + alternatives: ["--eta"] + type: double + description: "Step size shrinkage used in update to prevents overfitting. Range: [0,1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 0.3 + - name: "--min_split_loss" + alternatives: ["--gamma"] + type: double + description: "Minimum loss reduction required to make a further partition on a leaf node of the tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 0 + - name: "--max_depth" + alternatives: ["-d"] + type: integer + description: "Maximum depth of a tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 6 + - name: "--min_child_weight" + type: integer + description: "Minimum sum of instance weight (hessian) needed in a child. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 + - name: "--max_delta_step" + type: double + description: "Maximum delta step we allow each leaf output to be. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 0 + - name: "--subsample" + type: double + description: "Subsample ratio of the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 + - name: "--sampling_method" + type: string + choices: [uniform, gradient_based] + description: "The method to use to sample the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: "uniform" + - name: "--colsample_bytree" + type: double + description: "Fraction of columns to be subsampled. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 + - name: "--colsample_bylevel" + type: double + description: "Subsample ratio of columns for each level. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 + - name: "--colsample_bynode" + type: double + description: "Subsample ratio of columns for each node (split). Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 + - name: "--reg_lambda" + alternatives: ["--lambda"] + type: double + description: "L2 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 + - name: "--reg_alpha" + alternatives: ["--alpha"] + type: double + description: "L1 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 0 + - name: "--scale_pos_weight" + type: double + description: "Control the balance of positive and negative weights, useful for unbalanced classes. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference" + required: false + default: 1 +resources: + - type: python_script + path: script.py + - path: ../utils/helper.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/ + - path: /resources_test/pbmc_1k_protein_v3/ +engines: + - type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - git + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + - type: apt + packages: + - libopenblas-dev + - liblapack-dev + - gfortran + - type: python + __merge__: [/src/base/requirements/scanpy.yaml, .] + packages: + - xgboost~=2.1.3 + - scikit-learn<1.6 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, highcpu, gpu] diff --git a/src/labels_transfer/xgboost/script.py b/src/labels_transfer/xgboost/script.py new file mode 100644 index 00000000..ca21c020 --- /dev/null +++ b/src/labels_transfer/xgboost/script.py @@ -0,0 +1,438 @@ +import sys +import json +import os +from typing import Optional +import yaml +from pathlib import Path + +import mudata +import numpy as np +import pandas as pd +import xgboost as xgb +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.preprocessing import LabelEncoder + + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_obsm_features": "X_integrated_scanvi", + "reference": "https://zenodo.org/record/6337966/files/HLCA_emb_and_metadata.h5ad", + "reference_obsm_features": "X_integrated_scanvi", + "reference_obs_targets": [ + "ann_level_1", + "ann_level_2", + "ann_level_3", + "ann_level_4", + "ann_level_5", + "ann_finest_level", + ], + "output": "foo.h5mu", + "output_obs_predictions": None, + "output_obs_probability": None, + "output_uns_parameters": "labels_transfer", + "force_retrain": False, + "use_gpu": True, + "verbosity": 1, + "learning_rate": 0.3, + "min_split_loss": 0, + "max_depth": 6, + "min_child_weight": 1, + "max_delta_step": 0, + "subsample": 1, + "sampling_method": "uniform", + "colsample_bytree": 1, + "colsample_bylevel": 1, + "colsample_bynode": 1, + "reg_lambda": 1, + "reg_alpha": 0, + "scale_pos_weight": 1, +} +meta = { + "resources_dir": "src/labels_transfer/utils", + "config": "src/labels_transfer/xgboost/config.vsh.yaml", +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + +# look for training params for method +argument_groups = {grp["name"]: grp["arguments"] for grp in config["argument_groups"]} +training_arg_names = [ + arg["name"].replace("--", "") for arg in argument_groups["Learning parameters"] +] +training_params = {arg_name: par[arg_name] for arg_name in training_arg_names} + + +def encode_labels(y): + labels_encoder = LabelEncoder() + labels_encoder.fit(y) + + return labels_encoder.transform(y), labels_encoder + + +def get_model_eval(xgb_model, X_test, y_test, labels_encoder): + preds = xgb_model.predict(X_test) + + cr = classification_report( + labels_encoder.inverse_transform(y_test), + labels_encoder.inverse_transform(preds), + output_dict=True, + ) + cr_df = pd.DataFrame(cr).transpose() + + return cr_df + + +def train_test_split_adata(adata, labels): + train_data = pd.DataFrame(data=adata.X, index=adata.obs_names) + + X_train, X_test, y_train, y_test = train_test_split( + train_data, labels, test_size=0.2, random_state=42, stratify=labels + ) + + return X_train, X_test, y_train, y_test + + +def train_xgb_model(X_train, y_train, gpu=True) -> xgb.XGBClassifier: + n_classes = len(np.unique(y_train)) + objective = "binary:logistic" if n_classes == 2 else "multi:softprob" + + tree_method = "gpu_hist" if gpu else "hist" + xgbc = xgb.XGBClassifier( + tree_method=tree_method, objective=objective, **training_params + ) + xgbc.fit(X_train, y_train) + + return xgbc + + +def build_classifier( + X, y, labels_encoder, label_key, eval_verbosity: Optional[int] = 1, gpu=True +) -> xgb.XGBClassifier: + # Adata prep + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + # Note: Do we need a new train-test split for each classifier? + + # Model training + xgb_model = train_xgb_model(X_train, y_train, gpu=gpu) + + # Model eval + if eval_verbosity != 0: + cr_df = get_model_eval(xgb_model, X_test, y_test, labels_encoder) + + if eval_verbosity == 2: + print(cr_df) + + else: + overall_accuracy = cr_df["support"]["accuracy"] + low_prec_key = cr_df.precision.idxmin() + low_prec_val = cr_df.precision.min() + low_rec_key = cr_df.recall.idxmin() + low_rec_val = cr_df.recall.min() + low_f1_key = cr_df["f1-score"].idxmin() + low_f1_val = cr_df["f1-score"].min() + + print("") + print(f"Summary stats for {label_key} model:") + print(f"Overall accuracy: {overall_accuracy}") + print(f"Min. precision: {low_prec_key}: {low_prec_val}") + print(f"Min. Recall: {low_rec_key}: {low_rec_val}") + print(f"Min. F1-score: {low_f1_key}: {low_f1_val}") + print("") + + return xgb_model + + +def build_ref_classifiers( + adata_reference, + targets, + model_path, + eval_verbosity: Optional[int] = 1, + gpu: Optional[bool] = True, +) -> None: + """ + This function builds xgboost classifiers on a reference embedding for a designated number of + adata_reference.obs columns. Classifier .xgb files and a model_info.json file is written to the `model_path` + directory. Model evaluation is printed to stdout. + + Inputs: + * `adata_reference`: The AnnData object that was used to train the reference model + * `model_path`: The reference model directory where the classifiers will also be stored + * `eval_verbosity`: The verbosity level for evaluation of the classifier from the range [0;2]. + * `gpu`: Boolean indicating whether a gpu is available for classifier training + + + Example: + ``` + >>> adata + AnnData object with n_obs x n_vars = 700 x 765 + obs: "ann_finest_level", "ann_level_1" + + >>> os.listdir("/path/to/model") + model_params.pt* + + >>> build_ref_classifiers(adata, "path/to/model", eval_verbosity=1, gpu=True) + >>> os.listdir("/path/to/model") + classifier_ann_finest_level.xgb* model_info.json* + classifier_ann_level_1.xgb* model_params.pt* + ``` + """ + + # Check inputs + if not isinstance(eval_verbosity, int): + raise TypeError("`eval_verbosity` should be an integer between 0 and 2.") + + if eval_verbosity < 0 or eval_verbosity > 2: + raise ValueError("`eval_verbosity` should be an integer between 0 and 2.") + + train_data = get_reference_features(adata_reference, par, logger) + + if not os.path.exists(model_path): + os.makedirs(model_path, exist_ok=True) + + # Map from name of classifier to file names + classifiers = dict() + + for label, obs_pred in zip(targets, par["output_obs_predictions"]): + if label not in adata_reference.obs: + raise ValueError(f"{label} is not in the `adata` object passed!") + + filename = "classifier_" + label + ".xgb" + + labels, labels_encoder = encode_labels(adata_reference.obs[label]) + logger.info(f"Classes: {labels_encoder.classes_}") + + logger.info(f"Building classifier for {label}...") + xgb_model = build_classifier( + X=train_data, + y=labels, + labels_encoder=labels_encoder, + label_key=label, + eval_verbosity=eval_verbosity, + gpu=gpu, + ) + + # Save classifier + logger.info("Saving model") + xgb_model.save_model(os.path.join(model_path, filename)) + + # Store classifier info + classifiers[label] = { + "filename": filename, + "labels": labels_encoder.classes_.tolist(), + "obs_column": obs_pred, + "model_params": training_params, + } + + # Store model_info.json file + model_info = {"classifier_info": classifiers} + + logger.info("Writing model_info to the file") + # Read previous file if it exists + if os.path.exists(model_path + "/model_info.json"): + logger.info("Old model_info file found, updating") + with open(model_path + "/model_info.json", "r") as f: + old_model_info = json.loads(f.read()) + + for key in old_model_info: + if key in model_info: + old_model_info[key].update(model_info[key]) + json_string = json.dumps(old_model_info, indent=4) + + else: + logger.info("Creating a new file") + json_string = json.dumps(model_info, indent=4) + + with open(model_path + "/model_info.json", "w") as f: + f.write(json_string) + + +def project_labels( + query_dataset, + cell_type_classifier_model: xgb.XGBClassifier, + annotation_column_name="label_pred", + probability_column_name="label_probability", + probability_thresh=None, # Note: currently not passed to predict function +): + """ + A function that projects predicted labels onto the query dataset, along with probability estimations. + Performs in-place update of the adata object, adding columns to the `obs` DataFrame. + + Input: + * `query_dataset`: The query `AnnData` object + * `model_file`: Path to the classification model file + * `prediction_key`: Column name in `adata.obs` where to store the predicted labels + * `probability_key`: Column name in `adata.obs` where to store the labels probabilities + * `probability_thresh`: The probability threshold below which we call a cell 'Unknown' + + Output: + Nothing is output, the passed anndata is modified inplace + + """ + + if (probability_thresh is not None) and ( + probability_thresh < 0 or probability_thresh > 1 + ): + raise ValueError("`probability_thresh` must be `None` or between 0 and 1.") + + query_data = get_query_features(query_dataset, par, logger) + + # Predict labels and probabilities + query_dataset.obs[annotation_column_name] = cell_type_classifier_model.predict( + query_data + ) + + logger.info("Predicting probabilities") + probs = cell_type_classifier_model.predict_proba(query_data) + + # Format probabilities + df_probs = pd.DataFrame( + probs, + columns=cell_type_classifier_model.classes_, + index=query_dataset.obs_names, + ) + query_dataset.obs[probability_column_name] = df_probs.max(1) + + # Note: this is here in case we want to propose a set of values for the user to accept to seed the + # manual curation of predicted labels + if probability_thresh is not None: + logger.info("Marking uncertain predictions") + query_dataset.obs[annotation_column_name + "_filtered"] = [ + val + if query_dataset.obs[probability_column_name][i] >= probability_thresh + else "Unknown" + for i, val in enumerate(query_dataset.obs[annotation_column_name]) + ] + + return query_dataset + + +def predict( + query_dataset, + cell_type_classifier_model_path, + annotation_column_name: str, + prediction_column_name: str, + probability_column_name: str, + models_info, + use_gpu: bool = False, +) -> pd.DataFrame: + """ + Returns `obs` DataFrame with prediction columns appended + """ + + tree_method = "gpu_hist" if use_gpu else "hist" + + labels = models_info["classifier_info"][annotation_column_name]["labels"] + + objective = "binary:logistic" if len(labels) == 2 else "multi:softprob" + cell_type_classifier_model = xgb.XGBClassifier( + tree_method=tree_method, objective=objective + ) + + logger.info("Loading model") + cell_type_classifier_model.load_model(fname=cell_type_classifier_model_path) + + logger.info("Predicting labels") + project_labels( + query_dataset, + cell_type_classifier_model, + annotation_column_name=prediction_column_name, + probability_column_name=probability_column_name, + ) + + logger.info("Converting labels from numbers to classes") + labels_encoder = LabelEncoder() + labels_encoder.classes_ = np.array(labels) + query_dataset.obs[prediction_column_name] = labels_encoder.inverse_transform( + query_dataset.obs[prediction_column_name] + ) + + return query_dataset + + +def main(par): + logger.info("Checking arguments") + par = check_arguments(par) + + adata_query = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + adata_reference = mudata.read_h5ad(par["reference"], mod=par["modality"]) + + # If classifiers for targets are in the model_output directory, simply open them and run (unless `retrain` != True) + # If some classifiers are missing, train and save them first + # Predict and save the query data + + targets_to_train = [] + + for obs_target in par["reference_obs_targets"]: + if ( + not os.path.exists(par["model_output"]) + or f"classifier_{obs_target}.xgb" not in os.listdir(par["model_output"]) + or par["force_retrain"] + ): + logger.info(f"Classifier for {obs_target} added to a training schedule") + targets_to_train.append(obs_target) + else: + logger.info(f"Found classifier for {obs_target}, no retraining required") + + build_ref_classifiers( + adata_reference, + targets_to_train, + model_path=par["model_output"], + gpu=par["use_gpu"], + eval_verbosity=par["verbosity"], + ) + + output_uns_parameters = adata_query.uns.get(par["output_uns_parameters"], {}) + + with open(par["model_output"] + "/model_info.json", "r") as f: + models_info = json.loads(f.read()) + + for obs_target, obs_pred, obs_unc in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], + ): + logger.info(f"Predicting {obs_target}") + + adata_query = predict( + query_dataset=adata_query, + cell_type_classifier_model_path=os.path.join( + par["model_output"], "classifier_" + obs_target + ".xgb" + ), + annotation_column_name=obs_target, + prediction_column_name=obs_pred, + probability_column_name=obs_unc, + models_info=models_info, + use_gpu=par["use_gpu"], + ) + + if obs_target in targets_to_train: + # Save information about the transfer to .uns + output_uns_parameters[obs_target] = { + "method": "XGBClassifier", + **training_params, + } + + adata_query.uns[par["output_uns_parameters"]] = output_uns_parameters + + logger.info("Writing output to %s", par["output"]) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata_query, None + ) + + +if __name__ == "__main__": + main(par) diff --git a/src/labels_transfer/xgboost/test.py b/src/labels_transfer/xgboost/test.py new file mode 100644 index 00000000..51dc7936 --- /dev/null +++ b/src/labels_transfer/xgboost/test.py @@ -0,0 +1,252 @@ +import sys +import pytest +from pathlib import Path +import anndata +import mudata +import numpy as np + +## VIASH START +meta = { + "executable": "./target/executable/labels_transfer/xgboost/xgboost", + "resources_dir": "./resources_test/", +} +## VIASH END + +reference_h5ad_file = ( + f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +) +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def test_args(tmp_path, request): + obsm_features, obs_targets, output_uns_parameters = request.param + + tempfile_reference_file = tmp_path / "reference.h5mu" + tempfile_input_file = tmp_path / "input.h5mu" + + # read reference + reference_adata = anndata.read_h5ad(reference_h5ad_file) + + # generate reference obs targets + for i, target in enumerate(obs_targets): + class_names = [ + str(idx) for idx in range(i + 2) + ] # e.g. ["0", "1", "2"], the higher the level, the more the classes + reference_adata.obs[target] = np.random.choice( + class_names, size=reference_adata.n_obs + ) + + # read input query + input_mudata = mudata.read_h5mu(input_file) + input_rna_adata = input_mudata.mod["rna"] + + # generate features + reference_adata.obsm[obsm_features] = np.random.normal( + size=(reference_adata.n_obs, 30) + ) + input_rna_adata.obsm[obsm_features] = np.random.normal( + size=(input_rna_adata.n_obs, 30) + ) + + reference_mdata = mudata.MuData({"rna": reference_adata}) + # write files + reference_mdata.write_h5mu(str(tempfile_reference_file)) + input_mudata.write_h5mu(str(tempfile_input_file)) + + return ( + tempfile_reference_file, + reference_adata, + tempfile_input_file, + input_rna_adata, + obsm_features, + obs_targets, + output_uns_parameters, + ) + + +@pytest.mark.parametrize( + "test_args", + [ + ("X_integrated_scvi", ["celltype"], None), + ("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran"), + ], + indirect=True, +) +def test_label_transfer(run_component, test_args): + ( + tempfile_reference_file, + _, + tempfile_input_file, + _, + obsm_features, + obs_targets, + output_uns_parameters, + ) = test_args + + args = [ + "--input", + str(tempfile_input_file), + "--modality", + "rna", + "--input_obsm_features", + obsm_features, + "--reference", + str(tempfile_reference_file), + "--reference_obsm_features", + obsm_features, + "--reference_obs_targets", + ";".join(obs_targets), + "--output", + "output.h5mu", + "--model_output", + "model_one_class", + "--use_gpu", + "false", + "--max_depth", + "6", + ] + + if output_uns_parameters is not None: + args.extend(["--output_uns_parameters", output_uns_parameters]) + + run_component(args) + + assert Path("output.h5mu").is_file() + + output_data = mudata.read_h5mu("output.h5mu") + + exp_uns = ( + "xgboost_parameters" if output_uns_parameters is None else output_uns_parameters + ) + + for target in obs_targets: + assert f"{target}_pred" in output_data.mod["rna"].obs, ( + f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert f"{target}_probability" in output_data.mod["rna"].obs, ( + f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert exp_uns in output_data.mod["rna"].uns, ( + f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + ) + assert target in output_data.mod["rna"].uns[exp_uns], ( + f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + ) + assert ( + output_data.mod["rna"].uns[exp_uns][target].get("method") == "XGBClassifier" + ), f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" + assert output_data.mod["rna"].uns[exp_uns][target].get("max_depth") == 6, ( + f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + ) + + +@pytest.mark.parametrize( + "test_args", + [("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran")], + indirect=True, +) +def test_retraining(run_component, test_args, tmp_path): + output_model = tmp_path / "model_retraining" + output_path = tmp_path / "output.h5mu" + output2_path = tmp_path / "output2.h5mu" + ( + tempfile_reference_file, + _, + tempfile_input_file, + _, + obsm_features, + obs_targets, + output_uns_parameters, + ) = test_args + + # Train first 2 targets + args = [ + "--modality", + "rna", + "--input_obsm_features", + obsm_features, + "--reference", + str(tempfile_reference_file), + "--reference_obsm_features", + obsm_features, + "--model_output", + str(output_model), + ] + + if output_uns_parameters is not None: + args.extend(["--output_uns_parameters", output_uns_parameters]) + + args1 = args + [ + "--input", + str(tempfile_input_file), + "--output", + str(output_path), + "--reference_obs_targets", + ";".join(obs_targets[:2]), + "--max_depth", + "6", + ] + run_component(args1) + + assert output_path.is_file() + + # Add more targets + # Now the code should use 2 previously trained models, + # and train only the remaining targets + args2 = args + [ + "--input", + str(output_path), + "--output", + str(output2_path), + "--reference_obs_targets", + ";".join(obs_targets), + "--max_depth", + "4", + ] + run_component(args2) + + assert output2_path.is_file() + + output_data = mudata.read_h5mu(output2_path) + + for target in obs_targets: + assert f"{target}_pred" in output_data.mod["rna"].obs, ( + f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert f"{target}_probability" in output_data.mod["rna"].obs, ( + f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" + ) + assert output_uns_parameters in output_data.mod["rna"].uns, ( + f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + ) + assert target in output_data.mod["rna"].uns[output_uns_parameters], ( + f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + ) + assert ( + output_data.mod["rna"].uns[output_uns_parameters][target].get("method") + == "XGBClassifier" + ), f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" + + assert ( + output_data.mod["rna"] + .uns[output_uns_parameters][obs_targets[0]] + .get("max_depth") + == 6 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"] + .uns[output_uns_parameters][obs_targets[1]] + .get("max_depth") + == 6 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"] + .uns[output_uns_parameters][obs_targets[2]] + .get("max_depth") + == 4 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/mapping/bd_rhapsody/config.vsh.yaml b/src/mapping/bd_rhapsody/config.vsh.yaml new file mode 100644 index 00000000..cef89704 --- /dev/null +++ b/src/mapping/bd_rhapsody/config.vsh.yaml @@ -0,0 +1,656 @@ +name: "bd_rhapsody" +namespace: "mapping" +scope: "public" +description: | + BD Rhapsody Sequence Analysis CWL pipeline v2.2.1 + + This pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported + sequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome + mRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and + ATAC-Seq + + The CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement' from the YAML. +info: + keywords: [rna-seq, single-cell, multiomic, atac-seq, targeted, abseq, tcr, bcr] + links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1 + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com + license: Unknown +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: +- name: Inputs + arguments: + - name: "--reads" + type: file + description: | + Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include: + + - WTA mRNA + - Targeted mRNA + - AbSeq + - Sample Multiplexing + - VDJ + + You may specify as many R1/R2 read pairs as you want. + required: false + multiple: true + example: + - WTALibrary_S1_L001_R1_001.fastq.gz + - WTALibrary_S1_L001_R2_001.fastq.gz + info: + config_key: Reads + - name: "--reads_atac" + type: file + description: | + Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries. + You may specify as many R1/R2/I2 files as you want. + required: false + multiple: true + example: + - ATACLibrary_S2_L001_R1_001.fastq.gz + - ATACLibrary_S2_L001_R2_001.fastq.gz + - ATACLibrary_S2_L001_I2_001.fastq.gz + info: + config_key: Reads_ATAC +- name: References + description: | + Assay type will be inferred from the provided reference(s). + Do not provide both reference_archive and targeted_reference at the same time. + + Valid reference input combinations: + - reference_archive: WTA only + - reference_archive & abseq_reference: WTA + AbSeq + - reference_archive & supplemental_reference: WTA + extra transgenes + - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes + - reference_archive: WTA + ATAC or ATAC only + - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes + - targeted_reference: Targeted only + - targeted_reference & abseq_reference: Targeted + AbSeq + - abseq_reference: AbSeq only + + The reference_archive can be generated with the bd_rhapsody_make_reference component. + Alternatively, BD also provides standard references which can be downloaded from these locations: + + - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz + - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz + arguments: + - name: "--reference_archive" + type: file + description: | + Path to Rhapsody WTA Reference in the tar.gz format. + + Structure of the reference archive: + + - `BD_Rhapsody_Reference_Files/`: top level folder + - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate` + - GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf" + example: "RhapRef_Human_WTA_2023-02.tar.gz" + required: false + info: + config_key: Reference_Archive + - name: "--targeted_reference" + type: file + description: | + Path to the targeted reference file in FASTA format. + example: "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + multiple: true + info: + config_key: Targeted_Reference + - name: "--abseq_reference" + type: file + description: Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used. + example: "AbSeq_reference.fasta" + multiple: true + info: + config_key: AbSeq_Reference + - name: "--supplemental_reference" + type: file + alternatives: [-s] + description: Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment. + example: "supplemental_reference.fasta" + multiple: true + info: + config_key: Supplemental_Reference +- name: Outputs + description: Outputs for all pipeline runs + arguments: + - name: "--output_dir" + type: file + direction: output + alternatives: [-o] + description: "The unprocessed output directory containing all the outputs from the pipeline." + required: true + example: output_dir/ + - name: "--output_seurat" + type: file + direction: output + description: "Single-cell analysis tool inputs. Seurat (.rds) input file containing RSEC molecules data table and all cell annotation metadata." + example: output_seurat.rds + required: false + info: + template: "sample_Seurat.rds" + - name: "--output_mudata" + type: file + direction: output + example: output_mudata.h5mu + required: false + info: + template: "sample.h5mu" + - name: "--metrics_summary" + type: file + direction: output + description: "Metrics Summary. Report containing sequencing, molecules, and cell metrics." + example: metrics_summary.csv + required: false + info: + template: "sample_Metrics_Summary.csv" + - name: "--pipeline_report" + type: file + direction: output + description: "Pipeline Report. Summary report containing the results from the sequencing analysis pipeline run." + example: pipeline_report.html + required: false + info: + template: "sample_Pipeline_Report.html" + - name: "--rsec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on RSEC" + example: RSEC_MolsPerCell_MEX.zip + required: false + info: + template: "sample_RSEC_MolsPerCell_MEX.zip" + - name: "--dbec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on DBEC. DBEC data table is only output if the experiment includes targeted mRNA or AbSeq bioproducts." + example: DBEC_MolsPerCell_MEX.zip + required: false + info: + template: "sample_DBEC_MolsPerCell_MEX.zip" + - name: "--rsec_mols_per_cell_unfiltered" + type: file + direction: output + description: "Unfiltered tables containing all cell labels with 10 reads." + example: RSEC_MolsPerCell_Unfiltered_MEX.zip + required: false + info: + template: "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip" + - name: "--bam" + type: file + direction: output + description: "Alignment file of R2 with associated R1 annotations for Bioproduct." + example: BioProduct.bam + required: false + info: + template: "sample_Bioproduct.bam" + - name: "--bam_index" + type: file + direction: output + description: "Index file for the alignment file." + example: BioProduct.bam.bai + required: false + info: + template: "sample_Bioproduct.bam.bai" + - name: "--bioproduct_stats" + type: file + direction: output + description: "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier adjustment algorithms on a per-bioproduct basis." + example: Bioproduct_Stats.csv + required: false + info: + template: "sample_Bioproduct_Stats.csv" + - name: "--dimred_tsne" + type: file + direction: output + description: "t-SNE dimensionality reduction coordinates per cell index" + example: tSNE_coordinates.csv + required: false + info: + template: "sample_assay_tSNE_coordinates.csv" + - name: "--dimred_umap" + type: file + direction: output + description: "UMAP dimensionality reduction coordinates per cell index" + example: UMAP_coordinates.csv + required: false + info: + template: "sample_assay_UMAP_coordinates.csv" + - name: "--immune_cell_classification" + type: file + direction: output + description: "Immune Cell Classification. Cell type classification based on the expression of immune cell markers." + example: Immune_Cell_Classification.csv + required: false + info: + template: "sample_assay_cell_type_experimental.csv" +- name: Multiplex outputs + description: Outputs when multiplex option is selected + arguments: + - name: "--sample_tag_metrics" + type: file + direction: output + description: "Sample Tag Metrics. Metrics from the sample determination algorithm." + example: Sample_Tag_Metrics.csv + required: false + info: + template: "sample_Sample_Tag_Metrics.csv" + - name: "--sample_tag_calls" + type: file + direction: output + description: "Sample Tag Calls. Assigned Sample Tag for each putative cell" + example: Sample_Tag_Calls.csv + required: false + info: + template: "sample_Sample_Tag_Calls.csv" + - name: "--sample_tag_counts" + type: file + direction: output + description: "Sample Tag Counts. Separate data tables and metric summary for cells assigned to each sample tag. Note: For putative cells that could not be assigned a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output." + example: Sample_Tag1.zip + required: false + multiple: true + info: + template: "sample_Sample_Tag.zip" + - name: "--sample_tag_counts_unassigned" + type: file + direction: output + description: "Sample Tag Counts Unassigned. Data table and metric summary for cells that could not be assigned a specific Sample Tag." + example: Multiplet_and_Undetermined.zip + required: false + info: + template: "sample_Multiplet_and_Undetermined.zip" +- name: VDJ Outputs + description: Outputs when VDJ option selected + arguments: + - name: "--vdj_metrics" + type: file + direction: output + description: "VDJ Metrics. Overall metrics from the VDJ analysis." + example: VDJ_Metrics.csv + required: false + info: + template: "sample_VDJ_Metrics.csv" + - name: "--vdj_per_cell" + type: file + direction: output + description: "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell.csv + required: false + info: + template: "sample_VDJ_perCell.csv" + - name: "--vdj_per_cell_uncorrected" + type: file + direction: output + description: "VDJ Per Cell Uncorrected. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell_uncorrected.csv + required: false + info: + template: "sample_VDJ_perCell_uncorrected.csv" + - name: "--vdj_dominant_contigs" + type: file + direction: output + description: "VDJ Dominant Contigs. Dominant contig for each cell label chain type combination (putative cells only)." + example: VDJ_Dominant_Contigs_AIRR.csv + required: false + info: + template: "sample_VDJ_Dominant_Contigs_AIRR.csv" + - name: "--vdj_unfiltered_contigs" + type: file + direction: output + description: "VDJ Unfiltered Contigs. All contigs that were assembled and annotated successfully (all cells)." + example: VDJ_Unfiltered_Contigs_AIRR.csv + required: false + info: + template: "sample_VDJ_Unfiltered_Contigs_AIRR.csv" +- name: "ATAC-Seq outputs" + description: Outputs when ATAC-Seq option selected + arguments: + - name: "--atac_metrics" + type: file + direction: output + description: "ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + example: ATAC_Metrics.csv + required: false + info: + template: "sample_ATAC_Metrics.csv" + - name: "--atac_metrics_json" + type: file + direction: output + description: "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in JSON format." + example: ATAC_Metrics.json + required: false + info: + template: "sample_ATAC_Metrics.json" + - name: "--atac_fragments" + type: file + direction: output + description: "ATAC Fragments. Chromosomal location, cell index, and read support for each fragment detected" + example: ATAC_Fragments.bed.gz + required: false + info: + template: "sample_ATAC_Fragments.bed.gz" + - name: "--atac_fragments_index" + type: file + direction: output + description: "Index of ATAC Fragments." + example: ATAC_Fragments.bed.gz.tbi + required: false + info: + template: "sample_ATAC_Fragments.bed.gz.tbi" + - name: "--atac_transposase_sites" + type: file + direction: output + description: "ATAC Transposase Sites. Chromosomal location, cell index, and read support for each transposase site detected" + example: ATAC_Transposase_Sites.bed.gz + required: false + info: + template: "sample_ATAC_Transposase_Sites.bed.gz" + - name: "--atac_transposase_sites_index" + type: file + direction: output + description: "Index of ATAC Transposase Sites." + example: ATAC_Transposase_Sites.bed.gz.tbi + required: false + info: + template: "sample_ATAC_Transposase_Sites.bed.gz.tbi" + - name: "--atac_peaks" + type: file + direction: output + description: "ATAC Peaks. Peak regions of transposase activity" + example: ATAC_Peaks.bed.gz + required: false + info: + template: "sample_ATAC_Peaks.bed.gz" + - name: "--atac_peaks_index" + type: file + direction: output + description: "Index of ATAC Peaks." + example: ATAC_Peaks.bed.gz.tbi + required: false + info: + template: "sample_ATAC_Peaks.bed.gz.tbi" + - name: "--atac_peak_annotation" + type: file + direction: output + description: "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + example: peak_annotation.tsv.gz + required: false + info: + template: "sample_peak_annotation.tsv.gz" + - name: "--atac_cell_by_peak" + type: file + direction: output + description: "ATAC Cell by Peak. Peak regions of transposase activity per cell" + example: ATAC_Cell_by_Peak_MEX.zip + required: false + info: + template: "sample_ATAC_Cell_by_Peak_MEX.zip" + - name: "--atac_cell_by_peak_unfiltered" + type: file + direction: output + description: "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell labels with >=1 transposase sites in peaks." + example: ATAC_Cell_by_Peak_Unfiltered_MEX.zip + required: false + info: + template: "sample_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + - name: "--atac_bam" + type: file + direction: output + description: "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations for ATAC-Seq. Only output if the BAM generation flag is set to true." + example: ATAC.bam + required: false + info: + template: "sample_ATAC.bam" + - name: "--atac_bam_index" + type: file + direction: output + description: "Index of ATAC BAM." + example: ATAC.bam.bai + required: false + info: + template: "sample_ATAC.bam.bai" +- name: AbSeq Cell Calling outputs + description: Outputs when Cell Calling Abseq is selected + arguments: + - name: "--protein_aggregates_experimental" + type: file + direction: output + description: "Protein Aggregates Experimental" + example: Protein_Aggregates_Experimental.csv + required: false + info: + template: "sample_Protein_Aggregates_Experimental.csv" +- name: Putative Cell Calling Settings + arguments: + - name: "--cell_calling_data" + type: string + description: | + Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC + + For putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above. + + For putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above. + + The default data for putative cell calling, will be determined the following way: + + - If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC + - If only ATAC Reads exist: ATAC + - Otherwise: mRNA + choices: [mRNA, AbSeq, ATAC, mRNA_and_ATAC] + example: mRNA + info: + config_key: Cell_Calling_Data + - name: "--cell_calling_bioproduct_algorithm" + type: string + description: | + Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_Bioproduct_Algorithm + - name: "--cell_calling_atac_algorithm" + type: string + description: | + Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_ATAC_Algorithm + - name: "--exact_cell_count" + type: integer + description: | + Set a specific number of cells as putative, based on those with the highest error-corrected read count + example: 10000 + min: 1 + info: + config_key: Exact_Cell_Count + - name: "--expected_cell_count" + type: integer + description: | + Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. + example: 20000 + min: 1 + info: + config_key: Expected_Cell_Count +- name: Intronic Reads Settings + arguments: + - name: --exclude_intronic_reads + type: boolean + description: | + By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded. + The value can be true or false. + example: false + info: + config_key: Exclude_Intronic_Reads +- name: Multiplex Settings + arguments: + - name: "--sample_tags_version" + type: string + description: | + Specify the version of the Sample Tags used in the run: + + * If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only + * If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the "nuclei_includes_mrna" option. + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the "nuclei_atac_only" option. + choices: [human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only] + example: human + info: + config_key: Sample_Tags_Version + - name: "--tag_names" + type: string + description: | + Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv + Do not use the special characters. + multiple: true + example: [4-mySample, 9-myOtherSample, 6-alsoThisSample] + info: + config_key: Tag_Names +- name: VDJ arguments + arguments: + - name: "--vdj_version" + type: string + description: | + If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR + choices: [human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR] + example: human + info: + config_key: VDJ_Version +- name: ATAC options + arguments: + - name: "--predefined_atac_peaks" + type: file + description: An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. + example: predefined_peaks.bed + info: + config_key: Predefined_ATAC_Peaks +- name: Additional options + arguments: + - name: "--run_name" + type: string + description: | + Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces. + default: sample + info: + config_key: Run_Name + - name: "--generate_bam" + type: boolean + description: | + Specify whether to create the BAM file output + default: false + info: + config_key: Generate_Bam + - name: "--long_reads" + type: boolean + description: | + Use STARlong (default: undefined - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp. + info: + config_key: Long_Reads +- name: Advanced options + description: | + NOTE: Only change these if you are really sure about what you are doing + arguments: + - name: "--custom_star_params" + type: string + description: | + Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline. + For reference this is the default that is used: + + Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000` + Long Reads: Same as Short Reads + `--seedPerReadNmax 10000` + + This applies to fastqs provided in the Reads user input + Do NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc. + We use STAR version 2.7.10b + example: "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000" + info: + config_key: Custom_STAR_Params + - name: "--custom_bwa_mem2_params" + type: string + description: | + Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline + The pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used + This applies to fastqs provided in the Reads_ATAC user input + Do NOT set any non-mapping related params like `-C`, `-t`, etc. + We use bwa-mem2 version 2.2.1 + example: "-k 16 -w 200 -r" + info: + config_key: Custom_bwa_mem2_Params +- name: CWL-runner arguments + arguments: + - name: "--parallel" + type: boolean + description: "Run jobs in parallel." + default: true + - name: "--timestamps" + type: boolean_true + description: "Add timestamps to the errors, warnings, and notifications." +- name: Undocumented arguments + arguments: + - name: --abseq_umi + type: integer + multiple: false + info: + config_key: AbSeq_UMI + - name: --target_analysis + type: boolean + multiple: false + info: + config_key: Target_analysis + - name: --vdj_jgene_evalue + type: double + description: | + e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_JGene_Evalue + - name: --vdj_vgene_evalue + type: double + description: | + e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_VGene_Evalue + - name: --write_filtered_reads + description: + Output processed FASTQ reads. + type: boolean + multiple: false + info: + config_key: Write_Filtered_Reads + +resources: + - type: python_script + path: script.py + - path: rhapsody_pipeline_2.2.1_nodocker.cwl +test_resources: + - type: python_script + path: test.py + - path: rhapsody_cell_label.py + - path: /resources_test/reference_gencodev41_chr1/reference.fa.gz + - path: /resources_test/reference_gencodev41_chr1/reference.gtf.gz + - path: /resources_test/reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz + - path: /resources_test/bdrhap_5kjrt/raw + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps] + - type: python + packages: [cwlref-runner, cwl-runner, ruamel.yaml, biopython, gffutils] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/bd_rhapsody/make_rhap_reference_2.2.1_nodocker.cwl b/src/mapping/bd_rhapsody/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/src/mapping/bd_rhapsody/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/src/mapping/bd_rhapsody/rhapsody_cell_label.py b/src/mapping/bd_rhapsody/rhapsody_cell_label.py new file mode 100644 index 00000000..2bc1f81f --- /dev/null +++ b/src/mapping/bd_rhapsody/rhapsody_cell_label.py @@ -0,0 +1,1695 @@ +#!/usr/bin/env python + +# copied from https://bd-rhapsody-public.s3.amazonaws.com/CellLabel/rhapsody_cell_label.py.txt +# documented at https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/steps_cell_label.html + +""" +Rhapsody cell label structure +Information on the cell label is captured by the combination of bases in three cell label sections (CLS1, CLS2, CLS3). +Two common linker sequences (L1, L2) separate the three CLS. + +--CLS1---|-L1-|--CLS2---|-L2-|--CL3---|--UMI---|-CaptureSequence- + + +Each cell label section has a whitelist of 96 or 384 possible 9 base sequences. +All the capture oligos from a single bead will have the same cell label. + +---------------- + +V1 beads: + +[A96_cell_key1] + [v1_linker1] + [A96_cell_key2] + [v1_linker2] + [A96_cell_key3] + [8 random base UMI] + [18 base polyT capture] + + +---------------- + +Enhanced beads: +Enhanced beads contain two different capture oligo types, polyT and 5prime. On any one bead, the two different capture oligo types have the same cell label sequences. +Compared to the V1 bead, enhanced beads have shorter linker sequences, longer polyT, and 0-3 diversity insert bases at the beginning of the sequence. +The cell label sections use the same 3 sequence whitelists as V1 beads. + +polyT capture oligo: +[Enh_insert 0-3 bases] + [A96_cell_key1] + [Enh_linker1] + [A96_cell_key2] + [Enh_linker2] + [A96_cell_key3] + [8 random base UMI] + [25 base polyT capture] + +5prime capture oligo: +[Enh_5p_primer] + [A96_cell_key1] + [Enh_5p_linker1] + [A96_cell_key2] + [Enh_5p_linker2] + [A96_cell_key3] + [8 random base UMI] + [Tso_capture_seq] + + +---------------- + +Enhanced V2/V3 beads: +Enhanced V2/V3 beads have the same structure as Enhanced beads, but the cell label sections have been updated with increased diversity + + +polyT capture oligo: +[Enh_insert 0-3 bases] + [B384_cell_key1] + [Enh_linker1] + [B384_cell_key2] + [Enh_linker2] + [B384_cell_key3] + [8 random base UMI] + [25 base polyT capture] + +5prime capture oligo: +[Enh_5p_primer] + [B384_cell_key1] + [Enh_5p_linker1] + [B384_cell_key2] + [Enh_5p_linker2] + [B384_cell_key3] + [8 random base UMI] + [Tso_capture_seq] + + +The only difference between Enh V2 and Enh V3 beads is a different Tso_capture_seq. + +---------------- + +The Rhapsody Sequence Analysis Pipeline will convert each cell label into a single integer representing a unique cell label sequence - which is used in the output files as the 'Cell_index'. +This cell index integer is deterministic and derived from the 3 part cell label as follows: + +- Get the 1-based index for each cell label section from the python sets of sequences below +- Apply this equation: + (CLS1index - 1) * 384 * 384 + (CLS2index - 1) * 384 + CLS3index + +(See label_sections_to_index() function below) + + +Example: Enhanced bead sequence: +ACACATTGCAGTGAAGATAGTTCGACACTCAAGACA + +Each part identified: +A CACATTGCA GTGA AGATAGTTC GACA CTCAAGACA +DiversityInsert A96_cell_key1-33 Linker1 A96_cell_key2-78 Linker2 A96_cell_key3-21 + +33-78-21 +(33 - 1) * 384 * 384 + (78 - 1) * 384 + 21 +=4748181 + + +The original sequences of cell label can be determined from the cell index integer by reversing this conversion. +See index_to_label_sections() and index_to_sequence() functions below. + +""" + +v1_linker1 = "ACTGGCCTGCGA" +v1_linker2 = "GGTAGCGGTGACA" + +Enh_linker1 = "GTGA" +Enh_linker2 = "GACA" + +Enh_5p_primer = "ACAGGAAACTCATGGTGCGT" + +Enh_5p_linker1 = "AATG" +Enh_5p_linker2 = "CCAC" + +Enh_inserts = ["", "A", "GT", "TCA"] + +Tso_capture_seq_Enh_EnhV2 = "TATGCGTAGTAGGTATG" +Tso_capture_seq_EnhV3 = "GTGGAGTCGTGATTATA" + +A96_cell_key1 = ( + "GTCGCTATA", + "CTTGTACTA", + "CTTCACATA", + "ACACGCCGG", + "CGGTCCAGG", + "AATCGAATG", + "CCTAGTATA", + "ATTGGCTAA", + "AAGACATGC", + "AAGGCGATC", + "GTGTCCTTA", + "GGATTAGGA", + "ATGGATCCA", + "ACATAAGCG", + "AACTGTATT", + "ACCTTGCGG", + "CAGGTGTAG", + "AGGAGATTA", + "GCGATTACA", + "ACCGGATAG", + "CCACTTGGA", + "AGAGAAGTT", + "TAAGTTCGA", + "ACGGATATT", + "TGGCTCAGA", + "GAATCTGTA", + "ACCAAGGAC", + "AGTATCTGT", + "CACACACTA", + "ATTAAGTGC", + "AAGTAACCC", + "AAATCCTGT", + "CACATTGCA", + "GCACTGTCA", + "ATACTTAGG", + "GCAATCCGA", + "ACGCAATCA", + "GAGTATTAG", + "GACGGATTA", + "CAGCTGACA", + "CAACATATT", + "AACTTCTCC", + "CTATGAAAT", + "ATTATTACC", + "TACCGAGCA", + "TCTCTTCAA", + "TAAGCGTTA", + "GCCTTACAA", + "AGCACACAG", + "ACAGTTCCG", + "AGTAAAGCC", + "CAGTTTCAC", + "CGTTACTAA", + "TTGTTCCAA", + "AGAAGCACT", + "CAGCAAGAT", + "CAAACCGCC", + "CTAACTCGC", + "AATATTGGG", + "AGAACTTCC", + "CAAAGGCAC", + "AAGCTCAAC", + "TCCAGTCGA", + "AGCCATCAC", + "AACGAGAAG", + "CTACAGAAC", + "AGAGCTATG", + "GAGGATGGA", + "TGTACCTTA", + "ACACACAAA", + "TCAGGAGGA", + "GAGGTGCTA", + "ACCCTGACC", + "ACAAGGATC", + "ATCCCGGAG", + "TATGTGGCA", + "GCTGCCAAT", + "ATCAGAGCT", + "TCGAAGTGA", + "ATAGACGAG", + "AGCCCAATC", + "CAGAATCGT", + "ATCTCCACA", + "ACGAAAGGT", + "TAGCTTGTA", + "ACACGAGAT", + "AACCGCCTC", + "ATTTAGATG", + "CAAGCAAGC", + "CAAAGTGTG", + "GGCAAGCAA", + "GAGCCAATA", + "ATGTAATGG", + "CCTGAGCAA", + "GAGTACATT", + "TGCGATCTA", +) + +A96_cell_key2 = ( + "TACAGGATA", + "CACCAGGTA", + "TGTGAAGAA", + "GATTCATCA", + "CACCCAAAG", + "CACAAAGGC", + "GTGTGTCGA", + "CTAGGTCCT", + "ACAGTGGTA", + "TCGTTAGCA", + "AGCGACACC", + "AAGCTACTT", + "TGTTCTCCA", + "ACGCGAAGC", + "CAGAAATCG", + "ACCAAAATG", + "AGTGTTGTC", + "TAGGGATAC", + "AGGGCTGGT", + "TCATCCTAA", + "AATCCTGAA", + "ATCCTAGGA", + "ACGACCACC", + "TTCCATTGA", + "TAGTCTTGA", + "ACTGTTAGA", + "ATTCATCGT", + "ACTTCGAGC", + "TTGCGTACA", + "CAGTGCCCG", + "GACACTTAA", + "AGGAGGCGC", + "GCCTGTTCA", + "GTACATCTA", + "AATCAGTTT", + "ACGATGAAT", + "TGACAGACA", + "ATTAGGCAT", + "GGAGTCTAA", + "TAGAACACA", + "AAATAAATA", + "CCGACAAGA", + "CACCTACCC", + "AAGAGTAGA", + "TCATTGAGA", + "GACCTTAGA", + "CAAGACCTA", + "GGAATGATA", + "AAACGTACC", + "ACTATCCTC", + "CCGTATCTA", + "ACACATGTC", + "TTGGTATGA", + "GTGCAGTAA", + "AGGATTCAA", + "AGAATGGAG", + "CTCTCTCAA", + "GCTAACTCA", + "ATCAACCGA", + "ATGAGTTAC", + "ACTTGATGA", + "ACTTTAACT", + "TTGGAGGTA", + "GCCAATGTA", + "ATCCAACCG", + "GATGAACTG", + "CCATGCACA", + "TAGTGACTA", + "AAACTGCGC", + "ATTACCAAG", + "CACTCGAGA", + "AACTCATTG", + "CTTGCTTCA", + "ACCTGAGTC", + "AGGTTCGCT", + "AAGGACTAT", + "CGTTCGGTA", + "AGATAGTTC", + "CAATTGATC", + "GCATGGCTA", + "ACCAGGTGT", + "AGCTGCCGT", + "TATAGCCCT", + "AGAGGACCA", + "ACAATATGG", + "CAGCACTTC", + "CACTTATGT", + "AGTGAAAGG", + "AACCCTCGG", + "AGGCAGCTA", + "AACCAAAGT", + "GAGTGCGAA", + "CGCTAAGCA", + "AATTATAAC", + "TACTAGTCA", + "CAACAACGG", +) + +A96_cell_key3 = ( + "AAGCCTTCT", + "ATCATTCTG", + "CACAAGTAT", + "ACACCTTAG", + "GAACGACAA", + "AGTCTGTAC", + "AAATTACAG", + "GGCTACAGA", + "AATGTATCG", + "CAAGTAGAA", + "GATCTCTTA", + "AACAACGCG", + "GGTGAGTTA", + "CAGGGAGGG", + "TCCGTCTTA", + "TGCATAGTA", + "ACTTACGAT", + "TGTATGCGA", + "GCTCCTTGA", + "GGCACAACA", + "CTCAAGACA", + "ACGCTGTTG", + "ATATTGTAA", + "AAGTTTACG", + "CAGCCTGGC", + "CTATTAGCC", + "CAAACGTGG", + "AAAGTCATT", + "GTCTTGGCA", + "GATCAGCGA", + "ACATTCGGC", + "AGTAATTAG", + "TGAAGCCAA", + "TCTACGACA", + "CATAACGTT", + "ATGGGACTC", + "GATAGAGGA", + "CTACATGCG", + "CAACGATCT", + "GTTAGCCTA", + "AGTTGCATC", + "AAGGGAACT", + "ACTACATAT", + "CTAAGCTTC", + "ACGAACCAG", + "TACTTCGGA", + "AACATCCAT", + "AGCCTGGTT", + "CAAGTTTCC", + "CAGGCATTT", + "ACGTGGGAG", + "TCTCACGGA", + "GCAACATTA", + "ATGGTCCGT", + "CTATCATGA", + "CAATACAAG", + "AAAGAGGCC", + "GTAGAAGCA", + "GCTATGGAA", + "ACTCCAGGG", + "ACAAGTGCA", + "GATGGTCCA", + "TCCTCAATA", + "AATAAACAA", + "CTGTACGGA", + "CTAGATAGA", + "AGCTATGTG", + "AAATGGAGG", + "AGCCGCAAG", + "ACAGTAAAC", + "AACGTGTGA", + "ACTGAATTC", + "AAGGGTCAG", + "TGTCTATCA", + "TCAGATTCA", + "CACGATCCG", + "AACAGAAAC", + "CATGAATGA", + "CGTACTACG", + "TTCAGCTCA", + "AAGGCCGCA", + "GGTTGGACA", + "CGTCTAGGT", + "AATTCGGCG", + "CAACCTCCA", + "CAATAGGGT", + "ACAGGCTCC", + "ACAACTAGT", + "AGTTGTTCT", + "AATTACCGG", + "ACAAACTTT", + "TCTCGGTTA", + "ACTAGACCG", + "ACTCATACG", + "ATCGAGTCT", + "CATAGGTCA", +) + +B384_cell_key1 = ( + "TGTGTTCGC", + "TGTGGCGCC", + "TGTCTAGCG", + "TGGTTGTCC", + "TGGTTCCTC", + "TGGTGTGCT", + "TGGCGACCG", + "TGCTGTGGC", + "TGCTGGCAC", + "TGCTCTTCC", + "TGCCTCACC", + "TGCCATTAT", + "TGATGTCTC", + "TGATGGCCT", + "TGATGCTTG", + "TGAAGGACC", + "TCTGTCTCC", + "TCTGATTAT", + "TCTGAGGTT", + "TCTCGTTCT", + "TCTCATCCG", + "TCCTGGATT", + "TCAGCATTC", + "TCACGCCTT", + "TATGTGCAC", + "TATGCGGCC", + "TATGACGAG", + "TATCTCGTG", + "TATATGACC", + "TAGGCTGTG", + "TACTGCGTT", + "TACGTGTCC", + "TAATCACAT", + "GTTGTGTTG", + "GTTGTGGCT", + "GTTGTCTGT", + "GTTGTCGAG", + "GTTGTCCTC", + "GTTGTATCC", + "GTTGGTTCT", + "GTTGGCGTT", + "GTTGGAGCG", + "GTTGCTGCC", + "GTTGCGCAT", + "GTTGCAGGT", + "GTTGCACTG", + "GTTGATGAT", + "GTTGATACG", + "GTTGAAGTC", + "GTTCTGTGC", + "GTTCTCTCG", + "GTTCTATAT", + "GTTCGTATG", + "GTTCGGCCT", + "GTTCGCGGC", + "GTTCGATTC", + "GTTCCGGTT", + "GTTCCGACG", + "GTTCACGCT", + "GTTATCACC", + "GTTAGTCCG", + "GTTAGGTGT", + "GTTAGAGAC", + "GTTAGACTT", + "GTTACCTCT", + "GTTAATTCC", + "GTTAAGCGC", + "GTGTTGCTT", + "GTGTTCGGT", + "GTGTTCCAG", + "GTGTTCATC", + "GTGTCACAC", + "GTGTCAAGT", + "GTGTACTGC", + "GTGGTTAGT", + "GTGGTACCG", + "GTGGCGATC", + "GTGCTTCTG", + "GTGCGTTCC", + "GTGCGGTAT", + "GTGCGCCTT", + "GTGCGAACT", + "GTGCAGCCG", + "GTGCAATTG", + "GTGCAAGGC", + "GTCTTGCGC", + "GTCTGGCCG", + "GTCTGAGGC", + "GTCTCAGAT", + "GTCTCAACC", + "GTCTATCGT", + "GTCGGTGTG", + "GTCGGAATC", + "GTCGCTCCG", + "GTCCTCGCC", + "GTCCTACCT", + "GTCCGCTTG", + "GTCCATTCT", + "GTCCAATAC", + "GTCATGTAT", + "GTCAGTGGT", + "GTCAGATAG", + "GTATTAACT", + "GTATCAGTC", + "GTATAGCCT", + "GTATACTTG", + "GTATAAGGT", + "GTAGCATCG", + "GTACCGTCC", + "GTACACCTC", + "GTAAGTGCC", + "GTAACAGAG", + "GGTTGTGTC", + "GGTTGGCTG", + "GGTTGACGC", + "GGTTCGTCG", + "GGTTCAGTT", + "GGTTATATT", + "GGTTAATAC", + "GGTGTACGT", + "GGTGCCGCT", + "GGTGCATGC", + "GGTCGTTGC", + "GGTCGAGGT", + "GGTAGGCAC", + "GGTAGCTTG", + "GGTACATAG", + "GGTAATCTG", + "GGCTTGGCC", + "GGCTTCACG", + "GGCTTATGT", + "GGCTTACTC", + "GGCTGTCTT", + "GGCTCTGTG", + "GGCTCCGGT", + "GGCTCACCT", + "GGCGTTGAG", + "GGCGTGTAC", + "GGCGTGCTG", + "GGCGTATCG", + "GGCGCTCGT", + "GGCGCTACC", + "GGCGAGCCT", + "GGCGAGATC", + "GGCGACTTG", + "GGCCTCTTC", + "GGCCTACAG", + "GGCCAGCGC", + "GGCCAACTT", + "GGCATTCCT", + "GGCATCCGC", + "GGCATAACC", + "GGCAACGAT", + "GGATGTCCG", + "GGATGAGAG", + "GGATCTGGC", + "GGATCCATG", + "GGATAGGTT", + "GGAGTCGTG", + "GGAGAAGGC", + "GGACTCCTT", + "GGACTAGTC", + "GGACCGTTG", + "GGAATTAGT", + "GGAATCTCT", + "GGAATCGAC", + "GGAAGCCTC", + "GCTTGTAGC", + "GCTTGACCG", + "GCTTCGGAC", + "GCTTCACAT", + "GCTTAGTCT", + "GCTGGATAT", + "GCTGGAACC", + "GCTGCGATG", + "GCTGATCAG", + "GCTGAGCGT", + "GCTCTTGTC", + "GCTCTCCTG", + "GCTCGGTCC", + "GCTCCAATT", + "GCTATTCGC", + "GCTATGAGT", + "GCTAGTGTT", + "GCTAGGATC", + "GCTAGCACT", + "GCTACGTAT", + "GCTAACCTT", + "GCGTTCCGC", + "GCGTGTGCC", + "GCGTGCATT", + "GCGTCGGTT", + "GCGTATGTG", + "GCGTATACT", + "GCGGTTCAC", + "GCGGTCTTG", + "GCGGCGTCG", + "GCGGCACCT", + "GCGCTGGAC", + "GCGCTCTCC", + "GCGCGGCAG", + "GCGCGATAC", + "GCGCCGACC", + "GCGAGCGAG", + "GCGAGAGGT", + "GCGAATTAC", + "GCCTTGCAT", + "GCCTGCGCT", + "GCCTAACTG", + "GCCGTCCGT", + "GCCGCTGTC", + "GCCATGCCG", + "GCCAGCTAT", + "GCCAACCAG", + "GCATGGTTG", + "GCATCGACG", + "GCAGGCTAG", + "GCAGGACGC", + "GCAGCCATC", + "GCAGATACC", + "GCAGACGTT", + "GCACTATGT", + "GCACACGAG", + "GATTGTCAT", + "GATTGGTAG", + "GATTGCACC", + "GATTCTACT", + "GATTCGCTT", + "GATTAGGCC", + "GATTACGGT", + "GATGTTGGC", + "GATGTTATG", + "GATGGCCAG", + "GATCGTTCG", + "GATCGGAGC", + "GATCGCCTC", + "GATCCTCTG", + "GATCCAGCG", + "GATACACGC", + "GAGTTACCT", + "GAGTCGTAT", + "GAGTCGCCG", + "GAGGTGTAG", + "GAGGCATTG", + "GAGCGGACG", + "GAGCCTGAG", + "GAGATCTGT", + "GAGATAATT", + "GAGACGGCT", + "GACTTCGTG", + "GACTGTTCT", + "GACTCTTAG", + "GACCGCATT", + "GAATTGAGC", + "GAATATTGC", + "GAAGGCTCT", + "GAAGAGACT", + "GAACTGCCG", + "GAACGCGTG", + "CTTGTGTAT", + "CTTGTGCGC", + "CTTGTCATG", + "CTTGGTCTT", + "CTTGGTACC", + "CTTGGATGT", + "CTTGCTCAC", + "CTTGCAATC", + "CTTGAGGCC", + "CTTGACGGT", + "CTTCTGATC", + "CTTCTCGTT", + "CTTCTAGGC", + "CTTCGTTAG", + "CTTATGTCC", + "CTTATGCTT", + "CTTATATAG", + "CTTAGGTTG", + "CTTAGGAGC", + "CTTACTTAT", + "CTGTTCTCG", + "CTGTGCCTC", + "CTGTCGCAT", + "CTGTCGAGC", + "CTGTAGCTG", + "CTGTACGTT", + "CTGCTTGCC", + "CTGCGTAGT", + "CTGCACACC", + "CTGATGGAT", + "CTGAGTCAT", + "CTGACGCCG", + "CTGAACGAG", + "CTCTTGTAG", + "CTCTTAGTT", + "CTCTTACCG", + "CTCTGCACC", + "CTCTCGTCC", + "CTCGTATTG", + "CTCGACTAT", + "CTCCTGACG", + "CTCACTAGC", + "CTATACGGC", + "CGTTCGCTC", + "CGTTCACCG", + "CGTATAGTT", + "CGGTGTTCC", + "CGGTGTCAG", + "CGGTCCTGC", + "CGGCGACTC", + "CGGCACGGT", + "CGGATAGCC", + "CGGAGAGAT", + "CGCTAATAG", + "CGCGTTGGC", + "CGCGCAGAG", + "CGCACTGCC", + "CCTTGTCTC", + "CCTTGGCGT", + "CCTTCTGAG", + "CCTTCTCCT", + "CCTTCGACC", + "CCTTACTTG", + "CCTGTTCGT", + "CCTGTATGC", + "CCTCGGCCG", + "CCGTTAATT", + "CCATGTGCG", + "CCAGTGGTT", + "CCAGGCATT", + "CCAGGATCC", + "CCAGCGTTG", + "CATTCCGAT", + "CATTATACC", + "CATGTTGAG", + "ATTGCGTGT", + "ATTGCGGAC", + "ATTGCGCCG", + "ATTGACTTG", + "ATTCGGCTG", + "ATTCGCGAG", + "ATTCCAAGT", + "ATTATCTTC", + "ATTACTGTT", + "ATTACACTC", + "ATGTTCTAT", + "ATGTTACGC", + "ATGTGTATC", + "ATGTGGCAG", + "ATGTCTGTG", + "ATGGTGCAT", + "ATGCTTACT", + "ATGCTGTCC", + "ATGCTCGGC", + "ATGAGGTTC", + "ATGAGAGTG", + "ATCTTGGCT", + "ATCTGTGCG", + "ATCGGTTCC", + "ATCATGCTC", + "ATCATCACT", + "ATATCTTAT", + "ATAGGCGCC", + "AGTTGGTAT", + "AGTTGAGCC", + "AGTGCGACC", + "AGGTGCTAC", + "AGGCTTGCG", + "AGGCCTTCC", + "AGGCACCTT", + "AGGAATATG", + "AGCGGCCAG", + "AGCCTGGTC", + "AGCCTGACT", + "AGCAATCCG", + "AGAGATGTT", + "AGAGAATTC", + "ACTCGCTTG", + "ACTCGACCT", + "ACGTACACC", + "ACGGATGGT", + "ACCAGTCTG", + "ACATTCGGC", + "ACATGAGGT", + "ACACTAATT", +) + +B384_cell_key2 = ( + "TTGTGTTGT", + "TTGTGGTAG", + "TTGTGCGGA", + "TTGTCTGTT", + "TTGTCTAAG", + "TTGTCATAT", + "TTGTCACGA", + "TTGTATGAA", + "TTGTACAGT", + "TTGGTTAAT", + "TTGGTGCAA", + "TTGGTCGAG", + "TTGGTATTA", + "TTGGCACAG", + "TTGGATACA", + "TTGGAAGTG", + "TTGCGGTTA", + "TTGCCATTG", + "TTGCACGCG", + "TTGCAAGGT", + "TTGATGTAT", + "TTGATAATT", + "TTGAGACGT", + "TTGACTACT", + "TTGACCGAA", + "TTCTGGTCT", + "TTCTGCACA", + "TTCTCCTTA", + "TTCTCCGCT", + "TTCTAGGTA", + "TTCTAATCG", + "TTCGTCGTA", + "TTCGTAGAT", + "TTCGGCTTG", + "TTCGGAATA", + "TTCGCCAGA", + "TTCGATTGT", + "TTCGATCAG", + "TTCCTCGGT", + "TTCCGGCAG", + "TTCCGCATT", + "TTCCAATTA", + "TTCATTGAA", + "TTCATGCTG", + "TTCAGGAGT", + "TTCACTATA", + "TTCAACTCT", + "TTCAACGTG", + "TTATGCGTT", + "TTATGATTG", + "TTATCCTGT", + "TTATCCGAG", + "TTATATTAT", + "TTAGGCGCG", + "TTACTGGAA", + "TTACTAGTT", + "TTACGTGGT", + "TTACGATAT", + "TTACCTAGA", + "TTACATGAG", + "TTACAGCGT", + "TTACACGGA", + "TTACACACT", + "TTAATCAGT", + "TTAATAGGA", + "TTAAGTGTG", + "TTAACCTTG", + "TTAACACAA", + "TGTTCACTT", + "TGTTCAAGA", + "TGTTAAGTG", + "TGTGTTATG", + "TGTGTCCAA", + "TGTGGAGCG", + "TGTCAGTTA", + "TGTCAGAAG", + "TGGTTAGTT", + "TGGTTACAA", + "TGGCGTTAT", + "TGGCGCCAA", + "TGGAGTCTT", + "TGCGTATTG", + "TGATAGAGA", + "TGAGGTATT", + "TGAGAATCT", + "TCTTGGTAA", + "TCTTCATAG", + "TCTGTCCTT", + "TCTGGAATT", + "TCTACCGCG", + "TCGTTCGAA", + "TCGTCAGTG", + "TCGACGAGA", + "TCATGGCTT", + "TCACACTTA", + "TATTCCGAA", + "TATTATGGT", + "TATGCTATT", + "TATCAAGGA", + "TAGTTCAAT", + "TAGCTGCTT", + "TAGAGGAAG", + "TACCTGTTA", + "TACACCTGT", + "GTTGTGCGT", + "GTTGGCTAT", + "GTTGCCAAG", + "GTTGACCTT", + "GTTCTGCTA", + "GTTCTGAAT", + "GTTCTATCA", + "GTTCGCGTG", + "GTTCCTTAT", + "GTTAGCAGT", + "GTTACTGTG", + "GTTACTCAA", + "GTTAAGAGA", + "GTTAACTTA", + "GTGTCGGCA", + "GTGTCCATT", + "GTGCTTGAG", + "GTGCTCGTT", + "GTGCTCACA", + "GTGCCTGGA", + "GTCTTGTCG", + "GTCTTGATT", + "GTCTTCCGT", + "GTCTTAAGA", + "GTCTCATCT", + "GTCTACGAG", + "GTCGTTGCT", + "GTCGTGTTA", + "GTCGGTAAT", + "GTCGGATGT", + "GTCGAGCTG", + "GTCCGGACT", + "GTCCAACAT", + "GTCAGACGA", + "GTCAGAATT", + "GTCACTCTT", + "GTCAAGGAA", + "GTATGTCTT", + "GTATGTACA", + "GTATCGGTT", + "GTATATGTA", + "GTATACAAT", + "GTAGTTAAG", + "GTAGTCGAT", + "GTAGCCTTA", + "GTAGATACT", + "GTACGATTA", + "GTACAGTCT", + "GTAATTCGT", + "GCTTGGCAG", + "GCTTGCTTG", + "GCTTGAGGA", + "GCTTCATTA", + "GCTTATGCG", + "GCTGTGTAG", + "GCTGTCATG", + "GCTGGTTGT", + "GCTGGACTG", + "GCTGCCTAA", + "GCTGATATT", + "GCTCTTAGT", + "GCTCTATTG", + "GCTCGCCGT", + "GCTCCGCTG", + "GCTATTCTG", + "GCTATACGA", + "GCTACTAAG", + "GCTACATGT", + "GCTAACTCT", + "GCGTTGTAA", + "GCGTTCTCT", + "GCGTGCGTA", + "GCGTCTTGA", + "GCGTCCGAT", + "GCGTAAGAG", + "GCGCTTACG", + "GCGCGGATT", + "GCGCCATAT", + "GCGCATGAA", + "GCGATCAAT", + "GCGAGCCTT", + "GCGAGATTG", + "GCGAGAACA", + "GCCTTGGTA", + "GCCTTCTAG", + "GCCTTCACA", + "GCCTGAGTG", + "GCCTCACGT", + "GCCGGCGAA", + "GCCGCACAA", + "GCCATGCTT", + "GCCATATAT", + "GCCAATTCG", + "GCATTCGTT", + "GCATGATGT", + "GCAGTTGGA", + "GCAGTGTCT", + "GCACTTGTG", + "GCAATCTGT", + "GCAACACTT", + "GATTGTATT", + "GATTGCGAG", + "GATTCCAGT", + "GATTCATAT", + "GATTATCAG", + "GATTAGGTT", + "GATGTTGCG", + "GATGGATCT", + "GATGCTGAT", + "GATGCCTTG", + "GATCTCCTT", + "GATCGCTTA", + "GATATTGAA", + "GATATTACT", + "GAGTGTTAT", + "GAGCTCAGT", + "GAGCGTGCT", + "GAGCGTCGA", + "GAGCGGTTG", + "GAGCGACTT", + "GAGCCGAAT", + "GAGATAGAT", + "GAGACCTAT", + "GACGGTCGT", + "GACGCAGGT", + "GACGATATG", + "GACCTATCT", + "GAATTAGGA", + "GAATCAGCT", + "GAAGTTCAT", + "GAAGTGGTT", + "GAAGTATTG", + "GAAGGCATT", + "GAACGCTGT", + "CTTGTCCAG", + "CTTGGATTG", + "CTTGCTGAA", + "CTTGCCGTG", + "CTTGATTCT", + "CTTCTGTCG", + "CTTCGGCGT", + "CTTATGAGT", + "CTTACCGAT", + "CTGTTAGGT", + "CTGTCGTCT", + "CTGTATAAT", + "CTGGCTCAT", + "CTGGATGCG", + "CTGCGTGTG", + "CTGCGCGGT", + "CTGCCGATT", + "CTGCATTGT", + "CTGATTAAG", + "CTGAGATAT", + "CTGACCTGT", + "CTCGTATCT", + "CTCGGCAAG", + "CTCGCAATT", + "CTCCTGCTT", + "CTCCTAAGT", + "CTCCGGATG", + "CTCCGAGCG", + "CTCACAGGT", + "CTATTCTAT", + "CTATTAGTG", + "CTATGAATT", + "CTACATATT", + "CGTGGCATT", + "CGTCTTAAT", + "CGTCTGGTT", + "CGTCACTGT", + "CGTAGGTCT", + "CGGTTCGAG", + "CGGTTCATT", + "CGGTGCTCT", + "CGGTAATTG", + "CGGCCTGAT", + "CGGATATAG", + "CGGAATATT", + "CGCTCCAAT", + "CGCGTTCGT", + "CGCAGGTTG", + "CGAGGATGT", + "CGAGCTGTT", + "CGACGGCTT", + "CCTTGTGTG", + "CCTGTCTCA", + "CCTGACTAT", + "CCTACCTTG", + "CCGTAGATT", + "CCGGCTGGT", + "CATCGGACG", + "CATCGATAA", + "CATCCTTCT", + "CAGTTCTGT", + "CAGTGCCAG", + "CAGGCACTG", + "CAGCCTCTT", + "CACTTATAT", + "CACTGGTCG", + "CACTGCATG", + "CACGCGTTG", + "CACGATGTT", + "CACCATCTG", + "CACAGGCGT", + "ATTGTACAA", + "ATTGGTATG", + "ATTGCTAAT", + "ATTGCATAG", + "ATTGCAGTT", + "ATTCTGCAG", + "ATTCTACGT", + "ATTCGGATT", + "ATTCCGTTG", + "ATTCATCAA", + "ATTCAAGAG", + "ATTAGCCTT", + "ATTAATATT", + "ATGTTAGAG", + "ATGTTAACT", + "ATGTAGTCG", + "ATGGTGTAG", + "ATGGATTAT", + "ATCTTGAAG", + "ATCTGATAT", + "ATCTCAGAA", + "ATCGCTCAA", + "ATCGCGTCG", + "ATCCATGGT", + "ATCATGAGA", + "ATCATAGTT", + "ATCAGCGAG", + "ATCACCATT", + "ATAGTAATT", + "ATAGCTGTG", + "ATACTCTCG", + "ATACCTCAT", + "AGTTGCGCG", + "AGTTGAATT", + "AGTTATGAT", + "AGTGTCCGT", + "AGTGGCTTG", + "AGTGCTTCT", + "AGTATCATT", + "AGTACACAA", + "AGGTATGCG", + "AGGTATAGT", + "AGGCTACTT", + "AGGCCAGGT", + "AGGAGCGAT", + "AGCTTATAG", + "AGCTCTAGA", + "AGCGTGTAT", + "AGCGTCACA", + "AGCCTTCAT", + "AGCCTGTCG", + "AGCCTCGAG", + "AGCACTGAA", + "AGATGTACG", + "AGAGTTAAT", + "AGACCTCTG", + "ACTTCTATA", + "ACTGTCGAG", + "ACTGTATGT", + "ACTCTGTAA", + "ACTCGCGAA", + "ACTAGATCT", + "ACTAACGTT", + "ACGTTACTG", + "ACGTGGAAT", + "ACGGACTCT", + "ACGCCTAAT", + "ACGCCGTTA", + "ACGACGTGT", + "ACCTCGCAT", + "ACCATCATA", + "ACATATATT", + "ACAGGCACA", + "ACACCTGAG", + "ACACATTCT", +) + +B384_cell_key3 = ( + "TTGTGGCTG", + "TTGTGGAGT", + "TTGTGCGAC", + "TTGTCTTCA", + "TTGTAAGAT", + "TTGGTTCTG", + "TTGGTGCGT", + "TTGGTCTAC", + "TTGGTAACT", + "TTGGCGTGC", + "TTGGATTAG", + "TTGGAGACG", + "TTGGAATCA", + "TTGCGGCGA", + "TTGCGCTCG", + "TTGCCTTAC", + "TTGCCGGAT", + "TTGCATGCT", + "TTGCACGTC", + "TTGCACCAT", + "TTGAACCTG", + "TTCTCGCGT", + "TTCTCAACT", + "TTCTACTCA", + "TTCGTCCAT", + "TTCGGATAC", + "TTCGGACGT", + "TTCGCAATC", + "TTCCGGTGC", + "TTCCGACTG", + "TTCATTATG", + "TTCATGGAT", + "TTCAGCGCA", + "TTCACCTCG", + "TTCAAGCAG", + "TTCAACTAC", + "TTATGCCAG", + "TTATGCATC", + "TTATCGTAC", + "TTATACCTA", + "TTATAATAG", + "TTATAAGTC", + "TTAGTTAGC", + "TTAGCTCAT", + "TTAGCACTA", + "TTAGATATG", + "TTACTACGA", + "TTACCGTCA", + "TTACAGAGC", + "TTAATTGCA", + "TTAACAGAT", + "TGTTGGCTA", + "TGTTGATGA", + "TGTTAAGCT", + "TGTGGCCGA", + "TGTGCTAGC", + "TGTGCGTCA", + "TGTCGCAGT", + "TGTCGAGCA", + "TGTACAACG", + "TGGTTCCGA", + "TGGTTCACT", + "TGGTCAAGT", + "TGGCTTGTA", + "TGGCTGTCG", + "TGGCGTATG", + "TGGCGCGCT", + "TGGATGTAC", + "TGGACTTGC", + "TGGAATACT", + "TGCTAGCGA", + "TGCGTTGCT", + "TGCGGTCTG", + "TGCGCTTAG", + "TGCGCGACG", + "TGCCTGCAT", + "TGCCTAGAC", + "TGCACGAGT", + "TGAGTGTGC", + "TGAGGCTCG", + "TCTTCCGTC", + "TCTTATAGT", + "TCTTACCAT", + "TCTGTTGTC", + "TCTGTTACT", + "TCTGGCTAG", + "TCTCAGATC", + "TCTAGTTGA", + "TCTAGTACG", + "TCGTACTAC", + "TCGGTGTAG", + "TCGGCTGCT", + "TCGCTACTG", + "TCGATCACG", + "TCGAGGCAT", + "TCCGGCGTC", + "TCCGGAGCT", + "TCCGCTCGT", + "TCCGAGTAC", + "TCCATTCAT", + "TCCATGGTC", + "TCCAAGTCG", + "TCATTACGT", + "TCATGCACT", + "TCAGGTTGC", + "TCAGACCGT", + "TCACTCAGT", + "TCAAGCTCA", + "TATTGCGCA", + "TATTCGGCT", + "TATTCCAGC", + "TATTCATCA", + "TATGTTCAG", + "TATGGTATG", + "TATGCAAGT", + "TATCTGGTC", + "TATCTGACT", + "TATCCAGAT", + "TATCAGTCG", + "TATCACGCT", + "TAGGCGCGA", + "TAGGCACAT", + "TAGGATCGT", + "TAGCATTGC", + "TAGAGTTAC", + "TAGACTGAT", + "TACTTGTCG", + "TACGTCCGA", + "TACCGTACT", + "TACCGCGAT", + "TACCAGGAC", + "TACAGAAGT", + "TAAGTGCAT", + "TAAGCTACT", + "GTTGACCGA", + "GTTCTCGAC", + "GTTCCTGCT", + "GTTATGATG", + "GTGCTTGCA", + "GTGCCGCGT", + "GTATTGCTG", + "GTATTCCGA", + "GTATTAAGC", + "GTATGACGT", + "GTAGTTGTC", + "GTAGTACAT", + "GTAGCTCGA", + "GGTTGCTCA", + "GGTTGAGTA", + "GGTTAACGT", + "GGTGTGGCA", + "GGTCTTCAG", + "GGTCGTCTA", + "GGTCGGCGT", + "GGTCCGACT", + "GGTCATGTC", + "GGTCACATG", + "GGTAGTGCT", + "GGTAGCGTC", + "GGTACCAGT", + "GGTAAGGAT", + "GGCTTGTGC", + "GGCTTGACT", + "GGCTTACGA", + "GGCTGTAGT", + "GGCTGGCAG", + "GGCTCCATC", + "GGCGTGGAT", + "GGCGTAATC", + "GGCGCAAGT", + "GGCGAGTAG", + "GGCGACCGT", + "GGCCTGTCA", + "GGCCATTGC", + "GGCACTCTG", + "GGATGTCAT", + "GGAGTAACT", + "GGAGAACGA", + "GGACTGGCT", + "GGACGTTCA", + "GGAACGTGC", + "GCTGTCCAT", + "GCTGGTTCA", + "GCTGCAACT", + "GCTCGTTAC", + "GCTATAGAT", + "GCTAGTCGT", + "GCTACCATG", + "GCGTTCTGA", + "GCGTGTTAG", + "GCGGTATCG", + "GCGGAGCAT", + "GCGCGGTGC", + "GCGCCTAGT", + "GCGCCGGCT", + "GCCTTCATG", + "GCCATACTG", + "GCATGTTGA", + "GCATGCTAC", + "GCAGTATAC", + "GCAGGTACT", + "GCAGCGCGT", + "GCACCTCAT", + "GCAATTCGA", + "GATTGCCGT", + "GATGAACAT", + "GATCTTCGA", + "GATCTGCAT", + "GAGTGGCAT", + "GAGTCGGAC", + "GAGTATGAT", + "GAGGCGAGT", + "GAGGCAACG", + "GAGCGCACT", + "GAATAGGCT", + "ATTGTCACT", + "ATTGTATCA", + "ATTGGTCAG", + "ATTGGCGAT", + "ATTGATCGT", + "ATTCGTAGT", + "ATTCATACG", + "ATTCAGGAC", + "ATTACTTCA", + "ATTAATTAG", + "ATTAAGCAT", + "ATGTCTCTA", + "ATGTAGCGT", + "ATGGCATAC", + "ATGGAGATC", + "ATGGACTCG", + "ATGGAACGA", + "ATGCTTCAT", + "ATGCTCGCT", + "ATGCGACGT", + "ATGCCGTAG", + "ATGAGTTCG", + "ATGACTATC", + "ATGACCGAC", + "ATCTTATGC", + "ATCTTACTA", + "ATCTATCAG", + "ATCGTGTAC", + "ATCGTCTGA", + "ATCGGCATG", + "ATCGCGAGC", + "ATCGCAACG", + "ATCGATGCT", + "ATCGAATAG", + "ATCCTTCTG", + "ATCCTGCGT", + "ATCCGCACT", + "ATCCATTAC", + "ATCCAAGCA", + "ATCAGATCA", + "ATCACACAT", + "ATCAACGTC", + "ATCAACCGA", + "ATATTGAGT", + "ATATTCGTC", + "ATATTACAG", + "ATATCTTGA", + "ATATCGCAT", + "ATATCAATC", + "ATAGTCCTG", + "ATAGGTCTA", + "ATAGCTGAC", + "ATAGCGGTA", + "AGTTCGCTG", + "AGTTACAGC", + "AGTTAACTA", + "AGTGCAATC", + "AGTCTGGTA", + "AGTCTGAGC", + "AGTCTACAT", + "AGTCGAACT", + "AGTCCATCG", + "AGTCATTCA", + "AGTATCCAG", + "AGTAGACTG", + "AGTAATCGA", + "AGTAAGTGC", + "AGGTTGGCT", + "AGGTTCTAG", + "AGGTGTTCA", + "AGGTGCCAT", + "AGGTCTGAT", + "AGGTCGTAC", + "AGGTCAGCA", + "AGGCTTATC", + "AGGCTATGA", + "AGGCCGACG", + "AGGCCAAGC", + "AGGCAGGTC", + "AGGCAAGAT", + "AGGAGCAGT", + "AGGACCGCT", + "AGGAATTAC", + "AGCTTGGAC", + "AGCTTAAGT", + "AGCTACACG", + "AGCGTTACG", + "AGCGGTGCA", + "AGCGGAGTC", + "AGCGGACGA", + "AGCGCGCTA", + "AGCGATAGC", + "AGCGACTCA", + "AGCCTCTAC", + "AGCCGTCGT", + "AGCATGATC", + "AGCACTTCG", + "AGCACGGCA", + "AGATTCTGA", + "AGATTAGAT", + "AGATGATAG", + "AGATATGTA", + "AGATACCGT", + "AGAGTGCGT", + "AGAGCCGAT", + "AGACTCACT", + "ACTTGCCTA", + "ACTTGAGCA", + "ACTTCTAGC", + "ACTTCGACT", + "ACTTAGTAC", + "ACTGTTGAT", + "ACTGTAACG", + "ACTGGTATC", + "ACTGACGTC", + "ACTGAAGCT", + "ACTCTGATG", + "ACTCCTGAC", + "ACTCCGCTA", + "ACTCAACTG", + "ACTATTGCA", + "ACTAGGCAG", + "ACTACGCGT", + "ACTAATACT", + "ACGTTCGTA", + "ACGTGTGCT", + "ACGTGTATG", + "ACGTGGAGC", + "ACGTCTTCG", + "ACGTCAGTC", + "ACGGTCTCA", + "ACGGTCCGT", + "ACGGTACAG", + "ACGGCGCTG", + "ACGCTGCGA", + "ACGCGTGTA", + "ACGCGCCAG", + "ACGATGTCG", + "ACGATGGAT", + "ACGATCTAC", + "ACGAGCTGA", + "ACGAGCATC", + "ACGAATCGT", + "ACGAACGCA", + "ACCTTGTAG", + "ACCTGTTGC", + "ACCTGTCAT", + "ACCTCGATC", + "ACCTAGGTA", + "ACCTACTGA", + "ACCTAATCG", + "ACCGTAGCA", + "ACCGGTAGT", + "ACCGGCTAC", + "ACCGCTTCA", + "ACATTGTGC", + "ACATTCTCG", + "ACATGGCTG", + "ACATGACGA", + "ACATATGAT", + "ACATATACG", + "ACAGCGTAC", + "ACACTTGCT", + "ACACTATCA", + "ACACGCATG", + "ACACCAGTA", + "ACACCAACT", + "ACACATAGT", + "ACACACCTA", +) + + +def label_sections_to_index(label): + """ + Return the cell_index integer based on input 3 part cell label string + + """ + + cl1, cl2, cl3 = [int(n) for n in label.split("-")] + return (cl1 - 1) * 384 * 384 + (cl2 - 1) * 384 + (cl3 - 1) + 1 + + +# print(label_sections_to_index('1-1-1')) +# print(label_sections_to_index('33-78-21')) +# print(label_sections_to_index('43-12-77')) +# print(label_sections_to_index('96-96-96')) +# print(label_sections_to_index('135-43-344')) +# print(label_sections_to_index('384-384-384')) +# print('-') + +# ---------------------------------- + + +def index_to_label_sections(index): + zerobased = int(index) - 1 + + cl1 = (int((zerobased) / 384 / 384) % 384) + 1 + cl2 = (int((zerobased) / 384) % 384) + 1 + cl3 = (zerobased % 384) + 1 + + return f"{cl1}-{cl2}-{cl3}" + + +# print(index_to_label_sections(1)) +# print(index_to_label_sections(4748181)) +# print(index_to_label_sections(6197453)) +# print(index_to_label_sections(14044896)) +# print(index_to_label_sections(19775576)) +# print(index_to_label_sections(56623104)) +# print('-') +# ---------------------------------- + + +def index_to_sequence(index, bead_version): + zerobased = int(index) - 1 + + cl1 = (int((zerobased) / 384 / 384) % 384) + 1 + cl2 = (int((zerobased) / 384) % 384) + 1 + cl3 = (zerobased % 384) + 1 + + if bead_version == "v1": + cls1_sequence = A96_cell_key1[cl1 - 1] + cls2_sequence = A96_cell_key2[cl2 - 1] + cls3_sequence = A96_cell_key3[cl3 - 1] + + return f"{cls1_sequence}{v1_linker1}{cls2_sequence}{v1_linker2}{cls3_sequence}" + + elif bead_version == "Enh": + diversityInsert = "" + + if 1 <= cl1 <= 24: + diversityInsert = "" + elif 25 <= cl1 <= 48: + diversityInsert = "A" + elif 49 <= cl1 <= 72: + diversityInsert = "GT" + else: # 73 <= cl1 <= 96: + diversityInsert = "TCA" + + cls1_sequence = A96_cell_key1[cl1 - 1] + cls2_sequence = A96_cell_key2[cl2 - 1] + cls3_sequence = A96_cell_key3[cl3 - 1] + + return f"{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}" + + elif bead_version == "EnhV2": + diversityInsert = "" + subIndex = ((cl1 - 1) % 96) + 1 + + if 1 <= subIndex <= 24: + diversityInsert = "" + elif 25 <= subIndex <= 48: + diversityInsert = "A" + elif 49 <= subIndex <= 72: + diversityInsert = "GT" + else: # 73 <= subIndex <= 96: + diversityInsert = "TCA" + + cls1_sequence = B384_cell_key1[cl1 - 1] + cls2_sequence = B384_cell_key2[cl2 - 1] + cls3_sequence = B384_cell_key3[cl3 - 1] + + return f"{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}" + + +# print(index_to_sequence(4748181, 'Enh')) +# print(index_to_sequence(52923177, 'EnhV2')) + +# ---------------------------------- + + +def create_cell_index_fasta_V1(): + with open("Rhapsody_cellBarcodeV1_IndexToSequence.fasta", "w") as f: + for cl1 in range(1, 96 + 1): + for cl2 in range(1, 96 + 1): + for cl3 in range(1, 96 + 1): + index = label_sections_to_index(f"{cl1}-{cl2}-{cl3}") + sequence = index_to_sequence(index, "v1") + f.write(f">{index}\n") + f.write(f"{sequence}\n") + + +# create_cell_index_fasta_V1() + + +def create_cell_index_fasta_Enh(): + with open("Rhapsody_cellBarcodeEnh_IndexToSequence.fasta", "w") as f: + for cl1 in range(1, 96 + 1): + for cl2 in range(1, 96 + 1): + for cl3 in range(1, 96 + 1): + index = label_sections_to_index(f"{cl1}-{cl2}-{cl3}") + sequence = index_to_sequence(index, "Enh") + f.write(f">{index}\n") + f.write(f"{sequence}\n") + + +# create_cell_index_fasta_Enh() + + +def create_cell_index_fasta_EnhV2(): + with open("Rhapsody_cellBarcodeEnhV2_IndexToSequence.fasta", "w") as f: + for cl1 in range(1, 384 + 1): + for cl2 in range(1, 384 + 1): + for cl3 in range(1, 384 + 1): + index = label_sections_to_index(f"{cl1}-{cl2}-{cl3}") + sequence = index_to_sequence(index, "EnhV2") + f.write(f">{index}\n") + f.write(f"{sequence}\n") + + +# create_cell_index_fasta_EnhV2() diff --git a/src/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl b/src/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl new file mode 100755 index 00000000..30009f05 --- /dev/null +++ b/src/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl @@ -0,0 +1,6146 @@ +#!/usr/bin/env cwl-runner +{ + "$graph": [ + { + "class": "CommandLineTool", + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "baseCommand": "ATAC_Cell_by_Peak.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--fragments" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Fragments" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--gtf" + }, + "id": "#ATAC_Cell_by_Peak.cwl/GTF" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peaks" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Peaks" + }, + { + "type": "File", + "loadContents": true, + "id": "#ATAC_Cell_by_Peak.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--transposase-sites" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Transposase_Sites" + } + ], + "arguments": [ + { + "prefix": "--base-name", + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + } + ], + "id": "#ATAC_Cell_by_Peak.cwl", + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_ATAC_Cell_Calling_Data.csv" + }, + "id": "#ATAC_Cell_by_Peak.cwl/ATAC_Cell_Calling_Data" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Initial_Seurat.rds" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Initial_Seurat_RDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Total_Fragment_Metrics.json" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Total_Fragment_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.log" + }, + "id": "#ATAC_Cell_by_Peak.cwl/output" + } + ] + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 48000 + } + ], + "baseCommand": "ATAC_Compile_Results.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--biop-putative-data-table" + }, + "id": "#ATAC_Compile_Results.cwl/Biop_putative_data_table" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--cell-order" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Order" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--cell-order-subsampled" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Order_Subsampled" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--fragments" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Compile_Results.cwl/Fragments" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--initial-seurat" + }, + "id": "#ATAC_Compile_Results.cwl/Initial_Seurat_RDS" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--input-metrics-tar" + }, + "id": "#ATAC_Compile_Results.cwl/Input_Metrics_tar" + }, + { + "type": "string", + "inputBinding": { + "prefix": "--genome-size" + }, + "id": "#ATAC_Compile_Results.cwl/Reference_Genome_Size" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#ATAC_Compile_Results.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--total-fragment-metrics" + }, + "id": "#ATAC_Compile_Results.cwl/Total_Fragment_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--unified-metrics" + }, + "id": "#ATAC_Compile_Results.cwl/Unified_Metrics" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_MEX.zip" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*/*_coordinates.csv" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_ATAC_Seurat.rds" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Seurat_RDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*cell_type_experimental.csv" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Type_Predictions" + }, + { + "type": "File", + "outputBinding": { + "glob": "metrics-files.tar.gz" + }, + "id": "#ATAC_Compile_Results.cwl/Metrics_tar" + }, + { + "type": "File", + "outputBinding": { + "glob": "mist_atac_compile_results.log" + }, + "id": "#ATAC_Compile_Results.cwl/output" + } + ], + "id": "#ATAC_Compile_Results.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": "${ if (inputs.Assay == 'ATAC') { return 2; } else { return 4; } }", + "ramMin": "${ if (inputs.Assay == 'ATAC') { return 4000; } else { return 32000; } }" + } + ], + "baseCommand": [ + "mist_add_to_bam.py" + ], + "inputs": [ + { + "type": "string", + "inputBinding": { + "prefix": "--assay" + }, + "id": "#AddtoBam.cwl/Assay" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--input-bam" + }, + "id": "#AddtoBam.cwl/Bam" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--cell-order-json" + }, + "id": "#AddtoBam.cwl/Cell_Order" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--corrected-mols-list", + "itemSeparator": "," + }, + "id": "#AddtoBam.cwl/Corrected_Mols" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#AddtoBam.cwl/Generate_Bam" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata-json" + }, + "id": "#AddtoBam.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#AddtoBam.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--target-gene-mapping" + }, + "id": "#AddtoBam.cwl/Target_Gene_Mapping" + } + ], + "arguments": [ + { + "prefix": "--bamIO-threads", + "valueFrom": "$(runtime.cores)" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "*.bam" + }, + "id": "#AddtoBam.cwl/Annotated_Bam" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#AddtoBam.cwl/output" + } + ], + "id": "#AddtoBam.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "AlignmentAnalysis stage of the Rain pipeline annotates aligned reads and collects a myriad of metrics on the aligned reads. Additional annotation is performed to the reads\n", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8, + "ramMin": 24000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "AlignmentAnalysisAndCountCB.sh" + ], + "inputs": [ + { + "inputBinding": { + "prefix": "--assay" + }, + "type": [ + "null", + "string" + ], + "id": "#AlignmentAnalysis.cwl/Assay" + }, + { + "inputBinding": { + "prefix": "--exclude-intronic-reads" + }, + "type": [ + "null", + "boolean" + ], + "id": "#AlignmentAnalysis.cwl/Exclude_Intronic_Reads" + }, + { + "inputBinding": { + "prefix": "--extra-seqs" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Extra_Seqs" + }, + { + "inputBinding": { + "prefix": "--gtf" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/GTF" + }, + { + "inputBinding": { + "prefix": "--threads" + }, + "type": [ + "null", + "int" + ], + "id": "#AlignmentAnalysis.cwl/Maximum_Threads" + }, + { + "inputBinding": { + "prefix": "--r2-bam", + "itemSeparator": "," + }, + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#AlignmentAnalysis.cwl/R2_BAM" + }, + { + "inputBinding": { + "prefix": "--quality-metrics" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/ReadQualityMetrics" + }, + { + "inputBinding": { + "prefix": "--run-metadata" + }, + "type": "File", + "id": "#AlignmentAnalysis.cwl/Run_Metadata" + }, + { + "inputBinding": { + "prefix": "--transcript-length" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Transcript_Length" + } + ], + "outputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*.annotated.*.bam" + }, + "id": "#AlignmentAnalysis.cwl/Annotated_Bam_Files" + }, + { + "outputBinding": { + "glob": "*logs.tar.gz" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Logs" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_SeqMetrics.csv" + }, + "id": "#AlignmentAnalysis.cwl/Seq_Metrics" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*Sorted_Valid_Reads.csv.*" + }, + "id": "#AlignmentAnalysis.cwl/Sorted_Valid_Reads_CSV" + }, + { + "type": "int", + "outputBinding": { + "glob": "count_estimates.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 30000; } return parseInt(JSON.parse(self[0].contents).num_bioproducts); }" + }, + "id": "#AlignmentAnalysis.cwl/num_bioproducts" + }, + { + "type": "int", + "outputBinding": { + "glob": "count_estimates.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 10000; } return parseInt(JSON.parse(self[0].contents).num_cell_estimate); }" + }, + "id": "#AlignmentAnalysis.cwl/num_cell_estimate" + }, + { + "type": "int", + "outputBinding": { + "glob": "num_vdj_reads.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 0; } return parseInt(JSON.parse(self[0].contents).BCR); }" + }, + "id": "#AlignmentAnalysis.cwl/num_valid_ig_reads" + }, + { + "type": "int", + "outputBinding": { + "glob": "num_vdj_reads.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 0; } return parseInt(JSON.parse(self[0].contents).TCR); }" + }, + "id": "#AlignmentAnalysis.cwl/num_valid_tcr_reads" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_BCR_Valid_Reads.fastq.gz" + }, + "id": "#AlignmentAnalysis.cwl/validIgReads" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_TCR_Valid_Reads.fastq.gz" + }, + "id": "#AlignmentAnalysis.cwl/validTcrReads" + } + ], + "id": "#AlignmentAnalysis.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_annotate_molecules.py" + ], + "inputs": [ + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--umi-option" + }, + "id": "#AnnotateMolecules.cwl/AbSeq_UMI" + }, + { + "type": [ + "null", + "string" + ], + "id": "#AnnotateMolecules.cwl/Assay" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#AnnotateMolecules.cwl/Run_Metadata" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--valid-annot" + }, + "id": "#AnnotateMolecules.cwl/Valids" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "*_bioproduct_stats*.json" + }, + "id": "#AnnotateMolecules.cwl/Bioproduct_Stats_List" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_CellBiopSummary.csv.*" + }, + "id": "#AnnotateMolecules.cwl/Cell_Biop_Summary_List" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_Annotation_Molecule_corrected.csv.*" + }, + "id": "#AnnotateMolecules.cwl/Corrected_Mols_List" + }, + { + "type": "int", + "outputBinding": { + "glob": "stats.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).total_molecules)\n" + }, + "id": "#AnnotateMolecules.cwl/Total_Molecules" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#AnnotateMolecules.cwl/output" + } + ], + "id": "#AnnotateMolecules.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": "boolean", + "id": "#Assay_Settings.cwl/AbSeq_Reference_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reads_ATAC_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reads_RNA_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reference_Archive_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Targeted_Reference_Present" + } + ], + "outputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#Assay_Settings.cwl/Assay_ATAC" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Assay_Settings.cwl/Assay_RNA" + } + ], + "expression": "${\n var assay_rna = null;\n var assay_atac = null;\n\n if (!inputs.Reads_ATAC_Present && !inputs.Reads_RNA_Present)\n {\n throw new Error('Invalid pipeline inputs: Please provide Reads for at least 1 of RNA or ATAC analysis.')\n }\n if (inputs.Targeted_Reference_Present && inputs.Reference_Archive_Present) {\n throw new Error('Invalid pipeline inputs: Do not provide both Targeted Reference and Reference Archive.')\n }\n if (!inputs.Targeted_Reference_Present && !inputs.AbSeq_Reference_Present && !inputs.Reference_Archive_Present) {\n throw new Error('Invalid pipeline inputs: Please provide either a Reference Archive or a Targeted Reference or an AbSeq Reference.')\n }\n\n if ( inputs.Reads_ATAC_Present )\n {\n assay_atac = \"ATAC\"\n }\n\n if (inputs.Reads_RNA_Present && inputs.Reference_Archive_Present) {\n assay_rna = \"WTA\"\n }\n else if (inputs.Reads_RNA_Present && (inputs.Targeted_Reference_Present || inputs.AbSeq_Reference_Present)) {\n assay_rna = \"Targeted\"\n }\n\n return ({Assay_RNA: assay_rna, Assay_ATAC: assay_atac})\n}", + "id": "#Assay_Settings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "boolean" + ], + "id": "#BamSettings.cwl/_Generate_Bam" + } + ], + "outputs": [ + { + "type": [ + "null", + "boolean" + ], + "id": "#BamSettings.cwl/Generate_Bam" + } + ], + "expression": "${\n // the create bam flag defaults to false\n var generateBam = false;\n // the user can set this flag to true, to enable creation of the bam file.\n if (inputs._Generate_Bam != null) {\n generateBam = inputs._Generate_Bam;\n }\n return ({\n Generate_Bam: generateBam,\n });\n}", + "id": "#BamSettings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "MultipleInputFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#BundleLogs.cwl/log_files" + } + ], + "outputs": [ + { + "type": "Directory", + "id": "#BundleLogs.cwl/logs_dir" + } + ], + "expression": "${\n /* shamelly cribbed from https://gist.github.com/jcxplorer/823878 */\n function uuid() {\n var uuid = \"\", i, random;\n for (i = 0; i < 32; i++) {\n random = Math.random() * 16 | 0;\n if (i == 8 || i == 12 || i == 16 || i == 20) {\n uuid += \"-\";\n }\n uuid += (i == 12 ? 4 : (i == 16 ? (random & 3 | 8) : random)).toString(16);\n }\n return uuid;\n }\n var listing = [];\n for (var i = 0; i < inputs.log_files.length; i++) {\n var log_file = inputs.log_files[i];\n /*\n Checking here for null in case a Node was skipped because of conditional execution.\n For e.g. Generate_Bam is used to skip the AddToBam, MergeBam and IndexBam nodes\n */\n if (log_file != null) {\n log_file.basename = uuid() + \"-\" + log_file.basename;\n listing.push(log_file);\n }\n }\n return ({\n logs_dir: {\n class: \"Directory\",\n basename: \"Logs\",\n listing: listing\n }\n });\n}", + "id": "#BundleLogs.cwl" + }, + { + "requirements": [ + { + "listing": [ + "$(inputs.Reference_Archive)" + ], + "class": "InitialWorkDirRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_check_references.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--abseq-reference" + }, + "id": "#CheckReference.cwl/AbSeq_Reference" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "inputBinding": { + "itemSeparator": ",", + "prefix": "--assay" + }, + "id": "#CheckReference.cwl/Assay" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--putative-cell-call" + }, + "id": "#CheckReference.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--predefined-peaks" + }, + "id": "#CheckReference.cwl/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + "File", + "Directory" + ], + "inputBinding": { + "prefix": "--reference-archive" + }, + "id": "#CheckReference.cwl/Reference_Archive" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--sample-tags-version" + }, + "id": "#CheckReference.cwl/Sample_Tags_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--supplemental-reference" + }, + "id": "#CheckReference.cwl/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--targeted-reference" + }, + "id": "#CheckReference.cwl/Targeted_Reference" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version" + }, + "id": "#CheckReference.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#CheckReference.cwl/Checked_Predefined_Peaks" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#CheckReference.cwl/Checked_Predefined_Peaks_Index" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "combined_extra_seq.fasta" + }, + "id": "#CheckReference.cwl/Extra_Seqs" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "full-gene-list.json" + }, + "id": "#CheckReference.cwl/Full_Genes" + }, + { + "type": "File", + "outputBinding": { + "glob": "BD_Rhapsody_Reference_Files/*.gtf" + }, + "id": "#CheckReference.cwl/GTF" + }, + { + "type": "Directory", + "outputBinding": { + "glob": "BD_Rhapsody_Reference_Files" + }, + "id": "#CheckReference.cwl/Index" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "target-gene.json" + }, + "id": "#CheckReference.cwl/Target_Gene_Mapping" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "transcript_length.json" + }, + "id": "#CheckReference.cwl/Transcript_Length" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#CheckReference.cwl/output" + } + ], + "id": "#CheckReference.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_generate_H5MU.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--atac-cell-by-peak" + }, + "id": "#GenerateH5MU.cwl/Atac_Datatables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--atac-metrics" + }, + "id": "#GenerateH5MU.cwl/Atac_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--bioproduct-stats" + }, + "id": "#GenerateH5MU.cwl/Bioproduct_Stats" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-experimental", + "itemSeparator": "," + }, + "id": "#GenerateH5MU.cwl/Cell_Type_Experimental" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--data-tables" + }, + "id": "#GenerateH5MU.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--coordinates-file-list" + }, + "id": "#GenerateH5MU.cwl/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--metrics-summary" + }, + "id": "#GenerateH5MU.cwl/Metrics_Summary" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peak_annotation" + }, + "id": "#GenerateH5MU.cwl/Peak_Annotation" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--protein-aggregates-experimental" + }, + "id": "#GenerateH5MU.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--putative-cells-origin" + }, + "id": "#GenerateH5MU.cwl/Putative_Cells_Origin" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#GenerateH5MU.cwl/Run_Metadata" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--sample-tag-files" + }, + "id": "#GenerateH5MU.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#GenerateH5MU.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell" + }, + "id": "#GenerateH5MU.cwl/VDJ_Per_Cell" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.h5mu" + }, + "id": "#GenerateH5MU.cwl/H5MU" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#GenerateH5MU.cwl/output" + } + ], + "id": "#GenerateH5MU.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "hints": [], + "baseCommand": "GenerateSeurat.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--atac-seurat-rds" + }, + "id": "#GenerateSeurat.cwl/ATAC_Seurat" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--bioproduct-stats" + }, + "id": "#GenerateSeurat.cwl/Bioproduct_Stats" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-experimental", + "itemSeparator": "," + }, + "id": "#GenerateSeurat.cwl/Cell_Type_Experimental" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--data-tables" + }, + "id": "#GenerateSeurat.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--coordinates-file-list" + }, + "id": "#GenerateSeurat.cwl/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--protein-aggregates-experimental" + }, + "id": "#GenerateSeurat.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--putative-cells-origin" + }, + "id": "#GenerateSeurat.cwl/Putative_Cells_Origin" + }, + { + "type": "File", + "loadContents": true, + "id": "#GenerateSeurat.cwl/Run_Metadata" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--sample-tag-csvs" + }, + "id": "#GenerateSeurat.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#GenerateSeurat.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell" + }, + "id": "#GenerateSeurat.cwl/VDJ_Per_Cell" + } + ], + "arguments": [ + { + "prefix": "--base-name", + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.rds" + }, + "id": "#GenerateSeurat.cwl/SeuratRDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.log" + }, + "id": "#GenerateSeurat.cwl/output" + } + ], + "id": "#GenerateSeurat.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 48000 + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_get_datatables.py" + ], + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--atac-cell-calling-data-file" + }, + "id": "#GetDataTable.cwl/ATAC_Cell_Calling_Input" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--bioproduct-stats-list" + }, + "id": "#GetDataTable.cwl/Bioproduct_Stats_List" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--cell-biop-summary-list" + }, + "id": "#GetDataTable.cwl/Cell_Biop_Summary_List" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--full-gene-list" + }, + "id": "#GetDataTable.cwl/Full_Genes" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#GetDataTable.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--seq-metrics" + }, + "id": "#GetDataTable.cwl/Seq_Metrics" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "int" + } + ], + "id": "#GetDataTable.cwl/Total_Molecules" + }, + { + "type": [ + "null", + "int" + ], + "id": "#GetDataTable.cwl/num_bioproducts" + }, + { + "type": [ + "null", + "int" + ], + "id": "#GetDataTable.cwl/num_cell_estimate" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_RSEC_MolsPerCell_MEX.zip" + }, + "id": "#GetDataTable.cwl/Biop_putative_data_table" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Bioproduct_Stats.csv" + }, + "id": "#GetDataTable.cwl/Bioproduct_Stats" + }, + { + "type": "File", + "outputBinding": { + "glob": "cell_order.json" + }, + "id": "#GetDataTable.cwl/Cell_Order" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "cell_order_subsampled.json" + }, + "id": "#GetDataTable.cwl/Cell_Order_Subsampled" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*cell_type_experimental.csv" + }, + "id": "#GetDataTable.cwl/Cell_Type_Predictions" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_MEX.zip" + }, + "id": "#GetDataTable.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_coordinates.csv" + }, + "id": "#GetDataTable.cwl/Dim_Reduction_Coord" + }, + { + "type": "File", + "outputBinding": { + "glob": "metrics-files.tar.gz" + }, + "id": "#GetDataTable.cwl/Metrics_tar" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "Protein_Agg/*_Protein_Aggregates_Experimental.csv" + }, + "id": "#GetDataTable.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "Cell_Label_Filtering/*_Putative_Cells_Origin.csv" + }, + "id": "#GetDataTable.cwl/Putative_Cells_Origin" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "SampleTag/*csv" + }, + "id": "#GetDataTable.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "SampleTag/*_Sample_Tag_Calls.csv" + }, + "id": "#GetDataTable.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "SampleTagArchives/*zip" + }, + "id": "#GetDataTable.cwl/SampleTag_perTagZips" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#GetDataTable.cwl/output" + } + ], + "id": "#GetDataTable.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "GetMachineResources gets available resources (current only the number of cpus) on the machine that is running the local deployment of the MIST pipeline.\n", + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_machine_resources.py" + ], + "inputs": [], + "outputs": [ + { + "type": "int", + "outputBinding": { + "glob": "resources.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).total_cpus_avail)" + }, + "id": "#GetMachineResources.cwl/Total_CPUs_Avail" + } + ], + "id": "#GetMachineResources.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/_AbSeq_UMI" + }, + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/_MinChunkSize" + }, + { + "type": [ + "null", + "long" + ], + "id": "#InternalSettings.cwl/_NumRecordsPerSplit" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_Subsample_Sample_Tags" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#InternalSettings.cwl/_Target_analysis" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_VDJ_VGene_Evalue" + } + ], + "outputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/AbSeq_UMI" + }, + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/MinChunkSize" + }, + { + "type": [ + "null", + "long" + ], + "id": "#InternalSettings.cwl/NumRecordsPerSplit" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/Subsample_Sample_Tags" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#InternalSettings.cwl/Target_analysis" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/VDJ_VGene_Evalue" + } + ], + "expression": "${\n var internalInputs = [\n '_AbSeq_UMI',\n '_MinChunkSize',\n '_NumRecordsPerSplit',\n '_Target_analysis',\n '_Subsample_Sample_Tags',\n '_VDJ_VGene_Evalue',\n '_VDJ_JGene_Evalue',\n ];\n var internalOutputs = {}\n for (var i = 0; i < internalInputs.length; i++) {\n var internalInput = internalInputs[i];\n var internalOutput = internalInput.slice(1); // remove leading underscore\n if (inputs.hasOwnProperty(internalInput)) {\n internalOutputs[internalOutput] = inputs[internalInput]; // if input specified, redirect to output\n } else {\n internalOutputs[internalOutput] = null; // if input not specified, provide a null\n }\n }\n return internalOutputs;\n}", + "id": "#InternalSettings.cwl" + }, + { + "class": "Workflow", + "label": "BD Rhapsody™ Sequence Analysis Pipeline", + "doc": "The BD Rhapsody™ assays are used to create sequencing libraries from single cell transcriptomes.\n\nAfter sequencing, the analysis pipeline takes the FASTQ files and a reference file for gene alignment. The pipeline generates molecular counts per cell, read counts per cell, metrics, and an alignment file.", + "requirements": [ + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "MultipleInputFeatureRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "AbSeq Reference", + "id": "#main/AbSeq_Reference" + }, + { + "type": [ + "null", + "int" + ], + "id": "#main/AbSeq_UMI" + }, + { + "label": "Cell Calling ATAC Algorithm", + "doc": "Specify the ATAC algorithm to be used for ATAC putative cell calling. The Basic algorithm is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm", + "symbols": [ + "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm/Basic", + "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm/Refined" + ] + } + ], + "id": "#main/Cell_Calling_ATAC_Algorithm" + }, + { + "label": "Cell Calling Bioproduct Algorithm", + "doc": "Specify the bioproduct algorithm to be used for mRNA/AbSeq putative cell calling. The Basic algorithm is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm", + "symbols": [ + "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm/Basic", + "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm/Refined" + ] + } + ], + "id": "#main/Cell_Calling_Bioproduct_Algorithm" + }, + { + "label": "Cell Calling Data", + "doc": "Specify the data to be used for putative cell calling.\nThe default data for putative cell calling will be determined the following way:\n - If mRNA and ATAC Reads exist, mRNA_and_ATAC is the default.\n - If only ATAC Reads exist, ATAC is the default.\n - Otherwise, mRNA is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_Data/Cell_Calling_Data", + "symbols": [ + "#main/Cell_Calling_Data/Cell_Calling_Data/mRNA", + "#main/Cell_Calling_Data/Cell_Calling_Data/AbSeq", + "#main/Cell_Calling_Data/Cell_Calling_Data/ATAC", + "#main/Cell_Calling_Data/Cell_Calling_Data/mRNA_and_ATAC" + ] + } + ], + "id": "#main/Cell_Calling_Data" + }, + { + "type": [ + "null", + "string" + ], + "label": "Custom STAR Params", + "doc": "Allows you to specify custom STAR aligner mapping parameters. Only the mapping parameters you provide here will be used with STAR, meaning that you must provide the complete list of parameters that you want to take effect. For reference, the parameters used by default in the pipeline are:\n\n 1. Short Reads: --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000\n 2. Long Reads: Same options as short reads + --seedPerReadNmax 10000\n\n\nExample input: --alignIntronMax 500000 --outFilterScoreMinOverLread 0 --limitOutSJcollapsed 2000000\n\nImportant:\n 1. This applies to fastqs provided in the Reads user input\n 2. Please do not specify any non-mapping related params like: --runThreadN, --genomeDir --outSAMtype, etc.\n 3. Please only use params supported by STAR version 2.7.10b\n", + "id": "#main/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "label": "Custom bwa-mem2 Params", + "doc": "Allows you to specify custom bwa-mem2 mapping parameters. Only the mapping parameters you provide here will be used with bwa-mem2, meaning that you must provide the complete list of parameters that you want to take effect. The pipeline uses program default mapping parameters.\n\nExample input: -k 15 -w 200 -r 2\n\nImportant:\n 1. This applies to fastqs provided in the Reads_ATAC user input\n 2. Please do not specify any non-mapping related params like: -C, -t, etc.\n 3. Please only use params supported by bwa-mem2 version 2.2.1\n", + "id": "#main/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "int" + ], + "label": "Exact Cell Count", + "doc": "Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count", + "id": "#main/Exact_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Exclude Intronic Reads", + "doc": "By default, reads aligned to exons and introns are considered and represented in molecule counts. Including intronic reads may increase sensitivity, resulting in an increase in molecule counts and the number of genes per cell for both cellular and nuclei samples. Intronic reads may indicate unspliced mRNAs and are also useful, for example, in the study of nuclei and RNA velocity. When set to true, intronic reads will be excluded.", + "id": "#main/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "int" + ], + "label": "Expected Cell Count", + "doc": "Optional. Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected.", + "id": "#main/Expected_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Generate Bam Output", + "doc": "Default: false. A Bam read alignment file contains reads from all the input libraries, but creating it can consume a lot of compute and disk resources. By setting this field to true, the Bam file will be created. This option is shared for both Bioproduct and ATAC libraries.\n", + "id": "#main/Generate_Bam" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Long Reads (>=650bp)", + "doc": "By default, we detect if there are any reads longer than 650bp and then flag QualCLAlign to use STARlong instead of STAR. This flag can be explicitly set if it is known in advance that there are reads longer than 650bp.\n", + "id": "#main/Long_Reads" + }, + { + "type": [ + "null", + "int" + ], + "label": "Maximum Number of Threads", + "doc": "The maximum number of threads to use in the pipeline. By default, all available cores are used.", + "id": "#main/Maximum_Threads" + }, + { + "type": [ + "null", + "File" + ], + "label": "ATAC Predefined Peak Regions", + "doc": "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. Only applies to ATAC assays.", + "id": "#main/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Reads", + "doc": "FASTQ files from libraries that may include WTA mRNA, Targeted mRNA, AbSeq, Sample Multiplexing, and related technologies", + "id": "#main/Reads" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Reads-ATAC", + "doc": "FASTQ files from libraries generated using the ATAC assay protocol. Each lane of a library is expected to have 3 FASTQs - R1, R2 and I1/I2, where the index read contains the Cell Barcode and UMI sequence. Only applies to ATAC assays.", + "id": "#main/Reads_ATAC" + }, + { + "type": [ + "null", + "File" + ], + "label": "Reference Files Archive", + "id": "#main/Reference_Archive" + }, + { + "label": "Run Name", + "type": [ + "null", + "string" + ], + "doc": "This is a name for output files, for example Experiment1_Metrics_Summary.csv. Default if left empty is to name run based on a library. Any non-alpha numeric characters will be changed to a hyphen.", + "id": "#main/Run_Name" + }, + { + "label": "Sample Tags Version", + "doc": "The sample multiplexing kit version. This option should only be set for a multiplexed experiment.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Sample_Tags_Version/Sample_Tags_Version", + "symbols": [ + "#main/Sample_Tags_Version/Sample_Tags_Version/human", + "#main/Sample_Tags_Version/Sample_Tags_Version/hs", + "#main/Sample_Tags_Version/Sample_Tags_Version/mouse", + "#main/Sample_Tags_Version/Sample_Tags_Version/mm", + "#main/Sample_Tags_Version/Sample_Tags_Version/flex", + "#main/Sample_Tags_Version/Sample_Tags_Version/nuclei_includes_mrna", + "#main/Sample_Tags_Version/Sample_Tags_Version/nuclei_atac_only", + "#main/Sample_Tags_Version/Sample_Tags_Version/custom" + ] + } + ], + "id": "#main/Sample_Tags_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Supplemental Reference", + "id": "#main/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "label": "Sample Tag Names", + "doc": "Specify the Sample Tag number followed by - (hyphen) and a sample name to appear in the output files. For example: 4-Ramos. Should be alpha numeric, with + - and _ allowed. Any special characters: &, (), [], {}, <>, ?, | will be corrected to underscores. \n", + "id": "#main/Tag_Names" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#main/Target_analysis" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Targeted Reference", + "id": "#main/Targeted_Reference" + }, + { + "type": [ + "null", + "float" + ], + "label": "e-value threshold for J gene", + "doc": "The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001\n", + "id": "#main/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "label": "e-value threshold for V gene", + "doc": "The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001\n", + "id": "#main/VDJ_VGene_Evalue" + }, + { + "label": "VDJ Species Version", + "doc": "The VDJ species and chain types. This option should only be set for VDJ experiment.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/VDJ_Version/VDJ_Version", + "symbols": [ + "#main/VDJ_Version/VDJ_Version/human", + "#main/VDJ_Version/VDJ_Version/hs", + "#main/VDJ_Version/VDJ_Version/mouse", + "#main/VDJ_Version/VDJ_Version/mm", + "#main/VDJ_Version/VDJ_Version/humanBCR", + "#main/VDJ_Version/VDJ_Version/humanTCR", + "#main/VDJ_Version/VDJ_Version/mouseBCR", + "#main/VDJ_Version/VDJ_Version/mouseTCR" + ] + } + ], + "id": "#main/VDJ_Version" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#main/Write_Filtered_Reads" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/MergeATAC/ATAC_out", + "id": "#main/ATAC" + }, + { + "label": "BAM files and indices", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/MergeBAM_RNA/Bam", + "#main/MergeBAM_RNA/BamIndex", + "#main/MergeBAM_ATAC/Bam", + "#main/MergeBAM_ATAC/BamIndex" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Bam" + }, + { + "label": "Bioproduct Statistics", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/Bioproduct_Stats" + }, + { + "label": "Data Tables", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/GetDataTable/Data_Tables", + "id": "#main/Data_Tables" + }, + { + "label": "Dimensionality Reduction Coordinates", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/QualCLAlign_ATAC/Failed_Reads_CSVs", + "id": "#main/Failed_Reads_CSVs_ATAC" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/QualCLAlign_RNA/Failed_Reads_CSVs", + "id": "#main/Failed_Reads_CSVs_RNA" + }, + { + "label": "Scanpy-Muon H5MU File", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GenerateH5MU/H5MU", + "id": "#main/H5MU" + }, + { + "label": "Immune Cell Classification (Experimental)", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/Immune_Cell_Classification(Experimental)" + }, + { + "label": "Pipeline Logs", + "type": "Directory", + "outputSource": "#main/BundleLogs/logs_dir", + "id": "#main/Logs" + }, + { + "label": "Metrics Summary", + "type": [ + "null", + "File" + ], + "outputSource": "#main/Metrics/Metrics_Summary", + "id": "#main/Metrics_Summary" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/MergeMultiplex/Multiplex_out", + "id": "#main/Multiplexing" + }, + { + "label": "Pipeline Report HTML", + "type": [ + "null", + "File" + ], + "outputSource": "#main/Metrics/Pipeline_Report_HTML", + "id": "#main/Pipeline_Report_HTML" + }, + { + "label": "Protein Aggregates (Experimental)", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/Protein_Aggregates_Experimental" + }, + { + "label": "Seurat RDS File", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GenerateSeurat/SeuratRDS", + "id": "#main/Seurat" + }, + { + "label": "vdjCellsDatatable", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/vdjCellsDatatable" + }, + { + "label": "vdjCellsDatatableUncorrected", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjCellsDatatableUncorrected", + "id": "#main/vdjCellsDatatableUncorrected" + }, + { + "label": "vdjDbecFilterImages", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjDbecFilterImages", + "id": "#main/vdjDbecFilterImages" + }, + { + "label": "vdjDominantContigsAIRR", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjDominantContigsAIRR", + "id": "#main/vdjDominantContigsAIRR" + }, + { + "label": "vdjMetricsCsv", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjMetricsCsv", + "id": "#main/vdjMetricsCsv" + }, + { + "label": "vdjUnfilteredContigsAIRR", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjUnfilteredContigsAIRR", + "id": "#main/vdjUnfilteredContigsAIRR" + } + ], + "steps": [ + { + "run": "#ATAC_Cell_by_Peak.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/ATAC_Cell_by_Peak/Assay" + }, + { + "source": "#main/QualCLAlign_ATAC/Fragments", + "id": "#main/ATAC_Cell_by_Peak/Fragments" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/ATAC_Cell_by_Peak/GTF" + }, + { + "source": "#main/QualCLAlign_ATAC/Peaks", + "id": "#main/ATAC_Cell_by_Peak/Peaks" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/ATAC_Cell_by_Peak/Run_Metadata" + }, + { + "source": "#main/QualCLAlign_ATAC/Transposase_Sites", + "id": "#main/ATAC_Cell_by_Peak/Transposase_Sites" + } + ], + "out": [ + "#main/ATAC_Cell_by_Peak/Initial_Seurat_RDS", + "#main/ATAC_Cell_by_Peak/ATAC_Cell_Calling_Data", + "#main/ATAC_Cell_by_Peak/Total_Fragment_Metrics", + "#main/ATAC_Cell_by_Peak/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/ATAC_Cell_by_Peak" + }, + { + "run": "#ATAC_Compile_Results.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/ATAC_Compile_Results/Assay" + }, + { + "source": "#main/GetDataTable/Biop_putative_data_table", + "id": "#main/ATAC_Compile_Results/Biop_putative_data_table" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/ATAC_Compile_Results/Cell_Order" + }, + { + "source": "#main/GetDataTable/Cell_Order_Subsampled", + "id": "#main/ATAC_Compile_Results/Cell_Order_Subsampled" + }, + { + "source": "#main/QualCLAlign_ATAC/Fragments", + "id": "#main/ATAC_Compile_Results/Fragments" + }, + { + "source": "#main/ATAC_Cell_by_Peak/Initial_Seurat_RDS", + "id": "#main/ATAC_Compile_Results/Initial_Seurat_RDS" + }, + { + "source": "#main/GetDataTable/Metrics_tar", + "id": "#main/ATAC_Compile_Results/Input_Metrics_tar" + }, + { + "source": "#main/QualCLAlign_ATAC/Reference_Genome_Size", + "id": "#main/ATAC_Compile_Results/Reference_Genome_Size" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/ATAC_Compile_Results/Run_Metadata" + }, + { + "source": "#main/ATAC_Cell_by_Peak/Total_Fragment_Metrics", + "id": "#main/ATAC_Compile_Results/Total_Fragment_Metrics" + }, + { + "source": "#main/QualCLAlign_ATAC/UnifiedMetrics", + "id": "#main/ATAC_Compile_Results/Unified_Metrics" + } + ], + "out": [ + "#main/ATAC_Compile_Results/Metrics_tar", + "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Seurat_RDS", + "#main/ATAC_Compile_Results/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/ATAC_Compile_Results" + }, + { + "run": "#AddtoBam.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/AddtoBam_ATAC/Assay" + }, + { + "source": [ + "#main/QualCLAlign_ATAC/BAMFiles" + ], + "default": [ + "does_not_exist" + ], + "id": "#main/AddtoBam_ATAC/Bam" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/AddtoBam_ATAC/Cell_Order" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/AddtoBam_ATAC/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AddtoBam_ATAC/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/AddtoBam_ATAC/SampleTag_Calls" + } + ], + "when": "$(inputs.Generate_Bam == true && inputs.Assay == \"ATAC\")", + "out": [ + "#main/AddtoBam_ATAC/Annotated_Bam", + "#main/AddtoBam_ATAC/output" + ], + "scatter": [ + "#main/AddtoBam_ATAC/Bam" + ], + "id": "#main/AddtoBam_ATAC" + }, + { + "run": "#AddtoBam.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AddtoBam_RNA/Assay" + }, + { + "source": "#main/AlignmentAnalysis/Annotated_Bam_Files", + "default": [ + "does_not_exist" + ], + "id": "#main/AddtoBam_RNA/Bam" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/AddtoBam_RNA/Cell_Order" + }, + { + "source": "#main/AnnotateMolecules/Corrected_Mols_List", + "id": "#main/AddtoBam_RNA/Corrected_Mols" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/AddtoBam_RNA/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AddtoBam_RNA/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/AddtoBam_RNA/SampleTag_Calls" + }, + { + "source": "#main/CheckReference/Target_Gene_Mapping", + "id": "#main/AddtoBam_RNA/Target_Gene_Mapping" + } + ], + "when": "$(inputs.Generate_Bam == true && (inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\"))", + "out": [ + "#main/AddtoBam_RNA/Annotated_Bam", + "#main/AddtoBam_RNA/output" + ], + "scatter": [ + "#main/AddtoBam_RNA/Bam" + ], + "id": "#main/AddtoBam_RNA" + }, + { + "run": "#AlignmentAnalysis.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AlignmentAnalysis/Assay" + }, + { + "source": "#main/Misc_Settings/Exclude_Intronic_Reads", + "id": "#main/AlignmentAnalysis/Exclude_Intronic_Reads" + }, + { + "source": "#main/CheckReference/Extra_Seqs", + "id": "#main/AlignmentAnalysis/Extra_Seqs" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/AlignmentAnalysis/GTF" + }, + { + "source": "#main/Maximum_Threads", + "id": "#main/AlignmentAnalysis/Maximum_Threads" + }, + { + "source": "#main/QualCLAlign_RNA/BAMFiles", + "id": "#main/AlignmentAnalysis/R2_BAM" + }, + { + "source": "#main/QualCLAlign_RNA/QualCLAlignMetrics", + "id": "#main/AlignmentAnalysis/ReadQualityMetrics" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AlignmentAnalysis/Run_Metadata" + }, + { + "source": "#main/CheckReference/Transcript_Length", + "id": "#main/AlignmentAnalysis/Transcript_Length" + } + ], + "out": [ + "#main/AlignmentAnalysis/Seq_Metrics", + "#main/AlignmentAnalysis/Annotated_Bam_Files", + "#main/AlignmentAnalysis/Sorted_Valid_Reads_CSV", + "#main/AlignmentAnalysis/num_valid_ig_reads", + "#main/AlignmentAnalysis/num_valid_tcr_reads", + "#main/AlignmentAnalysis/validIgReads", + "#main/AlignmentAnalysis/validTcrReads", + "#main/AlignmentAnalysis/num_cell_estimate", + "#main/AlignmentAnalysis/num_bioproducts" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/AlignmentAnalysis" + }, + { + "requirements": [ + { + "ramMin": 32000, + "class": "ResourceRequirement" + } + ], + "run": "#AnnotateMolecules.cwl", + "in": [ + { + "source": "#main/Internal_Settings/AbSeq_UMI", + "id": "#main/AnnotateMolecules/AbSeq_UMI" + }, + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AnnotateMolecules/Assay" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AnnotateMolecules/Run_Metadata" + }, + { + "source": "#main/AlignmentAnalysis/Sorted_Valid_Reads_CSV", + "default": [ + "does_not_exist" + ], + "id": "#main/AnnotateMolecules/Valids" + } + ], + "out": [ + "#main/AnnotateMolecules/Bioproduct_Stats_List", + "#main/AnnotateMolecules/Cell_Biop_Summary_List", + "#main/AnnotateMolecules/Corrected_Mols_List", + "#main/AnnotateMolecules/Total_Molecules", + "#main/AnnotateMolecules/output" + ], + "scatter": [ + "#main/AnnotateMolecules/Valids" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/AnnotateMolecules" + }, + { + "run": "#Assay_Settings.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "valueFrom": "${ if (self && self.length > 0){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/AbSeq_Reference_Present" + }, + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Assay_Settings/Reads_ATAC_Present" + }, + { + "source": "#main/Reads", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Assay_Settings/Reads_RNA_Present" + }, + { + "source": "#main/Reference_Archive", + "valueFrom": "${ if (self){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/Reference_Archive_Present" + }, + { + "source": "#main/Targeted_Reference", + "valueFrom": "${ if (self && self.length > 0){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/Targeted_Reference_Present" + } + ], + "out": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "id": "#main/Assay_Settings" + }, + { + "label": "Bam Settings", + "run": "#BamSettings.cwl", + "in": [ + { + "source": "#main/Generate_Bam", + "id": "#main/Bam_Settings/_Generate_Bam" + } + ], + "out": [ + "#main/Bam_Settings/Generate_Bam" + ], + "id": "#main/Bam_Settings" + }, + { + "run": "#BundleLogs.cwl", + "in": [ + { + "source": [ + "#main/CheckReference/output", + "#main/GetDataTable/output", + "#main/ATAC_Cell_by_Peak/output", + "#main/ATAC_Compile_Results/output", + "#main/Metrics/output", + "#main/AddtoBam_RNA/output", + "#main/AddtoBam_ATAC/output", + "#main/AnnotateMolecules/output", + "#main/MergeBAM_RNA/log", + "#main/MergeBAM_ATAC/log", + "#main/GenerateH5MU/output", + "#main/GenerateSeurat/output", + "#main/Peak_Annotation/output" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/BundleLogs/log_files" + } + ], + "out": [ + "#main/BundleLogs/logs_dir" + ], + "id": "#main/BundleLogs" + }, + { + "requirements": [ + { + "ramMin": 10000, + "class": "ResourceRequirement" + } + ], + "run": "#CheckReference.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "id": "#main/CheckReference/AbSeq_Reference" + }, + { + "source": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/CheckReference/Assay" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "id": "#main/CheckReference/Cell_Calling_Data" + }, + { + "source": "#main/Predefined_ATAC_Peaks", + "id": "#main/CheckReference/Predefined_ATAC_Peaks" + }, + { + "source": "#main/Reference_Archive", + "id": "#main/CheckReference/Reference_Archive" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tags_Version", + "id": "#main/CheckReference/Sample_Tags_Version" + }, + { + "source": "#main/Supplemental_Reference", + "id": "#main/CheckReference/Supplemental_Reference" + }, + { + "source": "#main/Targeted_Reference", + "id": "#main/CheckReference/Targeted_Reference" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/CheckReference/VDJ_Version" + } + ], + "out": [ + "#main/CheckReference/Index", + "#main/CheckReference/Extra_Seqs", + "#main/CheckReference/Full_Genes", + "#main/CheckReference/output", + "#main/CheckReference/Transcript_Length", + "#main/CheckReference/GTF", + "#main/CheckReference/Target_Gene_Mapping", + "#main/CheckReference/Checked_Predefined_Peaks", + "#main/CheckReference/Checked_Predefined_Peaks_Index" + ], + "id": "#main/CheckReference" + }, + { + "run": "#GenerateH5MU.cwl", + "in": [ + { + "source": "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "id": "#main/GenerateH5MU/Atac_Datatables" + }, + { + "source": "#main/Metrics/Metrics_ATAC", + "id": "#main/GenerateH5MU/Atac_Metrics" + }, + { + "source": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/GenerateH5MU/Bioproduct_Stats" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateH5MU/Cell_Type_Experimental" + }, + { + "source": "#main/GetDataTable/Data_Tables", + "id": "#main/GenerateH5MU/Data_Tables" + }, + { + "source": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateH5MU/Dim_Reduction_Coord" + }, + { + "source": "#main/Metrics/Metrics_Summary", + "id": "#main/GenerateH5MU/Metrics_Summary" + }, + { + "source": "#main/Peak_Annotation/Peak_Annotation_TSV", + "id": "#main/GenerateH5MU/Peak_Annotation" + }, + { + "source": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/GenerateH5MU/Protein_Aggregates_Experimental" + }, + { + "source": "#main/GetDataTable/Putative_Cells_Origin", + "id": "#main/GenerateH5MU/Putative_Cells_Origin" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GenerateH5MU/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_CSVs", + "id": "#main/GenerateH5MU/SampleTag_CSVs" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/GenerateH5MU/SampleTag_Calls" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/GenerateH5MU/VDJ_Per_Cell" + } + ], + "out": [ + "#main/GenerateH5MU/H5MU", + "#main/GenerateH5MU/output" + ], + "id": "#main/GenerateH5MU" + }, + { + "run": "#GenerateSeurat.cwl", + "in": [ + { + "source": "#main/ATAC_Compile_Results/ATAC_Seurat_RDS", + "id": "#main/GenerateSeurat/ATAC_Seurat" + }, + { + "source": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/GenerateSeurat/Bioproduct_Stats" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateSeurat/Cell_Type_Experimental" + }, + { + "source": "#main/GetDataTable/Data_Tables", + "id": "#main/GenerateSeurat/Data_Tables" + }, + { + "source": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateSeurat/Dim_Reduction_Coord" + }, + { + "source": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/GenerateSeurat/Protein_Aggregates_Experimental" + }, + { + "source": "#main/GetDataTable/Putative_Cells_Origin", + "id": "#main/GenerateSeurat/Putative_Cells_Origin" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GenerateSeurat/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_CSVs", + "id": "#main/GenerateSeurat/SampleTag_CSVs" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/GenerateSeurat/SampleTag_Calls" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/GenerateSeurat/VDJ_Per_Cell" + } + ], + "out": [ + "#main/GenerateSeurat/SeuratRDS", + "#main/GenerateSeurat/output" + ], + "id": "#main/GenerateSeurat" + }, + { + "run": "#GetDataTable.cwl", + "in": [ + { + "source": "#main/ATAC_Cell_by_Peak/ATAC_Cell_Calling_Data", + "id": "#main/GetDataTable/ATAC_Cell_Calling_Input" + }, + { + "source": "#main/AnnotateMolecules/Bioproduct_Stats_List", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Bioproduct_Stats_List" + }, + { + "source": "#main/AnnotateMolecules/Cell_Biop_Summary_List", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Cell_Biop_Summary_List" + }, + { + "source": "#main/CheckReference/Full_Genes", + "id": "#main/GetDataTable/Full_Genes" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GetDataTable/Run_Metadata" + }, + { + "source": "#main/AlignmentAnalysis/Seq_Metrics", + "id": "#main/GetDataTable/Seq_Metrics" + }, + { + "source": "#main/AnnotateMolecules/Total_Molecules", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Total_Molecules" + }, + { + "source": "#main/AlignmentAnalysis/num_bioproducts", + "id": "#main/GetDataTable/num_bioproducts" + }, + { + "source": "#main/AlignmentAnalysis/num_cell_estimate", + "id": "#main/GetDataTable/num_cell_estimate" + } + ], + "out": [ + "#main/GetDataTable/Metrics_tar", + "#main/GetDataTable/Bioproduct_Stats", + "#main/GetDataTable/Cell_Order", + "#main/GetDataTable/Cell_Order_Subsampled", + "#main/GetDataTable/Cell_Type_Predictions", + "#main/GetDataTable/Data_Tables", + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/GetDataTable/output", + "#main/GetDataTable/Protein_Aggregates_Experimental", + "#main/GetDataTable/Putative_Cells_Origin", + "#main/GetDataTable/SampleTag_Calls", + "#main/GetDataTable/SampleTag_CSVs", + "#main/GetDataTable/SampleTag_perTagZips", + "#main/GetDataTable/Biop_putative_data_table" + ], + "id": "#main/GetDataTable" + }, + { + "label": "Get Machine Resources", + "run": "#GetMachineResources.cwl", + "in": [], + "out": [ + "#main/GetMachineResources/Total_CPUs_Avail" + ], + "id": "#main/GetMachineResources" + }, + { + "label": "Internal Settings", + "run": "#InternalSettings.cwl", + "in": [ + { + "source": "#main/AbSeq_UMI", + "id": "#main/Internal_Settings/_AbSeq_UMI" + }, + { + "source": "#main/Target_analysis", + "id": "#main/Internal_Settings/_Target_analysis" + }, + { + "source": "#main/VDJ_JGene_Evalue", + "id": "#main/Internal_Settings/_VDJ_JGene_Evalue" + }, + { + "source": "#main/VDJ_VGene_Evalue", + "id": "#main/Internal_Settings/_VDJ_VGene_Evalue" + } + ], + "out": [ + "#main/Internal_Settings/AbSeq_UMI", + "#main/Internal_Settings/Target_analysis", + "#main/Internal_Settings/VDJ_VGene_Evalue", + "#main/Internal_Settings/VDJ_JGene_Evalue" + ], + "id": "#main/Internal_Settings" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": { + "items": [ + "null", + "File" + ], + "type": "array" + }, + "id": "#main/MergeATAC/run/ATAC_Files" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#main/MergeATAC/run/ATAC_out" + } + ], + "expression": "${\n var fp_array = [];\n for (var i = 0; i < inputs.ATAC_Files.length; i++) {\n var fp = inputs.ATAC_Files[i];\n if (fp != null) {\n fp_array.push(fp);\n }\n }\n return({\"ATAC_out\": fp_array});\n}" + }, + "in": [ + { + "source": [ + "#main/QualCLAlign_ATAC/Fragments", + "#main/QualCLAlign_ATAC/Transposase_Sites", + "#main/QualCLAlign_ATAC/Peaks", + "#main/Metrics/Metrics_ATAC", + "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "#main/Peak_Annotation/Peak_Annotation_TSV" + ], + "linkMerge": "merge_flattened", + "id": "#main/MergeATAC/ATAC_Files" + } + ], + "out": [ + "#main/MergeATAC/ATAC_out" + ], + "id": "#main/MergeATAC" + }, + { + "run": "#MergeBAM.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/MergeBAM_ATAC/Assay" + }, + { + "source": "#main/AddtoBam_ATAC/Annotated_Bam", + "id": "#main/MergeBAM_ATAC/BamFiles" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/MergeBAM_ATAC/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/MergeBAM_ATAC/Run_Metadata" + } + ], + "when": "$(inputs.Generate_Bam == true && inputs.Assay == \"ATAC\")", + "out": [ + "#main/MergeBAM_ATAC/Bam", + "#main/MergeBAM_ATAC/BamIndex", + "#main/MergeBAM_ATAC/log" + ], + "id": "#main/MergeBAM_ATAC" + }, + { + "run": "#MergeBAM.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/MergeBAM_RNA/Assay" + }, + { + "source": "#main/AddtoBam_RNA/Annotated_Bam", + "id": "#main/MergeBAM_RNA/BamFiles" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/MergeBAM_RNA/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/MergeBAM_RNA/Run_Metadata" + } + ], + "when": "$(inputs.Generate_Bam == true && (inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\"))", + "out": [ + "#main/MergeBAM_RNA/Bam", + "#main/MergeBAM_RNA/BamIndex", + "#main/MergeBAM_RNA/log" + ], + "id": "#main/MergeBAM_RNA" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": { + "items": [ + "null", + "File" + ], + "type": "array" + }, + "id": "#main/MergeMultiplex/run/SampleTag_Files" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#main/MergeMultiplex/run/Multiplex_out" + } + ], + "expression": "${\n var fp_array = [];\n for (var i = 0; i < inputs.SampleTag_Files.length; i++) {\n var fp = inputs.SampleTag_Files[i];\n if (fp != null) {\n fp_array.push(fp);\n }\n }\n return({\"Multiplex_out\": fp_array});\n}" + }, + "in": [ + { + "source": [ + "#main/GetDataTable/SampleTag_CSVs", + "#main/GetDataTable/SampleTag_perTagZips" + ], + "linkMerge": "merge_flattened", + "id": "#main/MergeMultiplex/SampleTag_Files" + } + ], + "out": [ + "#main/MergeMultiplex/Multiplex_out" + ], + "id": "#main/MergeMultiplex" + }, + { + "run": "#Metadata.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "id": "#main/Metadata_Settings/AbSeq_Reference" + }, + { + "source": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Assay" + }, + { + "source": [ + "#main/QualCLAlign_RNA/Bead_Version", + "#main/QualCLAlign_ATAC/Bead_Version" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Bead_Version" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_ATAC_Algorithm", + "id": "#main/Metadata_Settings/Cell_Calling_ATAC_Algorithm" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Bioproduct_Algorithm", + "id": "#main/Metadata_Settings/Cell_Calling_Bioproduct_Algorithm" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "id": "#main/Metadata_Settings/Cell_Calling_Data" + }, + { + "source": "#main/Misc_Settings/Custom_STAR_Params", + "id": "#main/Metadata_Settings/Custom_STAR_Params" + }, + { + "source": "#main/Misc_Settings/Custom_bwa_mem2_Params", + "id": "#main/Metadata_Settings/Custom_bwa_mem2_Params" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Exact_Cell_Count", + "id": "#main/Metadata_Settings/Exact_Cell_Count" + }, + { + "source": "#main/Misc_Settings/Exclude_Intronic_Reads", + "id": "#main/Metadata_Settings/Exclude_Intronic_Reads" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Expected_Cell_Count", + "id": "#main/Metadata_Settings/Expected_Cell_Count" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/Metadata_Settings/Generate_Bam" + }, + { + "source": "#main/QualCLAlign_RNA/Libraries", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Libraries" + }, + { + "source": "#main/QualCLAlign_ATAC/Libraries", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Libraries_ATAC" + }, + { + "source": "#main/Misc_Settings/Long_Reads", + "id": "#main/Metadata_Settings/Long_Reads" + }, + { + "valueFrom": "BD Rhapsody Sequence Analysis Pipeline", + "id": "#main/Metadata_Settings/Pipeline_Name" + }, + { + "source": "#main/Version/version", + "id": "#main/Metadata_Settings/Pipeline_Version" + }, + { + "source": "#main/Predefined_ATAC_Peaks", + "id": "#main/Metadata_Settings/Predefined_ATAC_Peaks" + }, + { + "source": "#main/QualCLAlign_RNA/ReadsList", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Reads" + }, + { + "source": "#main/QualCLAlign_ATAC/ReadsList", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Reads_ATAC" + }, + { + "source": "#main/Reference_Archive", + "id": "#main/Metadata_Settings/Reference_Archive" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/Metadata_Settings/Run_Base_Name" + }, + { + "source": "#main/Name_Settings/Run_Name", + "id": "#main/Metadata_Settings/Run_Name" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tag_Names", + "id": "#main/Metadata_Settings/Sample_Tag_Names" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tags_Version", + "id": "#main/Metadata_Settings/Sample_Tags_Version" + }, + { + "source": "#main/Start_Time/Start_Time", + "id": "#main/Metadata_Settings/Start_Time" + }, + { + "source": "#main/Supplemental_Reference", + "id": "#main/Metadata_Settings/Supplemental_Reference" + }, + { + "source": "#main/Targeted_Reference", + "id": "#main/Metadata_Settings/Targeted_Reference" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/Metadata_Settings/VDJ_Version" + } + ], + "out": [ + "#main/Metadata_Settings/Run_Metadata" + ], + "id": "#main/Metadata_Settings" + }, + { + "requirements": [ + { + "ramMin": 4000, + "class": "ResourceRequirement" + } + ], + "run": "#Metrics.cwl", + "in": [ + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metrics/Cell_Type_Predictions" + }, + { + "source": [ + "#main/ATAC_Compile_Results/Metrics_tar", + "#main/GetDataTable/Metrics_tar" + ], + "linkMerge": "merge_flattened", + "pickValue": "first_non_null", + "id": "#main/Metrics/Metrics_tar" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/Metrics/Run_Metadata" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/Metrics/vdjCellsDatatable" + }, + { + "source": "#main/VDJ_Compile_Results/vdjMetricsJson", + "id": "#main/Metrics/vdjMetricsJson" + } + ], + "out": [ + "#main/Metrics/Metrics_Summary", + "#main/Metrics/Metrics_Archive", + "#main/Metrics/Metrics_ATAC", + "#main/Metrics/Pipeline_Report_JSON", + "#main/Metrics/Pipeline_Report_HTML", + "#main/Metrics/output" + ], + "id": "#main/Metrics" + }, + { + "label": "Miscellaneous Settings", + "run": "#MiscSettings.cwl", + "in": [ + { + "source": "#main/Custom_STAR_Params", + "id": "#main/Misc_Settings/_Custom_STAR_Params" + }, + { + "source": "#main/Custom_bwa_mem2_Params", + "id": "#main/Misc_Settings/_Custom_bwa_mem2_Params" + }, + { + "source": "#main/Exclude_Intronic_Reads", + "id": "#main/Misc_Settings/_Exclude_Intronic_Reads" + }, + { + "source": "#main/Long_Reads", + "id": "#main/Misc_Settings/_Long_Reads" + } + ], + "out": [ + "#main/Misc_Settings/Exclude_Intronic_Reads", + "#main/Misc_Settings/Long_Reads", + "#main/Misc_Settings/Custom_STAR_Params", + "#main/Misc_Settings/Custom_bwa_mem2_Params" + ], + "id": "#main/Misc_Settings" + }, + { + "label": "Multiplexing Settings", + "run": "#MultiplexingSettings.cwl", + "in": [ + { + "source": "#main/Tag_Names", + "id": "#main/Multiplexing_Settings/_Sample_Tag_Names" + }, + { + "source": "#main/Sample_Tags_Version", + "id": "#main/Multiplexing_Settings/_Sample_Tags_Version" + } + ], + "out": [ + "#main/Multiplexing_Settings/Sample_Tag_Names", + "#main/Multiplexing_Settings/Sample_Tags_Version" + ], + "id": "#main/Multiplexing_Settings" + }, + { + "label": "Name Settings", + "run": "#NameSettings.cwl", + "in": [ + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ var fastqs = []; if(self) { for(var i = 0; i < self.length; i++) { fastqs.push(self[i].basename); } } return fastqs; }", + "id": "#main/Name_Settings/ATAC_Fastqs" + }, + { + "source": "#main/Reads", + "valueFrom": "${ var fastqs = []; if(self) { for(var i = 0; i < self.length; i++) { fastqs.push(self[i].basename); } } return fastqs; }", + "id": "#main/Name_Settings/Bioproduct_Fastqs" + }, + { + "source": "#main/Run_Name", + "id": "#main/Name_Settings/_Run_Name" + } + ], + "out": [ + "#main/Name_Settings/Run_Name", + "#main/Name_Settings/Run_Base_Name" + ], + "id": "#main/Name_Settings" + }, + { + "run": "#PeakAnnotation.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/Peak_Annotation/Assay" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/Peak_Annotation/Gtf" + }, + { + "source": "#main/QualCLAlign_ATAC/Peaks", + "id": "#main/Peak_Annotation/Peaks_bed" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/Peak_Annotation/Run_Metadata" + } + ], + "out": [ + "#main/Peak_Annotation/Peak_Annotation_TSV", + "#main/Peak_Annotation/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/Peak_Annotation" + }, + { + "label": "Putative Cell Calling Settings", + "run": "#PutativeCellSettings.cwl", + "in": [ + { + "source": "#main/Cell_Calling_ATAC_Algorithm", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_ATAC_Algorithm" + }, + { + "source": "#main/Cell_Calling_Bioproduct_Algorithm", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_Bioproduct_Algorithm" + }, + { + "source": "#main/Cell_Calling_Data", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_Data" + }, + { + "source": "#main/Exact_Cell_Count", + "id": "#main/Putative_Cell_Calling_Settings/_Exact_Cell_Count" + }, + { + "source": "#main/Expected_Cell_Count", + "id": "#main/Putative_Cell_Calling_Settings/_Expected_Cell_Count" + }, + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Putative_Cell_Calling_Settings/_Reads_ATAC_Present" + }, + { + "source": "#main/Reads", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Putative_Cell_Calling_Settings/_Reads_RNA_Present" + } + ], + "out": [ + "#main/Putative_Cell_Calling_Settings/Cell_Calling_ATAC_Algorithm", + "#main/Putative_Cell_Calling_Settings/Cell_Calling_Bioproduct_Algorithm", + "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "#main/Putative_Cell_Calling_Settings/Exact_Cell_Count", + "#main/Putative_Cell_Calling_Settings/Expected_Cell_Count" + ], + "id": "#main/Putative_Cell_Calling_Settings" + }, + { + "run": "#QualCLAlign.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/QualCLAlign_ATAC/Assay" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/QualCLAlign_ATAC/Generate_Bam" + }, + { + "source": "#main/CheckReference/Index", + "id": "#main/QualCLAlign_ATAC/Index" + }, + { + "source": "#main/CheckReference/Checked_Predefined_Peaks", + "id": "#main/QualCLAlign_ATAC/Predefined_ATAC_Peaks" + }, + { + "source": "#main/Reads_ATAC", + "id": "#main/QualCLAlign_ATAC/Reads" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/QualCLAlign_ATAC/Run_Base_Name" + }, + { + "source": [ + "#main/Maximum_Threads", + "#main/GetMachineResources/Total_CPUs_Avail", + "#main/Assay_Settings/Assay_RNA" + ], + "valueFrom": "${ var [max_threads, total_cpus_avail, assay_rna] = self;\nif (max_threads != null) { return max_threads; }\nif (assay_rna == null) { return total_cpus_avail; }\nif (total_cpus_avail >= 48) { return parseInt(total_cpus_avail / 2); } else { return total_cpus_avail; } }", + "id": "#main/QualCLAlign_ATAC/Threads" + }, + { + "source": "#main/Write_Filtered_Reads", + "id": "#main/QualCLAlign_ATAC/Write_Filtered_Reads" + }, + { + "source": "#main/Misc_Settings/Custom_bwa_mem2_Params", + "id": "#main/QualCLAlign_ATAC/bwa_mem2_Params" + } + ], + "out": [ + "#main/QualCLAlign_ATAC/Bead_Version", + "#main/QualCLAlign_ATAC/Libraries", + "#main/QualCLAlign_ATAC/ReadsList", + "#main/QualCLAlign_ATAC/BAMFiles", + "#main/QualCLAlign_ATAC/Fragments", + "#main/QualCLAlign_ATAC/Fragments_Index", + "#main/QualCLAlign_ATAC/Transposase_Sites", + "#main/QualCLAlign_ATAC/Transposase_Sites_Index", + "#main/QualCLAlign_ATAC/Peaks", + "#main/QualCLAlign_ATAC/Peaks_Index", + "#main/QualCLAlign_ATAC/QualCLAlignMetrics", + "#main/QualCLAlign_ATAC/UnifiedMetrics", + "#main/QualCLAlign_ATAC/Logs", + "#main/QualCLAlign_ATAC/Failed_Reads_CSVs", + "#main/QualCLAlign_ATAC/Reference_Genome_Size" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/QualCLAlign_ATAC" + }, + { + "run": "#QualCLAlign.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/QualCLAlign_RNA/Assay" + }, + { + "source": "#main/CheckReference/Extra_Seqs", + "id": "#main/QualCLAlign_RNA/Extra_Seqs" + }, + { + "source": "#main/CheckReference/Index", + "id": "#main/QualCLAlign_RNA/Index" + }, + { + "source": "#main/Misc_Settings/Long_Reads", + "id": "#main/QualCLAlign_RNA/Long_Reads" + }, + { + "source": "#main/Reads", + "id": "#main/QualCLAlign_RNA/Reads" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/QualCLAlign_RNA/Run_Base_Name" + }, + { + "source": "#main/Misc_Settings/Custom_STAR_Params", + "id": "#main/QualCLAlign_RNA/STAR_Params" + }, + { + "source": [ + "#main/Maximum_Threads", + "#main/GetMachineResources/Total_CPUs_Avail", + "#main/Assay_Settings/Assay_ATAC" + ], + "valueFrom": "${ var [max_threads, total_cpus_avail, assay_atac] = self;\nif (max_threads != null) { return max_threads; }\nif (assay_atac == null) { return total_cpus_avail; }\nif (total_cpus_avail >= 48) { return parseInt(total_cpus_avail / 2); } else { return total_cpus_avail; } }", + "id": "#main/QualCLAlign_RNA/Threads" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/QualCLAlign_RNA/VDJ_Version" + }, + { + "source": "#main/Write_Filtered_Reads", + "id": "#main/QualCLAlign_RNA/Write_Filtered_Reads" + } + ], + "out": [ + "#main/QualCLAlign_RNA/Bead_Version", + "#main/QualCLAlign_RNA/Libraries", + "#main/QualCLAlign_RNA/ReadsList", + "#main/QualCLAlign_RNA/BAMFiles", + "#main/QualCLAlign_RNA/QualCLAlignMetrics", + "#main/QualCLAlign_RNA/Logs", + "#main/QualCLAlign_RNA/Failed_Reads_CSVs" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/QualCLAlign_RNA" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [], + "outputs": [ + { + "type": "string", + "id": "#main/Start_Time/run/Start_Time" + } + ], + "expression": "${\n var today = new Date();\n var date = today.toDateString();\n var time = today.toLocaleTimeString('en-us', {timeZoneName: 'short'});\n return ({Start_Time: date + ' ' + time});\n} " + }, + "in": [], + "out": [ + "#main/Start_Time/Start_Time" + ], + "id": "#main/Start_Time" + }, + { + "run": "#VDJ_Analyze_Reads_IG.cwl", + "when": "$(inputs.VDJ_Version != null && inputs.VDJ_Version != \"humanTCR\" && inputs.VDJ_Version != \"mouseTCR\")", + "in": [ + { + "source": "#main/Maximum_Threads", + "id": "#main/VDJ_Analyze_Reads_IG/Maximum_Threads" + }, + { + "source": "#main/AlignmentAnalysis/num_valid_ig_reads", + "id": "#main/VDJ_Analyze_Reads_IG/Num_Valid_Reads_IG" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Analyze_Reads_IG/VDJ_Version" + }, + { + "source": "#main/AlignmentAnalysis/validIgReads", + "id": "#main/VDJ_Analyze_Reads_IG/Valid_Reads_Fastq_IG" + } + ], + "out": [ + "#main/VDJ_Analyze_Reads_IG/gatheredCalls" + ], + "id": "#main/VDJ_Analyze_Reads_IG" + }, + { + "run": "#VDJ_Analyze_Reads_TCR.cwl", + "when": "$(inputs.VDJ_Version != null && inputs.VDJ_Version != \"humanBCR\" && inputs.VDJ_Version != \"mouseBCR\")", + "in": [ + { + "source": "#main/Maximum_Threads", + "id": "#main/VDJ_Analyze_Reads_TCR/Maximum_Threads" + }, + { + "source": "#main/AlignmentAnalysis/num_valid_tcr_reads", + "id": "#main/VDJ_Analyze_Reads_TCR/Num_Valid_Reads_TCR" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Analyze_Reads_TCR/VDJ_Version" + }, + { + "source": "#main/AlignmentAnalysis/validTcrReads", + "id": "#main/VDJ_Analyze_Reads_TCR/Valid_Reads_Fastq_TCR" + } + ], + "out": [ + "#main/VDJ_Analyze_Reads_TCR/gatheredCalls" + ], + "id": "#main/VDJ_Analyze_Reads_TCR" + }, + { + "run": "#VDJ_Compile_Results.cwl", + "when": "$(inputs.VDJ_Version != null)", + "in": [ + { + "source": "#main/AlignmentAnalysis/Seq_Metrics", + "id": "#main/VDJ_Compile_Results/Seq_Metrics" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Compile_Results/VDJ_Version" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/VDJ_Compile_Results/cellTypeMapping" + }, + { + "valueFrom": "$([])", + "id": "#main/VDJ_Compile_Results/chainsToIgnore" + }, + { + "source": "#main/Internal_Settings/VDJ_JGene_Evalue", + "id": "#main/VDJ_Compile_Results/evalueJgene" + }, + { + "source": "#main/Internal_Settings/VDJ_VGene_Evalue", + "id": "#main/VDJ_Compile_Results/evalueVgene" + }, + { + "source": "#main/VDJ_Analyze_Reads_IG/gatheredCalls", + "id": "#main/VDJ_Compile_Results/igCalls" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/VDJ_Compile_Results/metadata" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/VDJ_Compile_Results/putativeCells" + }, + { + "source": "#main/VDJ_Analyze_Reads_TCR/gatheredCalls", + "id": "#main/VDJ_Compile_Results/tcrCalls" + } + ], + "out": [ + "#main/VDJ_Compile_Results/vdjCellsDatatable", + "#main/VDJ_Compile_Results/vdjCellsDatatableUncorrected", + "#main/VDJ_Compile_Results/vdjDominantContigsAIRR", + "#main/VDJ_Compile_Results/vdjUnfilteredContigsAIRR", + "#main/VDJ_Compile_Results/vdjMetricsJson", + "#main/VDJ_Compile_Results/vdjMetricsCsv", + "#main/VDJ_Compile_Results/vdjDbecFilterImages" + ], + "id": "#main/VDJ_Compile_Results" + }, + { + "label": "VDJ Settings", + "run": "#VDJ_Settings.cwl", + "in": [ + { + "source": "#main/VDJ_Version", + "id": "#main/VDJ_Settings/_VDJ_Version" + } + ], + "out": [ + "#main/VDJ_Settings/VDJ_Version" + ], + "id": "#main/VDJ_Settings" + }, + { + "run": "#Version.cwl", + "in": [], + "out": [ + "#main/Version/version" + ], + "id": "#main/Version" + } + ], + "id": "#main" + }, + { + "requirements": [ + { + "listing": [ + { + "entryname": "bam_files.txt", + "entry": "${\n function getBamsInSortedOrder(inputs) {\n // Create an associative array to hold the mapping from basename to full path\n var fileMap = {};\n\n // Extract basenames and map them to their full paths\n for (var i = 0; i < inputs.BamFiles.length; i++) {\n var file = inputs.BamFiles[i].path;\n var basename = file.split('/').pop();\n fileMap[basename] = file;\n }\n\n // Sort the basenames numerically\n // This works because all bams share the same prefix and have a numerical id :\n // i.e. foobar.0001.tagged.bam, foobar.0002.tagged.bam, foobar.0010.tagged.bam, etc.\n // so will be sorted by numerical ids\n var sortedBasenames = Object.keys(fileMap).sort(function(a, b) {\n return a.localeCompare(b, undefined, { numeric: true });\n });\n\n // Reconstruct the sorted full paths\n var sortedBamFiles = sortedBasenames.map(function(basename) {\n return fileMap[basename];\n });\n\n // Create a file of file names - 1 per line\n return sortedBamFiles.join('\\n');\n }\n\n // For ATAC we cat the bams so need them in a particular order\n if (inputs.Assay == \"ATAC\") {\n return getBamsInSortedOrder(inputs);\n }\n else {\n return inputs.BamFiles.map(function(file) {\n return file.path;\n }).join('\\n');\n }\n}", + "writable": false + } + ], + "class": "InitialWorkDirRequirement" + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8, + "ramMin": 16000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "MergeBam.sh" + ], + "stderr": "merge_bam.log", + "inputs": [ + { + "type": "string", + "inputBinding": { + "position": 1 + }, + "id": "#MergeBAM.cwl/Assay" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#MergeBAM.cwl/BamFiles" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MergeBAM.cwl/Generate_Bam" + }, + { + "type": "File", + "loadContents": true, + "id": "#MergeBAM.cwl/Run_Metadata" + } + ], + "arguments": [ + { + "position": 2, + "valueFrom": "${\n var st_version = JSON.parse(inputs.Run_Metadata.contents).Sample_Tags_Version\n if (st_version)\n {\n return st_version\n } else\n {\n return \"None\"\n }\n}" + }, + { + "position": 3, + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + }, + { + "position": 4, + "valueFrom": "$(runtime.cores)" + }, + { + "position": 5, + "valueFrom": "bam_files.txt" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "${ return \"*\" + JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name + \"*.bam\" }" + }, + "id": "#MergeBAM.cwl/Bam" + }, + { + "type": "File", + "outputBinding": { + "glob": "${ return \"*\" + JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name + \"*.bam.bai\" }" + }, + "id": "#MergeBAM.cwl/BamIndex" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#MergeBAM.cwl/log" + } + ], + "id": "#MergeBAM.cwl" + }, + { + "class": "CommandLineTool", + "baseCommand": "echo", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/AbSeq_Reference" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "id": "#Metadata.cwl/Assay" + }, + { + "type": { + "type": "array", + "items": { + "type": "record", + "fields": [ + { + "name": "#Metadata.cwl/Bead_Version/Library", + "type": "string" + }, + { + "name": "#Metadata.cwl/Bead_Version/bead_version", + "type": "string" + } + ] + } + }, + "id": "#Metadata.cwl/Bead_Version" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#Metadata.cwl/Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#Metadata.cwl/Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Exact_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#Metadata.cwl/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Expected_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#Metadata.cwl/Generate_Bam" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Libraries" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Libraries_ATAC" + }, + { + "type": "string", + "id": "#Metadata.cwl/Long_Reads" + }, + { + "type": "string", + "id": "#Metadata.cwl/Pipeline_Name" + }, + { + "type": "string", + "id": "#Metadata.cwl/Pipeline_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#Metadata.cwl/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Reads" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Reads_ATAC" + }, + { + "type": [ + "null", + "File", + "Directory" + ], + "id": "#Metadata.cwl/Reference_Archive" + }, + { + "type": "string", + "id": "#Metadata.cwl/Run_Base_Name" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Run_Name" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Sample_Tag_Names" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Sample_Tags_Version" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Start_Time" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/Targeted_Reference" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "run_metadata.json" + }, + "id": "#Metadata.cwl/Run_Metadata" + } + ], + "stdout": "run_metadata.json", + "arguments": [ + { + "prefix": "" + }, + { + "shellQuote": true, + "valueFrom": "${\n var metadata = inputs;\n var all_bv = {};\n var customer_bv = \"Original (V1)\";\n var detected_bv = \"V1\";\n for (var i = 0; i < inputs.Bead_Version.length; i++) {\n var BeadVer = inputs.Bead_Version[i];\n var Library = BeadVer[\"Library\"];\n var bead_version = BeadVer[\"bead_version\"];\n all_bv[Library] = bead_version \n var short_bv = bead_version.substring(0, 5);\n if (short_bv == \"Enh\") {\n customer_bv = \"Enhanced\";\n detected_bv = \"Enh\";\n }\n else if (short_bv == \"EnhV2\") {\n customer_bv = \"Enhanced V2/V3\";\n detected_bv = \"EnhV2\";\n }\n }\n metadata[\"Bead_Version\"] = all_bv;\n metadata[\"Bead_Version_Detected\"] = detected_bv;\n\n var pipeline_name = inputs.Pipeline_Name;\n var version = inputs.Pipeline_Version;\n var time = inputs.Start_Time;\n var libraries = inputs.Libraries;\n if(libraries == null){\n libraries = [\"None\"];\n }\n var libraries_atac = inputs.Libraries_ATAC\n if(libraries_atac == null){\n libraries_atac = [\"None\"];\n }\n\n var i = 0;\n var refs_mrna_inputs = [];\n var mrna_name = \"mRNA Reference\";\n if (inputs.Targeted_Reference != null) {\n refs_mrna_inputs = refs_mrna_inputs.concat(inputs.Targeted_Reference);\n mrna_name = \"Targeted Reference\";\n }\n if(inputs.Reference_Archive != null){\n refs_mrna_inputs = refs_mrna_inputs.concat(inputs.Reference_Archive);\n mrna_name = \"Reference Archive\";\n }\n var refs_mrna = [];\n if (refs_mrna_inputs.length > 0) {\n for (i = 0; i < refs_mrna_inputs.length; i++) {\n if (refs_mrna_inputs[i] != null) {\n refs_mrna.push(refs_mrna_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_mrna = [\"None\"];\n }\n\n var refs_abseq_inputs = [];\n if (inputs.AbSeq_Reference != null) {\n refs_abseq_inputs = refs_abseq_inputs.concat(inputs.AbSeq_Reference);\n }\n var refs_abseq = [];\n if (refs_abseq_inputs.length > 0) {\n for (i = 0; i < refs_abseq_inputs.length; i++) {\n if (refs_abseq_inputs[i] != null) {\n refs_abseq.push(refs_abseq_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_abseq = [\"None\"];\n }\n\n var refs_supp_inputs = [];\n if (inputs.Supplemental_Reference != null) {\n refs_supp_inputs = refs_supp_inputs.concat(inputs.AbSeq_Reference);\n }\n var refs_supp = [];\n if (refs_supp_inputs.length > 0) {\n for (i = 0; i < refs_supp_inputs.length; i++) {\n if (refs_supp_inputs[i] != null) {\n refs_supp.push(refs_supp_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_supp = [\"None\"];\n }\n\n if (inputs.Predefined_ATAC_Peaks != null) {\n var predef_atac_peaks = inputs.Predefined_ATAC_Peaks[\"basename\"];\n } else {\n var predef_atac_peaks = \"None\";\n }\n\n var parameters = [];\n if(inputs.Sample_Tags_Version != null){\n var tags = \"Sample Tag Version: \" + inputs.Sample_Tags_Version;\n } else{ \n var tags = \"Sample Tag Version: None\";\n }\n parameters.push(tags);\n\n if(inputs.Sample_Tag_Names != null){\n var tag_names = inputs.Sample_Tag_Names.join(\"; \")\n var tag_list = \"Sample Tag Names: \" + tag_names;\n } else{\n var tag_list = \"Sample Tag Names: None\";\n }\n parameters.push(tag_list);\n \n if(inputs.VDJ_Version != null){\n var vdj = \"VDJ Version: \" + inputs.VDJ_Version;\n } else{ \n var vdj = \"VDJ Version: None\";\n }\n parameters.push(vdj)\n\n if (inputs.Cell_Calling_Data == 0) {\n var call = \"Putative Cell Calling Data: mRNA\";\n } else if (inputs.Cell_Calling_Data == 1) {\n var call = \"Putative Cell Calling Data: AbSeq\";\n } else if (inputs.Cell_Calling_Data == 2) {\n var call = \"Putative Cell Calling Data: mRNA_and_AbSeq\";\n } else if (inputs.Cell_Calling_Data == 3) {\n var call = \"Putative Cell Calling Data: mRNA_and_ATAC\";\n } else if (inputs.Cell_Calling_Data == 4) {\n var call = \"Putative Cell Calling Data: AbSeq_and_ATAC\";\n } else if (inputs.Cell_Calling_Data == 5) {\n var call = \"Putative Cell Calling Data: ATAC\";\n } else {\n var call = \"Putative Cell Calling Data: None\";\n }\n parameters.push(call)\n\n if (inputs.Cell_Calling_Bioproduct_Algorithm != null) {\n var bioproduct_alg = \"Bioproduct Cell Calling Algorithm: \" + inputs.Cell_Calling_Bioproduct_Algorithm;\n } else {\n var bioproduct_alg = \"Bioproduct Cell Calling Algorithm: None\";\n }\n parameters.push(bioproduct_alg)\n\n if (inputs.Cell_Calling_ATAC_Algorithm != null) {\n var atac_alg = \"ATAC Cell Calling Algorithm: \" + inputs.Cell_Calling_ATAC_Algorithm;\n } else {\n var atac_alg = \"ATAC Cell Calling Algorithm: None\";\n }\n parameters.push(atac_alg)\n\n if(inputs.Exclude_Intronic_Reads){\n var introns = \"Exclude Intronic Reads: On\";\n } else{\n var introns = \"Exclude Intronic Reads: Off\";\n }\n parameters.push(introns)\n\n if(inputs.Generate_Bam){\n var generateBam = \"Generate Bam: On\";\n } else{\n var generateBam = \"Generate Bam: Off\";\n }\n parameters.push(generateBam)\n\n if(inputs.Exact_Cell_Count != null){\n var exactCells = \"Exact Cell Count: \" + inputs.Exact_Cell_Count;\n } else{\n var exactCells = \"Exact Cell Count: None\";\n }\n parameters.push(exactCells)\n\n if(inputs.Expected_Cell_Count != null){\n var expectedCells = \"Expected Cell Count: \" + inputs.Expected_Cell_Count;\n } else{\n var expectedCells = \"Expected Cell Count: None\";\n }\n parameters.push(expectedCells);\n\n var longReads = \"Long Reads: \" + inputs.Long_Reads;\n parameters.push(longReads);\n\n if (inputs.Custom_STAR_Params != null)\n {\n var starParams = \"Custom STAR Params: \" + inputs.Custom_STAR_Params;\n } else {\n var starParams = \"Custom STAR Params: None\"; \n }\n parameters.push(starParams);\n\n if (inputs.Custom_bwa_mem2_Params != null)\n {\n var bwaParams = \"Custom bwa-mem2 Params: \" + inputs.Custom_bwa_mem2_Params;\n } else {\n var bwaParams = \"Custom bwa-mem2 Params: None\";\n }\n parameters.push(bwaParams);\n\n var run_name = inputs.Run_Name;\n var run_base_name = inputs.Run_Base_Name;\n\n var header = [\"####################\"];\n header.push(\"## \" + pipeline_name + \" Version \" + version);\n header.push(\"## Analysis Date - \" + time);\n header.push(\"## Libraries - Bioproduct Libraries: \" + libraries.join('; ') + \" | ATAC Libraries: \" + libraries_atac.join('; ') + \" | Bead version detected: \" + customer_bv);\n header.push(\"## References - \" + mrna_name + \": \" + refs_mrna.join('; ') + \" | AbSeq Reference: \" + refs_abseq.join('; ') + \" | Supplemental Reference: \" + refs_supp.join('; ') + \" | ATAC Predefined Peak Regions: \" + predef_atac_peaks);\n header.push(\"## Parameters - \" + parameters.join(' | '));\n header.push(\"####################\");\n metadata[\"Output_Header\"] = header;\n metadata[\"Run_Name\"] = run_name \n metadata[\"Run_Base_Name\"] = run_base_name;\n\n var metadata_json = JSON.stringify(metadata, null, 2);\n\n return metadata_json;\n}\n" + } + ], + "id": "#Metadata.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_metrics.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-file", + "itemSeparator": "," + }, + "id": "#Metrics.cwl/Cell_Type_Predictions" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--metrics-tar" + }, + "id": "#Metrics.cwl/Metrics_tar" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#Metrics.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell-file" + }, + "id": "#Metrics.cwl/vdjCellsDatatable" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-metrics-fp" + }, + "id": "#Metrics.cwl/vdjMetricsJson" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": [ + "*_ATAC_Metrics.json", + "*_ATAC_Metrics.csv" + ] + }, + "id": "#Metrics.cwl/Metrics_ATAC" + }, + { + "type": "File", + "outputBinding": { + "glob": "internal-metrics-archive.tar.gz" + }, + "id": "#Metrics.cwl/Metrics_Archive" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Metrics_Summary.csv" + }, + "id": "#Metrics.cwl/Metrics_Summary" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Pipeline_Report.html" + }, + "id": "#Metrics.cwl/Pipeline_Report_HTML" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Pipeline_Report.json" + }, + "id": "#Metrics.cwl/Pipeline_Report_JSON" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#Metrics.cwl/output" + } + ], + "id": "#Metrics.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/_Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/_Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/_Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/_Long_Reads" + } + ], + "outputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Long_Reads" + } + ], + "expression": "${\n // the exclude intronic reads flag defaults to false\n var excludeIntronicReads = false;\n // the user can set the flag to exclude intronic reads\n if (inputs._Exclude_Intronic_Reads) {\n excludeIntronicReads = inputs._Exclude_Intronic_Reads;\n }\n\n // Use_Long_Reads default is autodetect, which happens in CheckFastqs\n // User can set this explicitly true or false\n // Convert boolean results to string. null -> \"auto\", true -> \"true\", false -> \"false\"\n var longReads = \"auto\";\n if (inputs._Long_Reads !== null) {\n if (inputs._Long_Reads) {\n longReads = \"true\";\n }\n else {\n longReads = \"false\";\n }\n }\n\n return ({\n Exclude_Intronic_Reads: excludeIntronicReads,\n Long_Reads: longReads,\n Custom_STAR_Params: inputs._Custom_STAR_Params,\n Custom_bwa_mem2_Params: inputs._Custom_bwa_mem2_Params\n });\n\n\n}", + "id": "#MiscSettings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": "string", + "default": "Targeted", + "id": "#MultiplexingSettings.cwl/Assay" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#MultiplexingSettings.cwl/_Sample_Tag_Names" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#MultiplexingSettings.cwl/_Sample_Tags_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#MultiplexingSettings.cwl/Sample_Tag_Names" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MultiplexingSettings.cwl/Sample_Tags_Version" + } + ], + "expression": "${\n var enumifiedSampleTagsVersion = null;\n if (inputs._Sample_Tags_Version) {\n var _Sample_Tags_Version = inputs._Sample_Tags_Version.toLowerCase();\n\n if (_Sample_Tags_Version.indexOf('human') >= 0 || _Sample_Tags_Version === 'hs')\n {\n enumifiedSampleTagsVersion = 'hs';\n }\n else if (_Sample_Tags_Version.indexOf('mouse') >= 0 || _Sample_Tags_Version === 'mm')\n {\n enumifiedSampleTagsVersion = 'mm';\n }\n else if (_Sample_Tags_Version.indexOf('flex') >= 0)\n {\n enumifiedSampleTagsVersion = 'flex';\n }\n else if (_Sample_Tags_Version.indexOf('nuclei') >= 0)\n {\n if (_Sample_Tags_Version.indexOf('atac') >= 0)\n {\n enumifiedSampleTagsVersion = 'nuclei_atac_only';\n }\n else\n {\n enumifiedSampleTagsVersion = 'nuclei_includes_mrna';\n }\n }\n else if (_Sample_Tags_Version === 'no multiplexing')\n {\n enumifiedSampleTagsVersion = null;\n }\n else\n {\n throw new Error(\"Cannot parse Sample Tag Version: \" + inputs._Sample_Tags_Version);\n }\n }\n var newTagNames = null;\n if (inputs._Sample_Tag_Names) {\n var listTagNames = inputs._Sample_Tag_Names\n var newTagNames = []\n for (var num in listTagNames) {\n var tag = listTagNames[num].replace(/[^A-Za-z0-9-+]/g,\"_\");\n newTagNames.push(tag);\n }\n } \n return ({\n Sample_Tag_Names: newTagNames,\n Sample_Tags_Version: enumifiedSampleTagsVersion\n });\n}", + "id": "#MultiplexingSettings.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "NameSettings sets the Run_Name variable that is used as a common prefix to name output files. If the user has specified a Run_Name, it is cleaned up or one is set based on the Bioproduct/ATAC fastq filenames.\n", + "hints": [], + "baseCommand": [ + "python", + "-c" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "default": [], + "id": "#NameSettings.cwl/ATAC_Fastqs" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "default": [], + "id": "#NameSettings.cwl/Bioproduct_Fastqs" + }, + { + "type": [ + "null", + "string" + ], + "default": "", + "id": "#NameSettings.cwl/_Run_Name" + } + ], + "arguments": [ + { + "position": 2, + "valueFrom": "import sys\nfrom mist.apps import CheckFastqs\nCheckFastqs.write_run_name_json(sys.argv[1], sys.argv[2], sys.argv[3])\n", + "shellQuote": true + }, + { + "position": 3, + "valueFrom": "$(inputs._Run_Name)", + "shellQuote": true + }, + { + "position": 4, + "valueFrom": "$(inputs.Bioproduct_Fastqs.join(\",\"))", + "shellQuote": true + }, + { + "position": 5, + "valueFrom": "$(inputs.ATAC_Fastqs.join(\",\"))", + "shellQuote": true + } + ], + "outputs": [ + { + "type": "string", + "outputBinding": { + "glob": "run_base_name.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents)['Run_Base_Name'])" + }, + "id": "#NameSettings.cwl/Run_Base_Name" + }, + { + "type": [ + "null", + "string" + ], + "outputBinding": { + "outputEval": "$(inputs._Run_Name)" + }, + "id": "#NameSettings.cwl/Run_Name" + } + ], + "id": "#NameSettings.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 16000 + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_peak_annotation.py" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "prefix": "--gtf" + }, + "id": "#PeakAnnotation.cwl/Gtf" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peaks_bed_file" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#PeakAnnotation.cwl/Peaks_bed" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#PeakAnnotation.cwl/Run_Metadata" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.tsv.gz" + }, + "id": "#PeakAnnotation.cwl/Peak_Annotation_TSV" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#PeakAnnotation.cwl/output" + } + ], + "id": "#PeakAnnotation.cwl" + }, + { + "class": "CommandLineTool", + "baseCommand": "echo", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_Data" + }, + { + "type": [ + "null", + "int" + ], + "id": "#PutativeCellSettings.cwl/_Exact_Cell_Count" + }, + { + "type": [ + "null", + "int" + ], + "id": "#PutativeCellSettings.cwl/_Expected_Cell_Count" + }, + { + "type": "boolean", + "id": "#PutativeCellSettings.cwl/_Reads_ATAC_Present" + }, + { + "type": "boolean", + "id": "#PutativeCellSettings.cwl/_Reads_RNA_Present" + } + ], + "outputs": [ + { + "type": [ + "null", + "Any" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_ATAC_Algorithm; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_Bioproduct_Algorithm; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": "int", + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_Data; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "int" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Exact_Cell_Count; }" + }, + "id": "#PutativeCellSettings.cwl/Exact_Cell_Count" + }, + { + "type": [ + "null", + "int" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Expected_Cell_Count; }" + }, + "id": "#PutativeCellSettings.cwl/Expected_Cell_Count" + }, + { + "type": "File", + "outputBinding": { + "glob": "putative_cell_settings.json" + }, + "id": "#PutativeCellSettings.cwl/PutativeCellSettings" + } + ], + "stdout": "putative_cell_settings.json", + "arguments": [ + { + "prefix": "" + }, + { + "shellQuote": true, + "valueFrom": "${\n var settings = inputs;\n var errorMessage = \"No error\";\n\n var cellCallingATACAlg = null;\n // the default cell calling algorithm for ATAC is basic\n if (inputs._Reads_ATAC_Present) {\n cellCallingATACAlg = \"Basic\";\n }\n // the user can choose the ATAC cell calling algorithm\n if (inputs._Cell_Calling_ATAC_Algorithm) {\n cellCallingATACAlg = inputs._Cell_Calling_ATAC_Algorithm;\n }\n\n var cellCallingBioproductAlg = null;\n // the default cell calling algorithm for bioproducts is basic\n if (inputs._Reads_RNA_Present) {\n cellCallingBioproductAlg = \"Basic\";\n }\n // the user can choose the bioproducts cell calling algorithm\n if (inputs._Cell_Calling_Bioproduct_Algorithm) {\n cellCallingBioproductAlg = inputs._Cell_Calling_Bioproduct_Algorithm;\n }\n\n // the default cell calling data depends on the data that is provided\n // the overall default is mRNA data\n var cellCallingDataInt = 0;\n // if mRNA and ATAC reads are present, then default to joint cell calling\n if (inputs._Reads_RNA_Present && inputs._Reads_ATAC_Present) {\n cellCallingDataInt = 3;\n }\n // if no WTA data is present, but ATAC data is, then default to ATAC\n else if (!inputs._Reads_RNA_Present && inputs._Reads_ATAC_Present) {\n cellCallingDataInt = 5;\n }\n\n // convert the Cell_Calling_Data from a string to an integer\n if (inputs._Cell_Calling_Data) {\n if (inputs._Cell_Calling_Data === \"mRNA\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA\" was selected but no mRNA Reads were provided.';\n } else {\n cellCallingDataInt = 0;\n }\n }\n else if (inputs._Cell_Calling_Data === \"AbSeq\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq\" was selected but no AbSeq Reads were provided.';\n }\n cellCallingDataInt = 1;\n }\n else if (inputs._Cell_Calling_Data === \"mRNA_and_AbSeq\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_AbSeq\" was selected but no mRNA/AbSeq Reads were provided.';\n }\n cellCallingDataInt = 2;\n }\n else if (inputs._Cell_Calling_Data === \"mRNA_and_ATAC\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_ATAC\" was selected but no mRNA Reads were provided.';\n } else if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 3;\n }\n else if (inputs._Cell_Calling_Data === \"AbSeq_and_ATAC\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq_and_ATAC\" was selected but no AbSeq Reads were provided.';\n } else if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq_and_ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 4;\n }\n else if (inputs._Cell_Calling_Data === \"ATAC\") {\n if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 5;\n }\n }\n // check the exact cell count\n if (inputs._Exact_Cell_Count) {\n if (inputs._Exact_Cell_Count < 1) {\n errorMessage = \"Exact cell count must be an integer greater than 0, value received: \" + inputs._Exact_Cell_Count;\n }\n }\n // check if there is an error\n if (errorMessage != \"No error\") {\n // If there is an error, force CWL to show it:\n // - \"Cell_Calling_Data\" is a required output\n // - setting it to null will cause a CWL error\n // - the error message will be shown in the json\n cellCallingDataInt = null;\n }\n\n settings[\"Cell_Calling_ATAC_Algorithm\"] = cellCallingATACAlg;\n settings[\"Cell_Calling_Bioproduct_Algorithm\"] = cellCallingBioproductAlg;\n settings[\"Cell_Calling_Data\"] = cellCallingDataInt;\n settings[\"Expected_Cell_Count\"] = inputs._Expected_Cell_Count;\n settings[\"Exact_Cell_Count\"] = inputs._Exact_Cell_Count;\n settings[\"Error\"] = errorMessage;\n\n var settings_json = JSON.stringify(settings, null, 2);\n\n return settings_json;\n }\n" + } + ], + "id": "#PutativeCellSettings.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "CheckFastqs does several quality control routines including: ensuring that read pair file names are formatted correctly and contain a read pair mate; QualCLAlign stage of the Rain pipeline overlaps read pairs and then performs a series of filters and mappings to reduce valid reads into a single FastQ file to be fed into the aligner. The R2 reads are annotated with cell index and UMI information derived from the R1 read.\n", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": "${ if(inputs.Threads){ return inputs.Threads; } else{ return 8; } }", + "ramMin": 48000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "mist_run_qualclalign.py" + ], + "inputs": [ + { + "inputBinding": { + "prefix": "--alignment-compression-threads" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Alignment_Compression_threads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--assay" + }, + "id": "#QualCLAlign.cwl/Assay" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--bgzf-threads" + }, + "id": "#QualCLAlign.cwl/BGZF_Threads" + }, + { + "inputBinding": { + "prefix": "--extra-seqs" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Extra_Seqs" + }, + { + "type": [ + "null", + "boolean" + ], + "inputBinding": { + "prefix": "--split-atac-bam" + }, + "id": "#QualCLAlign.cwl/Generate_Bam" + }, + { + "inputBinding": { + "prefix": "--index" + }, + "type": "Directory", + "id": "#QualCLAlign.cwl/Index" + }, + { + "inputBinding": { + "prefix": "--use-star-long" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Long_Reads" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--predefined-peaks" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#QualCLAlign.cwl/Predefined_ATAC_Peaks" + }, + { + "inputBinding": { + "prefix": "--reader-annotation-threads" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Reader_Annotation_Threads" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "inputBinding": { + "prefix": "--reads", + "itemSeparator": "," + }, + "id": "#QualCLAlign.cwl/Reads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--run-name" + }, + "id": "#QualCLAlign.cwl/Run_Base_Name" + }, + { + "inputBinding": { + "prefix": "--star-params", + "shellQuote": true + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/STAR_Params" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--threads" + }, + "id": "#QualCLAlign.cwl/Threads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version" + }, + "id": "#QualCLAlign.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "boolean" + ], + "inputBinding": { + "prefix": "--write-filtered-read-pairs" + }, + "id": "#QualCLAlign.cwl/Write_Filtered_Reads" + }, + { + "inputBinding": { + "prefix": "--bwa-mem2-params", + "shellQuote": true + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/bwa_mem2_Params" + } + ], + "outputs": [ + { + "outputBinding": { + "glob": "*.bam" + }, + "type": { + "type": "array", + "items": "File" + }, + "id": "#QualCLAlign.cwl/BAMFiles" + }, + { + "type": { + "type": "array", + "items": { + "type": "record", + "fields": [ + { + "name": "#QualCLAlign.cwl/Bead_Version/Library", + "type": "string" + }, + { + "name": "#QualCLAlign.cwl/Bead_Version/bead_version", + "type": "string" + } + ] + } + }, + "outputBinding": { + "glob": "bead_version.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).BeadVersion)\n" + }, + "id": "#QualCLAlign.cwl/Bead_Version" + }, + { + "outputBinding": { + "glob": "*.failedReads.csv.gz" + }, + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#QualCLAlign.cwl/Failed_Reads_CSVs" + }, + { + "outputBinding": { + "glob": "fastq_read_pairs.json" + }, + "type": "File", + "id": "#QualCLAlign.cwl/Fastq_read_pairs" + }, + { + "outputBinding": { + "glob": "*_ATAC_Fragments.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Fragments" + }, + { + "outputBinding": { + "glob": "*_ATAC_Fragments.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Fragments_Index" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "outputBinding": { + "glob": "bead_version.json", + "loadContents": true, + "outputEval": "${\n var obj = JSON.parse(self[0].contents);\n var libraries = [];\n var beadLibs = obj.BeadVersion\n for (var i in beadLibs){\n if (libraries.indexOf(beadLibs[i][\"Library\"]) == -1){ \n libraries.push(beadLibs[i][\"Library\"]);\n }\n }\n libraries.sort();\n return libraries\n}\n" + }, + "id": "#QualCLAlign.cwl/Libraries" + }, + { + "outputBinding": { + "glob": "*logs.tar.gz" + }, + "type": { + "type": "array", + "items": "File" + }, + "id": "#QualCLAlign.cwl/Logs" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Peaks" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Peaks_Index" + }, + { + "outputBinding": { + "glob": "*ReadQualityMetrics.json" + }, + "type": "File", + "id": "#QualCLAlign.cwl/QualCLAlignMetrics" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "outputBinding": { + "outputEval": "${ \n var reads = []; \n var files = inputs.Reads\n for (var i in files){\n reads.push(files[i][\"basename\"]);\n }\n reads.sort();\n return(reads)\n}\n" + }, + "id": "#QualCLAlign.cwl/ReadsList" + }, + { + "type": "string", + "outputBinding": { + "glob": "genome_size.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return \"0\"; } return JSON.parse(self[0].contents); }" + }, + "id": "#QualCLAlign.cwl/Reference_Genome_Size" + }, + { + "outputBinding": { + "glob": "*_ATAC_Transposase_Sites.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Transposase_Sites" + }, + { + "outputBinding": { + "glob": "*_ATAC_Transposase_Sites.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Transposase_Sites_Index" + }, + { + "outputBinding": { + "glob": "*_ATAC_UnifiedMetrics.json" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/UnifiedMetrics" + } + ], + "id": "#QualCLAlign.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/Maximum_Threads" + }, + { + "type": "int", + "id": "#VDJ_Analyze_Reads_IG.cwl/Num_Valid_Reads_IG" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/Valid_Reads_Fastq_IG" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/gatheredCalls", + "id": "#VDJ_Analyze_Reads_IG.cwl/gatheredCalls" + } + ], + "requirements": [ + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "steps": [ + { + "run": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/RSEC_Reads_Fastq", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_cores", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/num_cores" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/igCalls" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG" + }, + { + "run": "#VDJ_GatherCalls.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/igCalls", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/theCalls" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/gatheredCalls" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls" + }, + { + "run": "#VDJ_Preprocess_Reads.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Maximum_Threads", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/Maximum_Threads" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Valid_Reads_Fastq_IG", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/Valid_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Num_Valid_Reads_IG", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_valid_reads" + }, + { + "valueFrom": "BCR", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/vdj_type" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/RSEC_Reads_Fastq", + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_splits", + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_cores" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG" + } + ], + "id": "#VDJ_Analyze_Reads_IG.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/Maximum_Threads" + }, + { + "type": "int", + "id": "#VDJ_Analyze_Reads_TCR.cwl/Num_Valid_Reads_TCR" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/Valid_Reads_Fastq_TCR" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/gatheredCalls", + "id": "#VDJ_Analyze_Reads_TCR.cwl/gatheredCalls" + } + ], + "requirements": [ + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "steps": [ + { + "run": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/RSEC_Reads_Fastq", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_cores", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/num_cores" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/tcrCalls" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR" + }, + { + "run": "#VDJ_GatherCalls.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/tcrCalls", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/theCalls" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/gatheredCalls" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls" + }, + { + "run": "#VDJ_Preprocess_Reads.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Maximum_Threads", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/Maximum_Threads" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Valid_Reads_Fastq_TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/Valid_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Num_Valid_Reads_TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_valid_reads" + }, + { + "valueFrom": "TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/vdj_type" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/RSEC_Reads_Fastq", + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_splits", + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_cores" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR" + } + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 1, + "ramMin": 1024 + } + ], + "baseCommand": [ + "AssembleAndAnnotate.sh" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "position": 1 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/RSEC_Reads_Fastq" + }, + { + "type": "string", + "inputBinding": { + "position": 2 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/Read_Limit" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "position": 3 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_pruned.csv.gz" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/PyirCall" + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl" + }, + { + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/RSEC_Reads_Fastq" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/num_cores" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/PyirCall", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/igCalls" + } + ], + "steps": [ + { + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG", + "run": "#VDJ_Assemble_and_Annotate_Contigs.cwl", + "in": [ + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/RSEC_Reads_Fastq", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + }, + { + "valueFrom": "75000", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/Read_Limit" + }, + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Version", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/VDJ_Version" + } + ], + "out": [ + "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/PyirCall" + ], + "scatter": [ + "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + ] + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl" + }, + { + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/RSEC_Reads_Fastq" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/num_cores" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/PyirCall", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/tcrCalls" + } + ], + "steps": [ + { + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR", + "run": "#VDJ_Assemble_and_Annotate_Contigs.cwl", + "in": [ + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/RSEC_Reads_Fastq", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + }, + { + "valueFrom": "75000", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/Read_Limit" + }, + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Version", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/VDJ_Version" + } + ], + "out": [ + "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/PyirCall" + ], + "scatter": [ + "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + ] + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl" + }, + { + "class": "CommandLineTool", + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + } + ], + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "baseCommand": [ + "mist_vdj_compile_results.py" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "prefix": "--seq-metrics", + "position": 10 + }, + "id": "#VDJ_Compile_Results.cwl/Seq_Metrics" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version", + "position": 2 + }, + "id": "#VDJ_Compile_Results.cwl/VDJ_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "position": 0, + "itemSeparator": ",", + "prefix": "--cell-type-mapping-fp" + }, + "id": "#VDJ_Compile_Results.cwl/cellTypeMapping" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "inputBinding": { + "position": 4, + "prefix": "--ignore", + "itemSeparator": "," + }, + "id": "#VDJ_Compile_Results.cwl/chainsToIgnore" + }, + { + "type": [ + "null", + "float" + ], + "inputBinding": { + "position": 8, + "prefix": "--e-value-for-j" + }, + "id": "#VDJ_Compile_Results.cwl/evalueJgene" + }, + { + "type": [ + "null", + "float" + ], + "inputBinding": { + "position": 7, + "prefix": "--e-value-for-v" + }, + "id": "#VDJ_Compile_Results.cwl/evalueVgene" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 5 + }, + "id": "#VDJ_Compile_Results.cwl/igCalls" + }, + { + "type": "File", + "inputBinding": { + "position": 9, + "prefix": "--metadata-fp" + }, + "id": "#VDJ_Compile_Results.cwl/metadata" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--putative-cells-json-fp", + "position": 3 + }, + "id": "#VDJ_Compile_Results.cwl/putativeCells" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 6 + }, + "id": "#VDJ_Compile_Results.cwl/tcrCalls" + } + ], + "outputs": [ + { + "doc": "VDJ data per cell, with distribution based error correction", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_perCell.csv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjCellsDatatable" + }, + { + "doc": "VDJ data per cell, including non-putative cells, no error correction applied", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_perCell_uncorrected.csv.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjCellsDatatableUncorrected" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_DBEC_images.tar.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjDbecFilterImages" + }, + { + "doc": "AIRR compatible output that only reports the Dominant contigs, counts are DBEC corrected", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_Dominant_Contigs_AIRR.tsv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjDominantContigsAIRR" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_metrics.csv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjMetricsCsv" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_metrics.json" + }, + "id": "#VDJ_Compile_Results.cwl/vdjMetricsJson" + }, + { + "type": [ + "null", + "File" + ], + "doc": "AIRR compatible output that reports all the congits, counts are not DBEC corrected", + "outputBinding": { + "glob": "*_VDJ_Unfiltered_Contigs_AIRR.tsv.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjUnfilteredContigsAIRR" + } + ], + "id": "#VDJ_Compile_Results.cwl" + }, + { + "doc": "VDJ_GatherCalls collect the outputs from the multi-processed VDJ step into one file.\n", + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_GatherCalls.cwl/VDJ_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#VDJ_GatherCalls.cwl/theCalls" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/gatheredCalls", + "id": "#VDJ_GatherCalls.cwl/gatheredCalls" + } + ], + "steps": [ + { + "in": [ + { + "source": "#VDJ_GatherCalls.cwl/theCalls", + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/theCalls" + } + ], + "out": [ + "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/gatheredCalls" + ], + "run": { + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR", + "cwlVersion": "v1.2", + "class": "CommandLineTool", + "hints": [], + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR/theCalls" + } + ], + "arguments": [ + { + "shellQuote": false, + "valueFrom": "${\n if (!inputs.theCalls[0] ) {\n return (\"echo \\\"No outputs from PyIR detected in VDJ_GatherCalls\\\"\")\n }\n var inputFiles = \"\"\n if (!inputs.theCalls[0].path.split(\"_PrunePyIR\")[1]){\n inputFiles = \"zcat\"\n for (var i = 0; i < inputs.theCalls.length; i++) {\n inputFiles += \" \" + inputs.theCalls[i].path\n }\n inputFiles += \" | \"\n } else {\n inputFiles = \"zcat \" + inputs.theCalls[0].path.split(\"VDJ\")[0] + \"*\" + inputs.theCalls[0].path.split(\"_PrunePyIR\")[1].split(\"_Number_\")[0] + \"_Number_*.csv.gz | \"\n }\n var outputFileName = \"\\\"gzip > \" + inputs.theCalls[0].nameroot.split(\"_Number_\")[0] + \"_constant_region_called_pruned.csv.gz\" + \"\\\"\"\n var awkCommand = \"awk \\'NR==1{F=$1;print | \" + outputFileName + \" } $1!=F { print | \" + outputFileName + \" }\\' \"\n var outputCommand = inputFiles + awkCommand\n return (outputCommand)\n}" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_constant_region_called_pruned.csv.gz", + "outputEval": "${\n if (self.size == 0) {\n throw(\"No outputs from PyIR detected in VDJ_GatherCalls!\");\n } else {\n return(self);\n }\n}" + }, + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR/gatheredCalls" + } + ] + }, + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls" + } + ], + "id": "#VDJ_GatherCalls.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/Maximum_Threads" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Preprocess_Reads.cwl/Valid_Reads_Fastq" + }, + { + "type": "int", + "id": "#VDJ_Preprocess_Reads.cwl/num_valid_reads" + }, + { + "type": "string", + "id": "#VDJ_Preprocess_Reads.cwl/vdj_type" + } + ], + "outputs": [ + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/RSEC_Reads_Fastq", + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Preprocess_Reads.cwl/RSEC_Reads_Fastq" + }, + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_cores", + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/num_cores" + }, + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/num_splits" + } + ], + "requirements": [ + { + "envDef": [ + { + "envValue": "8", + "envName": "CORES_ALLOCATED_PER_CWL_PROCESS" + } + ], + "class": "EnvVarRequirement" + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "steps": [ + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads", + "requirements": [ + { + "class": "ResourceRequirement", + "coresMin": 8 + } + ], + "run": "#VDJ_RSEC_Reads.cwl", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/Valid_Reads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/num_splits" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/num_valid_reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/num_valid_reads" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/RSEC_Reads_Fastq" + ] + }, + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8 + } + ], + "run": "#VDJ_Trim_Reads.cwl", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/Valid_Reads_Fastq", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads_Fastq" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads", + "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Trim_Report" + ] + }, + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/Maximum_Threads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/Maximum_Threads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/num_valid_reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_valid_reads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/vdj_type", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/vdj_type" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_cores" + ], + "run": { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits", + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/Maximum_Threads" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_valid_reads" + }, + { + "type": "string", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/vdj_type" + } + ], + "outputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_cores" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_splits" + } + ], + "expression": "${\n var num_splits = 64;\n var max_threads = parseInt(inputs.Maximum_Threads);\n if (!isNaN(max_threads)) {\n num_splits = parseInt(Math.max(max_threads, 8) * 0.7);\n }\n return ({\"num_splits\": num_splits, \"num_cores\": num_splits});\n}" + } + } + ], + "id": "#VDJ_Preprocess_Reads.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": "mist_vdj_rsec_reads.py", + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_RSEC_Reads.cwl/VDJ_Version" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "inputBinding": { + "prefix": "--vdj-valid-reads", + "itemSeparator": "," + }, + "id": "#VDJ_RSEC_Reads.cwl/Valid_Reads" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--num-splits" + }, + "id": "#VDJ_RSEC_Reads.cwl/num_splits" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_RSEC_Reads.cwl/num_valid_reads" + } + ], + "outputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*RSEC_Reads_Fastq_*.tar.gz" + }, + "id": "#VDJ_RSEC_Reads.cwl/RSEC_Reads_Fastq" + } + ], + "id": "#VDJ_RSEC_Reads.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "Any" + ], + "id": "#VDJ_Settings.cwl/_VDJ_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + "float" + ], + "id": "#VDJ_Settings.cwl/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#VDJ_Settings.cwl/VDJ_VGene_Evalue" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Settings.cwl/VDJ_Version" + } + ], + "expression": "${\n var vdjVersion = null;\n if (!inputs._VDJ_Version) {\n vdjVersion = null;}\n else {\n var _VDJ_Version = inputs._VDJ_Version.toLowerCase();\n if (_VDJ_Version === \"human\" || _VDJ_Version === \"hs\" || _VDJ_Version === \"human vdj - bcr and tcr\") {\n vdjVersion = \"human\";\n } else if (_VDJ_Version === \"humanbcr\" || _VDJ_Version === \"human vdj - bcr only\") {\n vdjVersion = \"humanBCR\";\n } else if (_VDJ_Version === \"humantcr\" || _VDJ_Version === \"human vdj - tcr only\") {\n vdjVersion = \"humanTCR\";\n } else if (_VDJ_Version === \"mouse\" || _VDJ_Version === \"mm\" || _VDJ_Version === \"mouse vdj - bcr and tcr\") {\n vdjVersion = \"mouse\";\n } else if (_VDJ_Version === \"mousebcr\" || _VDJ_Version === \"mouse vdj - bcr only\") {\n vdjVersion = \"mouseBCR\";\n } else if (_VDJ_Version === \"mousetcr\" || _VDJ_Version === \"mouse vdj - tcr only\") {\n vdjVersion = \"mouseTCR\";\n } else {\n vdjVersion = inputs._VDJ_Version;\n }\n }\n\n return ({\n VDJ_Version: vdjVersion,\n })\n}", + "id": "#VDJ_Settings.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": "VDJ_Trim_Reads.sh", + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Trim_Reads.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 1 + }, + "id": "#VDJ_Trim_Reads.cwl/Valid_Reads_Fastq" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "cutadapt.log" + }, + "id": "#VDJ_Trim_Reads.cwl/Trim_Report" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*vdjtxt.gz" + }, + "id": "#VDJ_Trim_Reads.cwl/Valid_Reads" + } + ], + "id": "#VDJ_Trim_Reads.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_print_version.py" + ], + "stdout": "output.txt", + "inputs": [], + "outputs": [ + { + "type": "string", + "outputBinding": { + "glob": "output.txt", + "loadContents": true, + "outputEval": "$(self[0].contents)" + }, + "id": "#Version.cwl/version" + } + ], + "id": "#Version.cwl" + } + ], + "cwlVersion": "v1.2" +} diff --git a/src/mapping/bd_rhapsody/script.py b/src/mapping/bd_rhapsody/script.py new file mode 100644 index 00000000..92b8f3ca --- /dev/null +++ b/src/mapping/bd_rhapsody/script.py @@ -0,0 +1,320 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil +import glob + +## VIASH START +par = { + "reads": [ + "resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R1_001_subset.fastq.gz", + "resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R2_001_subset.fastq.gz", + "resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R1_001_subset.fastq.gz", + "resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R2_001_subset.fastq.gz", + ], + "reads_atac": None, + "reference_archive": "reference_gencodev41_chr1.tar.gz", + "targeted_reference": [], + "abseq_reference": [ + "resources_test/bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta" + ], + "supplemental_reference": [], + "cell_calling_data": "mRNA", + "cell_calling_bioproduct_algorithm": None, + "cell_calling_atac_algorithm": None, + "exact_cell_count": 4900, + "expected_cell_count": None, + "exclude_intronic_reads": None, + "sample_tags_version": None, + "tag_names": [], + "vdj_version": None, + "predefined_atac_peaks": None, + "run_name": "sample", + "generate_bam": False, + "alignment_star_params": None, + "alignment_bwa_mem2_params": None, + "parallel": True, + "timestamps": False, + "dryrun": False, + "output_dir": "output_large_op", + "output_seurat": "seurat.rds", + "output_mudata": "mudata.h5mu", + "metrics_summary": "metrics_summary.csv", + "pipeline_report": "pipeline_report.html", + "rsec_mols_per_cell": None, + "dbec_mols_per_cell": None, + "rsec_mols_per_cell_unfiltered": None, + "bam": None, + "bam_index": None, + "bioproduct_stats": None, + "dimred_tsne": None, + "dimred_umap": None, + "immune_cell_classification": None, + "sample_tag_metrics": None, + "sample_tag_calls": None, + "sample_tag_counts": None, + "sample_tag_counts_unassigned": None, + "vdj_metrics": None, + "vdj_per_cell": None, + "vdj_per_cell_uncorrected": None, + "vdj_dominant_contigs": None, + "vdj_unfiltered_contigs": None, + "atac_metrics": None, + "atac_metrics_json": None, + "atac_fragments": None, + "atac_fragments_index": None, + "atac_transposase_sites": None, + "atac_transposase_sites_index": None, + "atac_peaks": None, + "atac_peaks_index": None, + "atac_peak_annotation": None, + "atac_cell_by_peak": None, + "atac_cell_by_peak_unfiltered": None, + "atac_bam": None, + "atac_bam_index": None, + "protein_aggregates_experimental": None, + "long_reads": None, + "custom_star_params": None, + "custom_bwa_mem2_params": None, + "abseq_umi": None, + "target_analysis": None, + "vdj_jgene_evalue": None, + "vdj_vgene_evalue": None, + "write_filtered_reads": None, +} +meta = { + "config": "target/nextflow/mapping/bd_rhapsody/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/mapping/bd_rhapsody"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None, +} +## VIASH END + + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] + ] + + return config + + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + + +def process_params(par: dict[str, Any], config, temp_dir: str) -> str: + # check input parameters + assert par["reads"] or par["reads_atac"], ( + "Pass at least one set of inputs to --reads or --reads_atac." + ) + + # output to temp dir if output_dir was not passed + if not par["output_dir"]: + par["output_dir"] = os.path.join(temp_dir, "output") + + # checking sample prefix + if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): + print( + "--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", + flush=True, + ) + par["run_name"] = re.sub("[^A-Za-z0-9\\-]", "-", par["run_name"]) + + # make paths absolute + for argument in config["arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] + else: + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + + return par + + +def generate_config(par: dict[str, Any], config) -> str: + content_list = [ + strip_margin("""\ + |#!/usr/bin/env cwl-runner + | + |cwl:tool: rhapsody + |""") + ] + + for argument in config["arguments"]: + arg_info = argument.get("info") or {} + config_key = arg_info.get("config_key") + if par[argument["clean_name"]] and config_key: + if argument["type"] == "file": + str = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(par[argument["clean_name"]], list): + for file in par[argument["clean_name"]]: + str += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\ + | class: File + | location: "{par[argument["clean_name"]]}" + |""") + content_list.append(str) + else: + content_list.append( + strip_margin(f"""\ + |{config_key}: {par[argument["clean_name"]]} + |""") + ) + + ## Write config to file + return "".join(content_list) + + +def generate_config_file( + par: dict[str, Any], config: dict[str, Any], temp_dir: str +) -> str: + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, config) + with open(config_file, "w") as f: + f.write(config_content) + return config_file + + +def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: + # create cwl file (if need be) + orig_cwl_file = os.path.join( + meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl" + ) + + # Inject computational requirements into pipeline + if meta["memory_mb"] or meta["cpus"]: + cwl_file = os.path.join(dir, "pipeline.cwl") + + # Read in the file + with open(orig_cwl_file, "r") as file: + cwl_data = file.read() + + # Inject computational requirements into pipeline + if meta["memory_mb"]: + memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS + cwl_data = re.sub( + '"ramMin": [^\n]*[^,](,?)\n', f'"ramMin": {memory}\\1\n', cwl_data + ) + if meta["cpus"]: + cwl_data = re.sub( + '"coresMin": [^\n]*[^,](,?)\n', + f'"coresMin": {meta["cpus"]}\\1\n', + cwl_data, + ) + + # Write the file out again + with open(cwl_file, "w") as file: + file.write(cwl_data) + else: + cwl_file = orig_cwl_file + + return cwl_file + + +def copy_outputs(par: dict[str, Any], config: dict[str, Any]): + for arg in config["arguments"]: + par_value = par[arg["clean_name"]] + if par_value and arg["type"] == "file" and arg["direction"] == "output": + # example template: '[sample_name]_(assay)_cell_type_experimental.csv' + template = (arg.get("info") or {}).get("template") + if template: + template_glob = ( + template.replace("sample", par["run_name"]) + .replace("assay", "*") + .replace("number", "*") + ) + files = glob.glob(os.path.join(par["output_dir"], template_glob)) + if len(files) == 0 and arg["required"]: + raise ValueError( + f"Expected output file '{template_glob}' not found." + ) + elif len(files) > 1 and not arg["multiple"]: + raise ValueError( + f"Expected single output file '{template_glob}', but found multiple." + ) + + if not arg["multiple"]: + try: + shutil.copy(files[0], par_value) + print(f"Copied {files[0]} to {par_value}") + except IndexError: + print(f"Unable to copy {template_glob} to {par_value}") + else: + # replace '*' in par_value with index + for i, file in enumerate(files): + shutil.copy(file, par_value.replace("*", str(i))) + + +def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config, temp_dir) + + ## Process parameters + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + par["output_dir"], + ] + + if par["parallel"]: + cmd.append("--parallel") + + if par["timestamps"]: + cmd.append("--timestamps") + + # Create cwl file (if need be) + cwl_file = generate_cwl_file(meta, temp_dir) + cmd.append(cwl_file) + + # Create params file + config_file = generate_config_file(par, config, temp_dir) + cmd.append(config_file) + + # keep environment variables but set TMPDIR to temp_dir + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + # Create output dir if not exists + if not os.path.exists(par["output_dir"]): + os.makedirs(par["output_dir"]) + + # Run command + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + # Copy outputs + copy_outputs(par, config) + + +if __name__ == "__main__": + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"] + ) as temp_dir: + main(par, meta, temp_dir) diff --git a/src/mapping/bd_rhapsody/test.py b/src/mapping/bd_rhapsody/test.py new file mode 100644 index 00000000..385000f6 --- /dev/null +++ b/src/mapping/bd_rhapsody/test.py @@ -0,0 +1,81 @@ +import subprocess +from pathlib import Path +import mudata as md + +## VIASH START +meta = { + "name": "bd_rhapsody", + "executable": "target/docker/mapping/bd_rhapsody/bd_rhapsody", + "resources_dir": "src/mapping/bd_rhapsody", + "cpus": 8, + "memory_mb": 4096, +} + +# bdabseq_panel_fa = "resources_test/bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta" +# reference_file = "resources_test/reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz" +# abc_reads = "resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R1_001_subset.fastq.gz;resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R2_001_subset.fastq.gz" +# wta_reads = "resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R1_001_subset.fastq.gz;resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R2_001_subset.fastq.gz" +## VIASH END + +wta_reads = f"{meta['resources_dir']}/raw/12WTA_S1_L432_R1_001_subset.fastq.gz;{meta['resources_dir']}/raw/12WTA_S1_L432_R2_001_subset.fastq.gz" +abc_reads = f"{meta['resources_dir']}/raw/12ABC_S1_L432_R1_001_subset.fastq.gz;{meta['resources_dir']}/raw/12ABC_S1_L432_R2_001_subset.fastq.gz" +reference_file = f"{meta['resources_dir']}/reference_bd_rhapsody.tar.gz" +bdabseq_panel_fa = f"{meta['resources_dir']}/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta" + +# Run executable +print(f">> Run {meta['name']}", flush=True) +output_dir = Path("output") +subprocess.run( + [ + meta["executable"], + f"--reads={wta_reads}", + f"--reads={abc_reads}", + f"--reference_archive={reference_file}", + f"--abseq_reference={bdabseq_panel_fa}", + "--output_dir=output", + "--exact_cell_count=4900", + "---cpus=2", + "---memory=10gb", + "--output_seurat=seurat.rds", + "--output_mudata=mudata.h5mu", + "--metrics_summary=metrics_summary.csv", + "--pipeline_report=pipeline_report.html", + ] +) + + +# Check if output exists +print(">> Check if output exists", flush=True) +assert (output_dir / "sample_Bioproduct_Stats.csv").exists() +assert (output_dir / "sample_Metrics_Summary.csv").exists() +assert (output_dir / "sample_Pipeline_Report.html").exists() +assert (output_dir / "sample_RSEC_MolsPerCell_MEX.zip").exists() +assert (output_dir / "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip").exists() +assert (output_dir / "sample_Seurat.rds").exists() +assert (output_dir / "sample.h5mu").exists() + +# check individual outputs +assert Path("seurat.rds").exists() +assert Path("mudata.h5mu").exists() +assert Path("metrics_summary.csv").exists() +assert Path("pipeline_report.html").exists() + +print(">> Check contents of output", flush=True) +data = md.read_h5mu("mudata.h5mu") + +assert data.n_obs == 4900, "Number of cells is incorrect" +assert "rna" in data.mod, "RNA data is missing" +assert "prot" in data.mod, "RNA data is missing" + + +data_rna = data.mod["rna"] +assert data_rna.n_vars > 1000, "Number of genes is incorrect" +assert data_rna.X.sum(axis=1).min() > 0, "Number of reads per cell is incorrect" +assert data_rna.var.Raw_Reads.sum() > 100000, "Number of reads is incorrect" + +# TODO: add VDJ, SMK, ATAC, and targeted RNA to test + + +######################################################################################### + +print("> Test successful", flush=True) diff --git a/src/mapping/cellranger_atac_count/config.vsh.yaml b/src/mapping/cellranger_atac_count/config.vsh.yaml new file mode 100644 index 00000000..a5d8ee59 --- /dev/null +++ b/src/mapping/cellranger_atac_count/config.vsh.yaml @@ -0,0 +1,78 @@ +name: cellranger_atac_count +namespace: mapping +scope: "public" +description: Align fastq files using Cell Ranger ATAC count. +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + required: true + multiple: true + example: [ "sample_S1_L001_R1_001.fastq.gz", "sample_S1_L001_R2_001.fastq.gz" ] + description: The fastq.gz files to align. Can also be a single directory containing fastq.gz files. + - type: file + name: --reference + required: true + description: The path to Cell Ranger reference tar.gz file. Can also be a directory. + example: reference.tar.gz + - name: Outputs + arguments: + - type: file + name: --output + direction: output + description: The folder to store the alignment results. + example: "/path/to/output" + required: true + - name: Arguments + arguments: + - type: string + name: --description + description: Sample description to embed in output files + default: "" + - type: integer + name: --force_cells + description: "Define the top N barcodes with the most fragments overlapping peaks as cells and override the cell calling algorithm. N must be a positive integer <= 20,000. Use this option if the number of cells estimated by Cell Ranger ATAC is not consistent with the barcode rank plot" + max: 20000 + - type: file + name: --peaks + description: "Override peak caller: specify peaks to use in downstream analyses from supplied 3-column BED file. The supplied peaks file must be sorted by position and not contain overlapping peaks; comment lines beginning with # are allowed" + - type: string + name: --dim_reduce + description: "Dimensionality reduction mode for clustering" + choices: [ lsa, pca, plsa ] + default: lsa + - type: double + name: --subsample_rate + description: "Downsample to preserve this fraction of reads" + example: 0.1 + - type: string + multiple: true + name: --lanes + description: bcl2fastq option. Semicolon-delimited series of lanes to demultiplex. Use this if you have a sample sheet for an entire flow cell but only want to generate a few lanes for further 10x Genomics analysis. + example: 1,3 +resources: + - type: bash_script + path: script.sh +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_atac_tiny_bcl + - path: /src/utils/setup_logger.py + - path: /resources_test/reference_gencodev41_chr1 +engines: + - type: docker + image: ghcr.io/data-intuitive/cellranger_atac:2.1 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update \ + && apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/cellranger_atac_count/script.sh b/src/mapping/cellranger_atac_count/script.sh new file mode 100644 index 00000000..19719b4c --- /dev/null +++ b/src/mapping/cellranger_atac_count/script.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input='resources_test/cellranger_atac_tiny_bcl/fastqs/' +par_reference='resources_test/reference_gencodev41_chr1/' +par_output='resources_test/cellranger_atac_tiny_bcl/bam' +## VIASH END + +# just to make sure paths are absolute +par_reference=`realpath $par_reference` +par_output=`realpath $par_output` + +echo "Creating temporary directory" +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# process inputs +# for every fastq file found, make a symlink into the tempdir +echo "Locating fastqs" +fastq_dir="$tmpdir/fastqs" +mkdir -p "$fastq_dir" +IFS=";" +for var in $par_input; do + unset IFS + abs_path=`realpath $var` + if [ -d "$abs_path" ]; then + find "$abs_path" -name *.fastq.gz -exec ln -s {} "$fastq_dir" \; + else + ln -s "$abs_path" "$fastq_dir" + fi +done + +echo "fastq_dir content: $(ls $fastq_dir)" + +echo "Processing reference" +# process reference +if file $par_reference | grep -q 'gzip compressed data'; then + echo "Untarring genome" + reference_dir="$tmpdir/fastqs" + mkdir -p "$reference_dir" + tar -xvf "$par_reference" -C "$reference_dir" --strip-components=1 + par_reference="$reference_dir" +fi + +# cd into tempdir +cd "$tmpdir" + +if [ ! -z "$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=`python -c "print(int('$meta_memory_gb') - 2)"` +fi + +echo "Running cellranger-atac count" + +id=myoutput +cellranger-atac count \ + --id "$id" \ + --fastqs "$fastq_dir" \ + --reference "$par_reference" \ + --dim-reduce "$par_dim_reduce" \ + --description "$par_description" \ + ${par_lanes:+--lanes=${par_lanes[*]}} \ + ${par_force_cells:+--force-cells=$par_force_cells} \ + ${par_subsample_rate:+--subsample-rate=$par_subsample_rate} \ + ${memory_gb:+--localmem=$memory_gb} \ + ${meta_cpus:+--localcores=$meta_cpus} \ + ${par_lanes:+--lanes=${par_lanes[*]}} + +echo "Copying output" +if [ -d "$id/outs/" ]; then + if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" + fi + mv "$id/outs/"* "$par_output" +fi diff --git a/src/mapping/cellranger_atac_count/test.py b/src/mapping/cellranger_atac_count/test.py new file mode 100644 index 00000000..71c9eb23 --- /dev/null +++ b/src/mapping/cellranger_atac_count/test.py @@ -0,0 +1,75 @@ +import subprocess +from os import path +import sys +from itertools import chain + +## VIASH START +meta = {"name": "cellranger_atac_count", "resources_dir": "resources_test"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("> Running command with folder") +input = ( + meta["resources_dir"] + "/cellranger_atac_tiny_bcl/fastqs/HJN3KBCX2/test_sample/" +) +reference = ( + meta["resources_dir"] + "/reference_gencodev41_chr1/reference_cellranger.tar.gz" +) +output = "test_output" +cmd_pars = [ + meta["executable"], + "--input", + input, + "--reference", + reference, + "--output", + output, +] +if meta.get("cpus"): + cmd_pars.extend(["---cpus", str(meta["cpus"])]) +if meta.get("memory_gb"): + cmd_pars.extend(["---memory", f"{meta['memory_gb']}gb"]) +try: + out = subprocess.check_output(cmd_pars).decode("utf-8") +except subprocess.CalledProcessError as e: + logger.error(e.output) + raise e +logger.info("> Check if file exists") +assert path.exists(output + "/filtered_peak_bc_matrix.h5"), "No output was created." +assert path.exists(output + "/fragments.tsv.gz"), "No fragments file was created." + + +logger.info("> Running command with fastq files") +# test_sample_S1_L001_R2_001.fastq.gz test_sample_S1_L001_R1_001.fastq.gz test_sample_S1_L001_R3_001.fastq.gz +input_files = [ + input + "test_sample_S1_L001_I1_001.fastq.gz", + input + "test_sample_S1_L001_R1_001.fastq.gz", + input + "test_sample_S1_L001_R2_001.fastq.gz", + input + "test_sample_S1_L001_R3_001.fastq.gz", +] +output = "test_output2" + + +cmd_pars = [ + meta["executable"], + *chain.from_iterable([("--input", input_file) for input_file in input_files]), + "--reference", + reference, + "--output", + output, +] +if meta.get("cpus"): + cmd_pars.extend(["---cpus", str(meta["cpus"])]) +if meta.get("memory_gb"): + cmd_pars.extend(["---memory", f"{meta['memory_gb']}gb"]) +out = subprocess.check_output(cmd_pars).decode("utf-8") + +logger.info("> Check if file exists") +assert path.exists(output + "/filtered_peak_bc_matrix.h5"), "No output was created." +assert path.exists(output + "/fragments.tsv.gz"), "No fragments file was created." + +logger.info("> Completed Successfully!") diff --git a/src/mapping/cellranger_count/config.vsh.yaml b/src/mapping/cellranger_count/config.vsh.yaml new file mode 100644 index 00000000..ef2a3eaf --- /dev/null +++ b/src/mapping/cellranger_count/config.vsh.yaml @@ -0,0 +1,155 @@ +name: cellranger_count +namespace: mapping +scope: "public" +description: Align fastq files using Cell Ranger count. +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/samuel_d_souza.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + required: true + multiple: true + example: [ "sample_S1_L001_R1_001.fastq.gz", "sample_S1_L001_R2_001.fastq.gz" ] + description: The fastq.gz files to align. Can also be a single directory containing fastq.gz files. + - type: file + name: --reference + required: true + description: The path to Cell Ranger reference tar.gz file. Can also be a directory. + example: reference.tar.gz + - type: file + name: "--feature_reference" + required: false + description: | + Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes + - name: Outputs + arguments: + - type: file + name: --output + direction: output + description: The folder to store the alignment results. + example: "/path/to/output" + required: true + - name: Arguments + arguments: + - type: integer + name: --expect_cells + example: 3000 + description: "Expected number of recovered cells, used as input to cell calling algorithm." + + - type: integer + name: "--force_cells" + example: 3000 + description: "Force pipeline to use this number of cells, bypassing cell calling algorithm." + + - type: string + name: --chemistry + default: auto + description: | + Assay configuration. + - auto: autodetect mode + - threeprime: Single Cell 3' + - fiveprime: Single Cell 5' + - SC3Pv1: Single Cell 3' v1 + NOTE: this mode cannot be auto-detected. It must be set explicitly with this option. + - SC3Pv2: Single Cell 3' v2 + - SC3Pv3: Single Cell 3' v3 + - SC3Pv4: Single Cell 3' v4 + - SC3Pv3LT: Single Cell 3' v3 LT + - SC3Pv3HT: Single Cell 3' v3 HT + - SC5P-R2-v3: Single Cell 5', paired-end/R2-only + - SC5P-PE-v3: Single Cell 5' paired-end v3 (GEM-X) + - SC5P-PE: Single Cell 5' paired-end + - SC5P-R2: Single Cell 5' R2-only + - SC-FB: Single Cell Antibody-only 3' v2 or 5' + - ARC-v1: for analyzing the Gene Expression portion of Multiome data. + NOTE: when the pipeline auto-detects ARC-v1 chemistry, an error is triggered. + See https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information. + choices: [ auto, threeprime, fiveprime, SC3Pv1, SC3Pv2, SC3Pv3, SC3Pv4, SC3Pv3LT, SC3Pv3HT, SC5P-PE-v3, SC5P-PE, SC5P-R2, SC5P-R2-v3, SC-FB, ARC-v1 ] + + - type: file + name: "--tenx_cloud_token_path" + description: The 10x Cloud Analysis user token used to enable cell annotation. + + - type: string + name: "--cell_annotation_model" + description: | + "Cell annotation model to use. If auto, uses the default model for the species. + If not given, does not run cell annotation." + choices: ["auto", "human_pca_v1_beta", "mouse_pca_v1_beta"] + + - type: boolean + name: "--secondary_analysis" + default: false + description: Whether or not to run the secondary analysis e.g. clustering. + + - type: boolean + name: "--generate_bam" + default: true + description: Whether to generate a BAM file. + + - type: boolean + name: "--include_introns" + default: true + description: Include intronic reads in count. + + - type: integer + name: --r1_length + description: "Hard trim the input Read 1 to this length before analysis" + required: false + + - type: integer + name: "--r2_length" + required: false + description: "Hard trim the input Read 2 to this length before analysis" + + - type: integer + multiple: true + name: --lanes + required: false + description: Only use FASTQs from selected lanes. + example: [1,2,3] + + - name: "--library_compatibility_check" + type: boolean + default: true + description: | + Whether to check for barcode compatibility between libraries. + + - name: "--min_crispr_umi" + type: integer + min: 1 + description: | + Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection. + If a lower or higher sensitivity is desired for detection, this value can be customized + according to specific experimental needs. Applicable only to datasets that include a + CRISPR Guide Capture library. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: ghcr.io/data-intuitive/cellranger:9.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/cellranger_count/script.sh b/src/mapping/cellranger_count/script.sh new file mode 100644 index 00000000..81da0418 --- /dev/null +++ b/src/mapping/cellranger_count/script.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input='resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/' +par_reference='resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/' +par_output='resources_test/cellranger_tiny_fastq/bam' +par_chemistry="auto" +par_expect_cells="3000" +par_secondary_analysis="false" +## VIASH END + +# just to make sure paths are absolute +par_reference=`realpath $par_reference` +par_output=`realpath $par_output` + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# process inputs +# for every fastq file found, make a symlink into the tempdir +fastq_dir="$tmpdir/fastqs" +mkdir -p "$fastq_dir" +IFS=";" +for var in $par_input; do + unset IFS + abs_path=`realpath $var` + if [ -d "$abs_path" ]; then + find "$abs_path" -name *.fastq.gz -exec ln -s {} "$fastq_dir" \; + else + ln -s "$abs_path" "$fastq_dir" + fi +done + +# process reference +if file $par_reference | grep -q 'gzip compressed data'; then + echo "Untarring genome" + reference_dir="$tmpdir/fastqs" + mkdir -p "$reference_dir" + tar -xvf "$par_reference" -C "$reference_dir" --strip-components=1 + par_reference="$reference_dir" +fi + +# cd into tempdir +cd "$tmpdir" + +no_secondary_analysis="" +if [ "$par_secondary_analysis" == "false" ]; then + no_secondary_analysis="true" +fi + +IFS="," +id=myoutput +cellranger count \ + --id="$id" \ + --fastqs="$fastq_dir" \ + --transcriptome="$par_reference" \ + --include-introns="$par_include_introns" \ + ${par_feature_reference:+--feature-ref=$par_feature_reference} \ + ${par_cell_annotation_model:+--cell-annotation-model=$par_cell_annotation_model} \ + ${par_tenx_cloud_token_path:+--tenx-cloud-token-path=$par_tenx_cloud_token_path} \ + ${meta_cpus:+--localcores=$meta_cpus} \ + ${meta_memory_gb:+--localmem=$((meta_memory_gb-2))} \ + ${par_expect_cells:+--expect-cells=$par_expect_cells} \ + ${par_force_cells:+--force-cells=$par_force_cells} \ + ${par_chemistry:+--chemistry="$par_chemistry"} \ + ${par_generate_bam:+--create-bam=$par_generate_bam} \ + ${no_secondary_analysis:+--nosecondary} \ + ${par_r1_length:+--r1-length=$par_r1_length} \ + ${par_r2_length:+--r2-length=$par_r2_length} \ + ${par_lanes:+--lanes=${par_lanes[*]}} \ + ${par_library_compatibility_check:+--check-library-compatibility=$par_library_compatibility_check}\ + --disable-ui +unset IFS + +echo "Copying output" +if [ -d "$id/outs/" ]; then + if [ ! -d "$par_output" ]; then + mkdir -p "$par_output" + fi + mv "$id/outs/"* "$par_output" +fi diff --git a/src/mapping/cellranger_count/test.py b/src/mapping/cellranger_count/test.py new file mode 100644 index 00000000..e902e072 --- /dev/null +++ b/src/mapping/cellranger_count/test.py @@ -0,0 +1,161 @@ +import sys +import pytest +from pathlib import Path + +## VIASH START +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} +## VIASH END + +input = Path(meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_fastq/") +reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref/" + + +def test_cellranger_count_with_folder(run_component, random_path): + output = random_path() + run_component( + [ + "--input", + input, + "--reference", + reference, + "--output", + output, + "--lanes", + "1", + ] + ) + + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +def test_cellranger_count_with_fastq_files(run_component, random_path): + output = random_path() + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + ] + ) + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +@pytest.mark.parametrize("chemistry", ["auto", "SC3Pv2"]) +def test_cellranger_chemistry(run_component, random_path, chemistry): + output = random_path() + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--chemistry", + chemistry, + ] + ) + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +def test_cellranger_no_bam(run_component, random_path): + output = random_path() + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--generate_bam", + "false", + ] + ) + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +def test_cellranger_no_secondary_analysis(run_component, random_path): + output = random_path() + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--secondary_analysis", + "false", + ] + ) + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +def test_cellranger_exclude_introns(run_component, random_path): + output = random_path() + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--include_introns", + "false", + ] + ) + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +def test_cellranger_trim_reads(run_component, random_path): + output = random_path() + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--r1_length", + "100", + "--r2_length", + "100", + ] + ) + assert (output / "filtered_feature_bc_matrix.h5").is_file(), ( + "No output was created." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/mapping/cellranger_count_split/config.vsh.yaml b/src/mapping/cellranger_count_split/config.vsh.yaml new file mode 100644 index 00000000..1db809f6 --- /dev/null +++ b/src/mapping/cellranger_count_split/config.vsh.yaml @@ -0,0 +1,74 @@ +name: cellranger_count_split +namespace: mapping +scope: "public" +description: Split 10x Cell Ranger output directory into separate output fields. +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/samuel_d_souza.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +arguments: + - name: --input + type: file + required: true + example: input_dir + description: Output directory from a Cell Ranger count run. + - name: --filtered_h5 + description: | + Output path for the h5 file storing the filtered counts. + type: file + direction: output + required: false + example: filtered_feature_bc_matrix.h5 + - name: --metrics_summary + description: | + Where to store the 'metrics_summary' CSV file. + type: file + direction: output + required: false + example: metrics_summary.csv + - name: --molecule_info + description: | + Where to store Cell Ranger's 'molecule_info.h5' file. + type: file + direction: output + required: false + example: molecule_info.h5 + - name: --bam + description: | + Location of output BAM files. + type: file + direction: output + required: false + example: possorted_genome_bam.bam + - name: --bai + description: | + Where to store the BAM index files. + type: file + direction: output + required: false + example: possorted_genome_bam.bam.bai + - name: --raw_h5 + description: | + Output path for the h5 file storing the raw counts. + type: file + direction: output + required: false + # description: Counts in AnnData format. + example: raw_feature_bc_matrix.h5 +resources: + - type: bash_script + path: script.sh + +engines: +- type: docker + image: "ubuntu:jammy" + setup: + - type: docker + run: apt update && apt upgrade -y + +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/mapping/cellranger_count_split/script.sh b/src/mapping/cellranger_count_split/script.sh new file mode 100644 index 00000000..b49aae33 --- /dev/null +++ b/src/mapping/cellranger_count_split/script.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input='input_dir' +par_filtered_h5='filtered_feature_bc_matrix.h5' +par_metrics_summary='metrics_summary.csv' +par_molecule_info='molecule_info.h5' +par_bam='possorted_genome_bam.bam' +par_bai='possorted_genome_bam.bam.bai' +par_raw_h5='raw_feature_bc_matrix.h5' +## VIASH END + +filtered_h5="$par_input/filtered_feature_bc_matrix.h5" +if [ -f "$filtered_h5" ] && [ ! -z "$par_filtered_h5" ]; then + echo "+ cp $filtered_h5 $par_filtered_h5" + cp "$filtered_h5" "$par_filtered_h5" +fi + +metrics_summary="$par_input/metrics_summary.csv" +if [ -f "$metrics_summary" ] && [ ! -z "$par_metrics_summary" ]; then + echo "+ cp $metrics_summary $par_metrics_summary" + cp "$metrics_summary" "$par_metrics_summary" +fi + +molecule_info="$par_input/molecule_info.h5" +if [ -f "$molecule_info" ] && [ ! -z "$par_molecule_info" ]; then + echo "+ cp $molecule_info $par_molecule_info" + cp "$molecule_info" "$par_molecule_info" +fi + +bam="$par_input/possorted_genome_bam.bam" +if [ -f "$bam" ] && [ ! -z "$par_bam" ]; then + echo "cp $bam $par_bam" + cp "$bam" "$par_bam" +fi + +raw_h5="$par_input/raw_feature_bc_matrix.h5" +if [ -f "$raw_h5" ] && [ ! -z "$par_raw_h5" ]; then + echo "+ cp $raw_h5 $par_raw_h5" + cp "$raw_h5" "$par_raw_h5" +fi + +bai="$par_input/possorted_genome_bam.bam.bai" +if [ -f "$bai" ] && [ ! -z "$par_bai" ]; then + echo "+ cp $bai $par_bai" + cp "$bai" "$par_bai" +fi \ No newline at end of file diff --git a/src/mapping/cellranger_multi/cellranger_multi.yaml b/src/mapping/cellranger_multi/cellranger_multi.yaml new file mode 100644 index 00000000..242abb07 --- /dev/null +++ b/src/mapping/cellranger_multi/cellranger_multi.yaml @@ -0,0 +1,473 @@ +argument_groups: + - name: Input files + arguments: + - type: file + name: --input + required: false + description: | + The FASTQ files to be analyzed. FASTQ files should conform to the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: [ mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz ] + multiple: true + + - name: Feature type-specific input files + description: | + Helper functionality to allow feature type-specific input files, without the need to specify + library_type or library_id. The library_id will be inferred from the input paths. + arguments: + - type: file + name: --gex_input + required: false + description: | + The FASTQ files to be analyzed for Gene Expression. FASTQ files should conform to the + naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --abc_input + required: false + description: | + The FASTQ files to be analyzed for Antibody Capture. FASTQ files should conform to + the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --cgc_input + required: false + description: | + The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files should conform to + the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --mux_input + required: false + description: | + The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files should conform to + the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --vdj_input + required: false + description: | + The FASTQ files to be analyzed for VDJ. FASTQ files should conform to the + naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --vdj_t_input + required: false + description: | + The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform to the naming + conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --vdj_t_gd_input + required: false + description: | + The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should conform to + the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --vdj_b_input + required: false + description: | + The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform to + the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + - type: file + name: --agc_input + required: false + description: | + The FASTQ files to be analyzed for Antigen Capture. FASTQ files should conform to + the naming conventions of bcl2fastq and mkfastq: + `[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz` + example: + [mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz] + multiple: true + multiple_sep: ";" + + - name: Library arguments + arguments: + - type: string + name: --library_id + required: false + description: | + The Illumina sample name to analyze. This must exactly match the 'Sample Name'part + of the FASTQ files specified in the `--input` argument. + example: ["mysample1"] + multiple: true + - type: string + name: --library_type + required: false + description: | + The underlying feature type of the library. + choices: ["Gene Expression", "VDJ", "VDJ-T", "VDJ-B", "VDJ-T-GD", "Antibody Capture", + "CRISPR Guide Capture", "Multiplexing Capture", "Antigen Capture", "Custom"] + example: "Gene Expression" + multiple: true + - type: string + name: --library_subsample + required: false + description: | + The rate at which reads from the provided FASTQ files are sampled. + Must be strictly greater than 0 and less than or equal to 1. + example: "0.5" + multiple: true + - type: string + name: --library_lanes + required: false + description: Lanes associated with this sample. Defaults to using all lanes. + example: "1-4" + multiple: true + - type: string + name: "--library_chemistry" + description: | + Only applicable to FRP. Library-specific assay configuration. By default, + the assay configuration is detected automatically. Typically, users will + not need to specify a chemistry. + + - name: Sample parameters + # Corresponds to the [samples] section + arguments: + - type: string + name: --sample_ids + alternatives: "--cell_multiplex_sample_id" + multiple: true + description: | + A name to identify a multiplexed sample. Must be alphanumeric with hyphens and/or underscores, + and less than 64 characters. Required for Cell Multiplexing libraries. + - type: string + multiple: true + name: --sample_description + alternatives: [--cell_multiplex_description] + description: A description for the sample. + - type: integer + multiple: true + name: --sample_expect_cells + example: 3000 + description: | + Expected number of recovered cells, used as input to cell calling algorithm. + - type: integer + name: "--sample_force_cells" + example: 3000 + multiple: true + required: false + description: | + Force pipeline to use this number of cells, bypassing cell detection. + + - name: "Feature Barcode library specific arguments" + # Corresponds to the [feature] section + arguments: + - name: "--feature_reference" + type: file + description: | + Path to the Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes. + Required only for Antibody Capture or CRISPR Guide Capture libraries. + See https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref for more information." + example: "feature_reference.csv" + required: false + - name: "--feature_r1_length" + type: integer + required: false + description: | + Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, + where N is the user-supplied value. Note that the length includes the Barcode and UMI + sequences so do not set this below 26. + - name: "--feature_r2_length" + type: integer + required: false + description: | + Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, + where N is a user-supplied value. Trimming occurs before sequencing metrics are computed + and therefore, limiting the length of Read 2 may affect Q30 scores. + - name: "--min_crispr_umi" + type: integer + min: 1 + required: false + description: | + Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection. + If a lower or higher sensitivity is desired for detection, this value can be customized + according to specific experimental needs. Applicable only to datasets that include a + CRISPR Guide Capture library. + - name: Gene expression arguments + # Corresponds to the [gene-expression] section + description: Arguments relevant to the analysis of gene expression data. + arguments: + - name: "--gex_reference" + type: file + description: "Genome refence index built by Cell Ranger mkref." + example: "reference_genome.tar.gz" + required: true + - type: boolean + name: "--gex_secondary_analysis" + default: false + description: Whether or not to run the secondary analysis e.g. clustering. + - type: boolean + name: "--gex_generate_bam" + default: false + description: Whether to generate a BAM file. + - type: file + name: "--tenx_cloud_token_path" + description: The 10x Cloud Analysis user token used to enable cell annotation. + - type: string + name: "--cell_annotation_model" + description: | + "Cell annotation model to use. If auto, uses the default model for the species. + If not given, does not run cell annotation." + choices: ["auto", "human_pca_v1_beta", "mouse_pca_v1_beta"] + - type: integer + name: --gex_expect_cells + example: 3000 + description: | + Expected number of recovered cells, used as input to cell calling algorithm. + - type: integer + name: "--gex_force_cells" + example: 3000 + description: | + Force pipeline to use this number of cells, bypassing cell detection. + - type: boolean + name: "--gex_include_introns" + default: true + description: | + Whether or not to include intronic reads in counts. + This option does not apply to Fixed RNA Profiling analysis. + - name: "--gex_r1_length" + type: integer + required: false + description: | + Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, + where N is the user-supplied value. Note that the length includes the Barcode and UMI + sequences so do not set this below 26. + - name: "--gex_r2_length" + type: integer + required: false + description: | + Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, + where N is a user-supplied value. Trimming occurs before sequencing metrics are computed + and therefore, limiting the length of Read 2 may affect Q30 scores. + - type: string + name: --gex_chemistry + default: auto + description: | + Assay configuration. Either specify a single value which will be applied to all libraries, + or a number of values that is equal to the number of libararies. The latter is only applicable + to only applicable to Fixed RNA Profiling. + - auto: Chemistry autodetection (default) + - threeprime: Single Cell 3' + - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single Cell 3' v1, v2, v3, or v4 + - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT + - SC-FB: Single Cell Antibody-only 3' v2 or 5' + - fiveprime: Single Cell 5' + - SC5P-PE: Paired-end Single Cell 5' + - SC5P-PE-v3: Paired-end Single Cell 5' v3 + - SC5P-R2: R2-only Single Cell 5' + - SC5P-R2-v3: R2-only Single Cell 5' v3 + - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X) + - SC5PHT : Single Cell 5' v2 HT + - SFRP: Fixed RNA Profiling (Singleplex) + - MFRP: Fixed RNA Profiling (Multiplex, Probe Barcode on R2) + - MFRP-R1: Fixed RNA Profiling (Multiplex, Probe Barcode on R1) + - MFRP-RNA: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R2) + - MFRP-Ab: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at R2:69) + - MFRP-Ab-R2pos50: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at R2:50) + - MFRP-RNA-R1: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R1) + - MFRP-Ab-R1: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode on R1) + - ARC-v1 for analyzing the Gene Expression portion of Multiome data. If Cell Ranger auto-detects ARC-v1 chemistry, an error is triggered. + See https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information. + choices: [ auto, threeprime, fiveprime, SC3Pv1, SC3Pv2, SC3Pv3, SC3Pv3-polyA, SC3Pv4, SC3Pv4-polyA, SC3Pv3LT, SC3Pv3HT, SC3Pv3HT-polyA, + SC5P-PE, SC5P-PE-v3, SC5P-R2, SC-FB, SC5P-R2-v3, SCP5-PE-v3, SC5PHT, MFRP, MFRP-R1, MFRP-RNA, MFRP-Ab, + SFRP, MFRP-Ab-R2pos50, MFRP-RNA-R1, MFRP-Ab-R1, ARC-v1] + + - name: "VDJ related parameters" + # The [vdj] section + arguments: + - name: "--vdj_reference" + type: file + description: "VDJ refence index built by Cell Ranger mkref." + example: "reference_vdj.tar.gz" + required: false + - name: "--vdj_inner_enrichment_primers" + type: file + description: | + V(D)J Immune Profiling libraries: if inner enrichment primers other than those provided + in the 10x Genomics kits are used, they need to be specified here as a + text file with one primer per line. + example: "enrichment_primers.txt" + required: false + - name: "--vdj_r1_length" + type: integer + required: false + description: | + Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, where N is the user-supplied value. + Note that the length includes the Barcode and UMI sequences so do not set this below 26. + - name: "--vdj_r2_length" + type: integer + required: false + description: | + Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, where N is a user-supplied value. + Trimming occurs before sequencing metrics are computed and therefore, limiting the length of Read 2 may affect Q30 scores + + - name: 3' Cell multiplexing parameters (CellPlex Multiplexing) + # cell_multiplex_oligo_ids adds to [samples] section + # min_assignment_confidence, cmo_set barcode_sample_assignment are added to [gene-expression] + arguments: + - type: string + name: --cell_multiplex_oligo_ids + alternatives: [--cmo_ids] + multiple: true + description: | + The Cell Multiplexing oligo IDs used to multiplex this sample. If multiple CMOs were used for a sample, + separate IDs with a pipe (e.g., CMO301|CMO302). Required for Cell Multiplexing libraries. + + - type: double + name: --min_assignment_confidence + description: | + The minimum estimated likelihood to call a sample as tagged with a Cell Multiplexing Oligo (CMO) instead of "Unassigned". + Users may wish to tolerate a higher rate of mis-assignment in order to obtain more singlets to include in their analysis, + or a lower rate of mis-assignment at the cost of obtaining fewer singlets. + - type: file + direction: input + required: false + name: "--cmo_set" + description: | + Path to a custom CMO set CSV file, declaring CMO constructs and associated barcodes. If the default CMO reference IDs that are built into + the Cell Ranger software are required, this option does not need to be used. + - type: file + direction: input + required: false + name: "--barcode_sample_assignment" + description: | + Path to a barcode-sample assignment CSV file that specifies the barcodes that belong to each sample. + + - name: Hashtag multiplexing parameters + # Is added to [samples] + arguments: + - name: --hashtag_ids + type: string + multiple: true + description: | + The hashtag IDs used to multiplex this sample. If multiple antibody hashtags were used for the same sample, + you can separate IDs with a pipe. + + - name: On-chip multiplexing parameters + # Is added to [samples] + arguments: + - name: --ocm_barcode_ids + type: string + multiple: true + # Note: choices is not an option here because multiple values can be added using pipe + description: | + The OCM barcode IDs used to multiplex this sample. Must be one of OB1, OB2, OB3, OB4. + If multiple OCM Barcodes were used for the same sample, you can separate IDs + with a pipe (e.g., OB1|OB2). + + - name: Flex multiplexing paramaters + # probe_set, filter_probes and emptydrops_minimum_umis end up in [gene-expression] + # probe_barcode_ids ends up in [samples] + arguments: + - type: file + name: "--probe_set" + description: | + A probe set reference CSV file. It specifies the sequences used as a reference for probe alignment and the gene ID associated with each probe. + It must include 4 columns (probe file format 1.0.0): gene_id,probe_seq,probe_id,included,region and an optional 5th column (probe file format 1.0.1). + - gene_id: The Ensembl gene identifier targeted by the probe. + - probe_seq: The nucleotide sequence of the probe, which is complementary to the transcript sequence. + - probe_id: The probe identifier, whose format is described in Probe identifiers. + - included: A TRUE or FALSE flag specifying whether the probe is included in the filtered counts matrix output or excluded by the probe filter. + See filter-probes option of cellranger multi. All probes of a gene must be marked TRUE in the included column for that gene to be included. + - region: Present only in v1.0.1 probe set reference CSV. The gene boundary targeted by the probe. Accepted values are spliced or unspliced. + + The file also contains a number of required metadata fields in the header in the format #key=value: + - panel_name: The name of the probe set. + - panel_type: Always predesigned for predesigned probe sets. + - reference_genome: The reference genome build used for probe design. + - reference_version: The version of the Cell Ranger reference transcriptome used for probe design. + - probe_set_file_format: The version of the probe set file format specification that this file conforms to. + - type: boolean # Null is also a valid option because passing this argument to cellranger (true or false) requires --probe_set + name: "--filter_probes" + description: | + If 'false', include all non-deprecated probes listed in the probe set reference CSV file. + If 'true' or not set, probes that are predicted to have off-target activity to homologous genes are excluded from analysis. + Not filtering will result in UMI counts from all non-deprecated probes, + including those with predicted off-target activity, to be used in the analysis. + Probes whose ID is prefixed with DEPRECATED are always excluded from the analysis. + + - type: string + name: "--probe_barcode_ids" + multiple: true + description: | + The Fixed RNA Probe Barcode ID used for this sample, and for multiplex GEX + Antibody Capture libraries, + the corresponding Antibody Multiplexing Barcode IDs. 10x recommends specifying both barcodes (e.g., BC001+AB001) + when an Antibody Capture library is present. The barcode pair order is BC+AB and they + are separated with a "+" (no spaces). Alternatively, you can specify the Probe Barcode ID alone and + Cell Ranger's barcode pairing auto-detection algorithm will automatically match to the corresponding Antibody + Multiplexing Barcode. + - type: integer + name: --emptydrops_minimum_umis + min: 1 + description: | + For singleplex Flex experiments, use this option to adjust the UMI cutoff during the second step of cell calling. + Cell Ranger will still perform the full cell calling process but will only evaluate barcodes with UMIs above + the threshold you specify. + + - name: Antigen Capture (BEAM) libary arguments + # These end up in the [antigen-specificity] section + description: | + These arguments are recommended if an Antigen Capture (BEAM) library is present. + It is needed to calculate the antigen specificity score. + arguments: + - type: string + name: --control_id + multiple: true + description: | + A user-defined ID for any negative controls used in the T/BCR Antigen Capture assay. Must match id specified in the feature reference CSV. + May only include ASCII characters and must not use whitespace, slash, quote, or comma characters. + Each ID must be unique and must not collide with a gene identifier from the transcriptome. + - type: string + multiple: true + name: --mhc_allele + description: | + The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele name specified in the Feature Reference CSV. + - name: "General arguments" + description: | + These arguments are applicable to all library types. + arguments: + - name: "--check_library_compatibility" + type: boolean + default: true + description: | + Optional. This option allows users to disable the check that evaluates 10x Barcode overlap between + ibraries when multiple libraries are specified (e.g., Gene Expression + Antibody Capture). Setting + this option to false will disable the check across all library combinations. We recommend running + this check (default), however if the pipeline errors out, users can bypass the check to generate + outputs for troubleshooting. + diff --git a/src/mapping/cellranger_multi/config.vsh.yaml b/src/mapping/cellranger_multi/config.vsh.yaml new file mode 100644 index 00000000..97bbd130 --- /dev/null +++ b/src/mapping/cellranger_multi/config.vsh.yaml @@ -0,0 +1,68 @@ +name: cellranger_multi +namespace: mapping +scope: "public" +description: Align fastq files using Cell Ranger multi. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] +__merge__: /src/mapping/cellranger_multi/cellranger_multi.yaml +argument_groups: + - name: Outputs + arguments: + - type: file + name: --output + direction: output + description: The folder to store the alignment results. + example: "/path/to/output" + required: true + + - name: Executor arguments + arguments: + - name: "--dryrun" + type: boolean_true + description: "If true, the output directory will only contain the CWL input files, but the pipeline itself will not be executed." +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/10x_5k_anticmv/raw/ + dest: 10x_5k_anticmv/raw/ + - path: /resources_test/10x_5k_lung_crispr/raw/ + dest: 10x_5k_lung_crispr/raw/ + - path: /resources_test/10x_5k_beam/raw/ + dest: 10x_5k_beam/raw/ + - path: /resources_test/10x_5k_fixed/raw/ + dest: 10x_5k_fixed/raw + - path: /resources_test/10x_5k_beam/raw/ + - path: /resources_test/reference_gencodev41_chr1 +engines: + - type: docker + image: ghcr.io/data-intuitive/cellranger:9.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + - type: python + packages: + - pandas + - pyyaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ veryhighmem, highcpu, highdisk ] diff --git a/src/mapping/cellranger_multi/script.py b/src/mapping/cellranger_multi/script.py new file mode 100644 index 00000000..a0e9e43f --- /dev/null +++ b/src/mapping/cellranger_multi/script.py @@ -0,0 +1,565 @@ +from __future__ import annotations + +import sys +import re +import subprocess +from multiprocessing import cpu_count +import tempfile +import pandas as pd +import yaml +from typing import Optional, Any, Union +import tarfile +from pathlib import Path +import shutil +from itertools import chain + +CELL_RANGER_EXEC = shutil.which("cellranger") + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + "output": "./cellranger_test_output", + "input": [ + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz", + ], + "library_id": [ + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "5k_human_antiCMV_T_TBNK_connect_AB_subset", + "5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + ], + "library_type": ["Gene Expression", "Antibody Capture", "VDJ"], + "gex_input": None, + "abc_input": None, + "cgc_input": None, + "mux_input": None, + "vdj_input": None, + "vdj_t_input": None, + "vdj_t_gd_input": None, + "vdj_b_input": None, + "agc_input": None, + "library_lanes": None, + "library_subsample": None, + "gex_expect_cells": None, + "gex_chemistry": "auto", + "gex_secondary_analysis": False, + "gex_generate_bam": False, + "gex_include_introns": False, + "cell_multiplex_sample_id": None, + "cell_multiplex_oligo_ids": None, + "cell_multiplex_description": None, + "dryrun": False, + "library_chemistry": None, + "sample_ids": None, + "sample_description": None, + "probe_barcode_ids": None, + "sample_expect_cells": None, + "sample_force_cells": None, + "hashtag_ids": None, + "ocm_barcode_ids": None, + "gex_force_cells": None, + "gex_r1_length": None, + "gex_r2_length": None, + "min_assignment_confidence": None, + "barcode_sample_assignment": None, + "cmo_set": None, + "probe_set": None, + "check_library_compatibility": True, + "tenx_cloud_token_path": False, + "filter_probes": None, + "cell_annotation_model": None, + "emptydrops_minimum_umis": None, + "feature_r1_length": None, + "feature_r2_length": None, + "min_crispr_umi": None, + "vdj_inner_enrichment_primers": None, + "vdj_r1_length": None, + "vdj_r2_length": None, + "control_id": None, + "mhc_allele": None, + "gex_reference": "resources_test/reference_gencodev41_chr1/reference_cellranger.tar.gz", + "feature_reference": "resources_test/10x_5k_anticmv/raw/feature_reference.csv", + "vdj_reference": "resources_test/10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz", +} +meta = { + "name": "cellranger_multi", + "cpus": 10, + "memory_b": None, + "memory_kb": None, + "memory_mb": None, + "memory_gb": 15, + "memory_tb": None, + "memory_pb": None, + "temp_dir": "/tmp", + "config": "./target/executable/mapping/cellranger_multi/.config.vsh.yaml", + "resources_dir": "./src/utils", +} + +CELL_RANGER_EXEC = "src/cellranger" +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +if not CELL_RANGER_EXEC: + raise RuntimeError( + "Could not find Cell Ranger executable. Please make sure it is installed and available in your PATH." + ) + +# Tested with cellranger 7.0: +# - omitting the lane number is allowed (e.g. `_L001`) +# - lane number should be omitted across all files if omitted in one +# - replacing `.fastq.` for `.fq.` is NOT allowed +# - omitting `.gz` is allowed + +fastq_regex = r"^([A-Za-z0-9\-_\.]+)_S(\d+)_(L(\d+)_)?[RI](\d+)_(\d+)\.fastq(\.gz)?$" +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_L001_R1_001.fastq.gz") is not None +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq") is not None +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq.gz.txt") is None + +# Invert some parameters. Keep the original ones in the config for compatibility +inverted_params = { + "gex_no_secondary_analysis": "gex_secondary_analysis", +} +for inverted_param, param in inverted_params.items(): + par[inverted_param] = not par[param] if par[param] is not None else None + del par[param] + +GEX_CONFIG_KEYS = { + "gex_reference": "reference", + "gex_expect_cells": "expect-cells", + "gex_force_cells": "force-cells", + "gex_chemistry": "chemistry", + "gex_no_secondary_analysis": "no-secondary", + "gex_generate_bam": "create-bam", + "gex_include_introns": "include-introns", + "min_assignment_confidence": "min-assignment-confidence", + "check_library_compatibility": "check-library-compatibility", + "barcode_sample_assignment": "barcode-sample-assignment", + "cmo_set": "cmo-set", + "probe_set": "probe-set", + "filter_probes": "filter-probes", + "gex_r1_length": "r1-length", + "gex_r2_length": "r2-length", + "tenx_cloud_token_path": "tenx-cloud-token-path", + "cell_annotation_model": "cell-annotation-model", + "emptydrops_minimum_umis": "emptydrops_minimum_umis", +} + +FEATURE_CONFIG_KEYS = { + "feature_reference": "reference", + "feature_r1_length": "r1-length", + "feature_r2_length": "r2-length", + "min_crispr_umi": "min-crispr-umi", +} + +VDJ_CONFIG_KEYS = { + "vdj_reference": "reference", + "vdj_inner_enrichment_primers": "inner-enrichment-primers", + "vdj_r1_length": "r1-length", + "vdj_r2_length": "r2-length", +} + + +ANTIGEN_SPECIFICITY_CONFIG_KEYS = { + "control_id": "control_id", + "mhc_allele": "mhc_allele", +} + + +REFERENCE_SECTIONS = { + "gene-expression": (GEX_CONFIG_KEYS, "index"), + "feature": (FEATURE_CONFIG_KEYS, "index"), + "vdj": (VDJ_CONFIG_KEYS, "index"), + "antigen-specificity": (ANTIGEN_SPECIFICITY_CONFIG_KEYS, "columns"), +} + +LIBRARY_CONFIG_KEYS = { + "library_id": "fastq_id", + "library_type": "feature_types", + "library_subsample": "subsample_rate", + "library_lanes": "lanes", + "library_chemistry": "chemistry", +} + + +SAMPLE_PARAMS_CONFIG_KEYS = { + "sample_ids": "sample_id", + "cell_multiplex_oligo_ids": "cmo_ids", + "sample_description": "description", + "probe_barcode_ids": "probe_barcode_ids", + "sample_expect_cells": "expect_cells", + "sample_force_cells": "force_cells", + "hashtag_ids": "hashtag_ids", + "ocm_barcode_ids": "ocm_barcode_ids", +} + + +# These are derived from the dictionaries above +REFERENCES = tuple( + reference_param + for reference_param, cellranger_param in chain( + GEX_CONFIG_KEYS.items(), FEATURE_CONFIG_KEYS.items(), VDJ_CONFIG_KEYS.items() + ) + if cellranger_param == "reference" +) +LIBRARY_PARAMS = tuple(LIBRARY_CONFIG_KEYS.keys()) +SAMPLE_PARAMS = tuple(SAMPLE_PARAMS_CONFIG_KEYS.keys()) +HELPER_INPUT = { + "gex_input": "Gene Expression", + "abc_input": "Antibody Capture", + "cgc_input": "CRISPR Guide Capture", + "mux_input": "Multiplexing Capture", + "vdj_input": "VDJ", + "vdj_t_input": "VDJ-T", + "vdj_t_gd_input": "VDJ-T-GD", + "vdj_b_input": "VDJ-B", + "agc_input": "Antigen Capture", +} + + +def infer_library_id_from_path(input_path: str) -> str: + match = re.match(fastq_regex, input_path) + assert match is not None, ( + f"File name of '{input_path}' should match regex {fastq_regex}." + ) + return match.group(1) + + +def transform_helper_inputs(par: dict[str, Any]) -> dict[str, Any]: + helper_input = {"input": [], "library_id": [], "library_type": []} + for input_type, library_type in HELPER_INPUT.items(): + if par[input_type]: + par[input_type] = resolve_input_directories_to_fastq_paths(par[input_type]) + + library_ids = [ + infer_library_id_from_path(path.name) for path in par[input_type] + ] + + library_id_dict = {} + for fastq, library_id in zip(par[input_type], library_ids): + library_id_dict.setdefault(library_id, []).append(fastq) + + for library_id, input in library_id_dict.items(): + helper_input["input"] += input + helper_input["library_id"].append(library_id) + helper_input["library_type"].append(library_type) + + assert len(helper_input["library_id"]) == len(set(helper_input["library_id"])), ( + "File names passed to feature type-specific inputs must be unique" + ) + + return helper_input + + +def lengths_gt1(dic: dict[str, Optional[list[Any]]]) -> dict[str, int]: + return { + key: len(li) + for key, li in dic.items() + if li is not None and isinstance(li, (list, tuple, set)) + } + + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + + +def subset_dict( + dictionary: dict[str, str], keys: Union[dict[str, str], list[str]] +) -> dict[str, str]: + if isinstance(keys, (list, tuple)): + keys = {key: key for key in keys} + return { + dest_key: dictionary[orig_key] + for orig_key, dest_key in keys.items() + if dictionary[orig_key] is not None + } + + +def check_subset_dict_equal_length( + group_name: str, dictionary: dict[str, list[str]] +) -> None: + lens = lengths_gt1(dictionary) + assert len(set(lens.values())) <= 1, ( + f"The number of values passed to {group_name} " + f"arguments must be 0, 1 or all the same. Offenders: {lens}" + ) + + +def resolve_input_directories_to_fastq_paths(input_paths: list[str]) -> list[Path]: + input_paths = [Path(fastq) for fastq in input_paths] + if len(input_paths) == 1 and input_paths[0].is_dir(): + logger.info( + "Detected a directory in input paths, " + "traversing to see if we can detect any FASTQ files." + ) + input_paths = [ + input_path + for input_path in input_paths[0].rglob("*") + if re.match(fastq_regex, input_path.name) + ] + + # check input fastq files + for input_path in input_paths: + assert re.match(fastq_regex, input_path.name) is not None, ( + f"File name of --input '{input_path}' should match regex {fastq_regex}." + ) + + return input_paths + + +def make_paths_absolute(par: dict[str, Any], config: Path | str): + with open(config, "r", encoding="utf-8") as open_viash_config: + config = yaml.safe_load(open_viash_config) + + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + for arg_name, arg in arguments.items(): + if not par.get(arg_name) or arg["type"] != "file": + continue + par_value, is_multiple = par[arg_name], arg["multiple"] + assert is_multiple in (True, False) + + def make_path_absolute(file: str | Path) -> Path: + logger.info("Making path %s absolute", file) + return Path(file).resolve() + + new_arg = ( + [make_path_absolute(file) for file in par_value] + if is_multiple + else make_path_absolute(par_value) + ) + par[arg_name] = new_arg + return par + + +def handle_integers_not_set(par: dict[str, Any], viash_config: Path | str) -> str: + """ + Allow to use `-1` to define a 'not set' value for arguments of `type: integer` with `multiple: true`. + """ + with open(viash_config, "r", encoding="utf-8") as open_viash_config: + config = yaml.safe_load(open_viash_config) + + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + for arg_name, arg in arguments.items(): + if not par.get(arg_name) or arg["type"] != "integer": + continue + par_value, is_multiple = par[arg_name], arg["multiple"] + assert is_multiple in (True, False) + + if not is_multiple: + continue + + def replace_notset_values(integer_value: int) -> int | None: + return None if integer_value == -1 else integer_value + + # Use an extension array to handle "None" values, otherwise int + NA + # values would be converted to a "float" dtype + new_arg = pd.array( + [replace_notset_values(value) for value in par_value], dtype="Int64" + ) + par[arg_name] = new_arg + return par + + +def process_params(par: dict[str, Any], viash_config: Path | str) -> str: + if par["input"]: + assert len(par["library_type"]) > 0, ( + "--library_type must be defined when passing input to --input" + ) + assert len(par["library_id"]) > 0, ( + "--library_id must be defined when passing input to --input" + ) + + # if par["input"] is a directory, look for fastq files + par["input"] = resolve_input_directories_to_fastq_paths(par["input"]) + + # add helper input + helper_input = transform_helper_inputs(par) + for key in ["input", "library_id", "library_type"]: + par[key] = (par[key] if par[key] else []) + helper_input[key] + + assert len(par[key]) > 0, ( + f"Either --{key} or feature type-specific input (e.g. --gex_input, --abc_input, ...) must be defined" + ) + + # check lengths of libraries metadata + library_dict = subset_dict(par, LIBRARY_PARAMS) + check_subset_dict_equal_length("Library", library_dict) + + samples_dict = subset_dict(par, SAMPLE_PARAMS) + check_subset_dict_equal_length("Samples", samples_dict) + + # Allow using -1 to indicate unset integers for arguments + # that accept multiple integers. + par = handle_integers_not_set(par, viash_config) + + # use absolute paths + return make_paths_absolute(par, viash_config) + + +def generate_csv_category(name: str, args: dict[str, str], orient: str) -> list[str]: + assert orient in ("index", "columns") + if not args: + return [] + title = [f"[{name}]"] + # Which index to include in csv section is based on orientation + to_csv_args = {"index": (orient == "index"), "header": (orient == "columns")} + values = [pd.DataFrame.from_dict(args, orient=orient).to_csv(**to_csv_args).strip()] + return title + values + [""] + + +def generate_config(par: dict[str, Any], fastq_dir: str) -> str: + content_list = [] + par["fastqs"] = fastq_dir + libraries = dict(LIBRARY_CONFIG_KEYS, **{"fastqs": "fastqs"}) + # TODO: use the union (|) operator when python is updated to 3.9 + all_sections = REFERENCE_SECTIONS | { + "libraries": (libraries, "columns"), + "samples": (SAMPLE_PARAMS_CONFIG_KEYS, "columns"), + } + for section_name, (section_params, orientation) in all_sections.items(): + reference_pars = subset_dict(par, section_params) + content_list += generate_csv_category( + section_name, reference_pars, orient=orientation + ) + + return "\n".join(content_list) + + +def untar(tar_file, output_directory): + logger.info("Looking at %s to check if it needs decompressing", tar_file) + if tar_file and Path(tar_file).is_file() and tarfile.is_tarfile(tar_file): + extaction_dir_name = Path( + tar_file.stem + ).stem # Remove two extensions (if they exist) + unpacked_directory = output_directory / extaction_dir_name + logger.info("Extracting %s to %s", tar_file, unpacked_directory) + + with tarfile.open(tar_file, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_directory, members=members_to_move) + return unpacked_directory + return tar_file + + +def main(par: dict[str, Any], meta: dict[str, Any]): + logger.info("Processing params") + par = process_params(par, meta["config"]) + logger.info(par) + + with tempfile.TemporaryDirectory( + prefix="cellranger_multi-", dir=meta["temp_dir"] + ) as temp_dir: + logger.info("Using temporary directory %s", temp_dir) + temp_dir_path = Path(temp_dir) + for reference_par_name in REFERENCES: + par[reference_par_name] = untar(par[reference_par_name], temp_dir_path) + + logger.info("Creating symbolic links to temporary directory") + # Creating symlinks of fastq files to tempdir + input_symlinks_dir = temp_dir_path / "input_symlinks" + input_symlinks_dir.mkdir() + for fastq in par["input"]: + destination = input_symlinks_dir / fastq.name + destination.symlink_to(fastq) + + logger.info("Creating config file") + config_content = generate_config(par, input_symlinks_dir) + logger.info("Using config: \n\t%s", config_content.replace("\n", "\n\t")) + par["output"].mkdir(parents=True, exist_ok=True) + config_file = par["output"] / "config.csv" + with open(config_file, "w") as f: + f.write(config_content) + + logger.info("Creating Cell Ranger argument list") + temp_id = "run" + cmd = [CELL_RANGER_EXEC, "multi", "--disable-ui", "--id", temp_id] + + command_line_parameters = { + "--localcores": int(meta["cpus"]) if meta["cpus"] else cpu_count() - 1, + "--csv": config_file, + } + if meta["memory_gb"]: + command_line_parameters["--localmem"] = ( + int(meta["memory_gb"]) - 2 + if meta["memory_gb"] > 2 + else meta["memory_gb"] + ) + cmd += [ + f"{param}={param_value}" + for param, param_value in command_line_parameters.items() + ] + + logger.info("> " + " ".join(cmd)) + + if par["dryrun"]: + logger.info("Not executing command. Dry run enabled.") + return + + logger.info("Starting Cell Ranger") + with ( + subprocess.Popen( + cmd, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=1, + cwd=temp_dir, + ) as process, + (par["output"] / "cellranger_multi.log").open("w") as open_log, + ): + for line in process.stdout: + print(line.strip(), flush=True) + open_log.write(line) + logger.info("Checking exit code.") + # Popen objects do not have a check_exitcode method. + if process.returncode != 0: + raise RuntimeError("Cell Ranger returned a nonzero exitcode!") + + logger.info("Checking if expected output files are present.") + # look for output dir file + tmp_output_dir = temp_dir_path / temp_id / "outs" + expected_files = { + Path("multi"): Path.is_dir, + Path("per_sample_outs"): Path.is_dir, + Path("config.csv"): Path.is_file, + } + for file_path, type_func in expected_files.items(): + output_path = tmp_output_dir / file_path + if not type_func(output_path): + raise ValueError(f"Could not find expected '{output_path}'") + logger.info("Copying output files to %s", par["output"]) + for output_path in tmp_output_dir.rglob("*"): + if output_path.name != "config.csv": # Already created + shutil.move(str(output_path), par["output"]) + + +if __name__ == "__main__": + logger.info("Component '%s' started.", meta["name"]) + main(par, meta) + logger.info("Finished!") diff --git a/src/mapping/cellranger_multi/test.py b/src/mapping/cellranger_multi/test.py new file mode 100644 index 00000000..5e4c77a0 --- /dev/null +++ b/src/mapping/cellranger_multi/test.py @@ -0,0 +1,726 @@ +from pathlib import Path +import tarfile +from textwrap import dedent +import re +import pytest +import sys +import subprocess + +## VIASH START +meta = { + "executable": "./target/docker/mapping/cellranger_multi/cellranger_multi", + "resources_dir": "resources_test/", + "cpus": 15, + "memory_gb": 20, + "config": "src/mapping/cellranger_multi/config.vsh.yaml", +} +## VIASH END + + +def make_path_relative(some_path): + absolute_input_path = Path(some_path).resolve() + absolute_cwd = Path.cwd().resolve() + try: + return absolute_input_path.relative_to(absolute_cwd) + except ValueError as e: + # TODO: python 3.12: remove lines below and add walk_up=True to `relative_to` call + if "is not in the subpath of" in str(e): + _, *parts_input = absolute_input_path.parts + _, *parts_cwd = absolute_cwd.parts + parts_input.reverse() + parts_cwd.reverse() + while parts_cwd and parts_input and parts_cwd[-1] == parts_input[-1]: + parts_input.pop() + parts_cwd.pop() + for part in parts_cwd: + if not part or part == ".": + pass + else: + parts_input.append("..") + relative_path = type(absolute_input_path)("", *reversed(parts_input)) + assert relative_path.resolve() == absolute_input_path + return relative_path + raise e + + +resources_dir = make_path_relative(meta["resources_dir"]) +input1_R1 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz" +) +input1_R2 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz" +) +input2_R1 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz" +) +input2_R2 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz" +) +input3_R1 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz" +) +input3_R2 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz" +) +gex_reference = resources_dir / "reference_gencodev41_chr1/reference_cellranger.tar.gz" +feature_reference = resources_dir / "10x_5k_anticmv/raw/feature_reference.csv" +vdj_reference = ( + resources_dir + / "10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" +) + +# Beam Input +input1_R1_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_gex_subset_S3_L001_R1_001.fastq.gz" +) +input1_R2_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_gex_subset_S3_L001_R2_001.fastq.gz" +) +input2_R1_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_ag_subset_S1_L001_R1_001.fastq.gz" +) +input2_R2_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_ag_subset_S1_L001_R2_001.fastq.gz" +) +input3_R1_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_vdj_subset_S2_L001_R1_001.fastq.gz" +) +input3_R2_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_vdj_subset_S2_L001_R2_001.fastq.gz" +) +vdj_reference_beam = ( + resources_dir + / "10x_5k_beam/raw/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" +) +feature_reference_beam = ( + resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_feature_reference.csv" +) + + +def test_cellranger_multi(run_component, random_path): + outputpath = random_path() + + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + ] + run_component(args) + + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + + # check for metrics summary + assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() + + # check for filtered gex+ab data + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + + # check for vdj data + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + + +def test_cellranger_multi_decompressed_reference(run_component, random_path): + extracted_tar = random_path() + extracted_tar.mkdir() + with tarfile.open(gex_reference) as open_tarfile: + open_tarfile.extractall(extracted_tar) + run_component( + [ + "--output", + random_path(), + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + extracted_tar, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--dryrun", + ] + ) + + +def test_cellranger_multi_directory_input(run_component, random_path): + args = [ + "--output", + random_path(), + "--input", + meta["resources_dir"] + "/10x_5k_anticmv/raw/", + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--gex_secondary_analysis", + "true", + "--gex_generate_bam", + "false", + "--gex_include_introns", + "false", + "--dryrun", + ] + run_component(args) + + +def test_vdj_inner_enrichment_primers(run_component, random_path): + outputpath = random_path() + enrichment_primers_file = random_path("txt") + with enrichment_primers_file.open("w") as primers_file_open: + primers_file_open.write("AGTCTCTCAGCTGGTACACG\nTCTGATGGCTCAAACACAGC") + args = [ + "--output", + outputpath, + "--input", + meta["resources_dir"] + "/10x_5k_anticmv/raw/", + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--gex_secondary_analysis", + "true", + "--gex_generate_bam", + "false", + "--gex_include_introns", + "false", + "--vdj_inner_enrichment_primers", + str(make_path_relative(enrichment_primers_file)), + "--dryrun", + ] + run_component(args) + config_path = outputpath / "config.csv" + assert config_path.is_file() + with config_path.open("r") as config_file: + config_contents = config_file.read() + expected_csv_content = rf"\[vdj\]\nreference,.*?\ninner-enrichment-primers,{enrichment_primers_file.resolve()}\n" + assert re.search(expected_csv_content, config_contents) + + +def test_cellranger_multi_applies_gex_options(run_component, random_path): + outputpath = random_path() + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--gex_secondary_analysis", + "true", + "--gex_generate_bam", + "false", + "--gex_include_introns", + "false", + "--dryrun", + ] + run_component(args) + config_path = outputpath / "config.csv" + assert config_path.is_file() + with config_path.open("r") as config_file: + config_contents = config_file.read() + expected_csv_content = dedent( + """\ + chemistry,auto + no-secondary,False + create-bam,False + include-introns,False + """ + ) + print(expected_csv_content, flush=True) + assert expected_csv_content in config_contents + + +def test_cellranger_multi_no_vdj_reference(run_component, random_path): + # GH291 + outputpath = random_path() + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset", + "--library_type", + "Gene Expression;Antibody Capture", + "--dryrun", + ] + run_component(args) + assert (outputpath / "config.csv").is_file() + + +def test_cellranger_multi_crispr_data(run_component, random_path): + outputpath = random_path() + args = [ + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset_S5_L001_R1_001.fastq.gz", + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset_S5_L001_R2_001.fastq.gz", + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset_S4_L001_R1_001.fastq.gz", + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset_S4_L001_R2_001.fastq.gz", + "--library_id", + "SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset;SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset", + "--library_type", + "Gene Expression;CRISPR Guide Capture", + "--min_crispr_umi", + "3", + "--gex_reference", + gex_reference, + "--feature_reference", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv", + "--output", + outputpath, + ] + run_component(args) + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + # check for metrics summary + assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() + # check for filtered gex+ab data + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + # check for crispr data + assert (outputpath / "per_sample_outs/run/count/crispr_analysis/").is_dir() + + +def test_cellranger_multi_helper_input(run_component, random_path): + outputpath = random_path() + args = [ + "--output", + outputpath, + "--gex_input", + input1_R1, + "--gex_input", + input1_R2, + "--abc_input", + input2_R1, + "--abc_input", + input2_R2, + "--vdj_input", + input3_R1, + "--vdj_input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + ] + run_component(args) + + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + + # check for metrics summary + assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() + + # check for filtered gex+ab data + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + + # check for vdj data + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + + +def test_cellranger_multi_combined_helper_and_global_input(run_component, random_path): + outputpath = random_path() + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--abc_input", + input2_R1, + "--abc_input", + input2_R2, + "--vdj_input", + input3_R1, + "--vdj_input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "--library_type", + "Gene Expression", + ] + run_component(args) + + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + + # check for metrics summary + assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() + + # check for filtered gex+ab data + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + + # check for vdj data + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + + +def test_cellranger_multi_create_output_on_fail(run_component, random_path): + outputpath = random_path() + # missing vdj_reference + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--abc_input", + input2_R1, + "--abc_input", + input2_R2, + "--vdj_input", + input3_R1, + "--vdj_input", + input3_R2, + "--gex_reference", + gex_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "--library_type", + "Gene Expression", + ] + + with pytest.raises(subprocess.CalledProcessError): + run_component(args) + + assert (outputpath / "cellranger_multi.log").is_file(), ( + "Should have created log file." + ) + + +def test_cellranger_multi_beam_data(run_component, random_path): + outputpath = random_path() + args = [ + "--input", + input1_R1_beam, + "--input", + input1_R2_beam, + "--input", + input2_R1_beam, + "--input", + input2_R2_beam, + "--input", + input3_R1_beam, + "--input", + input3_R2_beam, + "--library_id", + "beamt_human_A0201_B0702_pbmc_gex_subset;beamt_human_A0201_B0702_pbmc_ag_subset;beamt_human_A0201_B0702_pbmc_vdj_subset", + "--library_type", + "Gene Expression;VDJ-T;Antigen Capture", + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference_beam, + "--feature_reference", + feature_reference_beam, + "--output", + outputpath, + "--control_id", + "negative_control_A0201;negative_control_B0702", + "--mhc_allele", + "HLA-A*02:01;HLA-B*07:02", + ] + run_component(args) + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + # check for metrics summary + assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + # check for antigen data + assert (outputpath / "per_sample_outs/run/antigen_analysis/").is_dir() + # check for vdj data + assert (outputpath / "per_sample_outs/run/vdj_t/").is_dir() + + +def test_cellranger_multi_fixed_rna(run_component, random_path): + outputpath = random_path() + args = [ + "--input", + f"{meta['resources_dir']}/10x_5k_fixed/raw/", + "--library_id", + "4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_subset", + "--library_type", + "Gene Expression", + "--feature_reference", + f"{meta['resources_dir']}/10x_5k_fixed/raw/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv", + "--gex_reference", + gex_reference, + "--output", + outputpath, + "--probe_barcode_ids", + "BC001;BC002;BC003;BC004", + "--sample_ids", + "Liver_BC1;Ovarian_BC2;Colorectal_BC3;Pancreas_BC4", + "--gex_generate_bam", + "false", + "--library_lanes", + "any", + "--probe_set", + f"{meta['resources_dir']}/10x_5k_fixed/raw/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv", + "--sample_force_cells", + "5000;-1;-1;-1", + ] + run_component(args) + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + # check for metrics summary + for sample in ["Liver_BC1", "Ovarian_BC2", "Colorectal_BC3", "Pancreas_BC4"]: + assert (outputpath / f"per_sample_outs/{sample}/metrics_summary.csv").is_file() + assert ( + outputpath + / f"per_sample_outs/{sample}/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + + assert (outputpath / "multi/multiplexing_analysis").is_dir() + + +def test_cellranger_multi_with_alternative_names(run_component, random_path): + import shutil + import gzip + + input_dir = random_path() + input_dir.mkdir() + + # Note: if one input file does not use any lanes, none of the input files should use lanes + # remove lanes + input1_R1_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_R1_001.fastq.gz" + ) + input1_R2_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_R2_001.fastq.gz" + ) + input2_R1_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_R1_001.fastq.gz" + ) + input2_R2_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_R2_001.fastq.gz" + ) + input3_R1_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_R1_001.fastq" + ) + input3_R2_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_R2_001.fastq" + ) + + # copy files + shutil.copy(input1_R1, input1_R1_link) + shutil.copy(input1_R2, input1_R2_link) + shutil.copy(input2_R1, input2_R1_link) + shutil.copy(input2_R2, input2_R2_link) + + with gzip.open(input3_R1, "rb") as f_in: + with open(input3_R1_link, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + with gzip.open(input3_R2, "rb") as f_in: + with open(input3_R2_link, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + outputpath = random_path() + args = [ + "--output", + outputpath, + "--input", + input1_R1_link, + "--input", + input1_R2_link, + "--abc_input", + input2_R1_link, + "--abc_input", + input2_R2_link, + "--vdj_input", + input3_R1_link, + "--vdj_input", + input3_R2_link, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "--library_type", + "Gene Expression", + ] + run_component(args) + + # check for raw data + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + + # check for metrics summary + assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() + + # check for filtered gex+ab data + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() + + # check for vdj data + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + + +@pytest.mark.parametrize("cpus", [None]) +@pytest.mark.parametrize("memory_bytes", [None]) +def test_cellranger_no_cpus_or_mem_specifier(run_component, random_path): + outputpath = random_path() + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--library_subsample", + "0.1;0.1;0.1", + ] + + run_component(args) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/mapping/htseq_count/config.vsh.yaml b/src/mapping/htseq_count/config.vsh.yaml new file mode 100644 index 00000000..9895d863 --- /dev/null +++ b/src/mapping/htseq_count/config.vsh.yaml @@ -0,0 +1,196 @@ +name: htseq_count +namespace: mapping +scope: "public" +description: | + Quantify gene expression for subsequent testing for differential expression. + + This script takes one or more alignment files in SAM/BAM format and a feature file in GFF format and calculates for each feature the number of reads mapping to it. + + See http://htseq.readthedocs.io/en/master/count.html for details. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] +argument_groups: + - name: Input + arguments: + - type: file + name: --input + required: true + description: Path to the SAM/BAM files containing the mapped reads. + example: [ mysample1.BAM, mysample2.BAM ] + multiple: true + info: + orig_arg: samfilenames + - type: file + name: --reference + description: Path to the GTF file containing the features. + example: reference.gtf + required: true + info: + orig_arg: featurefilename + - name: Output + arguments: + - type: file + name: --output + description: Filename to output the counts to. + example: htseq-count.tsv + direction: output + required: true + info: + orig_arg: --counts_output + - type: string + name: --output_delimiter + description: Column delimiter in output. + required: false + example: "\t" + info: + orig_arg: --delimiter + - type: file + name: --output_sam + description: | + Write out all SAM alignment records into SAM/BAM files (one per input file needed), + annotating each line with its feature assignment (as an optional field with tag 'XF'). + See the -p option to use BAM instead of SAM. + example: [ mysample1_out.BAM, mysample2_out.BAM ] + direction: output + required: false + multiple: true + info: + orig_arg: --samout + - type: string + name: --output_sam_format + choices: [ sam, bam ] + description: Format to use with the --output_sam argument. + required: false + info: + orig_arg: --samout-format + - name: Arguments + arguments: + - name: --order + alternatives: [-r] + type: string + choices: [pos, name] + default: name + description: | + Sorting order of . Paired-end sequencing data must be sorted either by position or + by read name, and the sorting order must be specified. Ignored for single-end data. + info: + orig_arg: --order + - name: --stranded + alternatives: [-s] + type: string + choices: ["yes", "no", "reverse"] + default: "yes" + description: Whether the data is from a strand-specific assay. 'reverse' means 'yes' with reversed strand interpretation. + info: + orig_arg: --stranded + - name: --minimum_alignment_quality + type: integer + alternatives: [-a, --minaqual] + default: 10 + description: | + Skip all reads with MAPQ alignment quality lower than the given minimum value. + MAPQ is the 5th column of a SAM/BAM file and its usage depends on the software + used to map the reads. + info: + orig_arg: --minaqual + - name: --type + type: string + alternatives: [-t] + example: exon + description: "Feature type (3rd column in GTF file) to be used, all features of other type are ignored (default, suitable for Ensembl GTF files: exon)" + info: + orig_arg: --type + - name: --id_attribute + type: string + alternatives: [-i] + example: gene_id + description: | + GTF attribute to be used as feature ID (default, suitable for Ensembl GTF files: gene_id). + All feature of the right type (see -t option) within the same GTF attribute will be added + together. The typical way of using this option is to count all exonic reads from each gene + and add the exons but other uses are possible as well. You can call this option multiple + times: in that case, the combination of all attributes separated by colons (:) will be used + as a unique identifier, e.g. for exons you might use -i gene_id -i exon_number. + multiple: true + info: + orig_arg: --idattr + - name: --additional_attributes + type: string + example: gene_name + description: | + Additional feature attributes (suitable for Ensembl GTF files: gene_name). Use multiple times + for more than one additional attribute. These attributes are only used as annotations in the + output, while the determination of how the counts are added together is done based on option -i. + multiple: true + info: + orig_arg: --additional-attr + - name: --add_chromosome_info + type: boolean_true + description: | + Store information about the chromosome of each feature as an additional attribute + (e.g. colunm in the TSV output file). + info: + orig_arg: --add-chromosome-info + - name: --mode + type: string + alternatives: [-m] + choices: [union, intersection-strict, intersection-nonempty] + default: union + description: Mode to handle reads overlapping more than one feature. + info: + orig_arg: --mode + - name: --non_unique + type: string + choices: [none, all, fraction, random] + default: none + description: Whether and how to score reads that are not uniquely aligned or ambiguously assigned to features. + info: + orig_arg: --nonunique + - name: --secondary_alignments + type: string + choices: [score, ignore] + description: Whether to score secondary alignments (0x100 flag). + info: + orig_arg: --secondary-alignments + - name: --supplementary_alignments + type: string + choices: [score, ignore] + description: Whether to score supplementary alignments (0x800 flag). + info: + orig_arg: --supplementary-alignments + - name: --counts_output_sparse + type: boolean_true + description: Store the counts as a sparse matrix (mtx, h5ad, loom). + info: + orig_arg: --counts-output-sparse +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - HTSeq + - pyyaml + - scipy + - pandas + - numpy<2 + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/htseq_count/script.py b/src/mapping/htseq_count/script.py new file mode 100644 index 00000000..b1598d65 --- /dev/null +++ b/src/mapping/htseq_count/script.py @@ -0,0 +1,126 @@ +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil +import yaml + +## VIASH START +par = { + "input": ["resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"], + "reference": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", + "output": "test_output", +} +meta = {"cpus": 2, "temp_dir": "/tmp", "config": "src/mapping/htseq/config.vsh.yaml"} +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print( + f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True + ) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print( + f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True + ) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +def generate_args(par, config): + # fetch arguments from config + arguments = [ + arg for group in config["argument_groups"] for arg in group["arguments"] + ] + + cmd_args = [] + + for arg in arguments: + arg_val = par.get(arg["name"].removeprefix("--")) + orig_arg = arg.get("info", {}).get("orig_arg") + if arg_val and orig_arg: + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +######################## +### Main code ### +######################## + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + + +with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + reference = Path(par["reference"]) + + print(f">> Check compression of --reference with value: {reference}", flush=True) + par["reference"] = extract_if_need_be(reference, temp_dir_path) + + print(">> Constructing command", flush=True) + cmd_args = ["htseq-count"] + generate_args(par, config) + + # manually process cpus parameter + if "cpus" in meta and meta["cpus"]: + cmd_args.extend(["--nprocesses", str(meta["cpus"])]) + + print(">> Running htseq-count with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + + subprocess.run(cmd_args, check=True) diff --git a/src/mapping/htseq_count/test.py b/src/mapping/htseq_count/test.py new file mode 100644 index 00000000..73b2c801 --- /dev/null +++ b/src/mapping/htseq_count/test.py @@ -0,0 +1,40 @@ +import subprocess +from pathlib import Path +import pandas as pd + + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +print("> Running command with folder", flush=True) +input = meta["resources_dir"] + "/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" +reference = ( + meta["resources_dir"] + + "/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +) +output = "test_output.tsv" + +cmd_pars = [ + meta["executable"], + "--input", + input, + "--reference", + reference, + "--output", + output, + "---cpus", + "2", +] +subprocess.run(cmd_pars, check=True) + +print("> Check if file exists", flush=True) +output_path = Path(output) +assert output_path.is_file() + +print("> Check contents", flush=True) +counts = pd.read_table(output_path, sep="\t") +assert counts.shape[0] > 100 +assert counts.shape[1] == 2 + +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/htseq_count_to_h5mu/config.vsh.yaml b/src/mapping/htseq_count_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..c40e27ba --- /dev/null +++ b/src/mapping/htseq_count_to_h5mu/config.vsh.yaml @@ -0,0 +1,69 @@ +name: htseq_count_to_h5mu +namespace: mapping +scope: "public" +description: | + Convert the htseq table to a h5mu. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] +argument_groups: + - name: Input + arguments: + - type: string + name: --input_id + required: true + description: The obs index for the counts + example: foo + multiple: true + - type: file + name: --input_counts + required: true + description: The counts as a TSV file as output by HTSeq. + example: counts.tsv + multiple: true + - type: file + name: --reference + required: true + description: The GTF file. + example: gencode_v41_star + - name: Outputs + arguments: + - name: "--output" + alternatives: [-o] + direction: output + type: file + description: "Output h5mu file." + required: true + example: output.h5mu + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - gtfparse + - type: python + packages: + - pyarrow~=18.0.0 + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, midcpu ] diff --git a/src/mapping/htseq_count_to_h5mu/script.py b/src/mapping/htseq_count_to_h5mu/script.py new file mode 100644 index 00000000..5f8d7cef --- /dev/null +++ b/src/mapping/htseq_count_to_h5mu/script.py @@ -0,0 +1,127 @@ +import tempfile +from pathlib import Path +import tarfile +import gzip +import shutil +import pandas as pd +import mudata as md +import anndata as ad +import polars as pl +import numpy as np +import gtfparse + +## VIASH START +par = { + "input_counts": ["resources_test/cellranger_tiny_fastq/htseq_counts.tsv"], + "input_id": ["", "bar"], + "reference": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", + "output": "test_output.h5mu", +} +meta = {"temp_dir": "/tmp"} +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +print("> combine counts data", flush=True) +counts_data = [] + +for input_id, input_counts in zip(par["input_id"], par["input_counts"]): + data = pd.read_table( + input_counts, + index_col=0, + names=["gene_ids", input_id], + dtype={"gene_ids": "U", input_id: "i"}, + ).transpose() + counts_data.append(data) + +# combine all counts +counts_and_qc = pd.concat(counts_data, axis=0) + +print("> split qc", flush=True) +idx = counts_and_qc.columns.str.startswith("_") +qc = counts_and_qc.loc[:, idx] +qc.columns = qc.columns.str.replace("^__", "", regex=True) +counts = counts_and_qc.loc[:, ~idx] + +print("> construct var", flush=True) +with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + reference = Path(par["reference"]) + + print(f">> Check compression of --reference with value: {reference}", flush=True) + par["reference"] = extract_if_need_be(reference, temp_dir_path) + + # read_gtf only works on str object, not pathlib.Path + reference = gtfparse.read_gtf(str(par["reference"])) + + +# This is a polars dataframe, not pandas +reference_genes = reference.filter( + (pl.col("feature") == "gene") & (pl.col("gene_id").is_in(list(counts.columns))) +).sort("gene_id") + +var = pd.DataFrame( + data={ + "gene_ids": pd.Index(reference_genes.get_column("gene_id")), + "feature_types": "Gene Expression", + "gene_symbol": reference_genes.get_column("gene_name").to_pandas(), + } +).set_index("gene_ids") + +print("> construct anndata", flush=True) +adata = ad.AnnData(X=counts, obsm={"qc_htseq": qc}, var=var, dtype=np.int32) + +print("> convert to mudata", flush=True) +mdata = md.MuData(adata) + +print("> write to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/mapping/htseq_count_to_h5mu/test.py b/src/mapping/htseq_count_to_h5mu/test.py new file mode 100644 index 00000000..65b9a62e --- /dev/null +++ b/src/mapping/htseq_count_to_h5mu/test.py @@ -0,0 +1,47 @@ +import subprocess +from pathlib import Path +import mudata as md + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +print("> Running command with folder", flush=True) +input = meta["resources_dir"] + "/cellranger_tiny_fastq/htseq_counts.tsv" +reference = ( + meta["resources_dir"] + + "/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +) +output = "test_output.h5mu" + +cmd_pars = [ + meta["executable"], + "--input_id", + "foo;bar", + "--input_counts", + f"{input};{input}", + "--reference", + reference, + "--output", + output, + "---cpus", + "2", + "--output_compression", + "gzip", +] +subprocess.run(cmd_pars, check=True) + +print("> Check if file exists", flush=True) +output_path = Path(output) +assert output_path.is_file() + +print("> Check contents", flush=True) +mdata = md.read_h5mu(output) + +print(mdata) + +assert "rna" in mdata.mod +assert mdata.n_obs == 2 +assert mdata.mod["rna"].n_vars > 100 + +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/multi_star/arguments_htseq.yaml b/src/mapping/multi_star/arguments_htseq.yaml new file mode 100644 index 00000000..ae4276ad --- /dev/null +++ b/src/mapping/multi_star/arguments_htseq.yaml @@ -0,0 +1,102 @@ +argument_groups: + - name: HTSeq arguments + arguments: + - name: --stranded + alternatives: [-s] + type: string + choices: ["yes", "no", "reverse"] + default: "yes" + description: Whether the data is from a strand-specific assay. 'reverse' means 'yes' with reversed strand interpretation. + info: + step: htseq + orig_arg: --stranded + - name: --minimum_alignment_quality + type: integer + alternatives: [-a, --minaqual] + default: 10 + description: | + Skip all reads with MAPQ alignment quality lower than the given minimum value. + MAPQ is the 5th column of a SAM/BAM file and its usage depends on the software + used to map the reads. + info: + step: htseq + orig_arg: --minaqual + - name: --type + type: string + alternatives: [-t] + example: exon + description: "Feature type (3rd column in GTF file) to be used, all features of other type are ignored (default, suitable for Ensembl GTF files: exon)" + info: + step: htseq + orig_arg: --type + - name: --id_attribute + type: string + alternatives: [-i] + example: gene_id + description: | + GTF attribute to be used as feature ID (default, suitable for Ensembl GTF files: gene_id). + All feature of the right type (see -t option) within the same GTF attribute will be added + together. The typical way of using this option is to count all exonic reads from each gene + and add the exons but other uses are possible as well. You can call this option multiple + times: in that case, the combination of all attributes separated by colons (:) will be used + as a unique identifier, e.g. for exons you might use -i gene_id -i exon_number. + multiple: true + info: + step: htseq + orig_arg: --idattr + - name: --additional_attributes + type: string + example: gene_name + description: | + Additional feature attributes (suitable for Ensembl GTF files: gene_name). Use multiple times + for more than one additional attribute. These attributes are only used as annotations in the + output, while the determination of how the counts are added together is done based on option -i. + multiple: true + info: + step: htseq + orig_arg: --additional-attr + - name: --add_chromosome_info + type: boolean_true + description: | + Store information about the chromosome of each feature as an additional attribute + (e.g. colunm in the TSV output file). + info: + step: htseq + orig_arg: --add-chromosome-info + - name: --mode + type: string + alternatives: [-m] + choices: [union, intersection-strict, intersection-nonempty] + default: union + description: Mode to handle reads overlapping more than one feature. + info: + step: htseq + orig_arg: --mode + - name: --non_unique + type: string + choices: [none, all, fraction, random] + default: none + description: Whether and how to score reads that are not uniquely aligned or ambiguously assigned to features. + info: + step: htseq + orig_arg: --nonunique + - name: --secondary_alignments + type: string + choices: [score, ignore] + description: Whether to score secondary alignments (0x100 flag). + info: + step: htseq + orig_arg: --secondary-alignments + - name: --supplementary_alignments + type: string + choices: [score, ignore] + description: Whether to score supplementary alignments (0x800 flag). + info: + step: htseq + orig_arg: --supplementary-alignments + - name: --counts_output_sparse + type: boolean_true + description: Store the counts as a sparse matrix (mtx, h5ad, loom). + info: + step: htseq + orig_arg: --counts-output-sparse \ No newline at end of file diff --git a/src/mapping/multi_star/arguments_star.yaml b/src/mapping/multi_star/arguments_star.yaml new file mode 100644 index 00000000..dc06c410 --- /dev/null +++ b/src/mapping/multi_star/arguments_star.yaml @@ -0,0 +1,1728 @@ +argument_groups: +- name: Run Parameters + arguments: + - name: --runRNGseed + type: integer + description: random number generator seed. + info: + step: star + orig_arg: --runRNGseed + example: 777 +- name: Genome Parameters + arguments: + - name: --genomeFastaFiles + type: file + description: |- + path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + info: + step: star + orig_arg: --genomeFastaFiles + multiple: yes +- name: Splice Junctions Database + arguments: + - name: --sjdbFileChrStartEnd + type: string + description: path to the files with genomic coordinates (chr start + end strand) for the splice junction introns. Multiple files can be supplied + and will be concatenated. + info: + step: star + orig_arg: --sjdbFileChrStartEnd + multiple: yes + - name: --sjdbGTFfile + type: file + description: path to the GTF file with annotations + info: + step: star + orig_arg: --sjdbGTFfile + - name: --sjdbGTFchrPrefix + type: string + description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL + annotations with UCSC genomes) + info: + step: star + orig_arg: --sjdbGTFchrPrefix + - name: --sjdbGTFfeatureExon + type: string + description: feature type in GTF file to be used as exons for building transcripts + info: + step: star + orig_arg: --sjdbGTFfeatureExon + example: exon + - name: --sjdbGTFtagExonParentTranscript + type: string + description: GTF attribute name for parent transcript ID (default "transcript_id" + works for GTF files) + info: + step: star + orig_arg: --sjdbGTFtagExonParentTranscript + example: transcript_id + - name: --sjdbGTFtagExonParentGene + type: string + description: GTF attribute name for parent gene ID (default "gene_id" works for + GTF files) + info: + step: star + orig_arg: --sjdbGTFtagExonParentGene + example: gene_id + - name: --sjdbGTFtagExonParentGeneName + type: string + description: GTF attribute name for parent gene name + info: + step: star + orig_arg: --sjdbGTFtagExonParentGeneName + example: gene_name + multiple: yes + - name: --sjdbGTFtagExonParentGeneType + type: string + description: GTF attribute name for parent gene type + info: + step: star + orig_arg: --sjdbGTFtagExonParentGeneType + example: + - gene_type + - gene_biotype + multiple: yes + - name: --sjdbOverhang + type: integer + description: length of the donor/acceptor sequence on each side of the junctions, + ideally = (mate_length - 1) + info: + step: star + orig_arg: --sjdbOverhang + example: 100 + - name: --sjdbScore + type: integer + description: extra alignment score for alignments that cross database junctions + info: + step: star + orig_arg: --sjdbScore + example: 2 + - name: --sjdbInsertSave + type: string + description: |- + which files to save when sjdb junctions are inserted on the fly at the mapping step + + - Basic ... only small junction / transcript files + - All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + info: + step: star + orig_arg: --sjdbInsertSave + example: Basic +- name: Variation parameters + arguments: + - name: --varVCFfile + type: string + description: path to the VCF file that contains variation data. The 10th column + should contain the genotype information, e.g. 0/1 + info: + step: star + orig_arg: --varVCFfile +- name: Read Parameters + arguments: + - name: --readFilesType + type: string + description: |- + format of input read files + + - Fastx ... FASTA or FASTQ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + - SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + info: + step: star + orig_arg: --readFilesType + example: Fastx + - name: --readFilesSAMattrKeep + type: string + description: |- + for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + + - All ... keep all tags + - None ... do not keep any tags + info: + step: star + orig_arg: --readFilesSAMattrKeep + example: All + multiple: yes + - name: --readFilesManifest + type: file + description: |- + path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + info: + step: star + orig_arg: --readFilesManifest + - name: --readFilesPrefix + type: string + description: prefix for the read files names, i.e. it will be added in front of + the strings in --readFilesIn + info: + step: star + orig_arg: --readFilesPrefix + - name: --readFilesCommand + type: string + description: |- + command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + info: + step: star + orig_arg: --readFilesCommand + multiple: yes + - name: --readMapNumber + type: integer + description: |- + number of reads to map from the beginning of the file + + -1: map all reads + info: + step: star + orig_arg: --readMapNumber + example: -1 + - name: --readMatesLengthsIn + type: string + description: Equal/NotEqual - lengths of names,sequences,qualities for both mates + are the same / not the same. NotEqual is safe in all situations. + info: + step: star + orig_arg: --readMatesLengthsIn + example: NotEqual + - name: --readNameSeparator + type: string + description: character(s) separating the part of the read names that will be trimmed + in output (read name after space is always trimmed) + info: + step: star + orig_arg: --readNameSeparator + example: / + multiple: yes + - name: --readQualityScoreBase + type: integer + description: number to be subtracted from the ASCII code to get Phred quality + score + info: + step: star + orig_arg: --readQualityScoreBase + example: 33 +- name: Read Clipping + arguments: + - name: --clipAdapterType + type: string + description: |- + adapter clipping type + + - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal + - None ... no adapter clipping, all other clip* parameters are disregarded + info: + step: star + orig_arg: --clipAdapterType + example: Hamming + - name: --clip3pNbases + type: integer + description: number(s) of bases to clip from 3p of each mate. If one value is + given, it will be assumed the same for both mates. + info: + step: star + orig_arg: --clip3pNbases + example: 0 + multiple: yes + - name: --clip3pAdapterSeq + type: string + description: |- + adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + + - polyA ... polyA sequence with the length equal to read length + info: + step: star + orig_arg: --clip3pAdapterSeq + multiple: yes + - name: --clip3pAdapterMMp + type: double + description: max proportion of mismatches for 3p adapter clipping for each mate. If + one value is given, it will be assumed the same for both mates. + info: + step: star + orig_arg: --clip3pAdapterMMp + example: 0.1 + multiple: yes + - name: --clip3pAfterAdapterNbases + type: integer + description: number of bases to clip from 3p of each mate after the adapter clipping. + If one value is given, it will be assumed the same for both mates. + info: + step: star + orig_arg: --clip3pAfterAdapterNbases + example: 0 + multiple: yes + - name: --clip5pNbases + type: integer + description: number(s) of bases to clip from 5p of each mate. If one value is + given, it will be assumed the same for both mates. + info: + step: star + orig_arg: --clip5pNbases + example: 0 + multiple: yes +- name: Limits + arguments: + - name: --limitGenomeGenerateRAM + type: long + description: maximum available RAM (bytes) for genome generation + info: + step: star + orig_arg: --limitGenomeGenerateRAM + example: '31000000000' + - name: --limitIObufferSize + type: long + description: max available buffers size (bytes) for input/output, per thread + info: + step: star + orig_arg: --limitIObufferSize + example: + - 30000000 + - 50000000 + multiple: yes + - name: --limitOutSAMoneReadBytes + type: long + description: 'max size of the SAM record (bytes) for one read. Recommended value: + >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax' + info: + step: star + orig_arg: --limitOutSAMoneReadBytes + example: 100000 + - name: --limitOutSJoneRead + type: integer + description: max number of junctions for one read (including all multi-mappers) + info: + step: star + orig_arg: --limitOutSJoneRead + example: 1000 + - name: --limitOutSJcollapsed + type: integer + description: max number of collapsed junctions + info: + step: star + orig_arg: --limitOutSJcollapsed + example: 1000000 + - name: --limitBAMsortRAM + type: long + description: maximum available RAM (bytes) for sorting BAM. If =0, it will be + set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory + option. + info: + step: star + orig_arg: --limitBAMsortRAM + example: 0 + - name: --limitSjdbInsertNsj + type: integer + description: maximum number of junctions to be inserted to the genome on the fly + at the mapping stage, including those from annotations and those detected in + the 1st step of the 2-pass run + info: + step: star + orig_arg: --limitSjdbInsertNsj + example: 1000000 + - name: --limitNreadsSoft + type: integer + description: soft limit on the number of reads + info: + step: star + orig_arg: --limitNreadsSoft + example: -1 +- name: 'Output: general' + arguments: + - name: --outTmpKeep + type: string + description: |- + whether to keep the temporary files after STAR runs is finished + + - None ... remove all temporary files + - All ... keep all files + info: + step: star + orig_arg: --outTmpKeep + - name: --outStd + type: string + description: |- + which output will be directed to stdout (standard out) + + - Log ... log messages + - SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + - BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + - BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + info: + step: star + orig_arg: --outStd + example: Log + - name: --outReadsUnmapped + type: string + description: |- + output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + + - None ... no output + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + info: + step: star + orig_arg: --outReadsUnmapped + - name: --outQSconversionAdd + type: integer + description: add this number to the quality score (e.g. to convert from Illumina + to Sanger, use -31) + info: + step: star + orig_arg: --outQSconversionAdd + example: 0 + - name: --outMultimapperOrder + type: string + description: |- + order of multimapping alignments in the output files + + - Old_2.4 ... quasi-random order used before 2.5.0 + - Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + info: + step: star + orig_arg: --outMultimapperOrder + example: Old_2.4 +- name: 'Output: SAM and BAM' + arguments: + - name: --outSAMmode + type: string + description: |- + mode of SAM output + + - None ... no SAM output + - Full ... full SAM output + - NoQS ... full SAM but without quality scores + info: + step: star + orig_arg: --outSAMmode + example: Full + - name: --outSAMstrandField + type: string + description: |- + Cufflinks-like strand field flag + + - None ... not used + - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + info: + step: star + orig_arg: --outSAMstrandField + - name: --outSAMattributes + type: string + description: |- + a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + + ***Presets: + - None ... no attributes + - Standard ... NH HI AS nM + - All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + - NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + - HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + - AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + - nM ... number of mismatches. For PE reads, sum over two mates. + - NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + - MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + - jI ... start and end of introns for all junctions (1-based). + - XS ... alignment strand according to --outSAMstrandField. + - MC ... mate's CIGAR string. Standard SAM tag. + - ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + - cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + - vA ... variant allele + - vG ... genomic coordinate of the variant overlapped by the read. + - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + ***STARsolo: + - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + - GX GN ... gene ID and gene name for unique-gene reads. + - gx gn ... gene IDs and gene names for unique- and multi-gene reads. + - CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + - sM ... assessment of CB and UMI. + - sS ... sequence of the entire barcode (CB,UMI,adapter). + - sQ ... quality of the entire barcode. + ***Unsupported/undocumented: + - ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + - rB ... alignment block read/genomic coordinates. + - vR ... read coordinate of the variant. + info: + step: star + orig_arg: --outSAMattributes + example: Standard + multiple: yes + - name: --outSAMattrIHstart + type: integer + description: start value for the IH attribute. 0 may be required by some downstream + software, such as Cufflinks or StringTie. + info: + step: star + orig_arg: --outSAMattrIHstart + example: 1 + - name: --outSAMunmapped + type: string + description: |- + output of unmapped reads in the SAM format + + 1st word: + - None ... no output + - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + info: + step: star + orig_arg: --outSAMunmapped + multiple: yes + - name: --outSAMorder + type: string + description: |- + type of sorting for the SAM output + + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + info: + step: star + orig_arg: --outSAMorder + example: Paired + - name: --outSAMprimaryFlag + type: string + description: |- + which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + + - OneBestScore ... only one alignment with the best score is primary + - AllBestScore ... all alignments with the best score are primary + info: + step: star + orig_arg: --outSAMprimaryFlag + example: OneBestScore + - name: --outSAMreadID + type: string + description: |- + read ID record type + + - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + - Number ... read number (index) in the FASTx file + info: + step: star + orig_arg: --outSAMreadID + example: Standard + - name: --outSAMmapqUnique + type: integer + description: '0 to 255: the MAPQ value for unique mappers' + info: + step: star + orig_arg: --outSAMmapqUnique + example: 255 + - name: --outSAMflagOR + type: integer + description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e. + FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, + and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.' + info: + step: star + orig_arg: --outSAMflagOR + example: 0 + - name: --outSAMflagAND + type: integer + description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e. + FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, + but before outSAMflagOR. Can be used to unset specific bits that are not set + otherwise.' + info: + step: star + orig_arg: --outSAMflagAND + example: 65535 + - name: --outSAMattrRGline + type: string + description: |- + SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + info: + step: star + orig_arg: --outSAMattrRGline + multiple: yes + - name: --outSAMheaderHD + type: string + description: '@HD (header) line of the SAM header' + info: + step: star + orig_arg: --outSAMheaderHD + multiple: yes + - name: --outSAMheaderPG + type: string + description: extra @PG (software) line of the SAM header (in addition to STAR) + info: + step: star + orig_arg: --outSAMheaderPG + multiple: yes + - name: --outSAMheaderCommentFile + type: string + description: path to the file with @CO (comment) lines of the SAM header + info: + step: star + orig_arg: --outSAMheaderCommentFile + - name: --outSAMfilter + type: string + description: |- + filter the output into main SAM/BAM files + + - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + - KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + info: + step: star + orig_arg: --outSAMfilter + multiple: yes + - name: --outSAMmultNmax + type: integer + description: |- + max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + + - -1 ... all alignments (up to --outFilterMultimapNmax) will be output + info: + step: star + orig_arg: --outSAMmultNmax + example: -1 + - name: --outSAMtlen + type: integer + description: |- + calculation method for the TLEN field in the SAM/BAM files + + - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + info: + step: star + orig_arg: --outSAMtlen + example: 1 + - name: --outBAMcompression + type: integer + description: -1 to 10 BAM compression level, -1=default compression (6?), 0=no + compression, 10=maximum compression + info: + step: star + orig_arg: --outBAMcompression + example: 1 + - name: --outBAMsortingThreadN + type: integer + description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).' + info: + step: star + orig_arg: --outBAMsortingThreadN + example: 0 + - name: --outBAMsortingBinsN + type: integer + description: '>0: number of genome bins for coordinate-sorting' + info: + step: star + orig_arg: --outBAMsortingBinsN + example: 50 +- name: BAM processing + arguments: + - name: --bamRemoveDuplicatesType + type: string + description: |- + mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + + - - ... no duplicate removal/marking + - UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + info: + step: star + orig_arg: --bamRemoveDuplicatesType + - name: --bamRemoveDuplicatesMate2basesN + type: integer + description: number of bases from the 5' of mate 2 to use in collapsing (e.g. + for RAMPAGE) + info: + step: star + orig_arg: --bamRemoveDuplicatesMate2basesN + example: 0 +- name: Output Wiggle + arguments: + - name: --outWigType + type: string + description: |- + type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + + 1st word: + - None ... no signal output + - bedGraph ... bedGraph format + - wiggle ... wiggle format + 2nd word: + - read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + - read2 ... signal from only 2nd read + info: + step: star + orig_arg: --outWigType + multiple: yes + - name: --outWigStrand + type: string + description: |- + strandedness of wiggle/bedGraph output + + - Stranded ... separate strands, str1 and str2 + - Unstranded ... collapsed strands + info: + step: star + orig_arg: --outWigStrand + example: Stranded + - name: --outWigReferencesPrefix + type: string + description: prefix matching reference names to include in the output wiggle file, + e.g. "chr", default "-" - include all references + info: + step: star + orig_arg: --outWigReferencesPrefix + - name: --outWigNorm + type: string + description: |- + type of normalization for the signal + + - RPM ... reads per million of mapped reads + - None ... no normalization, "raw" counts + info: + step: star + orig_arg: --outWigNorm + example: RPM +- name: Output Filtering + arguments: + - name: --outFilterType + type: string + description: |- + type of filtering + + - Normal ... standard filtering using only current alignment + - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + info: + step: star + orig_arg: --outFilterType + example: Normal + - name: --outFilterMultimapScoreRange + type: integer + description: the score range below the maximum score for multimapping alignments + info: + step: star + orig_arg: --outFilterMultimapScoreRange + example: 1 + - name: --outFilterMultimapNmax + type: integer + description: |- + maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + info: + step: star + orig_arg: --outFilterMultimapNmax + example: 10 + - name: --outFilterMismatchNmax + type: integer + description: alignment will be output only if it has no more mismatches than this + value. + info: + step: star + orig_arg: --outFilterMismatchNmax + example: 10 + - name: --outFilterMismatchNoverLmax + type: double + description: alignment will be output only if its ratio of mismatches to *mapped* + length is less than or equal to this value. + info: + step: star + orig_arg: --outFilterMismatchNoverLmax + example: 0.3 + - name: --outFilterMismatchNoverReadLmax + type: double + description: alignment will be output only if its ratio of mismatches to *read* + length is less than or equal to this value. + info: + step: star + orig_arg: --outFilterMismatchNoverReadLmax + example: 1.0 + - name: --outFilterScoreMin + type: integer + description: alignment will be output only if its score is higher than or equal + to this value. + info: + step: star + orig_arg: --outFilterScoreMin + example: 0 + - name: --outFilterScoreMinOverLread + type: double + description: same as outFilterScoreMin, but normalized to read length (sum of + mates' lengths for paired-end reads) + info: + step: star + orig_arg: --outFilterScoreMinOverLread + example: 0.66 + - name: --outFilterMatchNmin + type: integer + description: alignment will be output only if the number of matched bases is higher + than or equal to this value. + info: + step: star + orig_arg: --outFilterMatchNmin + example: 0 + - name: --outFilterMatchNminOverLread + type: double + description: sam as outFilterMatchNmin, but normalized to the read length (sum + of mates' lengths for paired-end reads). + info: + step: star + orig_arg: --outFilterMatchNminOverLread + example: 0.66 + - name: --outFilterIntronMotifs + type: string + description: |- + filter alignment using their motifs + + - None ... no filtering + - RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + - RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + info: + step: star + orig_arg: --outFilterIntronMotifs + - name: --outFilterIntronStrands + type: string + description: |- + filter alignments + + - RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + - None ... no filtering + info: + step: star + orig_arg: --outFilterIntronStrands + example: RemoveInconsistentStrands +- name: Output splice junctions (SJ.out.tab) + arguments: + - name: --outSJtype + type: string + description: |- + type of splice junction output + + - Standard ... standard SJ.out.tab output + - None ... no splice junction output + info: + step: star + orig_arg: --outSJtype + example: Standard +- name: 'Output Filtering: Splice Junctions' + arguments: + - name: --outSJfilterReads + type: string + description: |- + which reads to consider for collapsed splice junctions output + + - All ... all reads, unique- and multi-mappers + - Unique ... uniquely mapping reads only + info: + step: star + orig_arg: --outSJfilterReads + example: All + - name: --outSJfilterOverhangMin + type: integer + description: |- + minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + does not apply to annotated junctions + info: + step: star + orig_arg: --outSJfilterOverhangMin + example: + - 30 + - 12 + - 12 + - 12 + multiple: yes + - name: --outSJfilterCountUniqueMin + type: integer + description: |- + minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + info: + step: star + orig_arg: --outSJfilterCountUniqueMin + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + - name: --outSJfilterCountTotalMin + type: integer + description: |- + minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + info: + step: star + orig_arg: --outSJfilterCountTotalMin + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + - name: --outSJfilterDistToOtherSJmin + type: integer + description: |- + minimum allowed distance to other junctions' donor/acceptor + + does not apply to annotated junctions + info: + step: star + orig_arg: --outSJfilterDistToOtherSJmin + example: + - 10 + - 0 + - 5 + - 10 + multiple: yes + - name: --outSJfilterIntronMaxVsReadN + type: integer + description: |- + maximum gap allowed for junctions supported by 1,2,3,,,N reads + + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + info: + step: star + orig_arg: --outSJfilterIntronMaxVsReadN + example: + - 50000 + - 100000 + - 200000 + multiple: yes +- name: Scoring + arguments: + - name: --scoreGap + type: integer + description: splice junction penalty (independent on intron motif) + info: + step: star + orig_arg: --scoreGap + example: 0 + - name: --scoreGapNoncan + type: integer + description: non-canonical junction penalty (in addition to scoreGap) + info: + step: star + orig_arg: --scoreGapNoncan + example: -8 + - name: --scoreGapGCAG + type: integer + description: GC/AG and CT/GC junction penalty (in addition to scoreGap) + info: + step: star + orig_arg: --scoreGapGCAG + example: -4 + - name: --scoreGapATAC + type: integer + description: AT/AC and GT/AT junction penalty (in addition to scoreGap) + info: + step: star + orig_arg: --scoreGapATAC + example: -8 + - name: --scoreGenomicLengthLog2scale + type: integer + description: 'extra score logarithmically scaled with genomic length of the alignment: + scoreGenomicLengthLog2scale*log2(genomicLength)' + info: + step: star + orig_arg: --scoreGenomicLengthLog2scale + example: 0 + - name: --scoreDelOpen + type: integer + description: deletion open penalty + info: + step: star + orig_arg: --scoreDelOpen + example: -2 + - name: --scoreDelBase + type: integer + description: deletion extension penalty per base (in addition to scoreDelOpen) + info: + step: star + orig_arg: --scoreDelBase + example: -2 + - name: --scoreInsOpen + type: integer + description: insertion open penalty + info: + step: star + orig_arg: --scoreInsOpen + example: -2 + - name: --scoreInsBase + type: integer + description: insertion extension penalty per base (in addition to scoreInsOpen) + info: + step: star + orig_arg: --scoreInsBase + example: -2 + - name: --scoreStitchSJshift + type: integer + description: maximum score reduction while searching for SJ boundaries in the + stitching step + info: + step: star + orig_arg: --scoreStitchSJshift + example: 1 +- name: Alignments and Seeding + arguments: + - name: --seedSearchStartLmax + type: integer + description: defines the search start point through the read - the read is split + into pieces no longer than this value + info: + step: star + orig_arg: --seedSearchStartLmax + example: 50 + - name: --seedSearchStartLmaxOverLread + type: double + description: seedSearchStartLmax normalized to read length (sum of mates' lengths + for paired-end reads) + info: + step: star + orig_arg: --seedSearchStartLmaxOverLread + example: 1.0 + - name: --seedSearchLmax + type: integer + description: defines the maximum length of the seeds, if =0 seed length is not + limited + info: + step: star + orig_arg: --seedSearchLmax + example: 0 + - name: --seedMultimapNmax + type: integer + description: only pieces that map fewer than this value are utilized in the stitching + procedure + info: + step: star + orig_arg: --seedMultimapNmax + example: 10000 + - name: --seedPerReadNmax + type: integer + description: max number of seeds per read + info: + step: star + orig_arg: --seedPerReadNmax + example: 1000 + - name: --seedPerWindowNmax + type: integer + description: max number of seeds per window + info: + step: star + orig_arg: --seedPerWindowNmax + example: 50 + - name: --seedNoneLociPerWindow + type: integer + description: max number of one seed loci per window + info: + step: star + orig_arg: --seedNoneLociPerWindow + example: 10 + - name: --seedSplitMin + type: integer + description: min length of the seed sequences split by Ns or mate gap + info: + step: star + orig_arg: --seedSplitMin + example: 12 + - name: --seedMapMin + type: integer + description: min length of seeds to be mapped + info: + step: star + orig_arg: --seedMapMin + example: 5 + - name: --alignIntronMin + type: integer + description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, + otherwise it is considered Deletion + info: + step: star + orig_arg: --alignIntronMin + example: 21 + - name: --alignIntronMax + type: integer + description: maximum intron size, if 0, max intron size will be determined by + (2^winBinNbits)*winAnchorDistNbins + info: + step: star + orig_arg: --alignIntronMax + example: 0 + - name: --alignMatesGapMax + type: integer + description: maximum gap between two mates, if 0, max intron gap will be determined + by (2^winBinNbits)*winAnchorDistNbins + info: + step: star + orig_arg: --alignMatesGapMax + example: 0 + - name: --alignSJoverhangMin + type: integer + description: minimum overhang (i.e. block size) for spliced alignments + info: + step: star + orig_arg: --alignSJoverhangMin + example: 5 + - name: --alignSJstitchMismatchNmax + type: integer + description: |- + maximum number of mismatches for stitching of the splice junctions (-1: no limit). + + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + info: + step: star + orig_arg: --alignSJstitchMismatchNmax + example: + - 0 + - -1 + - 0 + - 0 + multiple: yes + - name: --alignSJDBoverhangMin + type: integer + description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + info: + step: star + orig_arg: --alignSJDBoverhangMin + example: 3 + - name: --alignSplicedMateMapLmin + type: integer + description: minimum mapped length for a read mate that is spliced + info: + step: star + orig_arg: --alignSplicedMateMapLmin + example: 0 + - name: --alignSplicedMateMapLminOverLmate + type: double + description: alignSplicedMateMapLmin normalized to mate length + info: + step: star + orig_arg: --alignSplicedMateMapLminOverLmate + example: 0.66 + - name: --alignWindowsPerReadNmax + type: integer + description: max number of windows per read + info: + step: star + orig_arg: --alignWindowsPerReadNmax + example: 10000 + - name: --alignTranscriptsPerWindowNmax + type: integer + description: max number of transcripts per window + info: + step: star + orig_arg: --alignTranscriptsPerWindowNmax + example: 100 + - name: --alignTranscriptsPerReadNmax + type: integer + description: max number of different alignments per read to consider + info: + step: star + orig_arg: --alignTranscriptsPerReadNmax + example: 10000 + - name: --alignEndsType + type: string + description: |- + type of read ends alignment + + - Local ... standard local alignment with soft-clipping allowed + - EndToEnd ... force end-to-end read alignment, do not soft-clip + - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + info: + step: star + orig_arg: --alignEndsType + example: Local + - name: --alignEndsProtrude + type: string + description: |- + allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + - ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + - DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + info: + step: star + orig_arg: --alignEndsProtrude + example: 0 ConcordantPair + - name: --alignSoftClipAtReferenceEnds + type: string + description: |- + allow the soft-clipping of the alignments past the end of the chromosomes + + - Yes ... allow + - No ... prohibit, useful for compatibility with Cufflinks + info: + step: star + orig_arg: --alignSoftClipAtReferenceEnds + example: 'Yes' + - name: --alignInsertionFlush + type: string + description: |- + how to flush ambiguous insertion positions + + - None ... insertions are not flushed + - Right ... insertions are flushed to the right + info: + step: star + orig_arg: --alignInsertionFlush +- name: Paired-End reads + arguments: + - name: --peOverlapNbasesMin + type: integer + description: minimum number of overlapping bases to trigger mates merging and + realignment. Specify >0 value to switch on the "merginf of overlapping mates" + algorithm. + info: + step: star + orig_arg: --peOverlapNbasesMin + example: 0 + - name: --peOverlapMMp + type: double + description: maximum proportion of mismatched bases in the overlap area + info: + step: star + orig_arg: --peOverlapMMp + example: 0.01 +- name: Windows, Anchors, Binning + arguments: + - name: --winAnchorMultimapNmax + type: integer + description: max number of loci anchors are allowed to map to + info: + step: star + orig_arg: --winAnchorMultimapNmax + example: 50 + - name: --winBinNbits + type: integer + description: =log2(winBin), where winBin is the size of the bin for the windows/clustering, + each window will occupy an integer number of bins. + info: + step: star + orig_arg: --winBinNbits + example: 16 + - name: --winAnchorDistNbins + type: integer + description: max number of bins between two anchors that allows aggregation of + anchors into one window + info: + step: star + orig_arg: --winAnchorDistNbins + example: 9 + - name: --winFlankNbins + type: integer + description: log2(winFlank), where win Flank is the size of the left and right + flanking regions for each window + info: + step: star + orig_arg: --winFlankNbins + example: 4 + - name: --winReadCoverageRelativeMin + type: double + description: minimum relative coverage of the read sequence by the seeds in a + window, for STARlong algorithm only. + info: + step: star + orig_arg: --winReadCoverageRelativeMin + example: 0.5 + - name: --winReadCoverageBasesMin + type: integer + description: minimum number of bases covered by the seeds in a window , for STARlong + algorithm only. + info: + step: star + orig_arg: --winReadCoverageBasesMin + example: 0 +- name: Chimeric Alignments + arguments: + - name: --chimOutType + type: string + description: |- + type of chimeric output + + - Junctions ... Chimeric.out.junction + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + - WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + info: + step: star + orig_arg: --chimOutType + example: Junctions + multiple: yes + - name: --chimSegmentMin + type: integer + description: minimum length of chimeric segment length, if ==0, no chimeric output + info: + step: star + orig_arg: --chimSegmentMin + example: 0 + - name: --chimScoreMin + type: integer + description: minimum total (summed) score of the chimeric segments + info: + step: star + orig_arg: --chimScoreMin + example: 0 + - name: --chimScoreDropMax + type: integer + description: max drop (difference) of chimeric score (the sum of scores of all + chimeric segments) from the read length + info: + step: star + orig_arg: --chimScoreDropMax + example: 20 + - name: --chimScoreSeparation + type: integer + description: minimum difference (separation) between the best chimeric score and + the next one + info: + step: star + orig_arg: --chimScoreSeparation + example: 10 + - name: --chimScoreJunctionNonGTAG + type: integer + description: penalty for a non-GT/AG chimeric junction + info: + step: star + orig_arg: --chimScoreJunctionNonGTAG + example: -1 + - name: --chimJunctionOverhangMin + type: integer + description: minimum overhang for a chimeric junction + info: + step: star + orig_arg: --chimJunctionOverhangMin + example: 20 + - name: --chimSegmentReadGapMax + type: integer + description: maximum gap in the read sequence between chimeric segments + info: + step: star + orig_arg: --chimSegmentReadGapMax + example: 0 + - name: --chimFilter + type: string + description: |- + different filters for chimeric alignments + + - None ... no filtering + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + info: + step: star + orig_arg: --chimFilter + example: banGenomicN + multiple: yes + - name: --chimMainSegmentMultNmax + type: integer + description: maximum number of multi-alignments for the main chimeric segment. + =1 will prohibit multimapping main segments. + info: + step: star + orig_arg: --chimMainSegmentMultNmax + example: 10 + - name: --chimMultimapNmax + type: integer + description: |- + maximum number of chimeric multi-alignments + + - 0 ... use the old scheme for chimeric detection which only considered unique alignments + info: + step: star + orig_arg: --chimMultimapNmax + example: 0 + - name: --chimMultimapScoreRange + type: integer + description: the score range for multi-mapping chimeras below the best chimeric + score. Only works with --chimMultimapNmax > 1 + info: + step: star + orig_arg: --chimMultimapScoreRange + example: 1 + - name: --chimNonchimScoreDropMin + type: integer + description: to trigger chimeric detection, the drop in the best non-chimeric + alignment score with respect to the read length has to be greater than this + value + info: + step: star + orig_arg: --chimNonchimScoreDropMin + example: 20 + - name: --chimOutJunctionFormat + type: integer + description: |- + formatting type for the Chimeric.out.junction file + + - 0 ... no comment lines/headers + - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + info: + step: star + orig_arg: --chimOutJunctionFormat + example: 0 +- name: Quantification of Annotations + arguments: + - name: --quantMode + type: string + description: |- + types of quantification requested + + - - ... none + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + - GeneCounts ... count reads per gene + info: + step: star + orig_arg: --quantMode + multiple: yes + - name: --quantTranscriptomeBAMcompression + type: integer + description: |- + -2 to 10 transcriptome BAM compression level + + - -2 ... no BAM output + - -1 ... default compression (6?) + - 0 ... no compression + - 10 ... maximum compression + info: + step: star + orig_arg: --quantTranscriptomeBAMcompression + example: 1 + - name: --quantTranscriptomeBan + type: string + description: |- + prohibit various alignment type + + - IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM + - Singleend ... prohibit single-end alignments + info: + step: star + orig_arg: --quantTranscriptomeBan + example: IndelSoftclipSingleend +- name: 2-pass Mapping + arguments: + - name: --twopassMode + type: string + description: |- + 2-pass mapping mode. + + - None ... 1-pass mapping + - Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + info: + step: star + orig_arg: --twopassMode + - name: --twopass1readsN + type: integer + description: number of reads to process for the 1st step. Use very large number + (or default -1) to map all reads in the first step. + info: + step: star + orig_arg: --twopass1readsN + example: -1 +- name: WASP parameters + arguments: + - name: --waspOutputMode + type: string + description: |- + WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 . + + - SAMtag ... add WASP tags to the alignments that pass WASP filtering + info: + step: star + orig_arg: --waspOutputMode +- name: STARsolo (single cell RNA-seq) parameters + arguments: + - name: --soloType + type: string + description: |- + type of single-cell RNA-seq + + - CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + - CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + - CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate] + - SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + info: + step: star + orig_arg: --soloType + multiple: yes + - name: --soloCBwhitelist + type: string + description: |- + file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file. + + - None ... no whitelist: all cell barcodes are allowed + info: + step: star + orig_arg: --soloCBwhitelist + multiple: yes + - name: --soloCBstart + type: integer + description: cell barcode start base + info: + step: star + orig_arg: --soloCBstart + example: 1 + - name: --soloCBlen + type: integer + description: cell barcode length + info: + step: star + orig_arg: --soloCBlen + example: 16 + - name: --soloUMIstart + type: integer + description: UMI start base + info: + step: star + orig_arg: --soloUMIstart + example: 17 + - name: --soloUMIlen + type: integer + description: UMI length + info: + step: star + orig_arg: --soloUMIlen + example: 10 + - name: --soloBarcodeReadLength + type: integer + description: |- + length of the barcode read + + - 1 ... equal to sum of soloCBlen+soloUMIlen + - 0 ... not defined, do not check + info: + step: star + orig_arg: --soloBarcodeReadLength + example: 1 + - name: --soloBarcodeMate + type: integer + description: |- + identifies which read mate contains the barcode (CB+UMI) sequence + + - 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + - 1 ... barcode sequence is a part of mate 1 + - 2 ... barcode sequence is a part of mate 2 + info: + step: star + orig_arg: --soloBarcodeMate + example: 0 + - name: --soloCBposition + type: string + description: |- + position of Cell Barcode(s) on the barcode read. + + Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 0_0_2_-1 3_1_3_8 + info: + step: star + orig_arg: --soloCBposition + multiple: yes + - name: --soloUMIposition + type: string + description: |- + position of the UMI on the barcode read, same as soloCBposition + + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 3_9_3_14 + info: + step: star + orig_arg: --soloUMIposition + - name: --soloAdapterSequence + type: string + description: adapter sequence to anchor barcodes. Only one adapter sequence is + allowed. + info: + step: star + orig_arg: --soloAdapterSequence + - name: --soloAdapterMismatchesNmax + type: integer + description: maximum number of mismatches allowed in adapter sequence. + info: + step: star + orig_arg: --soloAdapterMismatchesNmax + example: 1 + - name: --soloCBmatchWLtype + type: string + description: |- + matching the Cell Barcodes to the WhiteList + + - Exact ... only exact matches allowed + - 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + - 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + - 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + - 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + - EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + info: + step: star + orig_arg: --soloCBmatchWLtype + example: 1MM_multi + - name: --soloInputSAMattrBarcodeSeq + type: string + description: |- + when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR . + This parameter is required when running STARsolo with input from SAM. + info: + step: star + orig_arg: --soloInputSAMattrBarcodeSeq + multiple: yes + - name: --soloInputSAMattrBarcodeQual + type: string + description: |- + when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + info: + step: star + orig_arg: --soloInputSAMattrBarcodeQual + multiple: yes + - name: --soloStrand + type: string + description: |- + strandedness of the solo libraries: + + - Unstranded ... no strand information + - Forward ... read strand same as the original RNA molecule + - Reverse ... read strand opposite to the original RNA molecule + info: + step: star + orig_arg: --soloStrand + example: Forward + - name: --soloFeatures + type: string + description: |- + genomic features for which the UMI counts per Cell Barcode are collected + + - Gene ... genes: reads match the gene transcript + - SJ ... splice junctions: reported in SJ.out.tab + - GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + - GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + info: + step: star + orig_arg: --soloFeatures + example: Gene + multiple: yes + - name: --soloMultiMappers + type: string + description: |- + counting method for reads mapping to multiple genes + + - Unique ... count only reads that map to unique genes + - Uniform ... uniformly distribute multi-genic UMIs to all genes + - Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + - PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + - EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + info: + step: star + orig_arg: --soloMultiMappers + example: Unique + multiple: yes + - name: --soloUMIdedup + type: string + description: |- + type of UMI deduplication (collapsing) algorithm + + - 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + - 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + - 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + - Exact ... only exactly matching UMIs are collapsed. + - NoDedup ... no deduplication of UMIs, count all reads. + - 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + info: + step: star + orig_arg: --soloUMIdedup + example: 1MM_All + multiple: yes + - name: --soloUMIfiltering + type: string + description: |- + type of UMI filtering (for reads uniquely mapping to genes) + + - - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + - MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + - MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + - MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --soloUMIdedup 1MM_CR + info: + step: star + orig_arg: --soloUMIfiltering + multiple: yes + - name: --soloOutFileNames + type: string + description: |- + file names for STARsolo output: + + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + info: + step: star + orig_arg: --soloOutFileNames + example: + - Solo.out/ + - features.tsv + - barcodes.tsv + - matrix.mtx + multiple: yes + - name: --soloCellFilter + type: string + description: |- + cell filtering type and parameters + + - None ... do not output filtered cells + - TopCells ... only report top cells by UMI count, followed by the exact number of cells + - CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + - EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + info: + step: star + orig_arg: --soloCellFilter + example: + - CellRanger2.2 + - '3000' + - '0.99' + - '10' + multiple: yes + - name: --soloOutFormatFeaturesGeneField3 + type: string + description: field 3 in the Gene features.tsv file. If "-", then no 3rd field + is output. + info: + step: star + orig_arg: --soloOutFormatFeaturesGeneField3 + example: Gene Expression + multiple: yes + - name: --soloCellReadStats + type: string + description: |- + Output reads statistics for each CB + + - Standard ... standard output + info: + step: star + orig_arg: --soloCellReadStats diff --git a/src/mapping/multi_star/config.vsh.yaml b/src/mapping/multi_star/config.vsh.yaml new file mode 100644 index 00000000..099c4c65 --- /dev/null +++ b/src/mapping/multi_star/config.vsh.yaml @@ -0,0 +1,120 @@ +name: multi_star +namespace: mapping +scope: "public" +description: Align fastq files using STAR. +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +# generated the argument groups using `utils/process_params.R` +__merge__: [., arguments_star.yaml, arguments_htseq.yaml] +# manually taking care of the main input files +argument_groups: + - name: Input/Output + arguments: + - type: string + name: --input_id + required: true + description: The ID of the sample being processed. This vector should have the same length as the `--input_r1` argument. + example: [ mysample, mysample ] + multiple: true + - type: file + name: --input_r1 + required: true + description: Paths to the sequences to be mapped. If using Illumina paired-end reads, only the R1 files should be passed. + example: [ mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L002_R1_001.fastq.gz ] + multiple: true + - type: file + name: --input_r2 + required: false + description: Paths to the sequences to be mapped. If using Illumina paired-end reads, only the R2 files should be passed. + example: [ mysample_S1_L001_R2_001.fastq.gz, mysample_S1_L002_R2_001.fastq.gz ] + multiple: true + - type: file + name: --reference_index + alternatives: --genomeDir + description: Path to the reference built by star_build_reference. Corresponds to the --genomeDir argument in the STAR command. + example: /path/to/reference + required: true + - type: file + name: --reference_gtf + description: Path to the gtf reference file. + example: genes.gtf + required: true + - type: file + name: --output + alternatives: --outFileNamePrefix + description: Path to output directory. Corresponds to the --outFileNamePrefix argument in the STAR command. + example: /path/to/foo + direction: output + required: true + - name: Processing arguments + arguments: + - type: boolean + name: --run_htseq_count + description: Whether or not to also run htseq-count after STAR. + default: true + - type: boolean + name: --run_multiqc + description: Whether or not to also run MultiQC at the end. + default: true + - type: double + name: --min_success_rate + default: 0.5 + description: Fail when the success rate is below this threshold. +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.10-slim + setup: + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.10b + - PACKAGES gcc g++ make wget zlib1g-dev unzip + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + - type: apt + packages: + - samtools + - procps + - type: python + packages: + - pyyaml + - HTSeq + - multiprocess + - gtfparse + - pandas + - numpy<2 + # Pin multiqc to avoid + # 'TypeError: expected str, bytes or os.PathLike object, not NoneType' + - multiqc~=1.15.0 + test_setup: + - type: python + packages: + - pytest +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/multi_star/script.py b/src/mapping/multi_star/script.py new file mode 100644 index 00000000..e80944f5 --- /dev/null +++ b/src/mapping/multi_star/script.py @@ -0,0 +1,464 @@ +from typing import Any, Dict, List, Tuple +import math +import tempfile +import subprocess +import tarfile +import gzip +import shutil +from pathlib import Path +import yaml +import pandas as pd +from multiprocess import Pool +import gtfparse +import polars as pl + +## VIASH START +par = { + "input_id": ["mysample1", "mysample2"], + "input_r1": [ + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz", + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz", + ], + "input_r2": [ + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz", + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz", + ], + "reference_index": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/", + "reference_gtf": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", + "output": "test_output", +} +meta = { + "name": "star_and_htseq", + "cpus": 30, + "temp_dir": "/tmp", + "config": "src/mapping/multi_star/.config.vsh.yaml", +} +## VIASH END + +######################## +### Helper functions ### +######################## + + +def fetch_arguments_info(config: Dict[str, Any]) -> Dict[str, Any]: + """Fetch arguments from config""" + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + return arguments + + +def process_par( + par: Dict[str, Any], + arguments_info: Dict[str, Any], + gz_args: List[str], + temp_dir: Path, +) -> Dict[str, Any]: + """ + Process the Viash par dictionary + + This turns file strings into Path objects and extracting gzipped files if need be. + + Parameters + ---------- + par: The par dictionary created by Viash + arguments_info: The arguments info Dictionary created by `fetch_arguments_info` + gz_args: A list of argument keys which could be gzip files which need to be decompressed. + temp_dir: A temporary directory in which to ungzip files + """ + new_par = {} + for key, value in par.items(): + arg_info = arguments_info[key] + # turn file arguments into paths + if value and arg_info["type"] == "file": + is_multiple = isinstance(value, list) + + if is_multiple: + value = [Path(val) for val in value] + else: + value = Path(value) + + if key in gz_args: + print(f">> Checking compression of --{key}", flush=True) + # turn value into list if need be + if not is_multiple: + value = [value] + + # extract + value = [extract_if_need_be(path, temp_dir) for path in value] + + # unlist if need be + if not is_multiple: + value = value[0] + + new_par[key] = value + return new_par + + +def generate_cmd_arguments(par, arguments_info, step_filter=None, flatten=False): + """ + Generate command-line arguments by fetching the relevant args + + Parameters + ---------- + par: The par dictionary created by Viash + arguments_info: The arguments info Dictionary created by `fetch_arguments_info` + step_filter: If provided,`par` will be filtered to only contain arguments for which + argument.info.step == step_filter. + flatten: If `False`, the command for an argument with multiple values will be + `["--key", "value1", "--key", "value2"]`, otherwise `["--key", "value1", "value2"]`. + """ + cmd_args = [] + + for key, arg in arguments_info.items(): + arg_val = par.get(key) + # The info key is always present (changed in viash 0.7.4) + # in the parsed config (None if not specified in source config) + info = arg["info"] or {} + orig_arg = info.get("orig_arg") + step = info.get("step") + if arg_val and orig_arg and (not step_filter or step == step_filter): + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + if flatten: + arg_val = [str(x) for x in [orig_arg] + arg_val] + else: + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +def is_gz_file(path: Path) -> bool: + """Check whether something is a gzip""" + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + + +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + """if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path""" + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +def load_star_reference(reference_index: str) -> None: + """Load star reference index into memory.""" + subprocess.run( + [ + "STAR", + "--genomeLoad", + "LoadAndExit", + "--genomeDir", + str(reference_index), + ], + check=True, + ) + + +def unload_star_reference(reference_index: str) -> None: + """Remove star reference index from memory.""" + subprocess.run( + [ + "STAR", + "--genomeLoad", + "Remove", + "--genomeDir", + str(reference_index), + ], + check=True, + ) + + +def star_and_htseq( + group_id: str, + r1_files: List[Path], + r2_files: List[Path], + temp_dir: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], + num_threads: int, +) -> Tuple[int, str]: + star_output = par["output"] / "per" / group_id + temp_dir_group = temp_dir / f"star_tmp_{group_id}" + unsorted_bam = star_output / "Aligned.out.bam" + sorted_bam = star_output / "Aligned.sorted.out.bam" + counts_file = star_output / "htseq-count.txt" + multiqc_path = star_output / "multiqc_data" + + print(f">> Running STAR for group '{group_id}' with command:", flush=True) + star_output.mkdir(parents=True, exist_ok=True) + temp_dir_group.parent.mkdir(parents=True, exist_ok=True) + run_star( + r1_files=r1_files, + r2_files=r2_files, + output_dir=star_output, + temp_dir=temp_dir / f"star_tmp_{group_id}", + par=par, + arguments_info=arguments_info, + num_threads=num_threads, + ) + if not unsorted_bam.exists(): + return (1, f"Could not find unsorted bam at '{unsorted_bam}'") + + if par["run_htseq_count"]: + print( + f">> Running samtools sort for group '{group_id}' with command:", flush=True + ) + run_samtools_sort(unsorted_bam, sorted_bam) + if not sorted_bam.exists(): + return (1, f"Could not find sorted bam at '{unsorted_bam}'") + + print( + f">> Running htseq-count for group '{group_id}' with command:", flush=True + ) + run_htseq_count(sorted_bam, counts_file, par, arguments_info) + if not counts_file.exists(): + return (1, f"Could not find counts at '{counts_file}'") + + if par["run_multiqc"]: + run_multiqc(star_output) + if not multiqc_path.exists(): + return (1, f"Could not find MultiQC output at '{multiqc_path}'") + + return (0, "") + + +def run_star( + r1_files: List[Path], + r2_files: List[Path], + output_dir: Path, + temp_dir: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], + num_threads: int, +) -> None: + """Run star""" + # process manual arguments + r1_pasted = [",".join([str(r1) for r1 in r1_files])] + r2_pasted = [",".join([str(r2) for r2 in r2_files])] if r2_files else [] + manual_par = { + "--genomeDir": [par["reference_index"]], + "--genomeLoad": ["LoadAndRemove"], + "--runThreadN": [str(num_threads)], + "--runMode": ["alignReads"], + "--readFilesIn": r1_pasted + r2_pasted, + # create a tempdir per group + "--outTmpDir": [temp_dir], + # make sure there is a trailing / + "--outFileNamePrefix": [f"{output_dir}/"], + # fix the outSAMtype to return unsorted BAM files + "--outSAMtype": ["BAM", "Unsorted"], + } + manual_cmd = [str(x) for key, values in manual_par.items() for x in [key] + values] + + # process all passthrough star arguments + par_cmd = generate_cmd_arguments(par, arguments_info, "star", flatten=True) + + # combine into one command and turn into strings + cmd_args = [str(val) for val in ["STAR"] + manual_cmd + par_cmd] + + # run star + subprocess.run(cmd_args, check=True) + + +def run_samtools_sort(unsorted_bam: Path, sorted_bam: Path) -> None: + "Run samtools sort" + cmd_args = [ + "samtools", + "sort", + "-o", + sorted_bam, + unsorted_bam, + ] + subprocess.run(cmd_args, check=True) + + +def run_htseq_count( + sorted_bam: Path, + counts_file: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], +) -> None: + """Run HTSeq count""" + # process manual arguments + manual_cmd = [sorted_bam, par["reference_gtf"]] + + # process all passthrough htseq arguments + par_cmd = generate_cmd_arguments(par, arguments_info, "htseq") + + # combine into one command and turn into strings + cmd_args = [str(val) for val in ["htseq-count"] + manual_cmd + par_cmd] + + # run htseq + with open(counts_file, "w", encoding="utf-8") as file: + subprocess.run(cmd_args, check=True, stdout=file) + + +def get_feature_info(reference_gtf) -> pd.DataFrame: + ref = gtfparse.read_gtf(reference_gtf) + ref_genes = ref.filter((pl.col("feature") == "gene") | (pl.col("source") == "ERCC")) + return pd.DataFrame( + { + "feature_id": pd.Index(ref_genes.get_column("gene_id")), + "feature_type": "Gene Expression", + "feature_name": ref_genes.get_column("gene_name").to_pandas(), + } + ) + + +def run_multiqc(input_dir: Path) -> None: + cmd_args = [ + "multiqc", + str(input_dir), + "--outdir", + str(input_dir), + "--no-report", + "--force", + ] + + # run multiqc + subprocess.run(cmd_args, check=True) + + +######################## +### Main code ### +######################## + + +def main(par, meta): + """Main function""" + + # check input arguments + assert len(par["input_id"]) == len(par["input_r1"]), ( + "--input_r1 should have same length as --input_id" + ) + if par["input_r2"]: + assert len(par["input_id"]) == len(par["input_r2"]), ( + "--input_r2 should have same length as --input_id" + ) + + # read config arguments + with open(meta["config"], "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + # fetch all arguments from the config and turn it into a Dict[str, Argument] + arguments_info = fetch_arguments_info(config) + + # temp_dir = "tmp/" + with tempfile.TemporaryDirectory( + prefix=f"{meta['name']}-", dir=meta["temp_dir"], ignore_cleanup_errors=True + ) as temp_dir: + temp_dir = Path(temp_dir) + temp_dir.mkdir(parents=True, exist_ok=True) + + # turn file strings into Paths and decompress gzip if need be + gz_args = ["input_r1", "input_r2", "reference_index", "reference_gtf"] + par = process_par(par, arguments_info, gz_args, temp_dir) + + # make sure input_r2 has same length as input_r1 + if not par["input_r2"]: + par["input_r2"] = [None for _ in par["input_r1"]] + + # group input_files by input_id + print(">> Group by --input_id", flush=True) + grouped_inputs = {} + for group_id, file_r1, file_r2 in zip( + par["input_id"], par["input_r1"], par["input_r2"] + ): + if group_id not in grouped_inputs: + grouped_inputs[group_id] = ([], []) + grouped_inputs[group_id][0].append(file_r1) + if file_r2: + grouped_inputs[group_id][1].append(file_r2) + + # create output dir if need be + par["output"].mkdir(parents=True, exist_ok=True) + + # store features metadata + feature_info = get_feature_info(str(par["reference_gtf"])) + with open(par["output"] / "feature_info.tsv", "w", encoding="utf-8") as file: + feature_info.to_csv(file, sep="\t", index=False) + + # try: + # print(">> Loading genome in memory", flush=True) + # load_star_reference(par["reference_index"]) + + cpus = meta.get("cpus", 1) + num_items = len(grouped_inputs) + pool_size = min(cpus, num_items) + num_threads_per_task = math.ceil(cpus / pool_size) + + with Pool(pool_size) as pool: + outs = pool.starmap( + lambda group_id, files: star_and_htseq( + group_id=group_id, + r1_files=files[0], + r2_files=files[1], + temp_dir=temp_dir, + par=par, + arguments_info=arguments_info, + num_threads=num_threads_per_task, + ), + grouped_inputs.items(), + ) + + num_errored = 0 + for exit, msg in outs: + if exit != 0: + print(f"Error: {msg}") + num_errored += 1 + + pct_succeeded = 1.0 - num_errored / len(outs) + print("------------------") + print(f"Success rate: {math.ceil(pct_succeeded * 100)}%") + + assert pct_succeeded >= par["min_success_rate"], ( + f"Success rate should be at least {math.ceil(par['min_success_rate'] * 100)}%" + ) + + +if __name__ == "__main__": + main(par, meta) diff --git a/src/mapping/multi_star/test.py b/src/mapping/multi_star/test.py new file mode 100644 index 00000000..80c99b22 --- /dev/null +++ b/src/mapping/multi_star/test.py @@ -0,0 +1,114 @@ +import subprocess +from pathlib import Path + +## VIASH START +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} +## VIASH END + +# find common input files +resources_dir = Path(meta["resources_dir"]) +input_dir = resources_dir / "cellranger_tiny_fastq" / "cellranger_tiny_fastq" +reference_index = ( + resources_dir / "cellranger_tiny_fastq" / "cellranger_tiny_ref_v2_7_10_a/" +) +reference_gtf = ( + resources_dir + / "cellranger_tiny_fastq" + / "cellranger_tiny_ref" + / "genes" + / "genes.gtf.gz" +) + + +def test_two_samples(): + input_id = ["mysample1", "mysample2"] + input_r1 = [ + input_dir / "tinygex_S1_L001_R1_001.fastq.gz", + input_dir / "tinygex_S1_L002_R1_001.fastq.gz", + ] + input_r2 = [ + input_dir / "tinygex_S1_L001_R2_001.fastq.gz", + input_dir / "tinygex_S1_L002_R2_001.fastq.gz", + ] + output = Path("test_output") + + cmd_pars = [ + meta["executable"], + "--input_id", + ";".join(input_id), + "--input_r1", + ";".join([str(r1) for r1 in input_r1]), + "--input_r2", + ";".join([str(r2) for r2 in input_r2]), + "--reference_index", + reference_index, + "--reference_gtf", + reference_gtf, + "--output", + output, + "---cpus", + "8", + "--outSAMattributes", + "NH;HI;NM;MD", + ] + subprocess.run([str(x) for x in cmd_pars], check=True) + + expected_files = [ + "Log.final.out", + "Aligned.out.bam", + "Aligned.sorted.out.bam", + "htseq-count.txt", + ] + for iid in input_id: + for expected_file in expected_files: + path = output / "per" / iid / expected_file + assert path.exists(), f"Required file '{path}' is missing" + + +def test_one_sample(): + input_id = ["mysample", "mysample"] + input_r1 = [ + input_dir / "tinygex_S1_L001_R1_001.fastq.gz", + input_dir / "tinygex_S1_L002_R1_001.fastq.gz", + ] + input_r2 = [ + input_dir / "tinygex_S1_L001_R2_001.fastq.gz", + input_dir / "tinygex_S1_L002_R2_001.fastq.gz", + ] + output = Path("test_output") + + cmd_pars = [ + meta["executable"], + "--input_id", + ";".join(input_id), + "--input_r1", + ";".join([str(r1) for r1 in input_r1]), + "--input_r2", + ";".join([str(r2) for r2 in input_r2]), + "--reference_index", + reference_index, + "--reference_gtf", + reference_gtf, + "--output", + output, + "---cpus", + "8", + ] + subprocess.run([str(x) for x in cmd_pars], check=True) + + assert (output / "feature_info.tsv").exists() + expected_files = [ + "Log.final.out", + "Aligned.out.bam", + "Aligned.sorted.out.bam", + "htseq-count.txt", + ] + for iid in input_id: + for expected_file in expected_files: + path = output / "per" / iid / expected_file + assert path.exists(), f"Required file '{path}' is missing" + + +if __name__ == "__main__": + test_two_samples() + test_one_sample() diff --git a/src/mapping/multi_star/utils/process_params.R b/src/mapping/multi_star/utils/process_params.R new file mode 100644 index 00000000..b2f05bb0 --- /dev/null +++ b/src/mapping/multi_star/utils/process_params.R @@ -0,0 +1,250 @@ +library(tidyverse) +library(magrittr) + +# until https://github.com/alexdobin/STAR/pull/1710 is merged +param_txt <- readr::read_lines("src/mapping/star_align/utils/parametersDefault") + +dev_begin <- grep("#####UnderDevelopment_begin", param_txt) +dev_end <- grep("#####UnderDevelopment_end", param_txt) + +# strip development sections +nondev_ix <- unlist(map2( + c(1, dev_end + 1), + c(dev_begin - 1, length(param_txt)), + function(i, j) { + if (i >= 1 && i < j) { + seq(i, j, 1) + } else { + NULL + } + } +)) + +param_txt2 <- param_txt[nondev_ix] + +# strip comments +param_txt3 <- param_txt2[-grep("^#[^#]", param_txt2)] + +# detect groups +group_ix <- grep("^### ", param_txt3) + +out <- map2_dfr( + group_ix, + c(group_ix[-1] - 1, length(param_txt3)), + function(group_start, group_end) { + group_name <- gsub("^### ", "", param_txt3[[group_start]]) + + group_txt <- param_txt3[seq(group_start + 1, group_end)] + + arg_ix <- grep("^[^ ]", group_txt) + + arguments <- map2_dfr( + arg_ix, + c(arg_ix[-1] - 1, length(group_txt)), + function(arg_start, arg_end) { + # process name and default + first_txt <- group_txt[[arg_start]] + first_regex <- "^([^ ]*) +(.*) *$" + if (!grepl(first_regex, first_txt)) { + stop("Line '", first_txt, "' did not match regex '", first_regex, "'") + } + name <- gsub(first_regex, "\\1", first_txt) + default <- gsub(first_regex, "\\2", first_txt) + + # process type and first description + second_txt <- group_txt[[arg_start + 1]] + second_regex <- "^ +([^:]*):[ ]+(.*)$" + if (!grepl(second_regex, second_txt)) { + stop( + "Line '", second_txt, "' did not match regex '", + second_regex, "'" + ) + } + type <- gsub(second_regex, "\\1", second_txt) + desc_start <- str_trim(gsub( + second_regex, "\\2", + second_txt + )) + + # process more description + desc_cont1 <- group_txt[seq(arg_start + 2, arg_end)] + + desc <- + if (sum(str_length(desc_cont1)) == 0) { + desc_start + } else { + # detect margin + margins <- str_extract(desc_cont1, "^( +)") %>% na.omit() + margin <- margins[[which.min(str_length(margins))]] + desc_cont2 <- gsub(paste0("^", margin), "", desc_cont1) + desc_cont3 <- ifelse(grepl("\\.\\.\\.", desc_cont2), + paste0("- ", desc_cont2), desc_cont2 + ) + desc_cont4 <- str_trim(desc_cont3) + + # construct desc + str_trim(paste0(c(desc_start, "", desc_cont4), "\n", collapse = "")) + } + + tibble( + group_name, + name, + default, + type, + description = desc + ) + } + ) + + arguments + } +) + +# todo: manually fix alignEndsProtrude? +# assigning types +type_map <- c( + "string" = "string", "int" = "integer", "real" = "double", + "double" = "double", "int, string" = "string" +) +file_args <- c( + "genomeDir", "readFilesIn", "sjdbGTFfile", + "genomeFastaFiles", "genomeChainFiles", "readFilesManifest" +) +long_args <- c( + "limitGenomeGenerateRAM", "limitIObufferSize", + "limitOutSAMoneReadBytes", "limitBAMsortRAM" +) +required_args <- c("genomeDir", "readFilesIn") + +# converting examples +as_safe_int <- function(x) { + tryCatch( + { + as.integer(x) + }, + warning = function(e) { + bit64::as.integer64(x) + } + ) +} +safe_split <- function(x) { + strsplit(x, "'[^']*'(*SKIP)(*F)|\"[^\"]*\"(*SKIP)(*F)|\\s+", + perl = TRUE + )[[1]] %>% + gsub("^[\"']|[\"']$", "", rlang::.data) +} +trafos <- list( + string = function(x) x, + integer = as_safe_int, + double = as.numeric, + strings = function(x) safe_split(x), + integers = function(x) sapply(safe_split(x), as_safe_int), + doubles = function(x) as.numeric(safe_split(x)) +) +# remove arguments that are not relevant for viash +removed_args <- c("versionGenome", "parametersFiles", "sysShell", "runDirPerm") +# these settings are defined by the viash component +manual_args <- c( + "runThreadN", "outTmpDir", "runMode", "outFileNamePrefix", "genomeDir", + "readFilesIn", "genomeTransformOutput", "genomeFileSizes", + "genomeChrSetMitochondrial", + + # only for star_and_htseq + "outSAMtype", "genomeLoad" +) + +# make viash-like values +out2 <- out %>% + # remove arguments that are not relevant for viash + filter(!name %in% c(removed_args, manual_args)) %>% + # remove arguments that are related to a different runmode + filter( + !grepl("--runMode", description) | + grepl("--runMode alignReads", description) + ) %>% + filter( + !grepl("--runMode", group_name) | + grepl("--runMode alignReads", group_name) + ) %>% + mutate( + viash_arg = paste0("--", name), + type_step1 = type %>% + str_replace_all( + ".*(int, string|string|int|real|double)\\(?(s?).*", + "\\1\\2" + ), + viash_type = type_map[gsub( + "(int, string|string|int|real|double).*", "\\1", + type_step1 + )], + multiple = type_step1 == "int, string" | + grepl("s$", type_step1) | grepl("^[4N][\\* ]", type), + default_step1 = default %>% + { + ifelse(. %in% c("-", "None"), NA_character_, .) + }, + viash_default = + mapply( + default_step1, + paste0(viash_type, ifelse(multiple, "s", "")), + FUN = function(str, typ) trafos[[typ]](str) + ), + # update type + viash_type = case_when( + name %in% long_args ~ "long", + name %in% file_args ~ "file", + TRUE ~ viash_type + ), + # turn longs into character because + # yaml::write_yaml doesn't handle longs well + viash_default = ifelse(sapply(viash_default, bit64::is.integer64), + map(viash_default, as.character), + viash_default + ), + group_name = gsub(" - .*", "", group_name), + required = ifelse(name %in% required_args, TRUE, NA) + ) +print(out2, n = 200) +out2 %>% + mutate(i = row_number()) %>% + select(-group_name, -description) + +out2 %>% + filter( + !grepl("--runMode", description) | + grepl("--runMode alignReads", description) + ) + +argument_groups <- map(unique(out2$group_name), function(group_name) { + args <- out2 %>% + filter(group_name == !!group_name) %>% + pmap(function(viash_arg, viash_type, multiple, + viash_default, description, required, ...) { + li <- lst( + name = viash_arg, + type = viash_type, + description = description, + info = list( + step = "star", + orig_arg = viash_arg + ) + ) + if (all(!is.na(viash_default))) { + li$example <- viash_default + } + if (!is.na(multiple) && multiple) { + li$multiple <- multiple + li$multiple_sep <- ";" + } + if (!is.na(required) && required) { + li$required <- required + } + li + }) + list(name = group_name, arguments = args) +}) + +yaml::write_yaml( + list(argument_groups = argument_groups), + "src/mapping/star_and_htseq/arguments_star.yaml" +) diff --git a/src/mapping/multi_star_to_h5mu/config.vsh.yaml b/src/mapping/multi_star_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..27b5835a --- /dev/null +++ b/src/mapping/multi_star_to_h5mu/config.vsh.yaml @@ -0,0 +1,48 @@ +name: multi_star_to_h5mu +namespace: mapping +scope: "public" +description: | + Convert the output of `multi_star` to a h5mu. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] +arguments: + - type: file + name: --input + required: true + description: The directory created by `multi_star` + example: /path/to/foo + - name: "--output" + alternatives: [-o] + direction: output + type: file + description: "Output h5mu file." + required: true + example: output.h5mu +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq/multi_star + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + +runners: +- type: executable +- type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/mapping/multi_star_to_h5mu/script.py b/src/mapping/multi_star_to_h5mu/script.py new file mode 100644 index 00000000..976eb735 --- /dev/null +++ b/src/mapping/multi_star_to_h5mu/script.py @@ -0,0 +1,76 @@ +from pathlib import Path +import pandas as pd +import mudata as md +import anndata as ad +import numpy as np +import json + +## VIASH START +par = {"input": "output/A2_raw", "output": "test_output.h5mu"} +meta = {"temp_dir": "/tmp"} +## VIASH END + +# convert to path +input_dir = Path(par["input"]) + +# read counts information +print("> Read counts data", flush=True) +per_obs_data = [] + +for input_counts in (input_dir / "per").glob("**/htseq-count.txt"): + per_obs_dir = input_counts.parent + input_id = per_obs_dir.name + input_multiqc = per_obs_dir / "multiqc_data" / "multiqc_data.json" + + data = pd.read_table( + input_counts, + index_col=0, + names=["cell_id", input_id], + dtype={"cell_id": "U", input_id: "i"}, + ) + data2 = data[~data.index.str.startswith("__")] + + with open(input_multiqc, "r") as file: + qc = json.load(file) + + qc_star = qc.get("report_saved_raw_data", {}).get("multiqc_star", {}).get(input_id) + qc_htseq = ( + qc.get("report_saved_raw_data", {}).get("multiqc_htseq", {}).get("htseq-count") + ) + + per_obs_data.append( + { + "counts": data2.transpose(), + "qc_star": pd.DataFrame(qc_star, index=[input_id]), + "qc_htseq": pd.DataFrame(qc_htseq, index=[input_id]), + } + ) + + +# combine all counts +counts = pd.concat([x["counts"] for x in per_obs_data], axis=0) +qc_star = pd.concat([x["qc_star"] for x in per_obs_data], axis=0) +qc_htseq = pd.concat([x["qc_htseq"] for x in per_obs_data], axis=0) + +# read feature info +feature_info = pd.read_csv(input_dir / "feature_info.tsv", sep="\t", index_col=0) +feature_info_ord = feature_info.loc[counts.columns] + +var = pd.DataFrame( + data={ + "gene_ids": feature_info_ord.index, + "feature_types": "Gene Expression", + "gene_name": feature_info_ord["feature_name"], + } +).set_index("gene_ids") + +print("> construct anndata", flush=True) +adata = ad.AnnData( + X=counts, obsm={"qc_star": qc_star, "qc_htseq": qc_htseq}, var=var, dtype=np.int32 +) + +print("> convert to mudata", flush=True) +mdata = md.MuData(adata) + +print("> write to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/mapping/multi_star_to_h5mu/test.py b/src/mapping/multi_star_to_h5mu/test.py new file mode 100644 index 00000000..26d4711c --- /dev/null +++ b/src/mapping/multi_star_to_h5mu/test.py @@ -0,0 +1,44 @@ +import subprocess +from pathlib import Path +import mudata as md + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +print("> Running command with folder", flush=True) +input = meta["resources_dir"] + "/multi_star/" +output = "test_output.h5mu" + +cmd_pars = [ + meta["executable"], + "--input", + str(input), + "--output", + output, + "---cpus", + "2", + "--output_compression", + "gzip", +] +subprocess.run(cmd_pars, check=True) + +print("> Check if file exists", flush=True) +output_path = Path(output) +assert output_path.is_file() + +print("> Check contents", flush=True) +mdata = md.read_h5mu(output) + +print(mdata, flush=True) +print("", flush=True) + +assert "rna" in mdata.mod, "MuData should contain RNA modality" +assert mdata.n_obs == 1, "MuData should only contain one observation" + +adata_rna = mdata.mod["rna"] +assert adata_rna.n_vars > 100, "RNA modality should contain at least 100 variables" +assert "qc_star" in adata_rna.obsm, "RNA modality should contain STAR QC" +assert "qc_htseq" in adata_rna.obsm, "RNA modality should contain htseq QC" + +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/samtools_sort/config.vsh.yaml b/src/mapping/samtools_sort/config.vsh.yaml new file mode 100644 index 00000000..08f09ecc --- /dev/null +++ b/src/mapping/samtools_sort/config.vsh.yaml @@ -0,0 +1,117 @@ +name: samtools_sort +namespace: mapping +scope: "public" +description: | + Sort and (optionally) index alignments. + + Reads are sorted by leftmost coordinates, or by read name when `--sort_by_read_names` is used. + + An appropriate `@HD-SO` sort order header tag will be added or an existing one updated if necessary. + + Note that to generate an index file (by specifying `--output_bai`), the default coordinate sort must be used. + Thus the `--sort_by_read_names` and `--sort_by ` options are incompatible with `--output_bai`. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] +argument_groups: + - name: Input + arguments: + - type: file + name: --input + required: true + description: Path to the SAM/BAM/CRAM files containing the mapped reads. + example: "input.bam" + info: + orig_arg: in_sam + - name: Output + arguments: + - type: file + name: --output_bam + description: Filename to output the counts to. + example: output.bam + direction: output + required: true + info: + orig_arg: -o + - type: file + name: --output_bai + description: BAI-format index for BAM file. + example: output.bam.bai + direction: output + required: false + - type: string + name: --output_format + description: The output format. By default, samtools tries to select a format based on the -o filename extension; if output is to standard output or no format can be deduced, bam is selected. + choices: [sam, bam, cram] + example: bam + required: false + info: + orig_arg: -O + - type: integer + name: --compression + description: Compression level, from 0 (uncompressed) to 9 (best + example: 5 + required: false + info: + orig_arg: -l + - name: Arguments + arguments: + - type: boolean_true + name: --minimizer_cluster + description: | + Sort unmapped reads (those in chromosome "*") by their sequence minimiser (Schleimer et al., 2003; Roberts et al., 2004), + also reverse complementing as appropriate. This has the effect of collating some similar data together, improving the + compressibility of the unmapped sequence. The minimiser kmer size is adjusted using the -K option. Note data compressed + in this manner may need to be name collated prior to conversion back to fastq. + + Mapped sequences are sorted by chromosome and position. + info: + orig_arg: -M + - type: integer + name: --minimizer_kmer + description: Sets the kmer size to be used in the -M option. + example: 20 + info: + orig_arg: -K + - type: boolean_true + name: --sort_by_read_names + description: Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates. + info: + orig_arg: -n + - type: string + name: --sort_by + description: Sort first by this value in the alignment tag, then by position or name (if also using -n). + info: + orig_arg: -t + - type: boolean_true + name: --no_pg + description: Do not add a @PG line to the header of the output file. + info: + orig_arg: --no-PG +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - samtools + - procps + - type: python + packages: + - pyyaml + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/mapping/samtools_sort/script.py b/src/mapping/samtools_sort/script.py new file mode 100644 index 00000000..89582605 --- /dev/null +++ b/src/mapping/samtools_sort/script.py @@ -0,0 +1,76 @@ +import tempfile +import subprocess +from pathlib import Path +import yaml + +## VIASH START +par = { + "input": ["resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"], + "output_bam": "test_output.bam", + "output_bai": "test_output.bam.bai", +} +meta = {"cpus": 2, "temp_dir": "/tmp", "config": "src/mapping/htseq/config.vsh.yaml"} +## VIASH END + + +def generate_args(par, config): + # fetch arguments from config + arguments = [ + arg for group in config["argument_groups"] for arg in group["arguments"] + ] + + cmd_args = [] + + for arg in arguments: + arg_val = par.get(arg["name"].removeprefix("--")) + # The info key is always present (changed in viash 0.7.4) + # in the parsed config (None if not specified in source config) + info = arg["info"] or {} + orig_arg = info.get("orig_arg") + if arg_val and orig_arg: + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + +print(">> Constructing command", flush=True) +cmd_args = ["samtools", "sort"] + generate_args(par, config) + +# manually process cpus parameter +if "cpus" in meta and meta["cpus"]: + cmd_args.extend(["--threads", str(meta["cpus"])]) +# add memory +if "memory_mb" in meta and meta["memory_mb"]: + import math + + mem_per_thread = math.ceil(meta["memory_mb"] * 0.8 / meta["cpus"]) + cmd_args.extend(["-m", f"{mem_per_thread}M"]) + +with tempfile.TemporaryDirectory(prefix="samtools-", dir=meta["temp_dir"]) as temp_dir: + # add tempdir + cmd_args.extend(["-T", str(temp_dir + "/")]) + + # run command + print(">> Running samtools sort with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + subprocess.run(cmd_args, check=True) + +if par.get("output_bai"): + print(">> Running samtools index with command:", flush=True) + cmd_index_args = ["samtools", "index", "-b", par["output_bam"], par["output_bai"]] + print("+ " + " ".join([str(x) for x in cmd_index_args]), flush=True) + subprocess.run(cmd_index_args, check=True) diff --git a/src/mapping/samtools_sort/test.py b/src/mapping/samtools_sort/test.py new file mode 100644 index 00000000..0483f54f --- /dev/null +++ b/src/mapping/samtools_sort/test.py @@ -0,0 +1,54 @@ +import subprocess +from pathlib import Path + +## VIASH START +meta = {"resources_dir": "resources_test"} +## VIASH END + +print("> sort and index", flush=True) +input = meta["resources_dir"] + "/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" +output = "test_output.bam" + +cmd_pars = [ + meta["executable"], + "--input", + input, + "--output_bam", + output, + "--output_bai", + output + ".bai", + "---memory", + "3gb", + "---cpus", + "2", +] +subprocess.run(cmd_pars, check=True) + +print("> Check if file exists", flush=True) +bam_path = Path(output) +assert bam_path.is_file() +bai_path = Path(output + ".bai") +assert bai_path.is_file() + +print("> sort by name (no index)", flush=True) +output2 = "test_output2.bam" + +cmd_pars = [ + meta["executable"], + "--input", + input, + "--output_bam", + output2, + "--sort_by_read_names", + "---memory", + "3gb", + "---cpus", + "2", +] +subprocess.run(cmd_pars, check=True) + +print("> Check if file exists", flush=True) +bam_path2 = Path(output2) +assert bam_path2.is_file() + +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/star_align/argument_groups.yaml b/src/mapping/star_align/argument_groups.yaml new file mode 100644 index 00000000..93491c7b --- /dev/null +++ b/src/mapping/star_align/argument_groups.yaml @@ -0,0 +1,1255 @@ +argument_groups: +- name: Run Parameters + arguments: + - name: --runRNGseed + type: integer + description: random number generator seed. + example: 777 +- name: Genome Parameters + arguments: + - name: --genomeLoad + type: string + description: |- + mode of shared memory usage for the genome files. Only used with --runMode alignReads. + + - LoadAndKeep ... load genome into shared and keep it in memory after run + - LoadAndRemove ... load genome into shared but remove it after run + - LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + - Remove ... do not map anything, just remove loaded genome from memory + - NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + example: NoSharedMemory + - name: --genomeFastaFiles + type: file + description: |- + path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + multiple: yes + - name: --genomeFileSizes + type: integer + description: genome files exact sizes in bytes. Typically, this should not be + defined by the user. + example: 0 + multiple: yes + - name: --genomeTransformOutput + type: string + description: |- + which output to transform back to original genome + + - SAM ... SAM/BAM alignments + - SJ ... splice junctions (SJ.out.tab) + - None ... no transformation of the output + multiple: yes + - name: --genomeChrSetMitochondrial + type: string + description: names of the mitochondrial chromosomes. Presently only used for STARsolo + statistics output/ + example: + - chrM + - M + - MT + multiple: yes +- name: Splice Junctions Database + arguments: + - name: --sjdbFileChrStartEnd + type: string + description: path to the files with genomic coordinates (chr start + end strand) for the splice junction introns. Multiple files can be supplied + and will be concatenated. + multiple: yes + - name: --sjdbGTFfile + type: file + description: path to the GTF file with annotations + - name: --sjdbGTFchrPrefix + type: string + description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL + annotations with UCSC genomes) + - name: --sjdbGTFfeatureExon + type: string + description: feature type in GTF file to be used as exons for building transcripts + example: exon + - name: --sjdbGTFtagExonParentTranscript + type: string + description: GTF attribute name for parent transcript ID (default "transcript_id" + works for GTF files) + example: transcript_id + - name: --sjdbGTFtagExonParentGene + type: string + description: GTF attribute name for parent gene ID (default "gene_id" works for + GTF files) + example: gene_id + - name: --sjdbGTFtagExonParentGeneName + type: string + description: GTF attribute name for parent gene name + example: gene_name + multiple: yes + - name: --sjdbGTFtagExonParentGeneType + type: string + description: GTF attribute name for parent gene type + example: + - gene_type + - gene_biotype + multiple: yes + - name: --sjdbOverhang + type: integer + description: length of the donor/acceptor sequence on each side of the junctions, + ideally = (mate_length - 1) + example: 100 + - name: --sjdbScore + type: integer + description: extra alignment score for alignments that cross database junctions + example: 2 + - name: --sjdbInsertSave + type: string + description: |- + which files to save when sjdb junctions are inserted on the fly at the mapping step + + - Basic ... only small junction / transcript files + - All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + example: Basic +- name: Variation parameters + arguments: + - name: --varVCFfile + type: string + description: path to the VCF file that contains variation data. The 10th column + should contain the genotype information, e.g. 0/1 +- name: Read Parameters + arguments: + - name: --readFilesType + type: string + description: |- + format of input read files + + - Fastx ... FASTA or FASTQ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + - SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + example: Fastx + - name: --readFilesSAMattrKeep + type: string + description: |- + for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + + - All ... keep all tags + - None ... do not keep any tags + example: All + multiple: yes + - name: --readFilesManifest + type: file + description: |- + path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + - name: --readFilesPrefix + type: string + description: prefix for the read files names, i.e. it will be added in front of + the strings in --readFilesIn + - name: --readFilesCommand + type: string + description: |- + command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + multiple: yes + - name: --readMapNumber + type: integer + description: |- + number of reads to map from the beginning of the file + + -1: map all reads + example: -1 + - name: --readMatesLengthsIn + type: string + description: Equal/NotEqual - lengths of names,sequences,qualities for both mates + are the same / not the same. NotEqual is safe in all situations. + example: NotEqual + - name: --readNameSeparator + type: string + description: character(s) separating the part of the read names that will be trimmed + in output (read name after space is always trimmed) + example: / + multiple: yes + - name: --readQualityScoreBase + type: integer + description: number to be subtracted from the ASCII code to get Phred quality + score + example: 33 +- name: Read Clipping + arguments: + - name: --clipAdapterType + type: string + description: |- + adapter clipping type + + - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal + - None ... no adapter clipping, all other clip* parameters are disregarded + example: Hamming + - name: --clip3pNbases + type: integer + description: number(s) of bases to clip from 3p of each mate. If one value is + given, it will be assumed the same for both mates. + example: 0 + multiple: yes + - name: --clip3pAdapterSeq + type: string + description: |- + adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + + - polyA ... polyA sequence with the length equal to read length + multiple: yes + - name: --clip3pAdapterMMp + type: double + description: max proportion of mismatches for 3p adapter clipping for each mate. If + one value is given, it will be assumed the same for both mates. + example: 0.1 + multiple: yes + - name: --clip3pAfterAdapterNbases + type: integer + description: number of bases to clip from 3p of each mate after the adapter clipping. + If one value is given, it will be assumed the same for both mates. + example: 0 + multiple: yes + - name: --clip5pNbases + type: integer + description: number(s) of bases to clip from 5p of each mate. If one value is + given, it will be assumed the same for both mates. + example: 0 + multiple: yes +- name: Limits + arguments: + - name: --limitGenomeGenerateRAM + type: long + description: maximum available RAM (bytes) for genome generation + example: '31000000000' + - name: --limitIObufferSize + type: long + description: max available buffers size (bytes) for input/output, per thread + example: + - 30000000 + - 50000000 + multiple: yes + - name: --limitOutSAMoneReadBytes + type: long + description: 'max size of the SAM record (bytes) for one read. Recommended value: + >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax' + example: 100000 + - name: --limitOutSJoneRead + type: integer + description: max number of junctions for one read (including all multi-mappers) + example: 1000 + - name: --limitOutSJcollapsed + type: integer + description: max number of collapsed junctions + example: 1000000 + - name: --limitBAMsortRAM + type: long + description: maximum available RAM (bytes) for sorting BAM. If =0, it will be + set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory + option. + example: 0 + - name: --limitSjdbInsertNsj + type: integer + description: maximum number of junctions to be inserted to the genome on the fly + at the mapping stage, including those from annotations and those detected in + the 1st step of the 2-pass run + example: 1000000 + - name: --limitNreadsSoft + type: integer + description: soft limit on the number of reads + example: -1 +- name: 'Output: general' + arguments: + - name: --outTmpKeep + type: string + description: |- + whether to keep the temporary files after STAR runs is finished + + - None ... remove all temporary files + - All ... keep all files + - name: --outStd + type: string + description: |- + which output will be directed to stdout (standard out) + + - Log ... log messages + - SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + - BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + - BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + example: Log + - name: --outReadsUnmapped + type: string + description: |- + output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + + - None ... no output + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + - name: --outQSconversionAdd + type: integer + description: add this number to the quality score (e.g. to convert from Illumina + to Sanger, use -31) + example: 0 + - name: --outMultimapperOrder + type: string + description: |- + order of multimapping alignments in the output files + + - Old_2.4 ... quasi-random order used before 2.5.0 + - Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + example: Old_2.4 +- name: 'Output: SAM and BAM' + arguments: + - name: --outSAMtype + type: string + description: |- + type of SAM/BAM output + + 1st word: + - BAM ... output BAM without sorting + - SAM ... output SAM without sorting + - None ... no SAM/BAM output + 2nd, 3rd: + - Unsorted ... standard unsorted + - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + example: SAM + multiple: yes + - name: --outSAMmode + type: string + description: |- + mode of SAM output + + - None ... no SAM output + - Full ... full SAM output + - NoQS ... full SAM but without quality scores + example: Full + - name: --outSAMstrandField + type: string + description: |- + Cufflinks-like strand field flag + + - None ... not used + - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + - name: --outSAMattributes + type: string + description: |- + a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + + ***Presets: + - None ... no attributes + - Standard ... NH HI AS nM + - All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + - NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + - HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + - AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + - nM ... number of mismatches. For PE reads, sum over two mates. + - NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + - MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + - jI ... start and end of introns for all junctions (1-based). + - XS ... alignment strand according to --outSAMstrandField. + - MC ... mate's CIGAR string. Standard SAM tag. + - ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + - cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + - vA ... variant allele + - vG ... genomic coordinate of the variant overlapped by the read. + - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + ***STARsolo: + - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + - GX GN ... gene ID and gene name for unique-gene reads. + - gx gn ... gene IDs and gene names for unique- and multi-gene reads. + - CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + - sM ... assessment of CB and UMI. + - sS ... sequence of the entire barcode (CB,UMI,adapter). + - sQ ... quality of the entire barcode. + ***Unsupported/undocumented: + - ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + - rB ... alignment block read/genomic coordinates. + - vR ... read coordinate of the variant. + example: Standard + multiple: yes + - name: --outSAMattrIHstart + type: integer + description: start value for the IH attribute. 0 may be required by some downstream + software, such as Cufflinks or StringTie. + example: 1 + - name: --outSAMunmapped + type: string + description: |- + output of unmapped reads in the SAM format + + 1st word: + - None ... no output + - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + multiple: yes + - name: --outSAMorder + type: string + description: |- + type of sorting for the SAM output + + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + example: Paired + - name: --outSAMprimaryFlag + type: string + description: |- + which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + + - OneBestScore ... only one alignment with the best score is primary + - AllBestScore ... all alignments with the best score are primary + example: OneBestScore + - name: --outSAMreadID + type: string + description: |- + read ID record type + + - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + - Number ... read number (index) in the FASTx file + example: Standard + - name: --outSAMmapqUnique + type: integer + description: '0 to 255: the MAPQ value for unique mappers' + example: 255 + - name: --outSAMflagOR + type: integer + description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e. + FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, + and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.' + example: 0 + - name: --outSAMflagAND + type: integer + description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e. + FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, + but before outSAMflagOR. Can be used to unset specific bits that are not set + otherwise.' + example: 65535 + - name: --outSAMattrRGline + type: string + description: |- + SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + multiple: yes + - name: --outSAMheaderHD + type: string + description: '@HD (header) line of the SAM header' + multiple: yes + - name: --outSAMheaderPG + type: string + description: extra @PG (software) line of the SAM header (in addition to STAR) + multiple: yes + - name: --outSAMheaderCommentFile + type: string + description: path to the file with @CO (comment) lines of the SAM header + - name: --outSAMfilter + type: string + description: |- + filter the output into main SAM/BAM files + + - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + - KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + multiple: yes + - name: --outSAMmultNmax + type: integer + description: |- + max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + + - -1 ... all alignments (up to --outFilterMultimapNmax) will be output + example: -1 + - name: --outSAMtlen + type: integer + description: |- + calculation method for the TLEN field in the SAM/BAM files + + - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + example: 1 + - name: --outBAMcompression + type: integer + description: -1 to 10 BAM compression level, -1=default compression (6?), 0=no + compression, 10=maximum compression + example: 1 + - name: --outBAMsortingThreadN + type: integer + description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).' + example: 0 + - name: --outBAMsortingBinsN + type: integer + description: '>0: number of genome bins for coordinate-sorting' + example: 50 +- name: BAM processing + arguments: + - name: --bamRemoveDuplicatesType + type: string + description: |- + mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + + - - ... no duplicate removal/marking + - UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + - name: --bamRemoveDuplicatesMate2basesN + type: integer + description: number of bases from the 5' of mate 2 to use in collapsing (e.g. + for RAMPAGE) + example: 0 +- name: Output Wiggle + arguments: + - name: --outWigType + type: string + description: |- + type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + + 1st word: + - None ... no signal output + - bedGraph ... bedGraph format + - wiggle ... wiggle format + 2nd word: + - read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + - read2 ... signal from only 2nd read + multiple: yes + - name: --outWigStrand + type: string + description: |- + strandedness of wiggle/bedGraph output + + - Stranded ... separate strands, str1 and str2 + - Unstranded ... collapsed strands + example: Stranded + - name: --outWigReferencesPrefix + type: string + description: prefix matching reference names to include in the output wiggle file, + e.g. "chr", default "-" - include all references + - name: --outWigNorm + type: string + description: |- + type of normalization for the signal + + - RPM ... reads per million of mapped reads + - None ... no normalization, "raw" counts + example: RPM +- name: Output Filtering + arguments: + - name: --outFilterType + type: string + description: |- + type of filtering + + - Normal ... standard filtering using only current alignment + - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + example: Normal + - name: --outFilterMultimapScoreRange + type: integer + description: the score range below the maximum score for multimapping alignments + example: 1 + - name: --outFilterMultimapNmax + type: integer + description: |- + maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + example: 10 + - name: --outFilterMismatchNmax + type: integer + description: alignment will be output only if it has no more mismatches than this + value. + example: 10 + - name: --outFilterMismatchNoverLmax + type: double + description: alignment will be output only if its ratio of mismatches to *mapped* + length is less than or equal to this value. + example: 0.3 + - name: --outFilterMismatchNoverReadLmax + type: double + description: alignment will be output only if its ratio of mismatches to *read* + length is less than or equal to this value. + example: 1.0 + - name: --outFilterScoreMin + type: integer + description: alignment will be output only if its score is higher than or equal + to this value. + example: 0 + - name: --outFilterScoreMinOverLread + type: double + description: same as outFilterScoreMin, but normalized to read length (sum of + mates' lengths for paired-end reads) + example: 0.66 + - name: --outFilterMatchNmin + type: integer + description: alignment will be output only if the number of matched bases is higher + than or equal to this value. + example: 0 + - name: --outFilterMatchNminOverLread + type: double + description: sam as outFilterMatchNmin, but normalized to the read length (sum + of mates' lengths for paired-end reads). + example: 0.66 + - name: --outFilterIntronMotifs + type: string + description: |- + filter alignment using their motifs + + - None ... no filtering + - RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + - RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + - name: --outFilterIntronStrands + type: string + description: |- + filter alignments + + - RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + - None ... no filtering + example: RemoveInconsistentStrands +- name: Output splice junctions (SJ.out.tab) + arguments: + - name: --outSJtype + type: string + description: |- + type of splice junction output + + - Standard ... standard SJ.out.tab output + - None ... no splice junction output + example: Standard +- name: 'Output Filtering: Splice Junctions' + arguments: + - name: --outSJfilterReads + type: string + description: |- + which reads to consider for collapsed splice junctions output + + - All ... all reads, unique- and multi-mappers + - Unique ... uniquely mapping reads only + example: All + - name: --outSJfilterOverhangMin + type: integer + description: |- + minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + does not apply to annotated junctions + example: + - 30 + - 12 + - 12 + - 12 + multiple: yes + - name: --outSJfilterCountUniqueMin + type: integer + description: |- + minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + - name: --outSJfilterCountTotalMin + type: integer + description: |- + minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + - name: --outSJfilterDistToOtherSJmin + type: integer + description: |- + minimum allowed distance to other junctions' donor/acceptor + + does not apply to annotated junctions + example: + - 10 + - 0 + - 5 + - 10 + multiple: yes + - name: --outSJfilterIntronMaxVsReadN + type: integer + description: |- + maximum gap allowed for junctions supported by 1,2,3,,,N reads + + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + example: + - 50000 + - 100000 + - 200000 + multiple: yes +- name: Scoring + arguments: + - name: --scoreGap + type: integer + description: splice junction penalty (independent on intron motif) + example: 0 + - name: --scoreGapNoncan + type: integer + description: non-canonical junction penalty (in addition to scoreGap) + example: -8 + - name: --scoreGapGCAG + type: integer + description: GC/AG and CT/GC junction penalty (in addition to scoreGap) + example: -4 + - name: --scoreGapATAC + type: integer + description: AT/AC and GT/AT junction penalty (in addition to scoreGap) + example: -8 + - name: --scoreGenomicLengthLog2scale + type: integer + description: 'extra score logarithmically scaled with genomic length of the alignment: + scoreGenomicLengthLog2scale*log2(genomicLength)' + example: 0 + - name: --scoreDelOpen + type: integer + description: deletion open penalty + example: -2 + - name: --scoreDelBase + type: integer + description: deletion extension penalty per base (in addition to scoreDelOpen) + example: -2 + - name: --scoreInsOpen + type: integer + description: insertion open penalty + example: -2 + - name: --scoreInsBase + type: integer + description: insertion extension penalty per base (in addition to scoreInsOpen) + example: -2 + - name: --scoreStitchSJshift + type: integer + description: maximum score reduction while searching for SJ boundaries in the + stitching step + example: 1 +- name: Alignments and Seeding + arguments: + - name: --seedSearchStartLmax + type: integer + description: defines the search start point through the read - the read is split + into pieces no longer than this value + example: 50 + - name: --seedSearchStartLmaxOverLread + type: double + description: seedSearchStartLmax normalized to read length (sum of mates' lengths + for paired-end reads) + example: 1.0 + - name: --seedSearchLmax + type: integer + description: defines the maximum length of the seeds, if =0 seed length is not + limited + example: 0 + - name: --seedMultimapNmax + type: integer + description: only pieces that map fewer than this value are utilized in the stitching + procedure + example: 10000 + - name: --seedPerReadNmax + type: integer + description: max number of seeds per read + example: 1000 + - name: --seedPerWindowNmax + type: integer + description: max number of seeds per window + example: 50 + - name: --seedNoneLociPerWindow + type: integer + description: max number of one seed loci per window + example: 10 + - name: --seedSplitMin + type: integer + description: min length of the seed sequences split by Ns or mate gap + example: 12 + - name: --seedMapMin + type: integer + description: min length of seeds to be mapped + example: 5 + - name: --alignIntronMin + type: integer + description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, + otherwise it is considered Deletion + example: 21 + - name: --alignIntronMax + type: integer + description: maximum intron size, if 0, max intron size will be determined by + (2^winBinNbits)*winAnchorDistNbins + example: 0 + - name: --alignMatesGapMax + type: integer + description: maximum gap between two mates, if 0, max intron gap will be determined + by (2^winBinNbits)*winAnchorDistNbins + example: 0 + - name: --alignSJoverhangMin + type: integer + description: minimum overhang (i.e. block size) for spliced alignments + example: 5 + - name: --alignSJstitchMismatchNmax + type: integer + description: |- + maximum number of mismatches for stitching of the splice junctions (-1: no limit). + + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + example: + - 0 + - -1 + - 0 + - 0 + multiple: yes + - name: --alignSJDBoverhangMin + type: integer + description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + example: 3 + - name: --alignSplicedMateMapLmin + type: integer + description: minimum mapped length for a read mate that is spliced + example: 0 + - name: --alignSplicedMateMapLminOverLmate + type: double + description: alignSplicedMateMapLmin normalized to mate length + example: 0.66 + - name: --alignWindowsPerReadNmax + type: integer + description: max number of windows per read + example: 10000 + - name: --alignTranscriptsPerWindowNmax + type: integer + description: max number of transcripts per window + example: 100 + - name: --alignTranscriptsPerReadNmax + type: integer + description: max number of different alignments per read to consider + example: 10000 + - name: --alignEndsType + type: string + description: |- + type of read ends alignment + + - Local ... standard local alignment with soft-clipping allowed + - EndToEnd ... force end-to-end read alignment, do not soft-clip + - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + example: Local + - name: --alignEndsProtrude + type: string + description: |- + allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + - ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + - DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + example: 0 ConcordantPair + - name: --alignSoftClipAtReferenceEnds + type: string + description: |- + allow the soft-clipping of the alignments past the end of the chromosomes + + - Yes ... allow + - No ... prohibit, useful for compatibility with Cufflinks + example: 'Yes' + - name: --alignInsertionFlush + type: string + description: |- + how to flush ambiguous insertion positions + + - None ... insertions are not flushed + - Right ... insertions are flushed to the right +- name: Paired-End reads + arguments: + - name: --peOverlapNbasesMin + type: integer + description: minimum number of overlapping bases to trigger mates merging and + realignment. Specify >0 value to switch on the "merginf of overlapping mates" + algorithm. + example: 0 + - name: --peOverlapMMp + type: double + description: maximum proportion of mismatched bases in the overlap area + example: 0.01 +- name: Windows, Anchors, Binning + arguments: + - name: --winAnchorMultimapNmax + type: integer + description: max number of loci anchors are allowed to map to + example: 50 + - name: --winBinNbits + type: integer + description: =log2(winBin), where winBin is the size of the bin for the windows/clustering, + each window will occupy an integer number of bins. + example: 16 + - name: --winAnchorDistNbins + type: integer + description: max number of bins between two anchors that allows aggregation of + anchors into one window + example: 9 + - name: --winFlankNbins + type: integer + description: log2(winFlank), where win Flank is the size of the left and right + flanking regions for each window + example: 4 + - name: --winReadCoverageRelativeMin + type: double + description: minimum relative coverage of the read sequence by the seeds in a + window, for STARlong algorithm only. + example: 0.5 + - name: --winReadCoverageBasesMin + type: integer + description: minimum number of bases covered by the seeds in a window , for STARlong + algorithm only. + example: 0 +- name: Chimeric Alignments + arguments: + - name: --chimOutType + type: string + description: |- + type of chimeric output + + - Junctions ... Chimeric.out.junction + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + - WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + example: Junctions + multiple: yes + - name: --chimSegmentMin + type: integer + description: minimum length of chimeric segment length, if ==0, no chimeric output + example: 0 + - name: --chimScoreMin + type: integer + description: minimum total (summed) score of the chimeric segments + example: 0 + - name: --chimScoreDropMax + type: integer + description: max drop (difference) of chimeric score (the sum of scores of all + chimeric segments) from the read length + example: 20 + - name: --chimScoreSeparation + type: integer + description: minimum difference (separation) between the best chimeric score and + the next one + example: 10 + - name: --chimScoreJunctionNonGTAG + type: integer + description: penalty for a non-GT/AG chimeric junction + example: -1 + - name: --chimJunctionOverhangMin + type: integer + description: minimum overhang for a chimeric junction + example: 20 + - name: --chimSegmentReadGapMax + type: integer + description: maximum gap in the read sequence between chimeric segments + example: 0 + - name: --chimFilter + type: string + description: |- + different filters for chimeric alignments + + - None ... no filtering + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + example: banGenomicN + multiple: yes + - name: --chimMainSegmentMultNmax + type: integer + description: maximum number of multi-alignments for the main chimeric segment. + =1 will prohibit multimapping main segments. + example: 10 + - name: --chimMultimapNmax + type: integer + description: |- + maximum number of chimeric multi-alignments + + - 0 ... use the old scheme for chimeric detection which only considered unique alignments + example: 0 + - name: --chimMultimapScoreRange + type: integer + description: the score range for multi-mapping chimeras below the best chimeric + score. Only works with --chimMultimapNmax > 1 + example: 1 + - name: --chimNonchimScoreDropMin + type: integer + description: to trigger chimeric detection, the drop in the best non-chimeric + alignment score with respect to the read length has to be greater than this + value + example: 20 + - name: --chimOutJunctionFormat + type: integer + description: |- + formatting type for the Chimeric.out.junction file + + - 0 ... no comment lines/headers + - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + example: 0 +- name: Quantification of Annotations + arguments: + - name: --quantMode + type: string + description: |- + types of quantification requested + + - - ... none + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + - GeneCounts ... count reads per gene + multiple: yes + - name: --quantTranscriptomeBAMcompression + type: integer + description: |- + -2 to 10 transcriptome BAM compression level + + - -2 ... no BAM output + - -1 ... default compression (6?) + - 0 ... no compression + - 10 ... maximum compression + example: 1 + - name: --quantTranscriptomeBan + type: string + description: |- + prohibit various alignment type + + - IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM + - Singleend ... prohibit single-end alignments + example: IndelSoftclipSingleend +- name: 2-pass Mapping + arguments: + - name: --twopassMode + type: string + description: |- + 2-pass mapping mode. + + - None ... 1-pass mapping + - Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + - name: --twopass1readsN + type: integer + description: number of reads to process for the 1st step. Use very large number + (or default -1) to map all reads in the first step. + example: -1 +- name: WASP parameters + arguments: + - name: --waspOutputMode + type: string + description: |- + WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 . + + - SAMtag ... add WASP tags to the alignments that pass WASP filtering +- name: STARsolo (single cell RNA-seq) parameters + arguments: + - name: --soloType + type: string + description: |- + type of single-cell RNA-seq + + - CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + - CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + - CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate] + - SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + multiple: yes + - name: --soloCBwhitelist + type: string + description: |- + file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file. + + - None ... no whitelist: all cell barcodes are allowed + multiple: yes + - name: --soloCBstart + type: integer + description: cell barcode start base + example: 1 + - name: --soloCBlen + type: integer + description: cell barcode length + example: 16 + - name: --soloUMIstart + type: integer + description: UMI start base + example: 17 + - name: --soloUMIlen + type: integer + description: UMI length + example: 10 + - name: --soloBarcodeReadLength + type: integer + description: |- + length of the barcode read + + - 1 ... equal to sum of soloCBlen+soloUMIlen + - 0 ... not defined, do not check + example: 1 + - name: --soloBarcodeMate + type: integer + description: |- + identifies which read mate contains the barcode (CB+UMI) sequence + + - 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + - 1 ... barcode sequence is a part of mate 1 + - 2 ... barcode sequence is a part of mate 2 + example: 0 + - name: --soloCBposition + type: string + description: |- + position of Cell Barcode(s) on the barcode read. + + Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 0_0_2_-1 3_1_3_8 + multiple: yes + - name: --soloUMIposition + type: string + description: |- + position of the UMI on the barcode read, same as soloCBposition + + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 3_9_3_14 + - name: --soloAdapterSequence + type: string + description: adapter sequence to anchor barcodes. Only one adapter sequence is + allowed. + - name: --soloAdapterMismatchesNmax + type: integer + description: maximum number of mismatches allowed in adapter sequence. + example: 1 + - name: --soloCBmatchWLtype + type: string + description: |- + matching the Cell Barcodes to the WhiteList + + - Exact ... only exact matches allowed + - 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + - 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + - 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + - 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + - EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + example: 1MM_multi + - name: --soloInputSAMattrBarcodeSeq + type: string + description: |- + when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR . + This parameter is required when running STARsolo with input from SAM. + multiple: yes + - name: --soloInputSAMattrBarcodeQual + type: string + description: |- + when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + multiple: yes + - name: --soloStrand + type: string + description: |- + strandedness of the solo libraries: + + - Unstranded ... no strand information + - Forward ... read strand same as the original RNA molecule + - Reverse ... read strand opposite to the original RNA molecule + example: Forward + - name: --soloFeatures + type: string + description: |- + genomic features for which the UMI counts per Cell Barcode are collected + + - Gene ... genes: reads match the gene transcript + - SJ ... splice junctions: reported in SJ.out.tab + - GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + - GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + example: Gene + multiple: yes + - name: --soloMultiMappers + type: string + description: |- + counting method for reads mapping to multiple genes + + - Unique ... count only reads that map to unique genes + - Uniform ... uniformly distribute multi-genic UMIs to all genes + - Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + - PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + - EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + example: Unique + multiple: yes + - name: --soloUMIdedup + type: string + description: |- + type of UMI deduplication (collapsing) algorithm + + - 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + - 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + - 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + - Exact ... only exactly matching UMIs are collapsed. + - NoDedup ... no deduplication of UMIs, count all reads. + - 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + example: 1MM_All + multiple: yes + - name: --soloUMIfiltering + type: string + description: |- + type of UMI filtering (for reads uniquely mapping to genes) + + - - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + - MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + - MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + - MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --soloUMIdedup 1MM_CR + multiple: yes + - name: --soloOutFileNames + type: string + description: |- + file names for STARsolo output: + + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + example: + - Solo.out/ + - features.tsv + - barcodes.tsv + - matrix.mtx + multiple: yes + - name: --soloCellFilter + type: string + description: |- + cell filtering type and parameters + + - None ... do not output filtered cells + - TopCells ... only report top cells by UMI count, followed by the exact number of cells + - CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + - EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + example: + - CellRanger2.2 + - '3000' + - '0.99' + - '10' + multiple: yes + - name: --soloOutFormatFeaturesGeneField3 + type: string + description: field 3 in the Gene features.tsv file. If "-", then no 3rd field + is output. + example: Gene Expression + multiple: yes + - name: --soloCellReadStats + type: string + description: |- + Output reads statistics for each CB + + - Standard ... standard output diff --git a/src/mapping/star_align/config.vsh.yaml b/src/mapping/star_align/config.vsh.yaml new file mode 100644 index 00000000..1b020a26 --- /dev/null +++ b/src/mapping/star_align/config.vsh.yaml @@ -0,0 +1,76 @@ +name: star_align +namespace: mapping +scope: "public" +description: Align fastq files using STAR. +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +# generated the argument groups using `utils/process_params.R` +__merge__: [., argument_groups.yaml] +# manually taking care of the main input files +argument_groups: + - name: Input/Output + arguments: + - type: file + name: --input + alternatives: --readFilesIn + required: true + description: The FASTQ files to be analyzed. Corresponds to the --readFilesIn argument in the STAR command. + example: [ mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz ] + multiple: true + - type: file + name: --reference + alternatives: --genomeDir + description: Path to the reference built by star_build_reference. Corresponds to the --genomeDir argument in the STAR command. + example: /path/to/reference + required: true + - type: file + name: --output + alternatives: --outFileNamePrefix + description: Path to output directory. Corresponds to the --outFileNamePrefix argument in the STAR command. + example: /path/to/foo + direction: output + required: true +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.10b + - PACKAGES gcc g++ make wget zlib1g-dev unzip + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/star_align/script.py b/src/mapping/star_align/script.py new file mode 100644 index 00000000..25328ecf --- /dev/null +++ b/src/mapping/star_align/script.py @@ -0,0 +1,201 @@ +import re +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +par = { + "input": [ + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz", + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz", + # 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz', + # 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz' + ], + # 'input': [ 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/' ], + "reference": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/", + "output": "test_output", +} +meta = {"cpus": 8, "temp_dir": "/tmp"} +## VIASH END + +######################## +### Helper functions ### +######################## + +# regex for matching R[12] fastq(gz) files +# examples: +# - TSP10_Fat_MAT_SS2_B134171_B115063_Immune_A1_L003_R1.fastq.gz +# - tinygex_S1_L001_I1_001.fastq.gz +fastqgz_regex = r"(.+)_(R\d+)(_\d+)?\.fastq(\.gz)?" + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + + +# look for fastq files in a directory +def search_fastqs(path: Path) -> list[Path]: + if path.is_dir(): + print( + f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", + flush=True, + ) + value_paths = [ + file for file in path.iterdir() if re.match(fastqgz_regex, file.name) + ] + return value_paths + else: + return [path] + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the `processPar()` generator needs to be adapted +to_rename = { + "input": "readFilesIn", + "reference": "genomeDir", + "output": "outFileNamePrefix", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the `to_rename` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["outFileNamePrefix"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory( + prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True +) as temp_dir: + print(">> Check whether input files are directories", flush=True) + new_read_files_in = [] + for path in par["readFilesIn"]: + new_read_files_in.extend(search_fastqs(path)) + par["readFilesIn"] = new_read_files_in + print("", flush=True) + + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeDir", "readFilesIn"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print("Grouping R1/R2 input files into pairs", flush=True) + input_grouped = {} + for path in par["readFilesIn"]: + key = re.search(fastqgz_regex, path.name).group(2) + if key not in input_grouped: + input_grouped[key] = [] + input_grouped[key].append(str(path)) + par["readFilesIn"] = [",".join(val) for val in input_grouped.values()] + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "alignReads" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + # make sure there is a trailing / + par["outFileNamePrefix"] = f"{par['outFileNamePrefix']}/" + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) diff --git a/src/mapping/star_align/test.py b/src/mapping/star_align/test.py new file mode 100644 index 00000000..6fd9c86f --- /dev/null +++ b/src/mapping/star_align/test.py @@ -0,0 +1,76 @@ +import subprocess +from os import path +import sys +from pathlib import Path +from tempfile import TemporaryDirectory +import shutil + +## VIASH START +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +## Test 1: use input dir +logger.info("> Running command with folder") +input = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_fastq/" +input_files = [ + input + "tinygex_S1_L002_R1_001.fastq.gz", + input + "tinygex_S1_L002_R2_001.fastq.gz", +] +reference = ( + meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/" +) +output = "test_output" + +with TemporaryDirectory() as tempdir: + for file in input_files: + shutil.copyfile(file, Path(tempdir) / Path(file).name) + cmd_pars = [ + meta["executable"], + "--input", + tempdir, + "--reference", + reference, + "--output", + output, + "---cpus", + "2", + ] + subprocess.run(cmd_pars, check=True) + +logger.info("> Check if file exists") + +output_path = Path(output) +assert (output_path / "Log.final.out").is_file(), "No output log was created." +assert (output_path / "SJ.out.tab").is_file(), "No output file was created." + + +## Test 2: use input files +logger.info("> Running command with fastq files") +output = "test_output2" + +cmd_pars = [ + meta["executable"], + "--input", + input_files[0], + "--input", + input_files[1], + "--reference", + reference, + "--output", + output, + "---cpus", + "8", +] +out = subprocess.check_output(cmd_pars).decode("utf-8") + +logger.info("> Check if file exists") +assert path.exists(output + "/Log.final.out"), "No output log was created." +assert path.exists(output + "/SJ.out.tab"), "No output was created." + + +logger.info("> Completed Successfully!") diff --git a/src/mapping/star_align/utils/parametersDefault b/src/mapping/star_align/utils/parametersDefault new file mode 100644 index 00000000..c5c5ce39 --- /dev/null +++ b/src/mapping/star_align/utils/parametersDefault @@ -0,0 +1,910 @@ +### versions +versionGenome 2.7.4a + string: earliest genome index version compatible with this STAR release. Please do not change this value! + +### Parameter Files +parametersFiles - + string: name of a user-defined parameters file, "-": none. Can only be defined on the command line. + +### System +sysShell - + string: path to the shell binary, preferably bash, e.g. /bin/bash. + - ... the default shell is executed, typically /bin/sh. This was reported to fail on some Ubuntu systems - then you need to specify path to bash. + +### Run Parameters +runMode alignReads + string: type of the run. + alignReads ... map reads + genomeGenerate ... generate genome files + inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options. + liftOver ... lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles. + soloCellFiltering ... STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix + +runThreadN 1 + int: number of threads to run STAR + +runDirPerm User_RWX + string: permissions for the directories created at the run-time. + User_RWX ... user-read/write/execute + All_RWX ... all-read/write/execute (same as chmod 777) + +runRNGseed 777 + int: random number generator seed. + + +### Genome Parameters +genomeDir ./GenomeDir/ + string: path to the directory where genome files are stored (for --runMode alignReads) or will be generated (for --runMode generateGenome) + +genomeLoad NoSharedMemory + string: mode of shared memory usage for the genome files. Only used with --runMode alignReads. + LoadAndKeep ... load genome into shared and keep it in memory after run + LoadAndRemove ... load genome into shared but remove it after run + LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + Remove ... do not map anything, just remove loaded genome from memory + NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + +genomeFastaFiles - + string(s): path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + +genomeChainFiles - + string: chain files for genomic liftover. Only used with --runMode liftOver . + +genomeFileSizes 0 + uint(s)>0: genome files exact sizes in bytes. Typically, this should not be defined by the user. + +genomeTransformOutput None + string(s): which output to transform back to original genome + SAM ... SAM/BAM alignments + SJ ... splice junctions (SJ.out.tab) + None ... no transformation of the output + +genomeChrSetMitochondrial chrM M MT + string(s): names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/ + +### Genome Indexing Parameters - only used with --runMode genomeGenerate +genomeChrBinNbits 18 + int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). + +genomeSAindexNbases 14 + int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1). + +genomeSAsparseD 1 + int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + +genomeSuffixLengthMax -1 + int: maximum length of the suffixes, has to be longer than read length. -1 = infinite. + +genomeTransformType None + string: type of genome transformation + None ... no transformation + Haploid ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele) + Diploid ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome) + +genomeTransformVCF - + string: path to VCF file for genome transformation + + + +#####UnderDevelopment_begin : not supported - do not use +genomeType Full + string: type of genome to generate + Full ... full (normal) genome + Transcriptome ... genome consists of transcript sequences + SuperTransriptome ... genome consists of superTranscript sequences +#####UnderDevelopment_end + +# DEPRECATED: please use --genomeTransformVCF and --genomeTransformType options instead. +#genomeConsensusFile - +# string: VCF file with consensus SNPs (i.e. alternative allele is the major (AF>0.5) allele) +# DEPRECATED + + + +### Splice Junctions Database +sjdbFileChrStartEnd - + string(s): path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated. + +sjdbGTFfile - + string: path to the GTF file with annotations + +sjdbGTFchrPrefix - + string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) + +sjdbGTFfeatureExon exon + string: feature type in GTF file to be used as exons for building transcripts + +sjdbGTFtagExonParentTranscript transcript_id + string: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + +sjdbGTFtagExonParentGene gene_id + string: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + +sjdbGTFtagExonParentGeneName gene_name + string(s): GTF attribute name for parent gene name + +sjdbGTFtagExonParentGeneType gene_type gene_biotype + string(s): GTF attribute name for parent gene type + +sjdbOverhang 100 + int>0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + +sjdbScore 2 + int: extra alignment score for alignments that cross database junctions + +sjdbInsertSave Basic + string: which files to save when sjdb junctions are inserted on the fly at the mapping step + Basic ... only small junction / transcript files + All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + +### Variation parameters +varVCFfile - + string: path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1 + +### Input Files +inputBAMfile - + string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + +### Read Parameters +readFilesType Fastx + string: format of input read files + Fastx ... FASTA or FASTQ + SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + +readFilesSAMattrKeep All + string(s): for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + All ... keep all tags + None ... do not keep any tags + +readFilesIn Read1 Read2 + string(s): paths to files that contain input read1 (and, if needed, read2) + +readFilesManifest - + string: path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + +readFilesPrefix - + string: prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn + +readFilesCommand - + string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +readMapNumber -1 + int: number of reads to map from the beginning of the file + -1: map all reads + +readMatesLengthsIn NotEqual + string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +readNameSeparator / + string(s): character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed) + +readQualityScoreBase 33 + int>=0: number to be subtracted from the ASCII code to get Phred quality score + +### Read Clipping + +clipAdapterType Hamming + string: adapter clipping type + Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal + None ... no adapter clipping, all other clip* parameters are disregarded + +clip3pNbases 0 + int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq - + string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + polyA ... polyA sequence with the length equal to read length + +clip3pAdapterMMp 0.1 + double(s): max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAfterAdapterNbases 0 + int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + +clip5pNbases 0 + int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +#####UnderDevelopment_begin : not supported - do not use +clip5pAdapterSeq - + string(s): adapter sequences to clip from 5p of each mate, separated by space. + +clip5pAdapterMMp 0.1 + double(s): max proportion of mismatches for 5p adapter clipping for each mate, separated by space + +clip5pAfterAdapterNbases 0 + int(s): number of bases to clip from 5p of each mate after the adapter clipping, separated by space. +#####UnderDevelopment_end + +### Limits +limitGenomeGenerateRAM 31000000000 + int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 30000000 50000000 + int(s)>0: max available buffers size (bytes) for input/output, per thread + +limitOutSAMoneReadBytes 100000 + int>0: max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax + +limitOutSJoneRead 1000 + int>0: max number of junctions for one read (including all multi-mappers) + +limitOutSJcollapsed 1000000 + int>0: max number of collapsed junctions + +limitBAMsortRAM 0 + int>=0: maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option. + +limitSjdbInsertNsj 1000000 + int>=0: maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run + +limitNreadsSoft -1 + int: soft limit on the number of reads + +### Output: general +outFileNamePrefix ./ + string: output files name prefix (including full or relative path). Can only be defined on the command line. + +outTmpDir - + string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! + - ... the temp directory will default to outFileNamePrefix_STARtmp + +outTmpKeep None + string: whether to keep the temporary files after STAR runs is finished + None ... remove all temporary files + All ... keep all files + +outStd Log + string: which output will be directed to stdout (standard out) + Log ... log messages + SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped None + string: output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + None ... no output + Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + +outQSconversionAdd 0 + int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + +outMultimapperOrder Old_2.4 + string: order of multimapping alignments in the output files + Old_2.4 ... quasi-random order used before 2.5.0 + Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + +### Output: SAM and BAM +outSAMtype SAM + strings: type of SAM/BAM output + 1st word: + BAM ... output BAM without sorting + SAM ... output SAM without sorting + None ... no SAM/BAM output + 2nd, 3rd: + Unsorted ... standard unsorted + SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + +outSAMmode Full + string: mode of SAM output + None ... no SAM output + Full ... full SAM output + NoQS ... full SAM but without quality scores + +outSAMstrandField None + string: Cufflinks-like strand field flag + None ... not used + intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + +outSAMattributes Standard + string(s): a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + ***Presets: + None ... no attributes + Standard ... NH HI AS nM + All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + nM ... number of mismatches. For PE reads, sum over two mates. + NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + jI ... start and end of introns for all junctions (1-based). + XS ... alignment strand according to --outSAMstrandField. + MC ... mate's CIGAR string. Standard SAM tag. + ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + vA ... variant allele + vG ... genomic coordinate of the variant overlapped by the read. + vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + ***STARsolo: + CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + GX GN ... gene ID and gene name for unique-gene reads. + gx gn ... gene IDs and gene names for unique- and multi-gene reads. + CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + sM ... assessment of CB and UMI. + sS ... sequence of the entire barcode (CB,UMI,adapter). + sQ ... quality of the entire barcode. + ***Unsupported/undocumented: + ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + rB ... alignment block read/genomic coordinates. + vR ... read coordinate of the variant. + +outSAMattrIHstart 1 + int>=0: start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie. + +outSAMunmapped None + string(s): output of unmapped reads in the SAM format + 1st word: + None ... no output + Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + +outSAMorder Paired + string: type of sorting for the SAM output + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +outSAMprimaryFlag OneBestScore + string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + OneBestScore ... only one alignment with the best score is primary + AllBestScore ... all alignments with the best score are primary + +outSAMreadID Standard + string: read ID record type + Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + Number ... read number (index) in the FASTx file + +outSAMmapqUnique 255 + int: 0 to 255: the MAPQ value for unique mappers + +outSAMflagOR 0 + int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise. + +outSAMflagAND 65535 + int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise. + +outSAMattrRGline - + string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + +outSAMheaderHD - + strings: @HD (header) line of the SAM header + +outSAMheaderPG - + strings: extra @PG (software) line of the SAM header (in addition to STAR) + +outSAMheaderCommentFile - + string: path to the file with @CO (comment) lines of the SAM header + +outSAMfilter None + string(s): filter the output into main SAM/BAM files + KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + + +outSAMmultNmax -1 + int: max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + -1 ... all alignments (up to --outFilterMultimapNmax) will be output + +outSAMtlen 1 + int: calculation method for the TLEN field in the SAM/BAM files + 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + +outBAMcompression 1 + int: -1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression + +outBAMsortingThreadN 0 + int: >=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN). + +outBAMsortingBinsN 50 + int: >0: number of genome bins for coordinate-sorting + +### BAM processing +bamRemoveDuplicatesType - + string: mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + - ... no duplicate removal/marking + UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + +bamRemoveDuplicatesMate2basesN 0 + int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) + +### Output Wiggle +outWigType None + string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + 1st word: + None ... no signal output + bedGraph ... bedGraph format + wiggle ... wiggle format + 2nd word: + read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + read2 ... signal from only 2nd read + +outWigStrand Stranded + string: strandedness of wiggle/bedGraph output + Stranded ... separate strands, str1 and str2 + Unstranded ... collapsed strands + +outWigReferencesPrefix - + string: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references + +outWigNorm RPM + string: type of normalization for the signal + RPM ... reads per million of mapped reads + None ... no normalization, "raw" counts + +### Output Filtering +outFilterType Normal + string: type of filtering + Normal ... standard filtering using only current alignment + BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + +outFilterMultimapScoreRange 1 + int: the score range below the maximum score for multimapping alignments + +outFilterMultimapNmax 10 + int: maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + +outFilterMismatchNmax 10 + int: alignment will be output only if it has no more mismatches than this value. + +outFilterMismatchNoverLmax 0.3 + real: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value. + +outFilterMismatchNoverReadLmax 1.0 + real: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. + + +outFilterScoreMin 0 + int: alignment will be output only if its score is higher than or equal to this value. + +outFilterScoreMinOverLread 0.66 + real: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads) + +outFilterMatchNmin 0 + int: alignment will be output only if the number of matched bases is higher than or equal to this value. + +outFilterMatchNminOverLread 0.66 + real: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads). + +outFilterIntronMotifs None + string: filter alignment using their motifs + None ... no filtering + RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + +outFilterIntronStrands RemoveInconsistentStrands + string: filter alignments + RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + None ... no filtering + +### Output splice junctions (SJ.out.tab) +outSJtype Standard + string: type of splice junction output + Standard ... standard SJ.out.tab output + None ... no splice junction output + +### Output Filtering: Splice Junctions +outSJfilterReads All + string: which reads to consider for collapsed splice junctions output + All ... all reads, unique- and multi-mappers + Unique ... uniquely mapping reads only + +outSJfilterOverhangMin 30 12 12 12 + 4 integers: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + does not apply to annotated junctions + +outSJfilterCountUniqueMin 3 1 1 1 + 4 integers: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterCountTotalMin 3 1 1 1 + 4 integers: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 10 0 5 10 + 4 integers>=0: minimum allowed distance to other junctions' donor/acceptor + does not apply to annotated junctions + +outSJfilterIntronMaxVsReadN 50000 100000 200000 + N integers>=0: maximum gap allowed for junctions supported by 1,2,3,,,N reads + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + +### Scoring +scoreGap 0 + int: splice junction penalty (independent on intron motif) + +scoreGapNoncan -8 + int: non-canonical junction penalty (in addition to scoreGap) + +scoreGapGCAG -4 + int: GC/AG and CT/GC junction penalty (in addition to scoreGap) + +scoreGapATAC -8 + int: AT/AC and GT/AT junction penalty (in addition to scoreGap) + +scoreGenomicLengthLog2scale -0.25 + int: extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength) + +scoreDelOpen -2 + int: deletion open penalty + +scoreDelBase -2 + int: deletion extension penalty per base (in addition to scoreDelOpen) + +scoreInsOpen -2 + int: insertion open penalty + +scoreInsBase -2 + int: insertion extension penalty per base (in addition to scoreInsOpen) + +scoreStitchSJshift 1 + int: maximum score reduction while searching for SJ boundaries in the stitching step + + +### Alignments and Seeding + +seedSearchStartLmax 50 + int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +seedSearchStartLmaxOverLread 1.0 + real: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +seedSearchLmax 0 + int>=0: defines the maximum length of the seeds, if =0 seed length is not limited + +seedMultimapNmax 10000 + int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +seedPerReadNmax 1000 + int>0: max number of seeds per read + +seedPerWindowNmax 50 + int>0: max number of seeds per window + +seedNoneLociPerWindow 10 + int>0: max number of one seed loci per window + +seedSplitMin 12 + int>0: min length of the seed sequences split by Ns or mate gap + +seedMapMin 5 + int>0: min length of seeds to be mapped + +alignIntronMin 21 + int: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion + +alignIntronMax 0 + int: maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignMatesGapMax 0 + int: maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignSJoverhangMin 5 + int>0: minimum overhang (i.e. block size) for spliced alignments + +alignSJstitchMismatchNmax 0 -1 0 0 + 4*int>=0: maximum number of mismatches for stitching of the splice junctions (-1: no limit). + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + +alignSJDBoverhangMin 3 + int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +alignSplicedMateMapLmin 0 + int>0: minimum mapped length for a read mate that is spliced + +alignSplicedMateMapLminOverLmate 0.66 + real>0: alignSplicedMateMapLmin normalized to mate length + +alignWindowsPerReadNmax 10000 + int>0: max number of windows per read + +alignTranscriptsPerWindowNmax 100 + int>0: max number of transcripts per window + +alignTranscriptsPerReadNmax 10000 + int>0: max number of different alignments per read to consider + +alignEndsType Local + string: type of read ends alignment + Local ... standard local alignment with soft-clipping allowed + EndToEnd ... force end-to-end read alignment, do not soft-clip + Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + +alignEndsProtrude 0 ConcordantPair + int, string: allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + +alignSoftClipAtReferenceEnds Yes + string: allow the soft-clipping of the alignments past the end of the chromosomes + Yes ... allow + No ... prohibit, useful for compatibility with Cufflinks + +alignInsertionFlush None + string: how to flush ambiguous insertion positions + None ... insertions are not flushed + Right ... insertions are flushed to the right + +### Paired-End reads +peOverlapNbasesMin 0 + int>=0: minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the "merginf of overlapping mates" algorithm. + +peOverlapMMp 0.01 + real, >=0 & <1: maximum proportion of mismatched bases in the overlap area + +### Windows, Anchors, Binning + +winAnchorMultimapNmax 50 + int>0: max number of loci anchors are allowed to map to + +winBinNbits 16 + int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins. + +winAnchorDistNbins 9 + int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +winFlankNbins 4 + int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + +winReadCoverageRelativeMin 0.5 + real>=0: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only. + +winReadCoverageBasesMin 0 + int>0: minimum number of bases covered by the seeds in a window , for STARlong algorithm only. + +### Chimeric Alignments +chimOutType Junctions + string(s): type of chimeric output + Junctions ... Chimeric.out.junction + SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + +chimSegmentMin 0 + int>=0: minimum length of chimeric segment length, if ==0, no chimeric output + +chimScoreMin 0 + int>=0: minimum total (summed) score of the chimeric segments + +chimScoreDropMax 20 + int>=0: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length + +chimScoreSeparation 10 + int>=0: minimum difference (separation) between the best chimeric score and the next one + +chimScoreJunctionNonGTAG -1 + int: penalty for a non-GT/AG chimeric junction + +chimJunctionOverhangMin 20 + int>=0: minimum overhang for a chimeric junction + +chimSegmentReadGapMax 0 + int>=0: maximum gap in the read sequence between chimeric segments + +chimFilter banGenomicN + string(s): different filters for chimeric alignments + None ... no filtering + banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + +chimMainSegmentMultNmax 10 + int>=1: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments. + +chimMultimapNmax 0 + int>=0: maximum number of chimeric multi-alignments + 0 ... use the old scheme for chimeric detection which only considered unique alignments + +chimMultimapScoreRange 1 + int>=0: the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1 + +chimNonchimScoreDropMin 20 + int>=0: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value + +chimOutJunctionFormat 0 + int: formatting type for the Chimeric.out.junction file + 0 ... no comment lines/headers + 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + +### Quantification of Annotations +quantMode - + string(s): types of quantification requested + - ... none + TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + GeneCounts ... count reads per gene + +quantTranscriptomeBAMcompression 1 + int: -2 to 10 transcriptome BAM compression level + -2 ... no BAM output + -1 ... default compression (6?) + 0 ... no compression + 10 ... maximum compression + +quantTranscriptomeBan IndelSoftclipSingleend + string: prohibit various alignment type + IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM + Singleend ... prohibit single-end alignments + +### 2-pass Mapping +twopassMode None + string: 2-pass mapping mode. + None ... 1-pass mapping + Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + +twopass1readsN -1 + int: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step. + + +### WASP parameters +waspOutputMode None + string: WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 . + SAMtag ... add WASP tags to the alignments that pass WASP filtering + +### STARsolo (single cell RNA-seq) parameters +soloType None + string(s): type of single-cell RNA-seq + CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate] + SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + +soloCBwhitelist - + string(s): file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file. + None ... no whitelist: all cell barcodes are allowed + +soloCBstart 1 + int>0: cell barcode start base + +soloCBlen 16 + int>0: cell barcode length + +soloUMIstart 17 + int>0: UMI start base + +soloUMIlen 10 + int>0: UMI length + +soloBarcodeReadLength 1 + int: length of the barcode read + 1 ... equal to sum of soloCBlen+soloUMIlen + 0 ... not defined, do not check + +soloBarcodeMate 0 + int: identifies which read mate contains the barcode (CB+UMI) sequence + 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + 1 ... barcode sequence is a part of mate 1 + 2 ... barcode sequence is a part of mate 2 + +soloCBposition - + strings(s): position of Cell Barcode(s) on the barcode read. + Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 0_0_2_-1 3_1_3_8 + +soloUMIposition - + string: position of the UMI on the barcode read, same as soloCBposition + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 3_9_3_14 + +soloAdapterSequence - + string: adapter sequence to anchor barcodes. Only one adapter sequence is allowed. + +soloAdapterMismatchesNmax 1 + int>0: maximum number of mismatches allowed in adapter sequence. + +soloCBmatchWLtype 1MM_multi + string: matching the Cell Barcodes to the WhiteList + Exact ... only exact matches allowed + 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + +soloInputSAMattrBarcodeSeq - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR . + This parameter is required when running STARsolo with input from SAM. + +soloInputSAMattrBarcodeQual - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + +soloStrand Forward + string: strandedness of the solo libraries: + Unstranded ... no strand information + Forward ... read strand same as the original RNA molecule + Reverse ... read strand opposite to the original RNA molecule + +soloFeatures Gene + string(s): genomic features for which the UMI counts per Cell Barcode are collected + Gene ... genes: reads match the gene transcript + SJ ... splice junctions: reported in SJ.out.tab + GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + +#####UnderDevelopment_begin : not supported - do not use + Transcript3p ... quantification of transcript for 3' protocols +#####UnderDevelopment_end + +soloMultiMappers Unique + string(s): counting method for reads mapping to multiple genes + Unique ... count only reads that map to unique genes + Uniform ... uniformly distribute multi-genic UMIs to all genes + Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + +soloUMIdedup 1MM_All + string(s): type of UMI deduplication (collapsing) algorithm + 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + Exact ... only exactly matching UMIs are collapsed. + NoDedup ... no deduplication of UMIs, count all reads. + 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + +soloUMIfiltering - + string(s): type of UMI filtering (for reads uniquely mapping to genes) + - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --soloUMIdedup 1MM_CR + +soloOutFileNames Solo.out/ features.tsv barcodes.tsv matrix.mtx + string(s): file names for STARsolo output: + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + +soloCellFilter CellRanger2.2 3000 0.99 10 + string(s): cell filtering type and parameters + None ... do not output filtered cells + TopCells ... only report top cells by UMI count, followed by the exact number of cells + CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + +soloOutFormatFeaturesGeneField3 "Gene Expression" + string(s): field 3 in the Gene features.tsv file. If "-", then no 3rd field is output. + +soloCellReadStats None + string: Output reads statistics for each CB + Standard ... standard output + +#####UnderDevelopment_begin : not supported - do not use +soloClusterCBfile - + string: file containing the cluster information for cell barcodes, two columns: CB cluster_index. Only used with --soloFeatures Transcript3p +#####UnderDevelopment_end diff --git a/src/mapping/star_align/utils/process_params.R b/src/mapping/star_align/utils/process_params.R new file mode 100644 index 00000000..c49691f0 --- /dev/null +++ b/src/mapping/star_align/utils/process_params.R @@ -0,0 +1,239 @@ +library(tidyverse) +library(magrittr) + +# until https://github.com/alexdobin/STAR/pull/1710 is merged +param_txt <- readr::read_lines("src/mapping/star_align/utils/parametersDefault") + +dev_begin <- grep("#####UnderDevelopment_begin", param_txt) +dev_end <- grep("#####UnderDevelopment_end", param_txt) + +# strip development sections +nondev_ix <- unlist(map2( + c(1, dev_end + 1), + c(dev_begin - 1, length(param_txt)), function(i, j) { + if (i >= 1 && i < j) { + seq(i, j, 1) + } else { + NULL + } + } +)) + +param_txt2 <- param_txt[nondev_ix] + +# strip comments +param_txt3 <- param_txt2[-grep("^#[^#]", param_txt2)] + +# detect groups +group_ix <- grep("^### ", param_txt3) + +out <- map2_dfr( + group_ix, + c(group_ix[-1] - 1, length(param_txt3)), + function(group_start, group_end) { + group_name <- gsub("^### ", "", param_txt3[[group_start]]) + + group_txt <- param_txt3[seq(group_start + 1, group_end)] + + arg_ix <- grep("^[^ ]", group_txt) + + arguments <- map2_dfr( + arg_ix, + c(arg_ix[-1] - 1, length(group_txt)), + function(arg_start, arg_end) { + # process name and default + first_txt <- group_txt[[arg_start]] + first_regex <- "^([^ ]*) +(.*) *$" + if (!grepl(first_regex, first_txt)) { + stop("Line '", first_txt, "' did not match regex '", first_regex, "'") + } + name <- gsub(first_regex, "\\1", first_txt) + default <- gsub(first_regex, "\\2", first_txt) + + # process type and first description + second_txt <- group_txt[[arg_start + 1]] + second_regex <- "^ +([^:]*):[ ]+(.*)$" + if (!grepl(second_regex, second_txt)) { + stop( + "Line '", second_txt, + "' did not match regex '", second_regex, "'" + ) + } + type <- gsub(second_regex, "\\1", second_txt) + desc_start <- str_trim(gsub(second_regex, "\\2", second_txt)) + + # process more description + desc_cont1 <- group_txt[seq(arg_start + 2, arg_end)] + + desc <- + if (sum(str_length(desc_cont1)) == 0) { + desc_start + } else { + # detect margin + margins <- str_extract(desc_cont1, "^( +)") %>% na.omit() + margin <- margins[[which.min(str_length(margins))]] + desc_cont2 <- gsub(paste0("^", margin), "", desc_cont1) + desc_cont3 <- ifelse(grepl("\\.\\.\\.", desc_cont2), + paste0("- ", desc_cont2), desc_cont2 + ) + desc_cont4 <- str_trim(desc_cont3) + + # construct desc + str_trim(paste0(c(desc_start, "", desc_cont4), "\n", collapse = "")) + } + + tibble( + group_name, + name, + default, + type, + description = desc + ) + } + ) + + arguments + } +) + +# todo: manually fix alignEndsProtrude? +# assigning types +type_map <- c( + "string" = "string", "int" = "integer", "real" = "double", + "double" = "double", "int, string" = "string" +) +file_args <- c( + "genomeDir", "readFilesIn", "sjdbGTFfile", "genomeFastaFiles", + "genomeChainFiles", "readFilesManifest" +) +long_args <- c( + "limitGenomeGenerateRAM", "limitIObufferSize", + "limitOutSAMoneReadBytes", "limitBAMsortRAM" +) +required_args <- c("genomeDir", "readFilesIn") + +# converting examples +as_safe_int <- function(x) { + tryCatch( + { + as.integer(x) + }, + warning = function(e) { + bit64::as.integer64(x) + } + ) +} +safe_split <- function(x) { + strsplit(x, "'[^']*'(*SKIP)(*F)|\"[^\"]*\"(*SKIP)(*F)|\\s+", + perl = TRUE + )[[1]] %>% + gsub("^[\"']|[\"']$", "", rlang::.data) +} +trafos <- list( + string = function(x) x, + integer = as_safe_int, + double = as.numeric, + strings = function(x) safe_split(x), + integers = function(x) sapply(safe_split(x), as_safe_int), + doubles = function(x) as.numeric(safe_split(x)) +) +# remove arguments that are not relevant for viash +removed_args <- c("versionGenome", "parametersFiles", "sysShell", "runDirPerm") +# these settings are defined by the viash component +manual_args <- c( + "runThreadN", "outTmpDir", "runMode", + "outFileNamePrefix", "genomeDir", "readFilesIn" +) + +# make viash-like values +out2 <- out %>% + # remove arguments that are not relevant for viash + filter(!name %in% c(removed_args, manual_args)) %>% + # remove arguments that are related to a different runmode + filter( + !grepl("--runMode", description) | + grepl("--runMode alignReads", description) + ) %>% + filter( + !grepl("--runMode", group_name) | + grepl("--runMode alignReads", group_name) + ) %>% + mutate( + viash_arg = paste0("--", name), + type_step1 = type %>% + str_replace_all( + ".*(int, string|string|int|real|double)\\(?(s?).*", + "\\1\\2" + ), + viash_type = type_map[gsub( + "(int, string|string|int|real|double).*", "\\1", + type_step1 + )], + multiple = type_step1 == "int, string" | + grepl("s$", type_step1) | + grepl("^[4N][\\* ]", type), + default_step1 = default %>% + { + ifelse(. %in% c("-", "None"), NA_character_, .) + }, + viash_default = + mapply( + default_step1, + paste0(viash_type, ifelse(multiple, "s", "")), + FUN = function(str, typ) trafos[[typ]](str) + ), + # update type + viash_type = case_when( + name %in% long_args ~ "long", + name %in% file_args ~ "file", + TRUE ~ viash_type + ), + # turn longs into character because + # yaml::write_yaml doesn't handle longs well + viash_default = ifelse(sapply(viash_default, bit64::is.integer64), + map(viash_default, as.character), + viash_default + ), + group_name = gsub(" - .*", "", group_name), + required = ifelse(name %in% required_args, TRUE, NA) + ) +print(out2, n = 200) +out2 %>% + mutate(i = row_number()) %>% + select(-group_name, -description) + +out2 %>% + filter( + !grepl("--runMode", description) | + grepl("--runMode alignReads", description) + ) + +argument_groups <- map(unique(out2$group_name), function(group_name) { + args <- out2 %>% + filter(group_name == !!group_name) %>% + pmap(function(viash_arg, viash_type, multiple, + viash_default, description, required, ...) { + li <- lst( + name = viash_arg, + type = viash_type, + description = description + ) + if (all(!is.na(viash_default))) { + li$example <- viash_default + } + if (!is.na(multiple) && multiple) { + li$multiple <- multiple + li$multiple_sep <- ";" + } + if (!is.na(required) && required) { + li$required <- required + } + li + }) + list(name = group_name, arguments = args) +}) + +yaml::write_yaml( + list(argument_groups = argument_groups), + "src/mapping/star_align/argument_groups.yaml" +) diff --git a/src/mapping/star_align_v273a/config.vsh.yaml b/src/mapping/star_align_v273a/config.vsh.yaml new file mode 100644 index 00000000..3a03d242 --- /dev/null +++ b/src/mapping/star_align_v273a/config.vsh.yaml @@ -0,0 +1,79 @@ +name: star_align_v273a +namespace: mapping +scope: "public" +description: Align fastq files using STAR. +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +# as long as argument_groups.yaml is the same between both versions, just use that +__merge__: [., ../star_align/argument_groups.yaml] +# __merge__: [., argument_groups.yaml] +# manually taking care of the main input files +argument_groups: + - name: Input/Output + arguments: + - type: file + name: --input + alternatives: --readFilesIn + required: true + description: The FASTQ files to be analyzed. Corresponds to the --readFilesIn in the STAR command. + example: [ mysample_S1_L001_R1_001.fastq.gz, mysample_S1_L001_R2_001.fastq.gz ] + multiple: true + - type: file + name: --reference + alternatives: --genomeDir + description: Path to the reference built by star_build_reference. Corresponds to the --genomeDir in the STAR command. + example: /path/to/reference + required: true + - type: file + name: --output + alternatives: --outFileNamePrefix + description: Path to output directory. Corresponds to the --outFileNamePrefix in the STAR command. + example: /path/to/foo + direction: output + required: true +resources: + - type: python_script + # as long as argument_groups.yaml is the same between both versions, just use that + path: ../star_align/script.py + # path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.3a + - PACKAGES gcc g++ make wget zlib1g-dev unzip + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu, middisk ] diff --git a/src/mapping/star_align_v273a/test.py b/src/mapping/star_align_v273a/test.py new file mode 100644 index 00000000..cdd12839 --- /dev/null +++ b/src/mapping/star_align_v273a/test.py @@ -0,0 +1,74 @@ +import subprocess +from os import path +import sys +from pathlib import Path +from tempfile import TemporaryDirectory +import shutil + +## VIASH START +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +## Test 1: use input dir +logger.info("> Running command with folder") +input = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_fastq/" +input_files = [ + input + "tinygex_S1_L002_R1_001.fastq.gz", + input + "tinygex_S1_L002_R2_001.fastq.gz", +] +reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref/star" +output = "test_output" + +with TemporaryDirectory() as tempdir: + for file in input_files: + shutil.copyfile(file, Path(tempdir) / Path(file).name) + cmd_pars = [ + meta["executable"], + "--input", + tempdir, + "--reference", + reference, + "--output", + output, + "---cpus", + "2", + ] + subprocess.run(cmd_pars, check=True) + +logger.info("> Check if file exists") + +output_path = Path(output) +assert (output_path / "Log.final.out").is_file(), "No output log was created." +assert (output_path / "SJ.out.tab").is_file(), "No output file was created." + + +## Test 2: use input files +logger.info("> Running command with fastq files") +output = "test_output2" + +cmd_pars = [ + meta["executable"], + "--input", + input_files[0], + "--input", + input_files[1], + "--reference", + reference, + "--output", + output, + "---cpus", + "8", +] +out = subprocess.check_output(cmd_pars).decode("utf-8") + +logger.info("> Check if file exists") +assert path.exists(output + "/Log.final.out"), "No output log was created." +assert path.exists(output + "/SJ.out.tab"), "No output was created." + + +logger.info("> Completed Successfully!") diff --git a/src/metadata/add_id/config.vsh.yaml b/src/metadata/add_id/config.vsh.yaml new file mode 100644 index 00000000..919f0757 --- /dev/null +++ b/src/metadata/add_id/config.vsh.yaml @@ -0,0 +1,62 @@ +name: add_id +namespace: "metadata" +scope: "public" +description: | + Add id of .obs. Also allows to make .obs_names (the .obs index) unique + by prefixing the values with an unique id per .h5mu file. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Path to the input .h5mu. + required: true + example: sample_path + - name: "--input_id" + type: string + description: "The input id." + required: true + - name: "--obs_output" + type: string + required: false + description: "Name of the .obs column where to store the id." + default: "sample_id" + - name: "--output" + description: | + Name of output MuData file. + alternatives: ["-o"] + type: file + direction: output + example: "output.h5mu" + - name: "--make_observation_keys_unique" + type: boolean_true + description: Join the id to the .obs index (.obs_names). +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu + +engines: +- type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ singlecpu, lowmem ] \ No newline at end of file diff --git a/src/metadata/add_id/script.py b/src/metadata/add_id/script.py new file mode 100644 index 00000000..0a130c46 --- /dev/null +++ b/src/metadata/add_id/script.py @@ -0,0 +1,79 @@ +from __future__ import annotations +import sys +from mudata import read_h5mu, MuData + +### VIASH START +par = { + "input": "resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu", + "output": "foo.h5mu", + "input_id": "mouse", + "make_observation_keys_unique": True, +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def make_observation_keys_unique(sample_id: str, sample: MuData) -> None: + """ + Make the observation keys unique across all samples. At input, + the observation keys are unique within a sample. By adding the sample name + (unique for a sample) to each observation key, the observation key is made + unique across all samples as well. + """ + logger.info( + "Making observation keys unique across all " + "samples by appending prefix '%s' to the observation names.", + sample_id, + ) + sample.obs.index = f"{sample_id}_" + sample.obs.index + make_observation_keys_unique_per_mod(sample_id, sample) + logger.info("Done making observation keys unique.") + + +def make_observation_keys_unique_per_mod(sample_id: str, sample: MuData) -> None: + """ + Updating MuData.obs_names is not allowed (it is read-only). + So the observation keys for each modality has to be updated manually. + """ + for mod_name, mod in sample.mod.items(): + logger.info("Processing modality '%s'", mod_name) + mod.obs_names = f"{sample_id}_" + mod.obs_names + + +def main(): + logger.info("Reading input file '%s'.", par["input"]) + input_data = read_h5mu(par["input"]) + logger.info( + "Adding column '%s' to global .obs dataframe, populated with ID '%s'", + par["obs_output"], + par["input_id"], + ) + input_data.obs[par["obs_output"]] = par["input_id"] + logger.info("Done adding column to global .obs") + for mod_name, mod_data in input_data.mod.items(): + logger.info( + "Adding column '%s' to .obs dataframe for modality '%s', " + "populated with ID '%s'", + par["obs_output"], + mod_name, + par["input_id"], + ) + mod_data.obs[par["obs_output"]] = par["input_id"] + logger.info("Done adding per-modality columns.") + if par["make_observation_keys_unique"]: + make_observation_keys_unique(par["input_id"], input_data) + logger.info( + "Writing out data to '%s' with compression '%s'.", + par["output"], + par["output_compression"], + ) + input_data.write_h5mu(par["output"], compression=par["output_compression"]) + logger.info("Finished") + + +if __name__ == "__main__": + main() diff --git a/src/metadata/add_id/test.py b/src/metadata/add_id/test.py new file mode 100644 index 00000000..972eedd5 --- /dev/null +++ b/src/metadata/add_id/test.py @@ -0,0 +1,166 @@ +import sys +import pytest +import pandas as pd +from anndata import AnnData +from mudata import MuData, read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/metadata/add_id/add_id", + "resources_dir": "./resources_test/concat_test_data/", + "cpus": 2, + "config": "./src/metadata/add_id/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def generate_h5mu(): + # generate data + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) + ad1 = AnnData(df, obs=obs, var=var) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = AnnData(df, obs=obs2, var=var2) + tmp_mudata = MuData({"mod1": ad1, "mod2": ad2}) + return tmp_mudata + + +@pytest.mark.parametrize("output_compression", ["gzip", "lzf", None]) +def test_add_id( + run_component, small_mudata, small_mudata_path, random_h5mu_path, output_compression +): + output_path = random_h5mu_path() + + args = [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + ] + + small_mudata.obs["sample_id"] = ["test_id", "test_id"] + small_mudata.mod["mod1"].obs["sample_id"] = ["test_id", "test_id"] + small_mudata.mod["mod2"].obs["sample_id"] = ["test_id", "test_id"] + small_mudata.strings_to_categoricals() + small_mudata.update() + + if output_compression: + args.extend(["--output_compression", output_compression]) + # run component + run_component(args) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + + assert_annotation_objects_equal(output_data, small_mudata) + + +def test_add_id_obs_output( + run_component, small_mudata, small_mudata_path, random_h5mu_path +): + output_path = random_h5mu_path() + + # run component + run_component( + [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + "--obs_output", + "test_key", + ] + ) + + small_mudata.obs["test_key"] = ["test_id", "test_id"] + small_mudata.mod["mod1"].obs["test_key"] = ["test_id", "test_id"] + small_mudata.mod["mod2"].obs["test_key"] = ["test_id", "test_id"] + small_mudata.strings_to_categoricals() + small_mudata.update() + + assert output_path.is_file() + output_data = read_h5mu(output_path) + + assert_annotation_objects_equal(output_data, small_mudata) + + +def test_add_id_observations_unique( + run_component, small_mudata, small_mudata_path, random_h5mu_path +): + output_path = random_h5mu_path() + + # run component + run_component( + [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + "--make_observation_keys_unique", + ] + ) + + small_mudata.obs["sample_id"] = ["test_id", "test_id"] + small_mudata.mod["mod1"].obs["sample_id"] = ["test_id", "test_id"] + small_mudata.mod["mod2"].obs["sample_id"] = ["test_id", "test_id"] + small_mudata.strings_to_categoricals() + small_mudata.obs.index = pd.Index(["test_id_obs1", "test_id_obs2"]) + small_mudata.mod["mod1"].obs.index = pd.Index(["test_id_obs1", "test_id_obs2"]) + small_mudata.mod["mod2"].obs.index = pd.Index(["test_id_obs1", "test_id_obs2"]) + small_mudata.update() + + assert output_path.is_file() + output_data = read_h5mu(output_path) + + assert_annotation_objects_equal(output_data, small_mudata) + + +def test_add_id_overwrites_output_column( + run_component, small_mudata, small_mudata_path, random_h5mu_path +): + small_mudata.obs["already_exists"] = "alread_exists" + for _, modality in small_mudata.mod.items(): + modality.obs["already_exists"] = "alread_exists" + output_path = random_h5mu_path() + + # run component + run_component( + [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + "--obs_output", + "already_exists", + ] + ) + + small_mudata.obs["already_exists"] = ["test_id", "test_id"] + small_mudata.mod["mod1"].obs["already_exists"] = ["test_id", "test_id"] + small_mudata.mod["mod2"].obs["already_exists"] = ["test_id", "test_id"] + small_mudata.update() + small_mudata.strings_to_categoricals() + + assert output_path.is_file() + output_data = read_h5mu(output_path) + + assert_annotation_objects_equal(output_data, small_mudata) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/metadata/grep_annotation_column/config.vsh.yaml b/src/metadata/grep_annotation_column/config.vsh.yaml new file mode 100644 index 00000000..c1387d43 --- /dev/null +++ b/src/metadata/grep_annotation_column/config.vsh.yaml @@ -0,0 +1,97 @@ +name: grep_annotation_column +namespace: "metadata" +scope: "public" +description: | + Perform a regex lookup on a column from the annotation matrices .obs or .var. + The annotation matrix can originate from either a modality, or all modalities (global .var or .obs). +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + description: Arguments related to the input dataset. + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Path to the input .h5mu. + required: true + example: sample_path + - name: "--input_column" + type: string + required: false + description: "Column to query. If not specified, use .var_names or .obs_names, depending on the value of --matrix" + - name: "--input_layer" + type: string + required: false + description: | + Input data to use when calculating fraction of observations that match with the query. + Only used when --output_fraction_column is provided. If not specified, .X is used. + - name: "--modality" + description: | + Which modality to get the annotation matrix from. + type: string + required: true + example: "rna" + - name: "--matrix" + type: string + description: "Matrix to fetch the column from that will be searched." + choices: ["var", "obs"] + example: "var" + - name: Outputs + description: Arguments related to how the output will be written. + arguments: + - name: "--output" + description: | + Location of the output MuData file. + alternatives: ["-o"] + type: file + direction: output + example: "output.h5mu" + - name: "--output_match_column" + type: string + required: true + description: "Name of the column to write the result to." + - name: "--output_fraction_column" + type: string + required: false + description: | + For the opposite axis, name of the column to write the fraction of + observations that matches to the pattern. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Query options + description: Options related to the query + arguments: + - name: "--regex_pattern" + description: "Regex to use to match with the input column." + type: string + example: "^[mM][tT]-" + required: true + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu + +engines: +- type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ singlecpu, lowmem ] \ No newline at end of file diff --git a/src/metadata/grep_annotation_column/script.py b/src/metadata/grep_annotation_column/script.py new file mode 100644 index 00000000..48523381 --- /dev/null +++ b/src/metadata/grep_annotation_column/script.py @@ -0,0 +1,157 @@ +import sys +import mudata as mu +from pathlib import Path +from operator import attrgetter +from pandas import Series +import scipy as sc +import re +import numpy as np + + +### VIASH START +par = { + "input": "./resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + "input_layer": None, + "modality": "rna", + "matrix": "var", + "input_column": "gene_symbol", + "regex_pattern": "^[mM][tT]-", + "output": "foo.h5mu", + "input_id": "mouse", + "output_match_column": "test", + "output_fraction_column": "fraction_test", + "output_compression": "gzip", +} + +meta = {"resources_dir": "src/utils"} +### VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def describe_array(arr, msg): + # Note: sc.stats returns a DescribeResult NamedTuple. For NamedTuples, + # the _asdict method is public facing even though it starts with an underscore. + description = sc.stats.describe(arr)._asdict() + logger.info( + "%s:\nshape: %s\nmean: %s\nnobs: %s\n" + "variance: %s\nmin: %s\nmax: %s\ncontains na: %s\ndtype: %s\ncontains 0: %s", + msg, + arr.shape, + description["mean"], + description["nobs"], + description["variance"], + description["minmax"][0], + description["minmax"][1], + np.isnan(arr).any(), + arr.dtype, + (arr == 0).any(), + ) + + +def main(par): + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) + logger.info(f"Compiling regular expression '{par['regex_pattern']}'.") + try: + compiled_regex = re.compile(par["regex_pattern"]) + except (TypeError, re.error) as e: + raise ValueError( + f"{par['regex_pattern']} is not a valid regular expression pattern." + ) from e + else: + if compiled_regex.groups: + raise NotImplementedError( + "Using match groups is not supported by this component." + ) + logger.info("Reading input file %s, modality %s.", input_file, mod_name) + + modality_data = mu.read_h5ad(input_file, mod=mod_name) + logger.info("Reading input file done.") + logger.info("Using annotation dataframe '%s'.", par["matrix"]) + annotation_matrix = getattr(modality_data, par["matrix"]) + default_column = {"var": attrgetter("var_names"), "obs": attrgetter("obs_names")} + if par["input_column"]: + logger.info("Input column '%s' was specified.", par["input_column"]) + try: + annotation_column = annotation_matrix[par["input_column"]] + except KeyError as e: + raise ValueError( + f"Column {par['input_column']} could not be found for modality " + f"{par['modality']}. Available columns:" + f" {','.join(annotation_matrix.columns.to_list())}" + ) from e + else: + logger.info(f"No input column specified, using '.{par['matrix']}_names'") + annotation_column = default_column[par["matrix"]](modality_data).to_series() + logger.info("Applying regex search.") + grep_result = annotation_column.str.contains(par["regex_pattern"], regex=True) + logger.info("Search results: %s", grep_result.value_counts()) + # A Series object cannot be used as an indexer for a scipy sparse array + # when the data type is a pandas boolean extension array because + # extension arrays do not define .nonzero() + # See https://github.com/pandas-dev/pandas/issues/46025 + grep_result = grep_result.to_numpy(dtype="bool", na_value=False) + + other_axis_attribute = {"var": "obs", "obs": "var"} + if par["output_fraction_column"]: + logger.info( + "Enabled writing the fraction of values that matches to the pattern." + ) + input_layer = ( + modality_data.X + if not par["input_layer"] + else modality_data.layers[par["input_layer"]] + ) + totals = np.ravel(input_layer.sum(axis=1)) + describe_array(totals, "Summary of total counts for layer") + counts_for_matches = np.ravel(input_layer[:, grep_result].sum(axis=1)) + describe_array(counts_for_matches, "Summary of counts matching grep") + with np.errstate(all="raise"): + pct_matching = np.divide( + counts_for_matches, + totals, + out=np.zeros_like(totals, dtype=np.float64), + where=(~np.isclose(totals, np.zeros_like(totals))), + ) + logger.info("Testing wether or not fractions data contains NA.") + assert ~np.isnan(pct_matching).any(), "Fractions should not contain NA." + logger.info("Fraction statistics: \n%s", Series(pct_matching).describe()) + pct_matching = np.where(np.isclose(pct_matching, 0, atol=1e-6), 0, pct_matching) + pct_matching = np.where(np.isclose(pct_matching, 1, atol=1e-6), 1, pct_matching) + assert (np.logical_and(pct_matching >= 0, pct_matching <= 1)).all(), ( + "Fractions are not within bounds, please report this as a bug" + ) + output_matrix = other_axis_attribute[par["matrix"]] + logger.info( + "Writing fractions to matrix '%s', column '%s'", + output_matrix, + par["output_fraction_column"], + ) + getattr(modality_data, output_matrix)[par["output_fraction_column"]] = ( + pct_matching + ) + logger.info( + "Adding values that matched the pattern to '%s', column '%s'", + par["matrix"], + par["output_match_column"], + ) + getattr(modality_data, par["matrix"])[par["output_match_column"]] = grep_result + logger.info( + "Writing out data to '%s' with compression '%s'.", + output_file, + par["output_compression"], + ) + write_h5ad_to_h5mu_with_compression( + output_file, par["input"], mod_name, modality_data, par["output_compression"] + ) + + +if __name__ == "__main__": + main(par) diff --git a/src/metadata/grep_annotation_column/test.py b/src/metadata/grep_annotation_column/test.py new file mode 100644 index 00000000..778ab39f --- /dev/null +++ b/src/metadata/grep_annotation_column/test.py @@ -0,0 +1,493 @@ +import sys +import pytest +import pandas as pd +import numpy as np +import scipy +import math +from anndata import AnnData +from mudata import MuData, read_h5mu +from subprocess import CalledProcessError +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column +from itertools import permutations, islice + + +## VIASH START +meta = { + "executable": "./target/executable/metadata/grep_annotation_column/grep_annotation_column", + "resources_dir": "./resources_test/concat_test_data/", + "cpus": 2, + "config": "./src/metadata/grep_annotation_column/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def generate_h5mu(): + # generate data + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) + ad1 = AnnData(df, obs=obs, var=var) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = AnnData(df, obs=obs2, var=var2) + tmp_mudata = MuData({"mod1": ad1, "mod2": ad2}) + return tmp_mudata + + +@pytest.fixture( + params=[ + np.float32, + np.float64, + np.uint8, + np.uint16, + np.uint32, + np.int8, + np.int16, + np.int64, + ] +) +def very_sparse_mudata(request): + # NOTE: np.float16 is not a supported type in scipy! + # See https://github.com/scipy/scipy/issues/20200#issuecomment-1982170609 + # and https://github.com/scipy/scipy/issues/20200 + rng = np.random.default_rng() + shape = (10000, 200) + random_counts = scipy.sparse.random( + *shape, + density=0.00001, + format="csr", + dtype=request.param, + random_state=rng, + data_rvs=lambda length: np.array([1] * length), + ) + permutation_length_obs = int(math.log(shape[0]) / math.log(26)) + 1 + obs_index_perms = permutations( + "abcdefghijklmnopqrstuvwxyz", r=permutation_length_obs + ) + obs_index = pd.Index(["".join(x) for x in islice(obs_index_perms, shape[0])]) + obs = pd.DataFrame(index=obs_index) + permutation_length_var = int(math.log(shape[1]) / math.log(26)) + 1 + var_index_perms = permutations( + "ABCDEFGHIJKLMNOPQRSTUVWXYZ", r=permutation_length_var + ) + var_index = pd.Index(["".join(x) for x in islice(var_index_perms, shape[1])]) + var = pd.DataFrame(index=var_index) + mod = AnnData(X=random_counts, var=var, obs=obs) + return MuData({"mod1": mod}) + + +@pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) +def test_grep_column_default( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(generate_h5mu) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--regex_pattern", + "^var1", + "--output_match_column", + "test", + "--output_compression", + compression_format, + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + + assert_annotation_objects_equal( + input_path, + remove_annotation_column(output_data, "test", "var", "mod1"), + check_data=True, + ) + + +@pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) +def test_grep_column( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(generate_h5mu) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_compression", + compression_format, + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + + assert_annotation_objects_equal( + input_path, + remove_annotation_column(output_data, "test", "var", "mod1"), + check_data=True, + ) + + +@pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) +def test_grep_column_fraction_column( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(generate_h5mu) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + "--output_compression", + compression_format, + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + assert output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [ + 1 / 6, + 4 / 15, + ] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +@pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) +def test_fraction_column_nothing_matches( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(generate_h5mu) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^doesnotmatch$", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + "--output_compression", + compression_format, + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [False, False, False] + assert output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [0.0, 0.0] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +def test_fraction_column_with_no_counts( + run_component, generate_h5mu, random_h5mu_path, write_mudata_to_file +): + output_path = random_h5mu_path() + generate_h5mu.mod["mod1"].X[0] = 0 + input_path = write_mudata_to_file(generate_h5mu) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + assert output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [ + 0.0, + 4 / 15, + ] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +def test_fraction_column_very_sparse( + run_component, very_sparse_mudata, random_h5mu_path, write_mudata_to_file +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(very_sparse_mudata) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +@pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) +def test_fraction_column_input_layer( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): + output_path = random_h5mu_path() + generate_h5mu.mod["mod1"].layers["test_data"] = generate_h5mu.mod["mod1"].X.copy() + generate_h5mu.mod["mod1"].X = None + input_path = write_mudata_to_file(generate_h5mu) + + # run component + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + "--input_layer", + "test_data", + "--output_compression", + compression_format, + ] + ) + assert output_path.is_file() + + # check output + output_data = read_h5mu(output_path) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [1 / 6, 4 / 15] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +def test_missing_column( + run_component, generate_h5mu, random_h5mu_path, write_mudata_to_file +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(generate_h5mu) + + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "filliberke", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_compression", + "gzip", + ] + ) + assert ( + "ValueError: Column filliberke could not be found for modality mod1" + in err.value.stdout.decode("utf-8") + ) + + +def test_invalid_regex_pattern( + run_component, generate_h5mu, random_h5mu_path, write_mudata_to_file +): + output_path = random_h5mu_path() + input_path = write_mudata_to_file(generate_h5mu) + + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "(a", + "--output_match_column", + "test", + "--output_compression", + "gzip", + ] + ) + assert ( + "ValueError: (a is not a valid regular expression pattern." + in err.value.stdout.decode("utf-8") + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-s"])) diff --git a/src/metadata/join_csv/config.vsh.yml b/src/metadata/join_csv/config.vsh.yml new file mode 100644 index 00000000..53ae52aa --- /dev/null +++ b/src/metadata/join_csv/config.vsh.yml @@ -0,0 +1,87 @@ +name: join_csv +namespace: "metadata" +scope: "public" +description: "Join a csv containing metadata to the .obs or .var field of a mudata file." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "MuData Input" + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obs_key" + type: string + description: | + Obs column name where the sample id can be found for each observation to join on. + Useful when adding metadata to concatenated samples. + Mutually exclusive with `--var_key`." + - name: "--var_key" + type: string + description: | + Var column name where the sample id can be found for each variable to join on. + Mutually exclusive with `--obs_key`." + - name: "MuData Output" + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + - name: "Metadata Input" + arguments: + - name: "--input_csv" + type: file + required: true + direction: input + description: ".csv file containing metadata" + example: metadata.csv + - name: "--csv_key" + type: string + required: false + default: "id" + description: "column of the the csv that corresponds to the sample id." +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [/src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ singlecpu, lowmem ] diff --git a/src/metadata/join_csv/script.py b/src/metadata/join_csv/script.py new file mode 100644 index 00000000..7feba8be --- /dev/null +++ b/src/metadata/join_csv/script.py @@ -0,0 +1,57 @@ +import sys +import pandas as pd +from mudata import read_h5ad + +### VIASH START +par = { + "input": "work/f5/5f6365898ca5a42a360301a0c9e200/TSP15_Eye_ScleraEtc_10X_2_1.add_id.output.h5mu", + "input_csv": "work/f5/5f6365898ca5a42a360301a0c9e200/sample_info.csv", + "output": "foo.h5mu", + "modality": "rna", + "csv_key": "id", + "obs_key": "sample_id", + "var_key": None, +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +if par["obs_key"] and par["var_key"]: + raise ValueError("--obs_key can not be used in conjuction with --var_key.") +if not (par["obs_key"] or par["var_key"]): + raise ValueError("Must define either --obs_key or --var_key") + +logger.info("Read metadata csv from file") +metadata = pd.read_csv(par["input_csv"], sep=",", header=0, index_col=par["csv_key"]) +metadata.fillna("", inplace=True) + +logger.info("Read modality %s from file %s", par["modality"], par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Joining csv to mudata") +matrix = "var" if par["var_key"] else "obs" +matrix_sample_column_name = par["var_key"] if par["var_key"] else par["obs_key"] +original_matrix = getattr(mod_data, matrix) +sample_ids = original_matrix[matrix_sample_column_name] + +try: + new_columns = metadata.loc[sample_ids.tolist()] +except KeyError as e: + raise KeyError( + f"Not all sample IDs selected from {matrix} " + "(using the column selected with --var_key or --obs_key) were found in " + "the csv file." + ) from e +new_matrix = pd.concat( + [original_matrix.reset_index(drop=True), new_columns.reset_index(drop=True)], axis=1 +).set_axis(original_matrix.index) +setattr(mod_data, matrix, new_matrix) + +logger.info("Write output to mudata file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) diff --git a/src/metadata/join_csv/test.py b/src/metadata/join_csv/test.py new file mode 100644 index 00000000..9029f701 --- /dev/null +++ b/src/metadata/join_csv/test.py @@ -0,0 +1,186 @@ +import sys +import pytest +import pandas as pd +from anndata import AnnData +from mudata import MuData, read_h5mu +import subprocess +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/integrate/add_metadata/add_metadata", +} +## VIASH END + + +@pytest.fixture +def modality_1(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + return AnnData(df, obs=obs, var=var) + + +@pytest.fixture +def modality_2(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + return AnnData(df, obs=obs2, var=var2) + + +@pytest.fixture +def sample_h5mu(modality_1, modality_2): + mudata = MuData({"mod1": modality_1, "mod2": modality_2}) + return mudata + + +@pytest.fixture +def sample_h5mu_path(sample_h5mu, random_path): + output = random_path() + sample_h5mu.write(output) + return output + + +def test_add_metadata_var(run_component, random_path, sample_h5mu, sample_h5mu_path): + input_csv = random_path("csv") + output_h5mu = random_path("h5mu") + + # create csv + csv = pd.DataFrame( + {"id": ["sample1", "sample2"], "foo": ["v", "w"], "bar": ["x", "y"]} + ) + csv.to_csv(str(input_csv), index=False) + + run_component( + [ + "--input", + str(sample_h5mu_path), + "--input_csv", + str(input_csv), + "--output", + str(output_h5mu), + "--modality", + "mod1", + "--var_key", + "sample_id_var", + "--csv_key", + "id", + "--compression_output", + "gzip", + ] + ) + output_mudata = read_h5mu(output_h5mu) + expected_var = pd.DataFrame( + { + "Feat": ["a", "b", "c"], + "sample_id_var": ["sample1", "sample2", "sample1"], + "foo": ["v", "w", "v"], + "bar": ["x", "y", "x"], + }, + index=pd.Index(["var1", "var2", "var3"]), + ).astype( + { + "Feat": "object", + "sample_id_var": "category", + "foo": "category", + "bar": "category", + } + ) + + sample_h5mu.mod["mod1"].var = expected_var + assert_annotation_objects_equal(sample_h5mu, output_mudata) + + +def test_add_metadata_matrix_sample_column(run_component, tmp_path, sample_h5mu): + input_h5mu, input_mudata = sample_h5mu + input_csv = tmp_path / "input.csv" + output_h5mu = tmp_path / "output.h5mu" + + # create csv + csv = pd.DataFrame( + {"id": ["sample1", "sample2"], "foo": ["v", "w"], "bar": ["x", "y"]} + ) + csv.to_csv(str(input_csv), index=False) + + run_component( + [ + "--input", + str(input_h5mu), + "--input_csv", + str(input_csv), + "--output", + str(output_h5mu), + "--modality", + "mod1", + "--obs_key", + "sample_id", + "--csv_key", + "id", + ] + ) + + result = read_h5mu(output_h5mu) + expected_obs = pd.DataFrame( + { + "Obs": ["A", "B"], + "sample_id": ["sample1", "sample2"], + "foo": ["v", "w"], + "bar": ["x", "y"], + }, + index=pd.Index(["obs1", "obs2"]), + ).astype({"Obs": "object", "foo": "object", "bar": "object"}) + pd.testing.assert_frame_equal(result.mod["mod1"].obs, expected_obs) + pd.testing.assert_frame_equal(result.mod["mod1"].var, input_mudata.mod["mod1"].var) + pd.testing.assert_frame_equal(result.mod["mod2"].obs, input_mudata.mod["mod2"].obs) + pd.testing.assert_frame_equal(result.mod["mod2"].var, input_mudata.mod["mod2"].var) + + +def test_add_not_all_samples_in_csv_raises(run_component, tmp_path, sample_h5mu): + input_h5mu, input_mudata = sample_h5mu + input_csv = tmp_path / "input.csv" + output_h5mu = tmp_path / "output.h5mu" + + csv = pd.DataFrame( + {"id": ["sample1", "lorem"], "foo": ["v", "w"], "bar": ["x", "y"]} + ) + csv.to_csv(str(input_csv), index=False) + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + str(input_h5mu), + "--input_csv", + str(input_csv), + "--output", + str(output_h5mu), + "--modality", + "mod1", + "--obs_key", + "sample_id", + "--csv_key", + "id", + ] + ) + assert ( + "Not all sample IDs selected from obs (using the column selected " + "with --var_key or --obs_key) were found in the csv file." + in err.value.stdout.decode("utf-8") + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-k", "test_add_metadata_var"])) diff --git a/src/metadata/join_uns_to_obs/config.vsh.yml b/src/metadata/join_uns_to_obs/config.vsh.yml new file mode 100644 index 00000000..9583d698 --- /dev/null +++ b/src/metadata/join_uns_to_obs/config.vsh.yml @@ -0,0 +1,61 @@ +name: join_uns_to_obs +namespace: "metadata" +scope: "public" +description: "Join a data frame of length 1 (1 row index value) in .uns containing metadata to the .obs of a mudata file." +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--uns_key" + description: | + Lookup key from .uns pointing to the input data. + type: string + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [/src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ singlecpu, lowmem ] \ No newline at end of file diff --git a/src/metadata/join_uns_to_obs/script.py b/src/metadata/join_uns_to_obs/script.py new file mode 100644 index 00000000..c0eb6fb4 --- /dev/null +++ b/src/metadata/join_uns_to_obs/script.py @@ -0,0 +1,41 @@ +import sys +import pandas as pd +from mudata import read_h5ad + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "uns_key": "metrics_cellranger", + "output": "foo.h5mu", + "modality": "rna", +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Read modality %s from file %s", par["modality"], par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Joining uns to obs") +# get data frame +uns_df = mod_data.uns[par["uns_key"]] + +# check for overlapping colnames +intersect_keys = uns_df.keys().intersection(mod_data.obs.keys()) +obs_drop = mod_data.obs.drop(intersect_keys, axis=1) + +# create data frame to join +uns_df_rep = uns_df.loc[uns_df.index.repeat(mod_data.n_obs)] +uns_df_rep.index = mod_data.obs_names + +# create new obs +mod_data.obs = pd.concat([obs_drop, uns_df_rep], axis=1) + +logger.info("Write output to mudata file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) diff --git a/src/metadata/join_uns_to_obs/test.py b/src/metadata/join_uns_to_obs/test.py new file mode 100644 index 00000000..d311e0dd --- /dev/null +++ b/src/metadata/join_uns_to_obs/test.py @@ -0,0 +1,97 @@ +import sys +import pytest +import pandas as pd +from anndata import AnnData +from mudata import MuData, read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/metadata/join_uns_to_obs/join_uns_to_obs", + "config": "./src/metadata/join_uns_to_obs/config.vsh.yml", +} +## VIASH END + + +@pytest.fixture +def ad_w_uns(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + obsm = pd.DataFrame( + [["X", "W"]], index=pd.Index([0]), columns=["uns_col1", "uns_col2"] + ) + return AnnData(df, obs=obs, var=var, uns={"obsm1": obsm}) + + +@pytest.fixture +def ad_wo_uns(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + var = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + return AnnData(df, obs=obs, var=var) + + +@pytest.fixture +def sample_h5mu(ad_w_uns, ad_wo_uns): + mudata = MuData({"mod1": ad_w_uns, "mod2": ad_wo_uns}) + return mudata + + +def test_join_uns_to_obs( + run_component, random_h5mu_path, write_mudata_to_file, sample_h5mu +): + input_file = write_mudata_to_file(sample_h5mu) + output_file = random_h5mu_path() + run_component( + [ + "--input", + str(input_file), + "--modality", + "mod1", + "--uns_key", + "obsm1", + "--output", + str(output_file), + ] + ) + + expected_obs = pd.DataFrame( + { + "Obs": ["A", "B"], + "sample_id": ["sample1", "sample2"], + "uns_col1": ["X", "X"], + "uns_col2": ["W", "W"], + }, + index=pd.Index(["obs1", "obs2"]), + ).astype( + { + "Obs": "object", + "sample_id": "object", + "uns_col1": "category", + "uns_col2": "category", + } + ) + + assert output_file.is_file() + output_mudata = read_h5mu(output_file) + assert "obsm1" in output_mudata.mod["mod1"].uns + + sample_h5mu.mod["mod1"].obs = expected_obs + assert_annotation_objects_equal(sample_h5mu, output_mudata) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/metadata/move_obsm_to_obs/config.vsh.yaml b/src/metadata/move_obsm_to_obs/config.vsh.yaml new file mode 100644 index 00000000..d4a43961 --- /dev/null +++ b/src/metadata/move_obsm_to_obs/config.vsh.yaml @@ -0,0 +1,65 @@ +name: move_obsm_to_obs +namespace: "metadata" +scope: "public" +description: | + Move a matrix from .obsm to .obs. Newly created columns in .obs will + be created from the .obsm key suffixed with an underscore and the name of the columns + of the specified .obsm matrix. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "MuData Input" + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obsm_key" + type: string + description: Key of a data structure to move from `.obsm` to `.obs`. + required: true + - name: "MuData Output" + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + __merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [/src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ singlecpu, lowmem ] diff --git a/src/metadata/move_obsm_to_obs/script.py b/src/metadata/move_obsm_to_obs/script.py new file mode 100644 index 00000000..6f36c18b --- /dev/null +++ b/src/metadata/move_obsm_to_obs/script.py @@ -0,0 +1,71 @@ +import sys +from functools import partial +from pandas.errors import MergeError +from mudata import read_h5ad + +## VIASH START +par = { + "input": "input.h5mu", + "modality": "mod1", + "obsm_key": "obsm_key", + "output": "output.h5mu", + "output_compression": "gzip", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from file %s", par["modality"], par["input"]) +try: + mod_data = read_h5ad(par["input"], mod=par["modality"]) +except KeyError: + raise ValueError(f"Modality {par['modality']} does not exist.") + +logger.info("Moving .obm key %s", par["obsm_key"]) +try: + obsm_matrix = mod_data.obsm[par["obsm_key"]].copy() +except KeyError: + raise ValueError( + f".obsm key {par['obsm_key']} was not found in " + f".obsm slot for modality {par['modality']}." + ) + + +obsm_matrix.rename( + partial("{key}_{}".format, key=par["obsm_key"]), + axis="columns", + copy=False, + inplace=True, +) + +original_n_obs = len(mod_data.obs) +try: + logger.info(f".obs names: {mod_data.obs_names}") + logger.info(f".obsm index: {obsm_matrix.index}") + new_obs = mod_data.obs.drop(obsm_matrix.columns, axis=1, errors="ignore") + new_obs = new_obs.merge( + obsm_matrix, + how="left", + validate="one_to_one", + left_index=True, + right_index=True, + ) + mod_data.obs = new_obs +except MergeError: + raise ValueError( + f"Could not join .obsm matrix at {par['obsm_key']} to .obs because there " + "are some observation that are not overlapping between the two matrices " + "(indexes should overlap). This is either a bug or your mudata file is corrupt." + ) +del mod_data.obsm[par["obsm_key"]] + +logger.info( + "Write output to %s with compression %s", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) diff --git a/src/metadata/move_obsm_to_obs/test.py b/src/metadata/move_obsm_to_obs/test.py new file mode 100644 index 00000000..923145b9 --- /dev/null +++ b/src/metadata/move_obsm_to_obs/test.py @@ -0,0 +1,185 @@ +import pytest +import re +import pandas as pd +import uuid +from anndata import AnnData +from mudata import MuData, read_h5mu +from subprocess import CalledProcessError + +## VIASH START +meta = { + "name": "move_obsm_to_obs", + "resources_dir": "resources_test/", + "executable": "target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs", + "config": "src/metadata/move_obsm_to_obs/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def h5mu(): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + obsm = { + "obsm_key": pd.DataFrame( + [["foo", "bar"], ["lorem", "ipsum"]], + index=obs.index, + columns=["obsm_col1", "obsm_col2"], + ) + } + ad1 = AnnData(df, obs=obs, var=var, obsm=obsm) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = AnnData(df, obs=obs2, var=var2) + return MuData({"mod1": ad1, "mod2": ad2}) + + +@pytest.fixture +def write_temp_h5mu(tmp_path): + def wrapper(test_h5mu): + test_h5mu_path = tmp_path / f"{str(uuid.uuid4())}.h5mu" + test_h5mu.write_h5mu(test_h5mu_path) + return test_h5mu_path + + return wrapper + + +@pytest.fixture +def h5mu_with_non_overlapping_observations(h5mu): + h5mu.mod["mod1"].obsm["obsm_key"].index = pd.Index(["obs1", "doesnotexist"]) + return h5mu + + +def test_move_obsm_to_obs(run_component, h5mu, write_temp_h5mu, tmp_path): + output = tmp_path / "output.h5mu" + run_component( + [ + "--input", + write_temp_h5mu(h5mu), + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output, + ] + ) + assert output.is_file(), "Some output file must have been created." + output_data = read_h5mu(output) + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.index, pd.Index(["obs1", "obs2"]) + ) + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.columns, + pd.Index(["Obs", "sample_id", "obsm_key_obsm_col1", "obsm_key_obsm_col2"]), + ) + assert "obsm_key" not in output_data.mod["mod1"].obsm + + +def test_move_obsm_to_obs_non_overlapping_obs_fails( + run_component, write_temp_h5mu, h5mu_with_non_overlapping_observations, tmp_path +): + output = tmp_path / "output.h5mu" + # Mudata seems to handle this error, but keep this test in just in case mudata drops the ball. + with pytest.raises((CalledProcessError, ValueError)) as err: + run_component( + [ + "--input", + write_temp_h5mu(h5mu_with_non_overlapping_observations), + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output, + ] + ) + expected_message = r"value.index does not match parent’s obs names" + if isinstance(err, CalledProcessError): + assert re.search(expected_message, err.value.stdout.decode("utf-8")) + else: + assert re.search(expected_message, str(err)) + + +def test_error_non_existing_modality(run_component, h5mu, write_temp_h5mu, tmp_path): + output = tmp_path / "output.h5mu" + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + write_temp_h5mu(h5mu), + "--modality", + "foo", + "--obsm_key", + "obsm_key", + "--output", + output, + ] + ) + assert re.search( + r"ValueError: Modality foo does not exist\.", err.value.stdout.decode("utf-8") + ) + + +def test_execute_twice_overwrites(run_component, h5mu, write_temp_h5mu, tmp_path): + output_run_1 = tmp_path / "output1.h5mu" + run_component( + [ + "--input", + write_temp_h5mu(h5mu), + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output_run_1, + ] + ) + output_data_run_1 = read_h5mu(output_run_1) + output_data_run_1.mod["mod1"].obsm = { + "obsm_key": pd.DataFrame( + [["dolor", "amet"], ["jommeke", "filiberke"]], + index=output_data_run_1.mod["mod1"].obs_names, + columns=["obsm_col1", "obsm_col2"], + ) + } + + output_run_2 = tmp_path / "output2.h5mu" + input_run_2 = write_temp_h5mu(output_data_run_1) + run_component( + [ + "--input", + input_run_2, + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output_run_2, + ] + ) + assert output_run_2.is_file(), "Some output file must have been created." + output_data = read_h5mu(output_run_2) + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.index, pd.Index(["obs1", "obs2"]) + ) + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.columns, + pd.Index(["Obs", "sample_id", "obsm_key_obsm_col1", "obsm_key_obsm_col2"]), + ) + assert "obsm_key" not in output_data.mod["mod1"].obsm + + +if __name__ == "__main__": + exit(pytest.main([__file__])) diff --git a/src/neighbors/bbknn/config.vsh.yaml b/src/neighbors/bbknn/config.vsh.yaml new file mode 100644 index 00000000..e2d5bbb6 --- /dev/null +++ b/src/neighbors/bbknn/config.vsh.yaml @@ -0,0 +1,101 @@ +name: bbknn +namespace: "neighbors" +scope: "public" +description: | + BBKNN network generation +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + # inputs + - name: "--input" + alternatives: [-i] + type: file + description: Input h5mu file + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obsm_input" + description: The dimensionality reduction in `.obsm` to use for neighbour detection. Defaults to X_pca. + type: string + default: "X_pca" + - name: "--obs_batch" + type: string + description: .obs column name discriminating between your batches. + default: "batch" + + # outputs + - name: "--output" + alternatives: ["-o"] + type: file + description: Output .h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--uns_output" + type: string + default: neighbors + description: Mandatory .uns slot to store various neighbor output objects. + + - name: "--obsp_distances" + type: string + default: "distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + + - name: "--obsp_connectivities" + type: string + default: "connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + + # arguments + - name: "--n_neighbors_within_batch" + type: integer + description: How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches. + default: 3 + - name: "--n_pcs" + type: integer + description: How many dimensions (in case of PCA, principal components) to use in the analysis. + default: 50 + - name: "--n_trim" + type: integer + description: Trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. The lower the value the more independent the individual populations, at the cost of more conserved batch effect. If `None` (default), sets the parameter value automatically to 10 times `neighbors_within_batch` times the number of batches. Set to 0 to skip. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - build-essential + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - bbknn + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowcpu, highmem, middisk] diff --git a/src/neighbors/bbknn/script.py b/src/neighbors/bbknn/script.py new file mode 100644 index 00000000..6ff2285c --- /dev/null +++ b/src/neighbors/bbknn/script.py @@ -0,0 +1,48 @@ +import sys +import bbknn +from mudata import read_h5ad + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "obs_batch": "sample_id", + "obsm_input": "X_pca", + "n_neighbors_within_batch": 3, + "n_trim": None, + "n_pcs": 50, + "output": "output.h5mu", + "output_compression": "gzip", + "obsp_connectivities": "my_connectivities", + "obsp_distances": "my_distances", + "uns_output": "my_neighbors", +} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +adata = read_h5ad(par["input"], mod=par["modality"]) + +# copy data +tmp_adata = adata.copy() +bbknn.bbknn( + tmp_adata, + use_rep=par["obsm_input"], + batch_key=par["obs_batch"], + neighbors_within_batch=par["n_neighbors_within_batch"], + n_pcs=par["n_pcs"], + trim=par["n_trim"], +) + +# store output +adata.obsp[par["obsp_connectivities"]] = tmp_adata.obsp["connectivities"] +adata.obsp[par["obsp_distances"]] = tmp_adata.obsp["distances"] +adata.uns[par["uns_output"]] = tmp_adata.uns["neighbors"] +adata.uns[par["uns_output"]]["distances_key"] = par["obsp_distances"] +adata.uns[par["uns_output"]]["connectivities_key"] = par["obsp_connectivities"] + +# write to file +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) diff --git a/src/neighbors/bbknn/test.py b/src/neighbors/bbknn/test.py new file mode 100644 index 00000000..1788368f --- /dev/null +++ b/src/neighbors/bbknn/test.py @@ -0,0 +1,96 @@ +import sys +import pytest +from mudata import read_h5mu + +## VIASH START +meta = { + "executable": "./target/executable/neighbors/bbknn/bbknn", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.fixture +def sample_mudata(tmp_path): + tmp_input_path = tmp_path / "input.h5mu" + + # create input data + mudata = read_h5mu(input_file) + rna_adata = mudata.mod["rna"] + + # remove previous output (if any) + if "connectivities" in rna_adata.obsp: + del rna_adata.obsp["connectivities"] + if "distances" in rna_adata.obsp: + del rna_adata.obsp["distances"] + if "neighbors" in rna_adata.uns: + del rna_adata.uns["neighbors"] + + # write to file + mudata.write(tmp_input_path) + + return tmp_input_path, mudata + + +def test_simple_integration(run_component, tmp_path, sample_mudata): + tmp_input_path, mudata = sample_mudata + output_path = tmp_path / "output.h5mu" + print(mudata, flush=True) + # run component + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obs_batch", + "harmony_integration_leiden_1.0", + "--obsm_input", + "X_pca", + "--output_compression", + "gzip", + ] + ) + assert output_path.exists() + data = read_h5mu(output_path).mod["rna"] + assert "connectivities" in data.obsp + assert "distances" in data.obsp + assert "neighbors" in data.uns + + +def test_alternative_names(run_component, tmp_path, sample_mudata): + tmp_input_path, mudata = sample_mudata + output_path = tmp_path / "output.h5mu" + + # run component + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obs_batch", + "harmony_integration_leiden_1.0", + "--obsm_input", + "X_pca", + "--output_compression", + "gzip", + "--uns_output", + "my_neighbors", + "--obsp_connectivities", + "my_connectivities", + "--obsp_distances", + "my_distances", + ] + ) + assert output_path.exists() + data = read_h5mu(output_path).mod["rna"] + assert "my_connectivities" in data.obsp + assert "my_distances" in data.obsp + assert "my_neighbors" in data.uns + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/neighbors/find_neighbors/config.vsh.yaml b/src/neighbors/find_neighbors/config.vsh.yaml new file mode 100644 index 00000000..fd6bd43a --- /dev/null +++ b/src/neighbors/find_neighbors/config.vsh.yaml @@ -0,0 +1,101 @@ +name: find_neighbors +namespace: "neighbors" +scope: "public" +description: | + Compute a neighborhood graph of observations [McInnes18]. + + The neighbor search efficiency of this heavily relies on UMAP [McInnes18], which also provides a method for estimating connectivities of data points - the connectivity of the manifold (method=='umap'). If method=='gauss', connectivities are computed according to [Coifman05], in the adaption of [Haghverdi16]. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ contributor ] +arguments: + # input + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--obsm_input" + type: string + default: "X_pca" + required: false + description: "Which .obsm slot to use as a starting PCA embedding." + + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file containing the found neighbors. + direction: output + example: output.h5mu + + - name: "--uns_output" + type: string + default: neighbors + description: Mandatory .uns slot to store various neighbor output objects. + + - name: "--obsp_distances" + type: string + default: "distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + + - name: "--obsp_connectivities" + type: string + default: "connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + + # arguments + - name: "--metric" + type: string + default: "euclidean" + description: The distance metric to be used in the generation of the nearest neighborhood network. + choices: [ 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule' ] + + - name: "--num_neighbors" + type: integer + default: 15 + description: The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If knn is True, number of nearest neighbors to be searched. If knn is False, a Gaussian kernel width is set to the distance of the n_neighbors neighbor. + + - name: "--seed" + type: integer + default: 0 + description: A random seed. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [/src/base/requirements/python_test_setup.yaml, .] + +runners: + - type: executable + - type: nextflow + directives: + label: [lowcpu, midmem, middisk] diff --git a/src/neighbors/find_neighbors/script.py b/src/neighbors/find_neighbors/script.py new file mode 100644 index 00000000..65f4921c --- /dev/null +++ b/src/neighbors/find_neighbors/script.py @@ -0,0 +1,66 @@ +import sys +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.float_ = np.float64 +sys.modules["numpy"] = numpy_module + +import mudata as mu +import scanpy as sc +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "output.h5mu", + "metric": "cosine", + "num_neighbors": 15, + "modality": "rna", + "obsm_input": "X_pca", + "uns_output": "neighbors", + "obsp_distances": "distances", + "obsp_connectivities": "connectivities", + "seed": None, +} +meta = {"resources_dir": "."} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input modality %s from %s", par["modality"], par["input"]) +adata = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing a neighborhood graph.") +neighbors = sc.Neighbors(adata) +neighbors.compute_neighbors( + n_neighbors=par["num_neighbors"], + use_rep=par["obsm_input"], + metric=par["metric"], + random_state=par["seed"], + method="umap", +) + +adata.uns[par["uns_output"]] = { + "connectivities_key": par["obsp_connectivities"], + "distances_key": par["obsp_distances"], + "params": { + "n_neighbors": neighbors.n_neighbors, + "method": "umap", + "random_state": par["seed"], + "metric": par["metric"], + "use_rep": par["obsm_input"], + }, +} + +adata.obsp[par["obsp_distances"]] = neighbors.distances +adata.obsp[par["obsp_connectivities"]] = neighbors.connectivities + + +logger.info("Writing to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) diff --git a/src/neighbors/find_neighbors/test.py b/src/neighbors/find_neighbors/test.py new file mode 100644 index 00000000..2b43fb7b --- /dev/null +++ b/src/neighbors/find_neighbors/test.py @@ -0,0 +1,64 @@ +import sys +import pytest +import mudata as mu + +## VIASH START +meta = { + "executable": "./target/docker/graph/neighbors/find_neighbors", + "name": "find_neighbors", + "resources_dir": "resources_test/", +} +## VIASH END + +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" +output = "output.h5mu" + + +def test_run(run_component, tmp_path): + output = tmp_path / "output.h5mu" + + cmd_pars = [ + meta["executable"], + "--input", + input, + "--output", + str(output), + "--obsm_input", + "X_pca", + "--uns_output", + "foo_neigh", + "--obsp_distances", + "bar_dist", + "--obsp_connectivities", + "baz_conn", + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + assert output.is_file(), "No output was created." + + mu_input = mu.read_h5mu(input) + mu_output = mu.read_h5mu(output) + + assert "rna" in mu_output.mod, 'Output should contain data.mod["prot"].' + assert "prot" in mu_output.mod, 'Output should contain data.mod["prot"].' + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + prot_in = mu_input.mod["prot"] + prot_out = mu_output.mod["prot"] + + assert rna_in.shape == rna_out.shape, "Should have same shape as before" + assert prot_in.shape == prot_out.shape, "Should have same shape as before" + + assert "foo_neigh" in rna_out.uns, "Output should have .uns['foo_neigh']" + assert "baz_conn" in rna_out.obsp, "Output should have .obsp['baz_conn']" + assert "bar_dist" in rna_out.obsp, "Output should have .obsp['bar_dist']" + assert "foo_neigh" not in rna_in.uns, "Output should not have .uns['foo_neigh']" + assert "baz_conn" not in rna_in.obsp, "Input should not have .obsp['baz_conn']" + assert "bar_dist" not in rna_in.obsp, "Input should not have .obsp['bar_dist']" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/process_10xh5/filter_10xh5/config.vsh.yaml b/src/process_10xh5/filter_10xh5/config.vsh.yaml new file mode 100644 index 00000000..9e0e6a98 --- /dev/null +++ b/src/process_10xh5/filter_10xh5/config.vsh.yaml @@ -0,0 +1,75 @@ +name: filter_10xh5 +namespace: "process_10xh5" +scope: "public" +description: | + Filter a 10x h5 dataset. +usage: | + filter_10xh5 \ + --input pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \ + --output pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5 \ + --min_library_size 1000 --min_cells_per_gene 300 +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + type: file + description: "An h5 file from the 10x genomics website." + required: true + example: "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + - name: "--output" + type: file + example: pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5 + required: true + direction: output + description: Output h5 file. + - name: "--min_library_size" + type: integer + default: 0 + description: Minimum library size. + - name: "--min_cells_per_gene" + type: integer + default: 0 + description: Minimum number of cells per gene. + - name: "--keep_feature_types" + type: string + multiple: true + description: Specify which feature types will never be filtered out + example: "Antibody Capture" + - name: "--verbose" + type: boolean_true + description: Increase verbosity +resources: + - type: r_script + path: script.R +test_resources: + - type: r_script + path: run_test.R + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5 + + +engines: + - type: docker + image: eddelbuettel/r2u:24.04 + setup: + - type: docker + env: + - PIP_BREAK_SYSTEM_PACKAGES=1 + - RETICULATE_PYTHON=/usr/bin/python + - type: apt + packages: + - libhdf5-dev + - python3-pip + - python3-dev + - python-is-python3 + - type: python + user: true + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + - type: r + cran: [ testthat, anndata, hdf5r ] + +runners: + - type: executable + - type: nextflow + directives: + label: [singlecpu, lowmem] diff --git a/src/process_10xh5/filter_10xh5/run_test.R b/src/process_10xh5/filter_10xh5/run_test.R new file mode 100755 index 00000000..558c572d --- /dev/null +++ b/src/process_10xh5/filter_10xh5/run_test.R @@ -0,0 +1,35 @@ +library(testthat) +requireNamespace("hdf5r", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +library(anndata) + +cat("Run command\n") +input <- paste0( + meta$resources_dir, + "/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" +) +output <- "out.h5" + +system(paste0( + "./", meta$name, " ", + "--input '", input, "' ", + "--output ", output, " ", + "--min_library_size 1000 ", + "--min_cells_per_gene 300 ", + "--verbose" +)) + +sc <- reticulate::import("scanpy") +orig_ad <- sc$read_10x_h5(input) + +out_ad <- sc$read_10x_h5(output) + +# check if matrix is not empty +expect_gte(nrow(out_ad), 100) +expect_gte(ncol(out_ad), 100) + +# check if filtering has been performed +expect_lte(nrow(out_ad), nrow(orig_ad)) +expect_lte(ncol(out_ad), ncol(orig_ad)) + +cat(">> All tests passed successfully!\n") diff --git a/src/process_10xh5/filter_10xh5/script.R b/src/process_10xh5/filter_10xh5/script.R new file mode 100644 index 00000000..f97359c5 --- /dev/null +++ b/src/process_10xh5/filter_10xh5/script.R @@ -0,0 +1,95 @@ +## VIASH START +par <- list( + input = paste0( + "resources_test/pbmc_1k_protein_v3/", + "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" + ), + output = "output.h5", + min_library_size = 1000, + min_cells_per_gene = 300, + keep_feature_types = "Antibody Capture", + verbose = TRUE +) +## VIASH END + +if (par$verbose) cat("Loading dependencies\n") +requireNamespace("hdf5r", quietly = TRUE) + +if (par$verbose) cat("Opening h5 file\n") +h5 <- hdf5r::H5File$new(par$input, mode = "r") + +if (par$verbose) cat("Reading data in memory\n") +features__all_tag_keys <- h5[["matrix/features/_all_tag_keys"]][] + +features <- data.frame( + feature_type = h5[["matrix/features/feature_type"]][], + genome = h5[["matrix/features/genome"]][], + id = h5[["matrix/features/id"]][], + name = h5[["matrix/features/name"]][] +) + +mat <- Matrix::sparseMatrix( + i = h5[["matrix/indices"]][], + p = h5[["matrix/indptr"]][], + x = h5[["matrix/data"]][], + dims = h5[["matrix/shape"]][], + index1 = FALSE, + dimnames = list( + features$id, + h5[["matrix/barcodes"]][] + ) +) + +if (par$verbose) { + cat("Filtering out cells with library size < ", + par$min_library_size, "\n", + sep = "" + ) +} +library_size <- Matrix::colSums(mat) +mat2 <- mat[, library_size >= par$min_library_size, drop = FALSE] + +if (par$verbose) { + cat("Filtering genes with num cells < ", + par$min_cells_per_gene, "\n", + sep = "" + ) +} +num_cells <- Matrix::rowSums(mat2 > 0) +mat3 <- mat2[ + num_cells >= par$min_cells_per_gene | + features$feature_type %in% par$keep_feature_types, , + drop = FALSE +] +features2 <- features[match(rownames(mat3), features$id), , drop = FALSE] + +# helper fun +set_with_type <- function(path, value) { + orig_dtype <- h5[[path]]$get_type() + orig_chunk <- h5[[path]]$chunk_dims + if (is.na(orig_chunk)) orig_chunk <- "auto" + h5new$create_dataset(path, value, dtype = orig_dtype, chunk_dims = orig_chunk) +} + +# create new file +if (par$verbose) cat("Saving h5 file at '", par$output, "'\n", sep = "") +h5new <- hdf5r::H5File$new(par$output, mode = "w") +zz <- h5new$create_group("matrix") +zz <- h5new$create_group("matrix/features") + +set_with_type("matrix/features/feature_type", features2$feature_type) +set_with_type("matrix/features/genome", features2$genome) +set_with_type("matrix/features/id", features2$id) +set_with_type("matrix/features/name", features2$name) +set_with_type("matrix/features/_all_tag_keys", features__all_tag_keys) +set_with_type("matrix/indices", mat3@i) +set_with_type("matrix/indptr", mat3@p) +set_with_type("matrix/data", as.integer(mat3@x)) +set_with_type("matrix/shape", dim(mat3)) +set_with_type("matrix/barcodes", colnames(mat3)) + +for (attname in hdf5r::h5attr_names(h5)) { + h5new$create_attr(attname, hdf5r::h5attr(h5, attname)) +} +h5new$close_all() +h5$close_all() diff --git a/src/qc/calculate_atac_qc_metrics/config.vsh.yaml b/src/qc/calculate_atac_qc_metrics/config.vsh.yaml new file mode 100644 index 00000000..81e94530 --- /dev/null +++ b/src/qc/calculate_atac_qc_metrics/config.vsh.yaml @@ -0,0 +1,118 @@ +name: calculate_atac_qc_metrics +namespace: "qc" +scope: "public" +description: | + Add basic ATAC quality control metrics to an .h5mu file. + + The metrics are comparable to what scanpy.pp.calculate_qc_metrics output, + although they have slightly different names: + + Obs metrics (name in this component -> name in scanpy): + - n_features_per_cell -> n_genes_by_counts + - total_fragment_counts -> total_counts + +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--fragments_path" + type: file + description: | + Path to the fragments file. If not provided and not present in the input h5mu file, + the nucleosome signal and TSS enrichment score will not be calculated. + direction: input + required: false + example: fragments.tsv.gz + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "atac" + required: false + - name: "--layer" + description: | + Layer at `.layers` to use for the calculation. If not provided, `.X` is used. + type: string + example: "raw_counts" + required: false + - name: "--n_fragments_for_nucleosome_signal" + type: integer + description: | + Number of fragments to use per cell for nucleosome signal calculation. + Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended. + See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal + for more information + default: 10e4 + required: false + - name: "--nuc_signal_threshold" + type: double + description: | + Threshold for nucleosome signal. Cells with nucleosome signal above this threshold + will be marked as low quality ("NS_FAIL"), otherwise they will be marked "NS_PASS". + default: 2 + required: false + - name: "--n_tss" + type: integer + description: | + Number of the transcription start sites to calculate TSS enrichment score. + See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment + for more information + default: 3000 + required: false + - name: "--tss_threshold" + type: double + description: | + Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold + will be marked as low quality ("TSS_FAIL") otherwise they will be marked as "TSS_PASS". + default: 1.5 + required: false + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_atac_tiny_bcl/counts/ +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - pkg-config # Otherwise h5py installation fails, which is required for scanpy + - libhdf5-dev + - gcc + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/scanpy.yaml, .] + packages: + - muon~=0.1.5 + - pysam~=0.22.0 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [singlecpu, midmem] diff --git a/src/qc/calculate_atac_qc_metrics/script.py b/src/qc/calculate_atac_qc_metrics/script.py new file mode 100644 index 00000000..c8fa0aea --- /dev/null +++ b/src/qc/calculate_atac_qc_metrics/script.py @@ -0,0 +1,103 @@ +import sys + +import scanpy as sc +import muon as mu +from muon import atac as ac # the module containing function for scATAC data processing +import numpy as np + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "fragments_path": None, + "output": "foo.h5mu", + "modality": "atac", + "layer": None, + "n_fragments_for_nucleosome_signal": 10e4, + "n_tss": 3000, + "nuc_signal_threshold": 2, + "tss_threshold": 1.5, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading input data") + atac = mu.read_h5ad(par["input"], mod=par["modality"]) + + logger.info("Checking if QC columns are already calculated") + for col in ("n_features_per_cell", "total_fragment_counts"): + if col in atac.obs: + logger.warning(f"{col} is already in atac.obs, dropping") + atac.obs.drop(col, axis=1, inplace=True) + + logger.info("Calculating QC metrics") + sc.pp.calculate_qc_metrics( + atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"] + ) + + logger.debug("Putting QC columns to ATAC adata") + atac.obs.rename( + columns={ + "n_genes_by_counts": "n_features_per_cell", + "total_counts": "total_fragment_counts", + }, + inplace=True, + ) + + logger.debug("Adding log-transformed total fragment counts") + # log-transform total counts and add as column + atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"]) + + if par["fragments_path"] is not None: + logger.info("Trying to locate frafments") + ac.tl.locate_fragments(atac, fragments=par["fragments_path"]) + else: + logger.info("Skipping fragment location: `fragments_path` is not set") + + # Calculate the nucleosome signal across cells + if "files" in atac.uns and "fragments" in atac.uns["files"]: + logger.info("Trying to calculate nucleosome signal") + ac.tl.nucleosome_signal( + atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs + ) + atac.obs["nuc_signal_filter"] = [ + "NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS" + for ns in atac.obs["nucleosome_signal"] + ] + else: + logger.info( + "Skipping nucleosome signal calculation: fragments information is not found" + ) + + # If interval information is available, calculate TSS enrichment + if "peak_annotation" in atac.uns.keys(): + tss = ac.tl.tss_enrichment( + atac, + features=atac.uns["peak_annotation"], + n_tss=par["n_tss"], + random_state=666, + ) + + tss.obs["tss_filter"] = [ + "TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS" + for score in atac.obs["tss_score"] + ] + else: + logger.info( + "Skipping TSS enrichment calculation: genes intervals are not found" + ) + + logger.info("Writing output") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], atac, par["output_compression"] + ) + + +if __name__ == "__main__": + main() diff --git a/src/qc/calculate_atac_qc_metrics/test.py b/src/qc/calculate_atac_qc_metrics/test.py new file mode 100644 index 00000000..907ae6b8 --- /dev/null +++ b/src/qc/calculate_atac_qc_metrics/test.py @@ -0,0 +1,196 @@ +import sys +from pathlib import Path +import pytest + +import mudata as md +import numpy as np +import scanpy as sc +import muon as mu +import pandas as pd + +## VIASH START +meta = { + "executable": "./target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics", + "resources_dir": "./resources_test/cellranger_atac_tiny_bcl/counts/", + "config": "./src/qc/calculate_atac_qc_metrics/config.vsh.yaml", + "cpus": 2, +} +## VIASH END + + +@pytest.fixture +def synthetic_example(): + atac = sc.AnnData( + np.array([[0, 0, 0], [1, 0, 1], [10, 0, 0], [100, 0, 1], [1000, 0, 0]]) + ) + atac.obs_names = ["A", "B", "C", "D", "E"] + atac.var_names = ["x", "y", "z"] + + return md.MuData({"atac": atac}) + + +@pytest.fixture +def example_mudata(tmp_path, synthetic_example): + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + + +@pytest.fixture +def example_mudata_with_layer(tmp_path, synthetic_example): + synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod[ + "atac" + ].X.copy() + synthetic_example.mod["atac"].X = np.random.normal( + size=synthetic_example.mod["atac"].X.shape + ) + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + + +@pytest.fixture +def neurips_mudata(tmp_path): + """From the `NeurIPS Multimodal Single-Cell Integration Challenge + ` + + Link is taken from the Moscot repository: + https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 + + """ + adata = sc.read( + "../data/neurips_data.h5ad", + backup_url="https://figshare.com/ndownloader/files/37993503", + ) + + mdata = md.MuData({"atac": adata}) + mdata_path = tmp_path / "neurips.h5mu" + mdata.write(mdata_path) + + return mdata_path + + +@pytest.fixture +def tiny_atac_mudata(tmp_path): + resources_dir = Path(meta["resources_dir"]) + mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5") + mu.atac.tl.locate_fragments( + mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz") + ) + assert "files" in mdata.mod["atac"].uns.keys() + assert "fragments" in mdata.mod["atac"].uns["files"].keys() + + # Read features annotation and save it to uns + peak_annotation = pd.read_csv( + resources_dir / "counts" / "peak_annotation.tsv", sep="\t" + ) + peak_annotation.columns = [ + "Chromosome", + "Start", + "End", + "gene", + "distance", + "peak_type", + ] + peak_annotation["gene"] = peak_annotation["gene"].astype(str) # Fixes saving error + mdata.mod["atac"].uns["peak_annotation"] = peak_annotation + + mdata_path = tmp_path / "tiny_atac.h5mu" + mdata.write(mdata_path) + + return mdata_path + + +@pytest.mark.parametrize( + "mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"] +) +def test_qc_columns_in_tables(run_component, request, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "atac", + "--n_fragments_for_nucleosome_signal", + "100", + ] + + run_component(args) + assert output_path.is_file() + data_with_qc = md.read(output_path) + + for qc_metric in ( + "n_features_per_cell", + "total_fragment_counts", + "log_total_fragment_counts", + ): + assert qc_metric in data_with_qc.mod["atac"].obs + for qc_metric in ( + "n_cells_by_counts", + "mean_counts", + "pct_dropout_by_counts", + "total_counts", + ): + assert qc_metric in data_with_qc.mod["atac"].var + + # Check that ATAC-specific metrics are calculated if fragments information is present (for tiny ATAC data) + if ( + "files" in data_with_qc.mod["atac"].uns + and "fragments" in data_with_qc.mod["atac"].uns["files"] + ): + assert "nucleosome_signal" in data_with_qc.mod["atac"].obs + + if "peak_annotation" in data_with_qc.mod["atac"].uns.keys(): + assert "tss_score" in data_with_qc.mod["atac"].obs + + +@pytest.mark.parametrize("mudata", ["example_mudata", "example_mudata_with_layer"]) +def test_calculations_correctness(request, run_component, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "atac", + "--n_fragments_for_nucleosome_signal", + "100", + ] + + if mudata == "example_mudata_with_layer": + args.extend(["--layer", "atac_counts"]) + + run_component(args) + assert output_path.is_file() + data_with_qc = md.read(output_path) + + assert np.allclose( + data_with_qc.mod["atac"].obs["n_features_per_cell"], [0, 2, 1, 2, 1] + ) + assert np.allclose( + data_with_qc.mod["atac"].obs["total_fragment_counts"], [0, 2, 10, 101, 1000] + ) + assert np.allclose( + data_with_qc.mod["atac"].obs["log_total_fragment_counts"], + [-np.inf, np.log10(2), np.log10(10), np.log10(101), np.log10(1000)], + ) + + assert np.allclose(data_with_qc.mod["atac"].var["n_cells_by_counts"], [4, 0, 2]) + assert np.allclose(data_with_qc.mod["atac"].var["mean_counts"], [222.2, 0, 0.4]) + assert np.allclose( + data_with_qc.mod["atac"].var["pct_dropout_by_counts"], [20, 100, 60] + ) + assert np.allclose(data_with_qc.mod["atac"].var["total_counts"], [1111, 0, 2]) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/qc/calculate_qc_metrics/config.vsh.yaml b/src/qc/calculate_qc_metrics/config.vsh.yaml new file mode 100644 index 00000000..bb260783 --- /dev/null +++ b/src/qc/calculate_qc_metrics/config.vsh.yaml @@ -0,0 +1,153 @@ +name: calculate_qc_metrics +namespace: "qc" +scope: "public" +description: | + Add basic quality control metrics to an .h5mu file. + + The metrics are comparable to what scanpy.pp.calculate_qc_metrics output, + although they have slightly different names: + + Var metrics (name in this component -> name in scanpy): + - pct_dropout -> pct_dropout_by_{expr_type} + - num_nonzero_obs -> n_cells_by_{expr_type} + - obs_mean -> mean_{expr_type} + - total_counts -> total_{expr_type} + + Obs metrics: + - num_nonzero_vars -> n_genes_by_{expr_type} + - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var} + - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var} + - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type} + - total_counts -> total_{expr_type} + +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--layer" + description: | + Layer from modality to use as input data. If not provided the .X attribute is used. + type: string + example: "raw_counts" + required: false + - name: Metrics added to .obs + arguments: + - name: "--var_qc_metrics" + description: | + Keys to select a boolean (containing only True or False) column from .var. + For each cell, calculate the proportion of total values for genes which are labeled 'True', + compared to the total sum of the values for all genes. + type: string + multiple: True + example: "ercc,highly_variable,mitochondrial" + - name: "--var_qc_metrics_fill_na_value" + type: boolean + description: | + Fill any 'NA' values found in the columns specified with --var_qc_metrics to 'True' or 'False'. + as False. + - name: "--top_n_vars" + type: integer + description: | + Number of top vars to be used to calculate cumulative proportions. + If not specified, proportions are not calculated. `--top_n_vars 20;50` finds + cumulative proportion to the 20th and 50th most expressed vars. + multiple: true + required: false + - name: "--output_obs_num_nonzero_vars" + description: | + Name of column in .obs describing, for each observation, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each row the number of columns that contain data. + type: string + required: false + default: "num_nonzero_vars" + - name: "--output_obs_total_counts_vars" + description: | + Name of the column for .obs describing, for each observation (row), + the sum of the stored values in the columns. + type: string + required: false + default: total_counts + - name: Metrics added to .var + arguments: + - name: "--output_var_num_nonzero_obs" + description: | + Name of column describing, for each feature, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each column the number of rows that contain data. + type: string + required: false + default: "num_nonzero_obs" + - name: "--output_var_total_counts_obs" + description: | + Name of the column in .var describing, for each feature (column), + the sum of the stored values in the rows. + type: string + required: false + default: total_counts + - name: "--output_var_obs_mean" + type: string + description: | + Name of the column in .obs providing the mean of the values in each row. + default: "obs_mean" + required: false + - name: "--output_var_pct_dropout" + type: string + default: "pct_dropout" + description: | + Name of the column in .obs providing for each feature the percentage of + observations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs` + but percentage based. + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + packages: + - scipy + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + test_setup: + - type: python + packages: + - scanpy +runners: + - type: executable + - type: nextflow + directives: + label: [singlecpu, midmem] diff --git a/src/qc/calculate_qc_metrics/script.py b/src/qc/calculate_qc_metrics/script.py new file mode 100644 index 00000000..7512b5c4 --- /dev/null +++ b/src/qc/calculate_qc_metrics/script.py @@ -0,0 +1,195 @@ +import sys +from mudata import read_h5ad +from scipy.sparse import issparse, csr_array +import numpy as np + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "foo.h5mu", + "modality": "rna", + "layer": None, + "top_n_vars": [10, 20, 50], + "var_qc_metrics": None, + "output_var_obs_mean": "obs_mean", + "output_var_total_counts_obs": "total_counts", + "output_var_num_nonzero_obs": "num_nonzero_obs", + "output_var_pct_dropout": "pct_dropout", + "output_obs_total_counts_vars": "total_counts", + "output_obs_num_nonzero_vars": "num_nonzero_vars", +} +meta = {"resources_dir": "."} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def count_nonzero(layer, axis): + """ + This method is the functional equivalent of the old .getnnz function from scirpy, + but that function was deprecated. So we use the nonzero function to mimic the old + behavior. + """ + axis ^= 1 + nonzero_counts = dict(zip(*np.unique(layer.nonzero()[axis], return_counts=True))) + nonzero_per_axis_item = { + row_index: nonzero_counts.get(row_index, 0) + for row_index in range(layer.shape[axis]) + } + return np.array(list(nonzero_per_axis_item.values()), dtype="int64") + + +def main(): + modality_data = read_h5ad(par["input"], mod=par["modality"]) + var = modality_data.var + layer = modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] + if not issparse(layer): + raise NotImplementedError("Expected layer to be in sparse format.") + layer = csr_array(layer) + layer.eliminate_zeros() + + var_columns_to_add = {} + + # var statistics + if par["output_var_obs_mean"]: + obs_mean = layer.mean(axis=0) + var_columns_to_add[par["output_var_obs_mean"]] = obs_mean + if par["output_var_total_counts_obs"]: + # from the np.sum documentation: + # Especially when summing a large number of lower precision floating point numbers, + # such as float32, numerical errors can become significant. In such cases it can + # be advisable to use dtype="float64" to use a higher precision for the output. + layer_with_type = layer + if np.issubdtype(layer.dtype, np.floating) and np.can_cast( + layer.dtype, np.float64, casting="safe" + ): + # 'safe' casting makes sure not to cast np.float128 or anything else to a lower precision dtype + layer_with_type = layer.astype(np.float64) + total_counts_obs = np.ravel(layer_with_type.sum(axis=0)) + var_columns_to_add[par["output_var_total_counts_obs"]] = total_counts_obs + + num_nonzero_obs = count_nonzero(layer, axis=0) + if par["output_var_num_nonzero_obs"]: + var_columns_to_add[par["output_var_num_nonzero_obs"]] = num_nonzero_obs + if par["output_var_pct_dropout"]: + var_columns_to_add[par["output_var_pct_dropout"]] = ( + 1 - num_nonzero_obs / layer.shape[0] + ) * 100 + + modality_data.var = modality_data.var.assign(**var_columns_to_add) + + # obs statistics + obs_columns_to_add = {} + total_counts_var = np.ravel(layer.sum(axis=1)) + + if par["output_obs_num_nonzero_vars"]: + num_nonzero_vars = count_nonzero(layer, axis=1) + obs_columns_to_add[par["output_obs_num_nonzero_vars"]] = num_nonzero_vars + + if par["output_obs_total_counts_vars"]: + obs_columns_to_add[par["output_obs_total_counts_vars"]] = total_counts_var + + top_metrics = {} + if par["top_n_vars"]: + par["top_n_vars"] = sorted(par["top_n_vars"]) + distributions = get_top_from_csr_matrix(layer, par["top_n_vars"]) + top_metrics = { + distribution_size: distribution * 100 + for distribution_size, distribution in zip( + par["top_n_vars"], distributions.T + ) + } + obs_columns_to_add |= { + f"pct_of_counts_in_top_{n_top}_vars": col + for n_top, col in top_metrics.items() + } + + if par["var_qc_metrics"]: + print(f"qc_metrics: {par['var_qc_metrics']}") + for qc_metric in par["var_qc_metrics"]: + if qc_metric not in var: + raise ValueError( + f"Value for --var_qc_metrics, '{qc_metric}' " + f"not found in .var for modality {par['modality']}" + ) + qc_column = var[qc_metric] + if qc_column.isna().any(): + if par["var_qc_metrics_fill_na_value"] is None: + raise ValueError( + f"The .var column '{qc_metric}', selected by '--var_qc_metrics', contains NA values. " + "It is ambiguous whether or not to include these values in the static calulation. " + "You can explicitly map the NA values to 'False' or 'True using '--var_qc_metrics_fill_na_value'" + ) + else: + qc_column = qc_column.fillna( + par["var_qc_metrics_fill_na_value"], inplace=False + ) + qc_column = qc_column.to_list() + if set(np.unique(qc_column)) - {True, False}: + raise ValueError( + f"Column {qc_metric} in .var for modality {par['modality']} " + f"must only contain boolean values" + ) + total_counts_qc_metric = np.ravel(layer[:, qc_column].sum(axis=1)) + obs_columns_to_add |= { + f"total_counts_{qc_metric}": total_counts_qc_metric, + f"pct_{qc_metric}": total_counts_qc_metric / total_counts_var * 100, + } + + modality_data.obs = modality_data.obs.assign(**obs_columns_to_add) + write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], + ) + + +def get_top_from_csr_matrix(array, top_n_genes): + # csr matrices stores a 3D matrix in a format such that data for individual cells + # are stored in 1 array. Another array (indptr) here stores the ranges of indices + # to select from the data-array (.e.g. data[indptr[0]:indptr[1]] for row 0) for each row. + # Another array 'indices' maps each element of data to a column + # (data and indices arrays have the same length) + top_n_genes = np.array(top_n_genes).astype(np.int64) + assert np.all(top_n_genes[:-1] <= top_n_genes[1:]), "top_n_genes must be sorted" + row_indices, data = array.indptr, array.data + number_of_rows, max_genes_to_parse = row_indices.size - 1, top_n_genes[-1] + top_data = np.zeros((number_of_rows, max_genes_to_parse), dtype=data.dtype) + # Loop over each row to create a dense matrix without the 0 counts, + # but not for the whole matrix, only store the genes up until + # the largest number of top n genes. + for row_number in range(number_of_rows): + row_start_index, row_end_index = ( + row_indices[row_number], + row_indices[row_number + 1], + ) + row_data = data[row_start_index:row_end_index] # all non-zero counts for an row + try: + # There are less genes with counts in the row than the + # maximum number of genes we would like to select + # all these genes are in the top genes, just store them + top_data[row_number, : row_end_index - row_start_index] = row_data + except ValueError: + # Store the counts for the top genes + top_data[row_number, :] = np.partition(row_data, -max_genes_to_parse)[ + -max_genes_to_parse: + ] + + # Partition works from smallest to largest, but we want largest + # so do smallest to largest first (but with reversed indices) + top_data = np.partition(top_data, max_genes_to_parse - top_n_genes) + # And then switch the order around + top_data = np.flip(top_data, axis=1) + + cumulative = top_data.cumsum(axis=1, dtype=np.float64)[:, top_n_genes - 1] + return cumulative / np.expand_dims(array.sum(axis=1), 1) + + +if __name__ == "__main__": + main() diff --git a/src/qc/calculate_qc_metrics/test.py b/src/qc/calculate_qc_metrics/test.py new file mode 100644 index 00000000..56c883ec --- /dev/null +++ b/src/qc/calculate_qc_metrics/test.py @@ -0,0 +1,386 @@ +import sys +import pytest +from pathlib import Path +import mudata as md +import anndata as ad +import numpy as np +import scanpy as sc +import pandas as pd +import scipy +import uuid +from pandas.testing import assert_series_equal +from itertools import product +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "./target/executable/qc/calculate_qc_metrics/calculate_qc_metrics", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", + "config": "./src/qc/calculate_qc_metrics/config.vsh.yaml", + "cpus": 2, +} +## VIASH END + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_mudata_random(): + rng = np.random.default_rng(seed=1) + random_counts = scipy.sparse.random( + 50000, 100, density=0.8, format="csr", dtype=np.uint32, random_state=rng + ) + good_dtype = random_counts.astype(np.float32) + del random_counts + mod1 = ad.AnnData( + X=good_dtype, + obs=pd.DataFrame(index=pd.RangeIndex(50000)), + var=pd.DataFrame(index=pd.RangeIndex(100)), + ) + return md.MuData({"mod1": mod1}) + + +@pytest.fixture +def input_mudata(input_path): + mudata = md.read_h5mu(input_path) + # create a less sparse matrix to increase the variability in qc statistics + rng = np.random.default_rng() + random_counts = scipy.sparse.random( + *mudata["rna"].X.shape, density=0.8, format="csr", random_state=rng + ) + mudata["rna"].X = random_counts + return mudata + + +@pytest.fixture +def input_mudata_path(tmp_path, input_mudata): + output_path = tmp_path / f"{str(uuid.uuid4())}.h5mu" + input_mudata.write(output_path) + return output_path + + +@pytest.fixture( + params=product([True, False, np.nan, "random"], ["bool", pd.BooleanDtype()]) +) +def mudata_with_boolean_column(tmp_path, input_mudata, request): + requested_value, requested_type = request.param + input_var = input_mudata.mod["rna"].var + input_var["custom"] = requested_value + if requested_value == "random": + input_var["custom"] = np.random.choice( + [True, False], len(input_var), p=[0.20, 0.80] + ) + input_var["custom"] = input_var["custom"].astype(requested_type) + new_input_path = tmp_path / "input_with_custom_col.h5mu" + input_mudata.write(new_input_path) + return new_input_path + + +def test_add_qc(run_component, input_path, random_h5mu_path): + output_path = random_h5mu_path() + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--top_n_vars", + "10;20;90", + "--output_compression", + "gzip", + ] + ) + + assert output_path.is_file() + data_with_qc = md.read(output_path) + expected_obs_columns = [ + f"pct_of_counts_in_top_{top_n_var}_vars" for top_n_var in ("10", "20", "90") + ] + ["total_counts", "num_nonzero_vars"] + expected_var_columns = [ + "pct_dropout", + "num_nonzero_obs", + "obs_mean", + "total_counts", + ] + data_with_qc.mod["rna"].var = data_with_qc.mod["rna"].var.drop( + columns=expected_var_columns, errors="raise" + ) + data_with_qc.var = data_with_qc.var.drop( + columns=list(map("rna:{}".format, expected_var_columns)), errors="errors" + ) + data_with_qc.mod["rna"].obs = data_with_qc.mod["rna"].obs.drop( + columns=expected_obs_columns, errors="raise" + ) + data_with_qc.obs = data_with_qc.obs.drop( + columns=list(map("rna:{}".format, expected_obs_columns)), errors="errors" + ) + assert_annotation_objects_equal(input_path, data_with_qc) + + +@pytest.mark.parametrize( + "optional_parameter,annotation_matrix,arg_value", + [ + ("--output_obs_num_nonzero_vars", "obs", "lorem"), + ("--output_obs_total_counts_vars", "obs", "ipsum"), + ("--output_var_num_nonzero_obs", "var", "dolor"), + ("--output_var_total_counts_obs", "var", "amet"), + ("--output_var_obs_mean", "var", "sit"), + ("--output_var_pct_dropout", "var", "elit"), + ], +) +def test_qc_metrics_set_output_column( + run_component, + input_mudata_path, + optional_parameter, + annotation_matrix, + arg_value, + random_h5mu_path, +): + output_h5mu = random_h5mu_path() + args = [ + "--input", + input_mudata_path, + "--output", + output_h5mu, + "--modality", + "rna", + "--output_compression", + "gzip", + optional_parameter, + arg_value, + ] + + run_component(args) + default_obs_columns = { + "output_obs_num_nonzero_vars": "num_nonzero_vars", + "output_obs_total_counts_vars": "total_counts", + } + default_var_columns = { + "output_var_num_nonzero_obs": "num_nonzero_obs", + "output_var_total_counts_obs": "total_counts", + "output_var_obs_mean": "obs_mean", + "output_var_pct_dropout": "pct_dropout", + } + defaults = {"var": default_var_columns, "obs": default_obs_columns} + defaults[annotation_matrix].update({optional_parameter.strip("-"): arg_value}) + assert output_h5mu.is_file() + data_with_qc = md.read(output_h5mu) + for attribute_name in ("var", "obs"): + setattr( + data_with_qc.mod["rna"], + attribute_name, + getattr(data_with_qc.mod["rna"], attribute_name).drop( + columns=list(defaults[attribute_name].values()), errors="raise" + ), + ) + setattr( + data_with_qc, + attribute_name, + getattr(data_with_qc, attribute_name).drop( + columns=list( + map("rna:{}".format, list(defaults[attribute_name].values())) + ), + errors="raise", + ), + ) + assert_annotation_objects_equal(input_mudata_path, data_with_qc) + + +@pytest.mark.parametrize( + "optional_parameter,annotation_matrix,expected_missing,", + [ + ("--var_qc_metrics", "obs", "total_counts_.*|pct_*"), + ("--top_n_vars", "obs", "pct_of_counts_in_top_.*"), + ("--output_obs_num_nonzero_vars", "obs", "num_nonzero_vars"), + ("--output_obs_total_counts_vars", "obs", "total_counts"), + ("--output_var_num_nonzero_obs", "var", "num_nonzero_obs"), + ("--output_var_total_counts_obs", "var", "total_counts"), + ("--output_var_obs_mean", "var", "obs_mean"), + ("--output_var_pct_dropout", "var", "pct_dropout"), + ], +) +def test_qc_metrics_optional( + run_component, + input_mudata_path, + optional_parameter, + annotation_matrix, + expected_missing, +): + args = [ + "--input", + input_mudata_path, + "--output", + "foo.h5mu", + "--modality", + "rna", + "--output_compression", + "gzip", + ] + if optional_parameter not in ["--var_qc_metrics", "--top_n_vars"]: + args.extend([optional_parameter, ""]) + + run_component(args) + assert Path("foo.h5mu").is_file() + data_with_qc = md.read("foo.h5mu") + matrix = getattr(data_with_qc.mod["rna"], annotation_matrix) + assert matrix.filter(regex=expected_missing, axis=1).empty + + +def test_calculcate_qc_var_qc_metrics( + run_component, mudata_with_boolean_column, random_h5mu_path +): + output_path = random_h5mu_path() + input_data = md.read_h5mu(mudata_with_boolean_column) + args = [ + "--input", + str(mudata_with_boolean_column), + "--output", + str(output_path), + "--modality", + "rna", + "--top_n_vars", + "10;20;90", + "--var_qc_metrics", + "custom", + ] + if input_data.mod["rna"].var["custom"].isna().any(): + args.extend(["--var_qc_metrics_fill_na_value", "True"]) + + run_component(args) + assert output_path.is_file() + data_with_qc = md.read(output_path)["rna"] + for qc_metric in ("pct_custom", "total_counts_custom"): + assert qc_metric in data_with_qc.obs + # Do a percentage calculation based on indexes + # and compare it to the calculations from the component + custom_column_true = data_with_qc.var["custom"].fillna(True) + gene_counts = sc.get.obs_df(data_with_qc, keys=data_with_qc.var_names.to_list()) + gene_counts_custom = gene_counts.loc[:, custom_column_true] + sum_custom_column = gene_counts_custom.sum(axis=1) + sum_all = gene_counts.sum(axis=1) + percentage = (sum_custom_column / sum_all) * 100 + pd.testing.assert_series_equal( + percentage, + data_with_qc.obs["pct_custom"], + check_exact=False, # Comparing floats + check_names=False, + ) + assert (data_with_qc.obs["pct_custom"] <= 100).all() + + +def test_compare_scanpy( + run_component, mudata_with_boolean_column, input_mudata, random_h5mu_path +): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + str(mudata_with_boolean_column), + "--output", + str(output_path), + "--modality", + "rna", + "--top_n_vars", + "10;20;90", + "--var_qc_metrics", + "custom", + "--var_qc_metrics_fill_na_value", + "False", + ] + ) + assert output_path.is_file() + + component_data = md.read(output_path) + rna_mod = component_data.mod["rna"] + + # Replicate --var_qc_metrics_fill_na_value False + # Scanpy also does not work with pd.BooleanDtype() + # So cast from 'boolean' to 'bool' + input_mudata.mod["rna"].var["custom"] = ( + input_mudata.mod["rna"].var["custom"].fillna(False).astype("bool") + ) + sc.pp.calculate_qc_metrics( + input_mudata.mod["rna"], + expr_type="counts", + var_type="genes", + qc_vars=["custom"], + percent_top=[10, 20, 90], + use_raw=False, + inplace=True, + log1p=False, + ) + scanpy_var = input_mudata.mod["rna"].var + component_var = rna_mod.var + + vars_to_compare = { + "pct_dropout": "pct_dropout_by_counts", + "num_nonzero_obs": "n_cells_by_counts", + "obs_mean": "mean_counts", + "total_counts": "total_counts", + } + for from_var, to_var in vars_to_compare.items(): + assert_series_equal( + component_var[from_var], + scanpy_var[to_var], + check_names=False, + check_dtype=False, + ) + + scanpy_obs = input_mudata.mod["rna"].obs + component_obs = rna_mod.obs + obs_to_compare = { + "num_nonzero_vars": "n_genes_by_counts", + "pct_custom": "pct_counts_custom", + "total_counts_custom": "total_counts_custom", + "total_counts": "total_counts", + } + obs_to_compare |= { + f"pct_of_counts_in_top_{i}_vars": f"pct_counts_in_top_{i}_genes" + for i in (10, 20, 90) + } + for from_obs, to_obs in obs_to_compare.items(): + assert_series_equal( + component_obs[from_obs], + scanpy_obs[to_obs], + check_names=False, + check_dtype=False, + ) + + +def test_total_counts_less_precision_dtype( + run_component, input_mudata_random, random_h5mu_path +): + input_path = random_h5mu_path() + input_mudata_random.write(input_path) + output_path = random_h5mu_path() + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "mod1", + ] + ) + output_data = md.read_h5mu(output_path) + matrix_good_type = input_mudata_random.mod["mod1"].X + var_names = input_mudata_random.var_names + obs_names = input_mudata_random.obs_names + del input_mudata_random + input_df = pd.DataFrame( + matrix_good_type.todense(), columns=var_names, index=obs_names + ) + total_sums_manual = input_df.to_numpy().sum(axis=0, dtype=np.float128) + total_counts = output_data.mod["mod1"].var["total_counts"] + np.testing.assert_allclose(total_sums_manual, total_counts.to_numpy()) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/qc/fastqc/config.vsh.yaml b/src/qc/fastqc/config.vsh.yaml new file mode 100644 index 00000000..1bce6130 --- /dev/null +++ b/src/qc/fastqc/config.vsh.yaml @@ -0,0 +1,52 @@ +name: "fastqc" +namespace: "qc" +scope: "public" +description: | + Fastqc component, please see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/. This component can take one or more files (by means of shell globbing) or a complete directory. +arguments: + - name: "--mode" + alternatives: [ "-m" ] + type: string + choices: [ files, dir ] + default: files + description: The mode in which the component works. Can be either files or dir. + - name: "--input" + alternatives: [ "-i" ] + type: file + required: true + description: Directory containing input fastq files. + example: fastq_dir/ + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + required: true + description: Output directory to write reports to. + example: qc/ + - name: "--threads" + alternatives: ["-t"] + type: integer + required: false + description: | + Specifies the number of files which can be processed simultaneously. Each thread will be allocated 250MB of + memory. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: [ fastqc ] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowcpu, midmem] diff --git a/src/qc/fastqc/script.sh b/src/qc/fastqc/script.sh new file mode 100644 index 00000000..70b915cd --- /dev/null +++ b/src/qc/fastqc/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eo pipefail + +mkdir -p "$par_output" + +if [ "$par_mode" == "dir" ]; then + par_input="$par_input/*.fastq.gz" +fi + +eval fastqc ${par_threads:+--threads $par_threads} -o "$par_output" "$par_input" diff --git a/src/qc/fastqc/test.sh b/src/qc/fastqc/test.sh new file mode 100644 index 00000000..59aec79f --- /dev/null +++ b/src/qc/fastqc/test.sh @@ -0,0 +1,40 @@ +#!/bin/bash + + + +echo ">>> Testing files mode" + +sample="tinygex_S1_L001_I1_001" + +# don't specify the mode 'files' as this is the default +./fastqc --input "cellranger_tiny_fastq/$sample.fastq.gz" --output filemode-report --threads 2 + +echo ">> Checking whether output dir exists" +[[ ! -d filemode-report ]] && echo "Output dir could not be found!" && exit 1 + +echo ">> Checking if the correct files are present" +[[ ! -f filemode-report/"$sample"_fastqc.html ]] && echo "Report file missing" && exit 1 +[[ ! -f filemode-report/"$sample"_fastqc.zip ]] && echo "Zip file missing" && exit 1 + +echo ">>> Testing dir mode" + +./fastqc -m dir --input "cellranger_tiny_fastq/" --output dirmode-report + +echo ">> Checking whether output dir exists" +[[ ! -d dirmode-report ]] && echo "Output dir could not be found!" && exit 1 + +echo ">> Checking if sufficient files are present" +# each fastq files generates one html and one zip file +nr_fastqs=`ls cellranger_tiny_fastq/*.fastq.gz | wc -l` +nr_htmlfiles=`ls dirmode-report/*.html | wc -l` +nr_zipfiles=`ls dirmode-report/*.zip | wc -l` + +[[ ! $nr_fastqs == $nr_htmlfiles ]] && echo "Html files are missing" && exit 1 +[[ ! $nr_fastqs == $nr_htmlfiles ]] && echo "Zip files are missing" && exit 1 + +# print final message +echo ">>> Test finished successfully" + +# do not remove this +# as otherwise your test might exit with a different exit code +exit 0 diff --git a/src/qc/multiqc/config.vsh.yaml b/src/qc/multiqc/config.vsh.yaml new file mode 100644 index 00000000..2e0fc5e8 --- /dev/null +++ b/src/qc/multiqc/config.vsh.yaml @@ -0,0 +1,47 @@ +name: "multiqc" +namespace: "qc" +scope: "public" +description: | + MultiQC aggregates results from bioinformatics analyses across many samples into a single report. + It searches a given directory for analysis logs and compiles a HTML report. It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. +arguments: + - name: "--input" + alternatives: [ "-i" ] + type: file + required: true + multiple: true + description: Inputs for MultiQC. + example: input.txt + - name: "--output" + alternatives: [ "-o" ] + type: file + direction: output + required: true + description: Create report in the specified output directory. + example: report +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/10x_5k_anticmv/fastqc/ + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: [ multiqc ] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] diff --git a/src/qc/multiqc/script.py b/src/qc/multiqc/script.py new file mode 100644 index 00000000..ec33db16 --- /dev/null +++ b/src/qc/multiqc/script.py @@ -0,0 +1,14 @@ +import subprocess + +## VIASH START +par = { + "input": [ + "resources_test/10x_5k_anticmv/fastqc/", + "resources_test/10x_5k_anticmv/fastqc/", + ], + "output": "output", +} +## VIASH END + +# Run MultiQC +subprocess.run(["multiqc", "-o", par["output"]] + par["input"]) diff --git a/src/qc/multiqc/test.py b/src/qc/multiqc/test.py new file mode 100644 index 00000000..b0579666 --- /dev/null +++ b/src/qc/multiqc/test.py @@ -0,0 +1,38 @@ +import sys +import pytest +import shutil + +## VIASH START +## VIASH END + +input_fastqc = meta["resources_dir"] + "/fastqc/" + + +def test_multiqc(run_component, tmp_path): + output_path = tmp_path / "output" + + run_component(["--input", input_fastqc, "--output", str(output_path)]) + + assert output_path.exists() + assert (output_path / "multiqc_report.html").is_file() + + +def test_multiple_inputs(run_component, tmp_path): + output_path = tmp_path / "output" + dir1 = tmp_path / "dir1" + dir2 = tmp_path / "dir2" + + # copy input to tmp_path + shutil.copytree(input_fastqc, dir1) + shutil.copytree(input_fastqc, dir2) + + run_component( + ["--input", str(dir1), "--input", str(dir2), "--output", str(output_path)] + ) + + assert output_path.exists() + assert (output_path / "multiqc_report.html").is_file() + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/query/cellxgene_census/config.vsh.yaml b/src/query/cellxgene_census/config.vsh.yaml new file mode 100644 index 00000000..0790e71d --- /dev/null +++ b/src/query/cellxgene_census/config.vsh.yaml @@ -0,0 +1,140 @@ +name: cellxgene_census +namespace: query +scope: "public" +description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. +authors: + - __merge__: /src/authors/matthias_beyens.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] + - __merge__: /src/authors/kai_waldrant.yaml + roles: [ contributor ] +argument_groups: + - name: Input database + description: "Open CellxGene Census by version or URI." + arguments: + - name: "--input_uri" + type: string + description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." + required: false + example: "s3://bucket/path" + - name: "--census_version" + description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." + type: string + example: "stable" + required: false + - name: "--add_dataset_metadata" + type: boolean_true + description: "If true, the experiment metadata will be added to the cell metadata. More specifically: `collection_id`, `collection_name`, `collection_doi`, `dataset_title`." + - name: Cell query + description: Arguments related to the query. + arguments: + - name: "--species" + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. + required: true + example: "homo_sapiens" + - name: "--obs_value_filter" + type: string + description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." + required: true + example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" + - name: Filter cells by grouping + description: "Filter groups with fewer than X number of cells." + arguments: + - name: "--cell_filter_grouping" + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] + multiple: true + - name: "--cell_filter_minimum_count" + type: integer + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--cell_filter_min_genes" + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: "--cell_filter_min_counts" + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: "--gene_filter_min_cells" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + info: + label: "CellxGene dataset" + summary: A dataset queried from the CellxGene Census platform + description: | + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + slots: + __merge__: file_output_slots.yaml + - name: "--output_modality" + description: "Which modality to store the output in." + type: string + default: "rna" + required: false + - name: "--output_layer_counts" + description: "Which layer to store the raw counts in. If not provided, the .X layer will be used." + type: string + required: false + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: python:3.11 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - cellxgene-census + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu] \ No newline at end of file diff --git a/src/query/cellxgene_census/file_output_slots.yaml b/src/query/cellxgene_census/file_output_slots.yaml new file mode 100644 index 00000000..6dfbeeab --- /dev/null +++ b/src/query/cellxgene_census/file_output_slots.yaml @@ -0,0 +1,159 @@ +mod: + - name: rna + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: true + + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: true + + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: true + + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: true + + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: true + + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: true + + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: true + + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: true + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: true + + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: true + + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: true + + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: true + + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: true + + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: true + + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: true + + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: true + + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: true + + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: true + + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: true + + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: true + + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + required: true + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: true diff --git a/src/query/cellxgene_census/script.py b/src/query/cellxgene_census/script.py new file mode 100644 index 00000000..2aefe388 --- /dev/null +++ b/src/query/cellxgene_census/script.py @@ -0,0 +1,191 @@ +import sys +import cellxgene_census +import scanpy as sc +import mudata as mu + +## VIASH START +par = { + "input_uri": None, + "census_version": "stable", + "species": "mus_musculus", + "obs_value_filter": "dataset_id == '49e4ffcc-5444-406d-bdee-577127404ba8'", + "cell_filter_grouping": None, + "cell_filter_minimum_count": None, + "output": "output.h5ad", + "output_modality": "rna", + "output_compression": "gzip", +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + + +from setup_logger import setup_logger + +logger = setup_logger() + + +def connect_census(uri, census_version): + """ + Connect to CellxGene Census or user-provided TileDBSoma object + """ + ver = census_version or "stable" + logger.info( + "Connecting to CellxGene Census at %s", + f"'{uri}'" if uri else f"version '{ver}'", + ) + return cellxgene_census.open_soma(uri=uri, census_version=ver) + + +def get_anndata(census_connection, par): + logger.info( + "Getting gene expression data based on `%s` query.", par["obs_value_filter"] + ) + return cellxgene_census.get_anndata( + census=census_connection, + obs_value_filter=par["obs_value_filter"], + organism=par["species"], + ) + + +def add_cellcensus_metadata_obs(census_connection, adata): + logger.info("Adding additional metadata to gene expression data.") + census_datasets = ( + census_connection["census_info"]["datasets"].read().concat().to_pandas() + ) + + adata.obs.dataset_id = adata.obs.dataset_id.astype("category") + + dataset_info = ( + census_datasets[ + census_datasets.dataset_id.isin(adata.obs.dataset_id.cat.categories) + ][ + [ + "collection_id", + "collection_name", + "collection_doi", + "dataset_id", + "dataset_title", + ] + ] + .reset_index(drop=True) + .astype("category") + ) + + adata.obs = adata.obs.merge(dataset_info, on="dataset_id", how="left") + + +def filter_min_cells_per_group(adata, par): + n_cells_before, _ = adata.shape + cell_count = adata.obs.groupby(par["cell_filter_grouping"])[ + "soma_joinid" + ].transform("count") + adata = adata[cell_count >= par["cell_filter_minimum_count"]] + n_cells_after, _ = adata.shape + logger.info( + "Removed %s cells based on %s cell_filter_minimum_count of %s cell_filter_grouping." + % ( + (n_cells_before - n_cells_after), + par["cell_filter_minimum_count"], + par["cell_filter_grouping"], + ) + ) + return adata + + +def filter_by_counts(adata, par): + logger.info("Remove cells with few counts and genes with few counts.") + n_cells_before, n_genes_before = adata.shape + # remove cells with few counts and genes with few counts + scanpy_proc = { + par["cell_filter_min_counts"]: (sc.pp.filter_cells, "min_counts"), + par["cell_filter_min_genes"]: (sc.pp.filter_cells, "min_genes"), + par["gene_filter_min_counts"]: (sc.pp.filter_genes, "min_counts"), + par["gene_filter_min_cells"]: (sc.pp.filter_genes, "min_cells"), + } + for threshold, (func, arg) in scanpy_proc.items(): + if threshold: + func(adata, **{arg: threshold}) + n_cells_after, n_genes_after = adata.shape + logger.info( + "Removed %s cells and %s genes.", + (n_cells_before - n_cells_after), + (n_genes_before - n_genes_after), + ) + + +def move_x_to_layers(adata, layer_name): + logger.info(f"Move .X to .layers['{layer_name}']") + adata.layers[layer_name] = adata.X + adata.X = None + + +def print_unique(adata, column): + unique_values = adata.obs[column].unique().astype(str) + formatted = "', '".join(unique_values[:50]) + if len(unique_values) > 50: + formatted += ", ..." + logger.info(f"Unique {column}: ['{formatted}']") + + +def print_summary(adata): + logger.info(f"Resulting dataset: {adata}") + + logger.info("Summary of dataset:") + for field in adata.obs.columns: + print_unique(adata, field) + + +def write_anndata(adata, par): + logger.info("Writing MuData object to '%s'", par["output"]) + + mdata = mu.MuData({par["output_modality"]: adata}) + + mdata.write_h5mu(par["output"], compression=par["output_compression"]) + + +def main(par, meta): + # check arguments + if (par["cell_filter_grouping"] is None) != ( + par["cell_filter_minimum_count"] is None + ): + raise NotImplementedError( + "You need to specify either both or none of the following parameters: cell_filter_grouping, cell_filter_minimum_count" + ) + + with connect_census( + uri=par["input_uri"], census_version=par["census_version"] + ) as conn: + adata = get_anndata(conn, par) + + if par["add_dataset_metadata"]: + add_cellcensus_metadata_obs(conn, adata) + + print(f"AnnData: {adata}", flush=True) + + if par["cell_filter_grouping"] is not None: + adata = filter_min_cells_per_group(adata, par) + + # remove cells with few counts and genes with few counts + filter_by_counts(adata, par) + + # logger.log(f"Filtered AnnData: {adata}") + print(f"Filtered AnnData: {adata}", flush=True) + + # use feature_id as var_names + adata.var_names = adata.var["feature_id"] + + # move .X to .layers["counts"] + if par["output_layer_counts"]: + move_x_to_layers(adata, par["output_layer_counts"]) + + # print summary + print_summary(adata) + + # write output to file + write_anndata(adata, par) + + +if __name__ == "__main__": + main(par, meta) diff --git a/src/query/cellxgene_census/test.py b/src/query/cellxgene_census/test.py new file mode 100644 index 00000000..2c4718fd --- /dev/null +++ b/src/query/cellxgene_census/test.py @@ -0,0 +1,168 @@ +import sys +import os +import pytest +import mudata as md +import numpy as np + +## VIASH START +meta = { + "resources_dir": "./resources_test/", + "executable": "./target/executable/query/cellxgene_census", + "config": "/home/di/code/openpipeline/src/query/cellxgene_census/config.vsh.yaml", +} +## VIASH END + + +def test_cellxgene_extract_metadata_expression(run_component, tmp_path): + output_file = tmp_path / "output.h5mu" + + run_component( + [ + "--obs_value_filter", + "is_primary_data == True " + "and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] " + "and suspension_type == 'cell'", + "--species", + "homo_sapiens", + "--add_dataset_metadata", + "--output", + output_file, + ] + ) + + # check whether file exists + assert os.path.exists(output_file), "Output file does not exist" + + mdata = md.read(output_file) + + assert "rna" in mdata.mod, "Output should contain 'rna' modality." + assert mdata.mod["rna"].n_obs > 0, "Expected at least one cell." + assert mdata.mod["rna"].n_vars > 0, "Expected at least one gene." + + ## check obs + obs = mdata.mod["rna"].obs + + expected_obs = [ + "dataset_id", + "assay", + "assay_ontology_term_id", + "cell_type", + "cell_type_ontology_term_id", + "development_stage", + "development_stage_ontology_term_id", + "disease", + "disease_ontology_term_id", + "donor_id", + "is_primary_data", + # "organism", "organism_ontology_term_id", # ← missing?? + "self_reported_ethnicity", + "self_reported_ethnicity_ontology_term_id", + "sex", + "sex_ontology_term_id", + "suspension_type", + "tissue", + "tissue_ontology_term_id", + "tissue_general", + "tissue_general_ontology_term_id", + "soma_joinid", + "collection_id", + "collection_name", + "collection_doi", + "dataset_title", + ] + for exp_obs in expected_obs: + assert exp_obs in obs.columns, f"Expected column '{exp_obs}' not found in .obs" + + assert np.all(np.isin(obs["is_primary_data"], [True])) + + ## check var + var = mdata.mod["rna"].var + expected_var = ["feature_id", "feature_name", "soma_joinid"] + for exp_var in expected_var: + assert exp_var in var.columns, f"Expected column '{exp_var}' not found in .var" + + ## check layers + assert mdata.mod["rna"].X is not None, "Expected count matrix in .X" + + +def test_cellxgene_extract_metadata_expression_with_layer(run_component, tmp_path): + output_file = tmp_path / "output.h5mu" + + run_component( + [ + "--obs_value_filter", + "is_primary_data == True " + "and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] " + "and suspension_type == 'cell'", + "--species", + "homo_sapiens", + "--add_dataset_metadata", + "--output", + output_file, + "--output_layer_counts", + "counts", + ] + ) + + # check whether file exists + assert os.path.exists(output_file), "Output file does not exist" + + mdata = md.read(output_file) + + assert "rna" in mdata.mod, "Output should contain 'rna' modality." + assert mdata.mod["rna"].n_obs > 0, "Expected at least one cell." + assert mdata.mod["rna"].n_vars > 0, "Expected at least one gene." + + ## check obs + obs = mdata.mod["rna"].obs + + expected_obs = [ + "dataset_id", + "assay", + "assay_ontology_term_id", + "cell_type", + "cell_type_ontology_term_id", + "development_stage", + "development_stage_ontology_term_id", + "disease", + "disease_ontology_term_id", + "donor_id", + "is_primary_data", + # "organism", "organism_ontology_term_id", # ← missing?? + "self_reported_ethnicity", + "self_reported_ethnicity_ontology_term_id", + "sex", + "sex_ontology_term_id", + "suspension_type", + "tissue", + "tissue_ontology_term_id", + "tissue_general", + "tissue_general_ontology_term_id", + "soma_joinid", + "collection_id", + "collection_name", + "collection_doi", + "dataset_title", + ] + for exp_obs in expected_obs: + assert exp_obs in obs.columns, f"Expected column '{exp_obs}' not found in .obs" + + assert np.all(np.isin(obs["is_primary_data"], [True])) + + ## check var + var = mdata.mod["rna"].var + expected_var = ["feature_id", "feature_name", "soma_joinid"] + for exp_var in expected_var: + assert exp_var in var.columns, f"Expected column '{exp_var}' not found in .var" + + ## check layers + layers = mdata.mod["rna"].layers + expected_layers = ["counts"] + for exp_layer in expected_layers: + assert exp_layer in layers.keys(), ( + f"Expected layer '{exp_layer}' not found in .layers" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/reference/build_bdrhap_reference/config.vsh.yaml b/src/reference/build_bdrhap_reference/config.vsh.yaml new file mode 100644 index 00000000..30b83823 --- /dev/null +++ b/src/reference/build_bdrhap_reference/config.vsh.yaml @@ -0,0 +1,134 @@ +name: build_bdrhap_reference +namespace: reference +scope: "public" +description: | + The Reference Files Generator creates an archive containing Genome Index + and Transcriptome annotation files needed for the BD Rhapsody Sequencing + Analysis Pipeline. The app takes as input one or more FASTA and GTF files + and produces a compressed archive in the form of a tar.gz file. The + archive contains: + + - STAR index + - Filtered GTF file +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --genome_fasta + required: true + description: Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + example: genome_sequence.fa.gz + multiple: true + info: + config_key: Genome_fasta + - type: file + name: --gtf + required: true + description: | + File path to the transcript annotation files in GTF or GTF.GZ format. The Sequence Analysis Pipeline requires the 'gene_name' or + 'gene_id' attribute to be set on each gene and exon feature. Gene and exon feature lines must have the same attribute, and exons + must have a corresponding gene with the same value. For TCR/BCR assays, the TCR or BCR gene segments must have the 'gene_type' or + 'gene_biotype' attribute set, and the value should begin with 'TR' or 'IG', respectively. + example: transcriptome_annotation.gtf.gz + multiple: true + info: + config_key: Gtf + - type: file + name: --extra_sequences + description: | + File path to additional sequences in FASTA format to use when building the STAR index. (e.g. transgenes or CRISPR guide barcodes). + GTF lines for these sequences will be automatically generated and combined with the main GTF. + required: false + multiple: true + info: + config_key: Extra_sequences + - name: Outputs + arguments: + - type: file + name: --reference_archive + direction: output + required: true + description: | + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an + input in the BD Rhapsody Sequencing Analysis Pipeline. + example: reference.tar.gz + - name: Arguments + arguments: + - type: string + name: --mitochondrial_contigs + description: | + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are + identified as 'nuclear fragments' in the ATACseq analysis pipeline. + required: false + multiple: true + default: [chrM, chrMT, M, MT] + info: + config_key: Mitochondrial_contigs + - type: boolean_true + name: --filtering_off + description: | + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features + having the following attribute values are kept: + + - protein_coding + - lncRNA + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + info: + config_key: Filtering_off + - type: boolean_true + name: --wta_only_index + description: Build a WTA only index, otherwise builds a WTA + ATAC index. + info: + config_key: Wta_Only + - type: string + name: --extra_star_params + description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + required: false + info: + config_key: Extra_STAR_params + +resources: + - type: python_script + path: script.py + - path: make_rhap_reference_2.2.1_nodocker.cwl +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/reference_gencodev41_chr1/reference.fa.gz + - path: /resources_test/reference_gencodev41_chr1/reference.gtf.gz + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps, seqkit] + - type: python + packages: [cwlref-runner, cwl-runner] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/src/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/src/reference/build_bdrhap_reference/script.py b/src/reference/build_bdrhap_reference/script.py new file mode 100644 index 00000000..609a11f1 --- /dev/null +++ b/src/reference/build_bdrhap_reference/script.py @@ -0,0 +1,177 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +par = { + "genome_fasta": ["resources_test/reference_gencodev41_chr1/reference.fa.gz"], + "gtf": ["resources_test/reference_gencodev41_chr1/reference.gtf.gz"], + "extra_sequences": None, + "mitochondrial_contigs": None, + "filtering_off": False, + "wta_only_index": False, + "extra_star_params": "--genomeSAindexNbases 4", + "reference_archive": "output.tar.gz", +} +meta = { + "config": "target/nextflow/reference/build_bdrhap_reference/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/reference/build_bdrhap_2_reference"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None, +} +## VIASH END + + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] + ] + + return config + + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".gz"), ( + "Output reference_archive must end with .tar.gz." + ) + + # make paths absolute + for argument in config["arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] + else: + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + + return par + + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [ + strip_margin("""\ + |#!/usr/bin/env cwl-runner + | + |""") + ] + + config_key_value_pairs = [] + for argument in config["arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\ + | class: File + | location: "{par_value}" + |""") + content_list.append(str) + else: + content_list.append( + strip_margin(f"""\ + |{config_key}: {par_value} + |""") + ) + + ## Write config to file + return "".join(content_list) + + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file = os.path.join( + meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl" + ) + + return os.path.abspath(cwl_file) + + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"] + ) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file, + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + shutil.move( + os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"] + ) + + +if __name__ == "__main__": + main(par, meta) diff --git a/src/reference/build_bdrhap_reference/test.sh b/src/reference/build_bdrhap_reference/test.sh new file mode 100644 index 00000000..a63ada85 --- /dev/null +++ b/src/reference/build_bdrhap_reference/test.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -eou pipefail + +## VIASH START +meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +seqkit subseq -r 1:50000 "$meta_resources_dir/reference.fa.gz" | gzip > "$tmpdir/reference_small.fa.gz" +zcat "$meta_resources_dir/reference.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz" + + +echo "> Running $meta_name, writing to $tmpdir." +$meta_executable \ + --genome_fasta "$tmpdir/reference_small.fa.gz" \ + --gtf "$tmpdir/reference_small.gtf.gz" \ + --reference_archive "$tmpdir/myreference.tar.gz" \ + ---cpus 2 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1 + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/reference/build_cellranger_arc_reference/config.vsh.yaml b/src/reference/build_cellranger_arc_reference/config.vsh.yaml new file mode 100644 index 00000000..f119ac84 --- /dev/null +++ b/src/reference/build_cellranger_arc_reference/config.vsh.yaml @@ -0,0 +1,84 @@ +name: build_cellranger_arc_reference +namespace: reference +scope: "public" +description: Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome. +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] +arguments: + # inputs + - type: file + name: --genome_fasta + required: true + description: Reference genome fasta. + example: genome_sequence.fa.gz + - type: file + name: --annotation_gtf + required: true + description: Reference annotation. + example: annotation.gtf.gz + - type: file + name: --motifs_file + required: false + description: Transcription factor motifs in JASPAR format. See https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references + example: JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified + - type: string + name: --non_nuclear_contigs + multiple: true + required: false + description: Name(s) of contig(s) that do not have any chromatin structure, for example, mitochondria or plastids. These contigs are excluded from peak calling since the entire contig will be "open" due to a lack of chromatin structure. Leave empty if there are no such contigs. + example: chrM + default: chrM + - type: file + name: --output + direction: output + required: true + description: Output folder + example: cellranger_reference + - type: string + name: --genome + required: true + default: "output" + description: Name of the genome. This will be the name of the intermediate output folder + example: GRCh38 + - type: string + name: --organism + required: false + description: Name of the organism. This is displayed in the web summary but is otherwise not used in the analysis. + - type: string + name: --subset_regex + description: Will subset the reference chromosomes using the given regex. + example: (ERCC-00002|chr1) +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/reference_gencodev41_chr1 +engines: + - type: docker + image: ghcr.io/data-intuitive/cellranger_arc:2.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/* + test_setup: + - type: docker + - type: apt + packages: [ git, wget ] + - type: docker + run: | + TARGETARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \ + TARGETOS="${TARGETOS:-linux}" && \ + PATH="${PATH}:/usr/local/go/bin" && \ + wget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ + rm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ + git clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \ + cd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] \ No newline at end of file diff --git a/src/reference/build_cellranger_arc_reference/script.sh b/src/reference/build_cellranger_arc_reference/script.sh new file mode 100644 index 00000000..db812819 --- /dev/null +++ b/src/reference/build_cellranger_arc_reference/script.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_genome_fasta="resources_test/reference_gencodev41_chr1/reference.fa.gz" +par_annotation_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz" +par_output="gencode_v41_annotation_cellranger.tar.gz" +par_motifs_file="resources_test/reference_gencodev41_chr1/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified" +par_genome="GRCh38" +par_organism="Homo_sapiens" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# just to make sure +echo "> Getting path of fasta file" +par_genome_fasta=`realpath $par_genome_fasta` +echo "> Getting path of annotation file" +par_annotation_gtf=`realpath $par_annotation_gtf` +echo "> Getting path of output file" +par_output=`realpath $par_output` +echo "> Getting path of motifs file" +par_motifs_file=`realpath $par_motifs_file` + +# process params +extra_params=( ) + +if [ ! -z "$meta_cpus" ]; then + extra_params+=( "nthreads: \"$meta_cpus"\" ) +fi +if [ ! -z "$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=`python -c "print(int('$meta_memory_gb') - 2)"` + extra_params+=( "memgb: \"$memory_gb"\" ) +fi + +echo "> Unzipping input files" +unpigz -c "$par_genome_fasta" > "$tmpdir/genome.fa" + +echo "> Building star index" +cd "$tmpdir" + +echo "> Building config" +config_in="${tmpdir}/config" + +# If non_nuclear_contigs is not set or bash thinks it is a flag, set it to an empty string +if [[ -z $par_non_nuclear_contigs || $par_non_nuclear_contigs == "--non_nuclear_contigs" ]]; then + non_nuclear_contigs="" +else + printf -v non_nuclear_contigs '"%s",' "${par_non_nuclear_contigs[@]}" + non_nuclear_contigs="[${non_nuclear_contigs%,}]" # remove trailing comma +fi + +echo """{ + ${par_organism:+organism: \"$par_organism\"} + genome: [\"${par_genome}\"] + input_fasta: [\""${tmpdir}/genome.fa"\"] + input_gtf: [\""${par_annotation_gtf}\""] + ${non_nuclear_contigs:+non_nuclear_contigs: "${non_nuclear_contigs}"} + input_motifs: \""$par_motifs_file"\" + $(printf "%s\n" "${extra_params[@]}") +}""" > "$config_in" + +echo "> Config content:" +cat ${config_in} + +echo "> Running cellranger" +cellranger-arc mkref --config=${config_in} + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "$par_output" -C "${tmpdir}/${par_genome}" . diff --git a/src/reference/build_cellranger_arc_reference/test.sh b/src/reference/build_cellranger_arc_reference/test.sh new file mode 100644 index 00000000..10ea117d --- /dev/null +++ b/src/reference/build_cellranger_arc_reference/test.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +set -eou pipefail + +## VIASH START +meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit subseq -r 1:50000 "$input" | gzip > "$output" + fi +} + +seqkit_head "$meta_resources_dir/reference_gencodev41_chr1/reference.fa.gz" "$tmpdir/reference_small.fa.gz" +zcat "$meta_resources_dir/reference_gencodev41_chr1/reference.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz" + +echo "> Running $meta_name, writing to $tmpdir." +$meta_executable \ + --genome_fasta "$tmpdir/reference_small.fa.gz" \ + --annotation_gtf "$tmpdir/reference_small.gtf.gz" \ + --motifs_file "$meta_resources_dir/reference_gencodev41_chr1/JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified" \ + --output "$tmpdir/myreference.tar.gz" \ + --non_nuclear_contigs "" \ + --organism "Homo_sapiens" \ + --genome "GRCh38" \ + ---cpus ${meta_memory_gb:-1} \ + ---memory ${meta_memory_gb:-5}GB + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1 + +echo ">> Checking whether output tar file contains the expected files" +if tar -tzf "$tmpdir/myreference.tar.gz" | grep -q 'regions/tss.bed'; then + : +else + echo "regions/tss.bed not found in tar file with reference!" +fi + +if tar -tzf "$tmpdir/myreference.tar.gz" | grep -q 'regions/transcripts.bed'; then + : +else + echo "regions/transcripts.bed not found in tar file with reference!" +fi + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/reference/build_cellranger_reference/config.vsh.yaml b/src/reference/build_cellranger_reference/config.vsh.yaml new file mode 100644 index 00000000..7d13a07a --- /dev/null +++ b/src/reference/build_cellranger_reference/config.vsh.yaml @@ -0,0 +1,65 @@ +name: build_cellranger_reference +namespace: reference +scope: "public" +description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome. +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +arguments: + # inputs + - type: file + name: --genome_fasta + required: true + description: Reference genome fasta. + example: genome_sequence.fa.gz + - type: file + name: --transcriptome_gtf + required: true + description: Reference transcriptome annotation. + example: transcriptome_annotation.gtf.gz + - type: string + name: "--reference_version" + required: false + description: "Optional reference version string to include with reference" + - type: file + name: --output + direction: output + required: true + description: Output folder + example: cellranger_reference +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/reference_gencodev41_chr1 + +engines: +- type: docker + image: ghcr.io/data-intuitive/cellranger:9.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/* + test_setup: + - type: apt + packages: [ git, wget ] + - type: docker + run: | + TARGETARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \ + TARGETOS="${TARGETOS:-linux}" && \ + PATH="${PATH}:/usr/local/go/bin" && \ + wget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ + rm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ + git clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \ + cd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/reference/build_cellranger_reference/script.sh b/src/reference/build_cellranger_reference/script.sh new file mode 100644 index 00000000..ef4cfb49 --- /dev/null +++ b/src/reference/build_cellranger_reference/script.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_genome_fasta="resources_test/reference_gencodev41_chr1/reference.fa.gz" +par_transcriptome_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz" +par_output="gencode_v41_annotation_cellranger.tar.gz" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_genome_fasta=`realpath $par_genome_fasta` +par_transcriptome_gtf=`realpath $par_transcriptome_gtf` +par_output=`realpath $par_output` + + +echo "> Unzipping input files" +unpigz -c "$par_genome_fasta" > "$tmpdir/genome.fa" + +echo "> Building star index" +cd "$tmpdir" +cellranger mkref \ + --fasta "$tmpdir/genome.fa" \ + --genes "$par_transcriptome_gtf" \ + --genome output \ + ${par_reference_version:+--ref-version $par_reference_version} \ + ${meta_cpus:+--nthreads $meta_cpus} \ + ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itseld + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" . \ No newline at end of file diff --git a/src/reference/build_cellranger_reference/test.sh b/src/reference/build_cellranger_reference/test.sh new file mode 100644 index 00000000..5ea063cc --- /dev/null +++ b/src/reference/build_cellranger_reference/test.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -eou pipefail + +## VIASH START +meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing `basename $input`" + seqkit subseq -r 1:50000 "$input" | gzip > "$output" + fi +} + +seqkit_head "$meta_resources_dir/reference_gencodev41_chr1/reference.fa.gz" "$tmpdir/reference_small.fa.gz" +zcat "$meta_resources_dir/reference_gencodev41_chr1/reference.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz" + +echo "> Running $meta_name, writing to $tmpdir." +$meta_executable \ + --genome_fasta "$tmpdir/reference_small.fa.gz" \ + --transcriptome_gtf "$tmpdir/reference_small.gtf.gz" \ + --output "$tmpdir/myreference.tar.gz" \ + ---cpus ${meta_memory_gb:-1} \ + ---memory ${meta_memory_gb:-5}GB + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1 + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/reference/build_star_reference/config.vsh.yml b/src/reference/build_star_reference/config.vsh.yml new file mode 100644 index 00000000..bf258475 --- /dev/null +++ b/src/reference/build_star_reference/config.vsh.yml @@ -0,0 +1,82 @@ +name: build_star_reference +namespace: reference +scope: "public" +description: Create a reference for STAR from a set of fasta files. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: Input/Output + arguments: + - type: file + name: --genome_fasta + alternatives: --genomeFastaFiles + required: true + description: The fasta files to be included in the reference. Corresponds to the --genomeFastaFiles argument in the STAR command. + example: [ chr1.fasta, chr2.fasta ] + multiple: true + - name: --transcriptome_gtf + alternatives: --sjdbGTFfile + type: file + direction: input + description: | + Specifies the path to the file with annotated transcripts in the standard GTF + format. STAR will extract splice junctions from this file and use them to greatly improve + accuracy of the mapping. Corresponds to the --sjdbGTFfile argument in the STAR command. + required: false + - name: --output + alternatives: --genomeDir + type: file + description: Path to output directory. Corresponds to the --genomeDir argument in the STAR command. + example: /path/to/foo + direction: output + required: true + - name: "Genome indexing arguments" + arguments: + - type: integer + name: --genomeSAindexNbases + description: | + Length (bases) of the SA pre-indexing string. Typically between 10 and 15. + Longer strings will use much more memory, but allow faster searches. For small + genomes, the parameter {genomeSAindexNbases must be scaled down to + min(14, log2(GenomeLength)/2 - 1). + required: false + default: 14 +resources: + - type: python_script + path: script.py +test_resources: + - type: bash_script + path: test.sh + - path: /resources_test/cellranger_tiny_fastq +engines: + - type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.10b + - PACKAGES gcc g++ make wget zlib1g-dev unzip + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/reference/build_star_reference/script.py b/src/reference/build_star_reference/script.py new file mode 100644 index 00000000..f26dde57 --- /dev/null +++ b/src/reference/build_star_reference/script.py @@ -0,0 +1,153 @@ +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +par = { + "genome_fasta": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/fasta/genome.fa", + "transcriptome_gtf": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", + "output": "star_reference_test", + "genomeSAindexNbases": 7, +} +meta = {"cpus": 8, "temp_dir": "/tmp"} +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the `processPar()` generator needs to be adapted +to_rename = { + "genome_fasta": "genomeFastaFiles", + "output": "genomeDir", + "transcriptome_gtf": "sjdbGTFfile", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the `to_rename` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["genomeDir"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeFastaFiles", "sjdbGTFfile"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "genomeGenerate" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) diff --git a/src/reference/build_star_reference/test.sh b/src/reference/build_star_reference/test.sh new file mode 100644 index 00000000..38849bec --- /dev/null +++ b/src/reference/build_star_reference/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -eou pipefail + +## VIASH START +meta_resources_dir="./resources_test" +meta_executable="./target/docker/mapping/star_build_reference/star_build_reference" +## VIASH END + +"$meta_executable" \ + --genome_fasta "$meta_resources_dir/cellranger_tiny_fastq/cellranger_tiny_ref/fasta/genome.fa" \ + --transcriptome_gtf "$meta_resources_dir/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" \ + --output "./star_reference_test" \ + --genomeSAindexNbases 7 \ + ---cpus 8 + +if [ ! -f "./star_reference_test/Genome" ]; then + echo "Genome file could not be found in the output directory"; + exit 1 +fi \ No newline at end of file diff --git a/src/reference/cellranger_mkgtf/config.vsh.yaml b/src/reference/cellranger_mkgtf/config.vsh.yaml new file mode 100644 index 00000000..f07441ea --- /dev/null +++ b/src/reference/cellranger_mkgtf/config.vsh.yaml @@ -0,0 +1,47 @@ +name: cellranger_mkgtf +namespace: reference +scope: "public" +description: Make a GTF file - filter by a specific attribute. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] +arguments: + # inputs + - type: file + name: --input_gtf + required: true + description: Reference GTF annotation. + example: transcriptome_annotation.gtf.gz + - type: file + name: --output_gtf + direction: output + required: true + description: Output GTF file. + example: output.gtf.gz + - type: string + name: --attribute + required: true + description: Key-value pair in attributes field to be kept in the GTF file of the format attribute:attribute_value. + example: [gene_type:transcribed_unprocessed_pseudogene,gene_type:miRNA] + multiple: true +resources: + - type: bash_script + path: script.sh +test_resources: + - type: python_script + path: test.py + - path: /resources_test/reference_gencodev41_chr1 +engines: + - type: docker + image: ghcr.io/data-intuitive/cellranger:9.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y pigz procps && rm -rf /var/lib/apt/lists/* + __merge__: [ /src/base/requirements/python_test_setup.yaml, . ] +runners: + - type: executable + - type: nextflow + directives: + label: [ midmem, lowcpu ] \ No newline at end of file diff --git a/src/reference/cellranger_mkgtf/script.sh b/src/reference/cellranger_mkgtf/script.sh new file mode 100644 index 00000000..90c75d15 --- /dev/null +++ b/src/reference/cellranger_mkgtf/script.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz" +par_output_gtf="gencode_v41_filtered.gtf.gz" +par_attribute="gene_type:transcribed_unprocessed_pseudogene" +## VIASH END + +# create temporary directory +echo $VIASH_TEMP +mkdir -p "$VIASH_TEMP" +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_input_gtf=`realpath $par_input_gtf` +par_output_gtf=`realpath $par_output_gtf` + +echo "> Unzipping input files" +unpigz -c "$par_input_gtf" > "$tmpdir/input_gtf.gtf" + +echo "${par_attribute}" + +echo "> Building gtf" +cd "$tmpdir" +# Start the cellranger mkgtf command +IFS=';' read -r -a attributes <<< "$par_attribute" +cmd="cellranger mkgtf \"$tmpdir/input_gtf.gtf\" \"$tmpdir/output.gtf\"" +# Append each key-value pair as a separate --attribute argument +for attribute in "${attributes[@]}"; do + cmd+=" --attribute=$attribute" +done +# Execute the command +eval $cmd + +echo "> Creating archive" +pigz -k "$tmpdir/output.gtf" +mv "$tmpdir/output.gtf.gz" "$par_output_gtf" \ No newline at end of file diff --git a/src/reference/cellranger_mkgtf/test.py b/src/reference/cellranger_mkgtf/test.py new file mode 100644 index 00000000..6cf2d1d4 --- /dev/null +++ b/src/reference/cellranger_mkgtf/test.py @@ -0,0 +1,88 @@ +import sys +import pytest +import gzip +import os +import re + +## VIASH START +meta = { + "name": "cellrnger_mkgtf", + "resources_dir": "resources_test/", + "executable": "target/docker/reference/cellranger_mkgtf/cellranger_mkgtf", + "config": "src/reference/cellranger_mkgtf/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def subset_input_gtf(random_path): + subset_input_path = random_path(extension="gtf.gz") + with gzip.open( + f"{meta['resources_dir']}/reference_gencodev41_chr1/reference.gtf.gz", "rt" + ) as f_in: + with gzip.open(subset_input_path, "wt") as f_out: + for line in f_in: + fields = line.split("\t") + if len(fields) >= 4 and int(fields[3]) < 50001: + f_out.write(line) + return subset_input_path + + +@pytest.mark.parametrize( + "attributes", [["miRNA"], ["transcribed_unprocessed_pseudogene", "miRNA"]] +) +def test_gene_type_column(run_component, subset_input_gtf, random_path, attributes): + output_gtf = random_path(extension="gtf.gz") + args = ["--input_gtf", subset_input_gtf, "--output_gtf", output_gtf, "--attribute"] + args.append(";".join([f"gene_type:{attribute}" for attribute in attributes])) + + print(args, flush=True) + run_component(args) + + assert os.path.isfile(output_gtf), "Output GTF could not be found." + + with gzip.open(output_gtf, "rt") as f: + unique_gene_types = { + match + for line in f + for match in re.findall(r'gene_type "([^"]*)"', line.split("\t")[8]) + } + assert set(attributes) == unique_gene_types, ( + "Output GTF does not contain exactly the expected gene types." + ) + + +def test_different_columns(run_component, subset_input_gtf, random_path): + output_gtf = random_path(extension="gtf.gz") + args = [ + "--input_gtf", + subset_input_gtf, + "--output_gtf", + output_gtf, + "--attribute", + "gene_type:transcribed_unprocessed_pseudogene;transcript_id:ENST00000456328.2", + ] + + run_component(args) + assert os.path.isfile(output_gtf), "Output GTF could not be found." + + with gzip.open(output_gtf, "rt") as f: + wrong_attributes_count = sum( + 1 + for line in f + if dict(re.findall(r'(\S+) "([^"]*)"', line.split("\t")[8])).get( + "gene_type" + ) + != "transcribed_unprocessed_pseudogene" + and dict(re.findall(r'(\S+) "([^"]*)"', line.split("\t")[8])).get( + "transcript_id" + ) + != "ENST00000456328.2" + ) + assert wrong_attributes_count == 0, ( + "Output GTF contains unexpected attribute values." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/reference/make_reference/config.vsh.yaml b/src/reference/make_reference/config.vsh.yaml new file mode 100644 index 00000000..62fb7cc1 --- /dev/null +++ b/src/reference/make_reference/config.vsh.yaml @@ -0,0 +1,64 @@ +name: make_reference +namespace: reference +scope: "public" +description: | + Preprocess and build a transcriptome reference. + + Example input files are: + - `genome_fasta`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz + - `transcriptome_gtf`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz + - `ercc`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +arguments: + # inputs + - type: file + name: --genome_fasta + required: true + description: "Reference genome fasta. Example: " + example: genome_fasta.fa.gz + - type: file + name: --transcriptome_gtf + required: true + description: Reference transcriptome annotation. + example: transcriptome.gtf.gz + - type: file + name: --ercc + description: ERCC sequence and annotation file. + example: ercc.zip + - type: string + name: --subset_regex + description: Will subset the reference chromosomes using the given regex. + example: (ERCC-00002|chr1) + - type: file + name: --output_fasta + direction: output + required: true + description: Output genome sequence fasta. + example: genome_sequence.fa.gz + - type: file + name: --output_gtf + direction: output + required: true + description: Output transcriptome annotation gtf. + example: transcriptome_annotation.gtf.gz +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: [ pigz, seqkit, curl, wget, unzip, file] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] \ No newline at end of file diff --git a/src/reference/make_reference/script.sh b/src/reference/make_reference/script.sh new file mode 100644 index 00000000..d0ba4bdf --- /dev/null +++ b/src/reference/make_reference/script.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_genome_fasta="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" +par_transcriptome_gtf="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" +par_ercc="https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" +# par_subset_regex='(ERCC-00002|chr1)' +par_output_fasta="temp/reference.fa.gz" +par_output_gtf="temp/reference.gtf.gz" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +echo "> Getting path of fasta file" +par_genome_fasta=$(realpath $par_genome_fasta) +echo "> Getting path of annotation file" +par_transcriptome_gtf=$(realpath $par_transcriptome_gtf) + +echo "> Processing genome sequence" +genome_fasta="$tmpdir/genome_sequence.fa" +# if genome is gzipped, extract. otherwise not +if file --mime-type "$par_genome_fasta" | grep -q gzip$; then + zcat "$par_genome_fasta" > "$genome_fasta" +else + cp "$par_genome_fasta" "$genome_fasta" +fi + +echo "> Processing transcriptome annotation" +transcriptome_gtf="$tmpdir/transcriptome_annotation.gtf" +# if transcriptome is gzipped, extract. otherwise not +if file --mime-type "$par_transcriptome_gtf" | grep -q gzip$; then + zcat "$par_transcriptome_gtf" > "$transcriptome_gtf" +else + cp "$par_transcriptome_gtf" "$transcriptome_gtf" +fi + +if [[ ! -z $par_ercc ]]; then + echo "> Processing ERCC sequences" + # wget "$par_ercc" -O "$tmpdir/ercc.zip" + # unzip "$tmpdir/ercc.zip" -d "$tmpdir" + unzip "$par_ercc" -d "$tmpdir" + cat "$tmpdir/ERCC92.fa" >> "$genome_fasta" + cat "$tmpdir/ERCC92.gtf" >> "$transcriptome_gtf" +fi + +# create output & filter reference if so desired +if [[ ! -z $par_subset_regex ]]; then + echo "> Subsetting reference with regex '$par_subset_regex'" + awk '{print $1}' "$genome_fasta" | seqkit grep -r -p "^$par_subset_regex\$" > "$tmpdir/genome_sequence_filtered.fa" + genome_fasta="$tmpdir/genome_sequence_filtered.fa" + grep -E "^$par_subset_regex[^A-Za-z0-9]" "$transcriptome_gtf" > "$tmpdir/transcriptome_annotation_filtered.gtf" + transcriptome_gtf="$tmpdir/transcriptome_annotation_filtered.gtf" + + echo + echo "Matched tags:" + cat "$genome_fasta" | grep '^>' | sed 's#^>##' | sed 's# .*##' | sort | uniq + echo +fi + +echo "> Gzipping outputs" +pigz -c "$genome_fasta" > "$par_output_fasta" +pigz -c "$transcriptome_gtf" > "$par_output_gtf" + +# to do: re enable +# echo "> Sanity check of outputs" +# readarray -t fasta_tags < <( cat "$genome_fasta" | grep '^>' | sed 's#^>##' | sed 's# .*##' | sort | uniq ) +# readarray -t transcriptome_tags < <( cat "$transcriptome_gtf" | cut -d$'\t' -f1 | sort | uniq | grep '^[^#]' ) +# [ "${fasta_tags[*]}" == "${transcriptome_tags[*]}" ] || { echo "Warning: fasta tags differ from transcriptome tags"; exit 1; } \ No newline at end of file diff --git a/src/reference/make_reference/test.sh b/src/reference/make_reference/test.sh new file mode 100644 index 00000000..f3e1c4d6 --- /dev/null +++ b/src/reference/make_reference/test.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# Fetch test data +echo ">> Fetching test data" + +wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz +wget https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.chr.gtf.gz +wget https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip + +# Test 1 +echo ">> Test1" +mkdir test1 +pushd test1 +fasta="myreference.fa.gz" +gtf="myreference.gtf.gz" + +"$meta_executable" \ + --genome_fasta "../Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \ + --transcriptome_gtf "../Homo_sapiens.GRCh38.109.chr.gtf.gz" \ + --ercc "../ERCC92.zip" \ + --subset_regex "(ERCC-00002|1)" \ + --output_fasta $fasta \ + --output_gtf $gtf + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1 +[[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1 + +echo ">> Checking contents of fasta" +if ! zgrep -q '>1' $fasta; then + echo "Could not find chromosome '1' in output reference!" + exit 1 +fi +if ! zgrep -q '>ERCC-00002' $fasta; then + echo "Could not find ERCC-00002 in output reference!" + exit 1 +fi +popd + +# Test 2 +echo ">> Test 2" +mkdir test2 +pushd test2 +fasta="myreference.fa.gz" +gtf="myreference.gtf.gz" + +"$meta_executable" \ + --genome_fasta "../Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \ + --transcriptome_gtf "../Homo_sapiens.GRCh38.109.chr.gtf.gz" \ + --output_fasta $fasta \ + --output_gtf $gtf + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1 +[[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1 + +echo ">> Checking contents of fasta" +if ! zgrep -q '>1' $fasta; then + echo "Could not find chromosome '1' in output reference!" + exit 1 +fi +if zgrep -q '>ERCC-00002' $fasta; then + echo "Should not find ERCC-00002 in output reference!" + exit 1 +fi +popd + +echo "> Test succeeded!" diff --git a/src/report/mermaid/config.vsh.yaml b/src/report/mermaid/config.vsh.yaml new file mode 100644 index 00000000..dbf43391 --- /dev/null +++ b/src/report/mermaid/config.vsh.yaml @@ -0,0 +1,65 @@ +name: mermaid +namespace: "report" +scope: "public" +description: | + Generates a network from mermaid code. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] + +arguments: + - name: "--input" + alternatives: [-i] + type: file + description: Input directory + direction: input + required: true + - name: "--output" + alternatives: ["-o"] + type: file + description: Generated network as output. + direction: output + required: true + - name: "--output_format" + type: string + description: | + Output format for the generated image. By default will be inferred from the extension + of the file specified with --output. + choices: ["svg", "png", "pdf"] + - name: "--width" + type: integer + description: "Width of the page" + default: 800 + - name: "--height" + type: integer + description: "Height of the page" + default: 600 + - name: "--background_color" + type: string + description: "Background color for pngs/svgs (not pdfs)" + default: "white" + example: ["#F0F0F0"] + +resources: + - type: bash_script + path: script.sh + - path: ./puppeteer-config.json + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: "node:20-bullseye" + setup: + - type: javascript + npm: + - "@mermaid-js/mermaid-cli" + - type: apt + packages: + - chromium + +runners: + - type: executable + - type: nextflow diff --git a/src/report/mermaid/puppeteer-config.json b/src/report/mermaid/puppeteer-config.json new file mode 100644 index 00000000..7b2851c2 --- /dev/null +++ b/src/report/mermaid/puppeteer-config.json @@ -0,0 +1,6 @@ +{ + "executablePath": "/usr/bin/chromium", + "args": [ + "--no-sandbox" + ] +} \ No newline at end of file diff --git a/src/report/mermaid/script.sh b/src/report/mermaid/script.sh new file mode 100644 index 00000000..ddc386d0 --- /dev/null +++ b/src/report/mermaid/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +mmdc -p "$meta_resources_dir/puppeteer-config.json" \ + -i "$par_input" \ + -o "$par_output" \ + --width "$par_width" \ + --height "$par_height" \ + ${par_background_color:+--backgroundColor $par_background_color} \ + ${output_format:+--outputFormat $par_output_format} + diff --git a/src/report/mermaid/test.sh b/src/report/mermaid/test.sh new file mode 100644 index 00000000..30fa80d2 --- /dev/null +++ b/src/report/mermaid/test.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +par_input="network.input" +par_output="output.png" + +echo "Creating input file" +cat > "$par_input" << HERE +graph LR; + A--> B & C & D; + B--> A & E; + C--> A & E; + D--> A & E; + E--> B & C & D; +HERE + +echo "Running command" +"$meta_executable" --input "$par_input" --output "$par_output" + +if [[ ! -f "$par_output" ]]; then + echo "Output does not exist" + exit +fi + +echo "Test succeeded!" \ No newline at end of file diff --git a/src/scgpt/binning/config.vsh.yaml b/src/scgpt/binning/config.vsh.yaml new file mode 100644 index 00000000..2d3fcdb0 --- /dev/null +++ b/src/scgpt/binning/config.vsh.yaml @@ -0,0 +1,93 @@ +name: binning +namespace: "scgpt" +scope: "public" +description: | + Conversion of (pre-processed) expression count data into relative values (bins) to address scale differences across sequencing batches. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/elizabeth_mlynarski.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + direction: input + required: true + example: input.h5mu + description: | + Input h5mu file. + - name: "--modality" + description: + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: False + description: | + Mudata layer (key from .layers) to use as input data for binning. If not specified, .X is used. + - name: "--var_input" + type: string + default: "id_in_vocab" + description: | + The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes. + - name: "--n_input_bins" + type: integer + default: 51 + required: False + min: 1 + description: | + The number of bins to discretize the data into. When no value is provided, data won't be binned. + + - name: Outputs + arguments: + - name: "--output" + direction: output + type: file + example: output.h5mu + required: true + description: | + The output h5mu file containing the binned data. + - name: "--output_obsm_binned_counts" + type: string + default: "binned_counts" + description: | + The name of the adata layer to write the binned data to. + - name: "--seed" + type: integer + description: | + Seed for random number generation. + __merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/subset_vars.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu + +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml ] +runners: + - type: executable + - type: nextflow + directives: + label: [ midcpu, midmem ] diff --git a/src/scgpt/binning/script.py b/src/scgpt/binning/script.py new file mode 100644 index 00000000..cac9312b --- /dev/null +++ b/src/scgpt/binning/script.py @@ -0,0 +1,129 @@ +import sys +import mudata as mu +import numpy as np +from scipy.sparse import csr_matrix +import warnings + +## VIASH START +par = { + "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_genes_cross_checked.h5mu", + "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu", + "modality": "rna", + "input_layer": None, + "output_obsm_binned_counts": "binned_counts", + "n_input_bins": 51, + "output_compression": None, + "var_input": "id_in_vocab", + "seed": 0, +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +if par["seed"]: + np.random.seed(par["seed"]) + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars + +logger = setup_logger() + +logger.info("Reading in data") +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +logger.info("Subsetting data based on highly variable gene and/or cross-checked genes") +adata = subset_vars(adata, par["var_input"]) + +logger.info("Converting the input layer into a CSR matrix") +if not par["input_layer"] or par["input_layer"] == "X": + layer_data = adata.X +else: + layer_data = adata.layers[par["input_layer"]] +layer_data = csr_matrix(layer_data) + +if layer_data.min() < 0: + raise ValueError( + f"Assuming non-negative data, but got min value {layer_data.min()}." + ) + +n_bins = par["n_input_bins"] # NOTE: the first bin is always a spectial for zero +logger.info(f"Binning data into {par['n_input_bins']} bins.") + + +def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: + assert x.ndim == 1 and bins.ndim == 1 + + left_digits = np.digitize(x, bins) + right_difits = np.digitize(x, bins, right=True) + + rands = np.random.rand(len(x)) # uniform random numbers + + digits = rands * (right_difits - left_digits) + left_digits + digits = np.ceil(digits) + smallest_dtype = np.min_scalar_type( + digits.max().astype(np.uint) + ) # Already checked for non-negative values + digits = digits.astype(smallest_dtype) + + return digits + + +with warnings.catch_warnings(): + # Make sure warnings are displayed once. + warnings.simplefilter("once") + # layer_data.indptr.size is the number of rows in the sparse matrix + binned_rows = [] + bin_edges = [] + logger.info( + "Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix" + ) + for row_number in range(layer_data.indptr.size - 1): + row_start_index, row_end_index = ( + layer_data.indptr[row_number], + layer_data.indptr[row_number + 1], + ) + # These are all non-zero counts in the row + non_zero_row = layer_data.data[row_start_index:row_end_index] + if len(non_zero_row) == 0: + logger.warning( + "The input data contains all zero rows. Please make sure " + "this is expected. You can use the `filter_cell_by_counts` " + "arg to filter out all zero rows." + ) + + # Add binned_rows and bin_edges as all 0 + # np.stack will upcast the dtype later + binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8)) + bin_edges.append(np.array([0] * n_bins)) + continue + + # Binning of non-zero values + bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1)) + non_zero_digits = _digitize(non_zero_row, bins) + assert non_zero_digits.min() >= 1 + assert non_zero_digits.max() <= n_bins - 1 + binned_rows.append(non_zero_digits) + + bin_edges.append(np.concatenate([[0], bins])) + +# Create new CSR matrix +logger.info("Creating a new CSR matrix of the binned count values") +binned_counts = csr_matrix( + ( + np.concatenate(binned_rows, casting="same_kind"), + layer_data.indices, + layer_data.indptr, + ), + shape=layer_data.shape, +) + +# Set binned values and bin edges layers to adata object +input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts +input_adata.obsm["bin_edges"] = np.stack(bin_edges) + +# Write mudata output +logger.info("Writing output data") +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/binning/test.py b/src/scgpt/binning/test.py new file mode 100644 index 00000000..4cfad752 --- /dev/null +++ b/src/scgpt/binning/test.py @@ -0,0 +1,59 @@ +import pytest +import sys +import mudata as mu +from scipy.sparse import issparse + +## VIASH START +meta = { + "resources_dir": "resources_test", + "executable": "./target/docker/scgpt/binning/binning", + "temp_dir": "tmp", + "config": "./target/docker/scgpt/binning/.config.vsh.yaml", +} +## VIASH END + + +def test_binning(run_component, tmp_path): + input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_preprocessed.h5mu" + output_file_path = tmp_path / "Kim2020_Lung_subset_binned.h5mu" + + run_component( + [ + "--input", + input_file_path, + "--modality", + "rna", + "--output_obsm_binned_counts", + "binned_counts", + "--n_input_bins", + "51", + "--var_input", + "filter_with_hvg", + "--output", + output_file_path, + ] + ) + + # Read output file + output_mdata = mu.read(output_file_path) + output_adata = output_mdata.mod["rna"] + + # Check presence of binning layers + assert {"bin_edges", "binned_counts"}.issubset(output_adata.obsm.keys()), ( + "Binning obsm fields were not added." + ) + + # Check bin edges + bin_edges = output_adata.obsm["bin_edges"] + assert all(bin_edges[:, 0] == 0) + assert bin_edges.shape[1] == 51 + assert all(all(i >= 0) for i in bin_edges) + + # Check binned values + binned_values = output_adata.obsm["binned_counts"] + assert issparse(binned_values) + assert (binned_values.data <= 51).all(axis=None) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/cell_type_annotation/config.vsh.yaml b/src/scgpt/cell_type_annotation/config.vsh.yaml new file mode 100644 index 00000000..165cb3b7 --- /dev/null +++ b/src/scgpt/cell_type_annotation/config.vsh.yaml @@ -0,0 +1,164 @@ +name: cell_type_annotation +namespace: "scgpt" +scope: "public" +description: | + Annotate gene expression data with cell type classes through the scGPT model. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + +argument_groups: + - name: Model input + arguments: + - name: "--model" + type: file + required: true + example: best_model.pt + description: | + The model file containing checkpoints and cell type label mapper. + - name: "--model_config" + type: file + required: true + example: args.json + description: | + The model configuration file. + - name: "--model_vocab" + type: file + required: true + example: vocab.json + description: | + Model vocabulary file directory. + - name: "--finetuned_checkpoints_key" + type: string + default: model_state_dict + description: | + Key in the model file containing the pretrained checkpoints. + - name: "--label_mapper_key" + type: string + default: id_to_class + description: | + Key in the model file containing the cell type class to label mapper dictionary. + + - name: Query input + arguments: + - name: "--input" + type: file + direction: input + required: true + example: scgpt_preprocess_ouput.h5mu + description: | + The input h5mu file containing of data that have been pre-processed (normalized, binned, genes cross-checked and tokenized). + - name: "--modality" + description: + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obs_batch_label" + type: string + required: false + description: | + The name of the adata.obs column containing the batch labels. Required if dsbn is set to true. + - name: "--obsm_gene_tokens" + type: string + default: "gene_id_tokens" + description: | + The key of the .obsm array containing the gene token ids + - name: "--obsm_tokenized_values" + type: string + default: values_tokenized + description: | + The key of the .obsm array containing the count values of the tokenized genes + + - name: Outputs + arguments: + - name: "--output" + type: file + direction: output + required: true + example: output.h5mu + description: | + The output mudata file. + - name: "--output_obs_predictions" + type: string + default: "scgpt_pred" + required: false + description: | + The name of the adata.obs column to write predicted cell type labels to. + - name: "--output_obs_probability" + type: string + default: "scgpt_probability" + required: false + description: | + The name of the adata.obs column to write the probabilities of the predicted cell type labels to. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--pad_token" + type: string + default: "" + required: false + description: | + The padding token used in the model. + - name: "--pad_value" + type: integer + default: -2 + required: false + description: | + The value of the padding. + - name: "--n_input_bins" + type: integer + default: 51 + required: false + description: | + The number of input bins. + - name: "--batch_size" + type: integer + default: 64 + required: false + description: | + The batch size. + - name: "--dsbn" + type: boolean + default: true + required: false + description: | + Whether to use domain-specific batch normalization. + - name: "--seed" + type: integer + description: | + Seed for random number generation. If not specified, no seed is used. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu + - path: /resources_test/scgpt/source/args.json + - path: /resources_test/scgpt/source/vocab.json + - path: /resources_test/scgpt/finetuned_model/best_model.pt + +engines: + - type: docker + image: nvcr.io/nvidia/pytorch:23.09-py3 + setup: + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] + - type: python + packages: + - scgpt==0.2.1 + test_setup: + - type: python + __merge__: [/src/base/requirements/scanpy.yaml] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu, gpu ] diff --git a/src/scgpt/cell_type_annotation/script.py b/src/scgpt/cell_type_annotation/script.py new file mode 100644 index 00000000..c3487712 --- /dev/null +++ b/src/scgpt/cell_type_annotation/script.py @@ -0,0 +1,255 @@ +import sys +import json +from multiprocessing import freeze_support +import os +import mudata as mu +from typing import Dict +import warnings +import torch +import numpy as np +from torch.nn import functional +from torch.utils.data import Dataset, DataLoader +from scgpt.model import TransformerModel +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.utils import set_seed +from tqdm import tqdm + +## VIASH START +par = { + "input": r"resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", + "modality": r"rna", + "model": r"resources_test/scgpt/finetuned_model/best_model.pt", + "model_config": r"resources_test/scgpt/source/args.json", + "model_vocab": r"resources_test/scgpt/source/vocab.json", + "obs_batch_label": r"sample", + "obsm_gene_tokens": r"gene_id_tokens", + "obsm_tokenized_values": r"values_tokenized", + "output": r"output.h5mu", + "output_compression": None, + "output_obs_predictions": r"predictions", + "output_obs_probability": r"probabilities", + "dsbn": True, + "seed": 0, + "pad_token": "", + "pad_value": -2, + "n_input_bins": 51, + "batch_size": 64, + "finetuned_checkpoints_key": "model_state_dict", + "label_mapper_key": "id_to_class", +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +class SeqDataset(Dataset): + def __init__(self, data: Dict[str, torch.Tensor]): + self.data = data + + def __len__(self): + return self.data["gene_ids"].shape[0] + + def __getitem__(self, idx): + return {k: v[idx] for k, v in self.data.items()} + + +def main(): + # Setting seed + if par["seed"]: + set_seed(par["seed"]) + + # Setting device + logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Read in data + logger.info("Reading in data") + mdata = mu.read(par["input"]) + input_adata = mdata.mod[par["modality"]] + adata = input_adata.copy() + + # Fetch batch ids for domain-specific batch normalization + if par["dsbn"] and not par["obs_batch_label"]: + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (obs_batch_labels)." + ) + elif par["dsbn"] and par["obs_batch_label"]: + logger.info("Fetching batch id's for domain-specific batch normalization") + batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") + batch_id_labels = batch_id_cats.cat.codes.values + batch_ids = batch_id_labels.tolist() + batch_ids = np.array(batch_ids) + num_batch_types = len(set(batch_ids)) + elif not par["dsbn"]: + # forward pass requires a tensor as input + batch_ids = np.zeros(adata.shape[0]) + + # Vocabulary configuration + logger.info("Loading model vocabulary") + special_tokens = [par["pad_token"], "", ""] + logger.info(f"Loading model vocab from {par['model_vocab']}") + vocab_file = par["model_vocab"] + vocab = GeneVocab.from_file(vocab_file) + [vocab.append_token(s) for s in special_tokens if s not in vocab] + vocab.set_default_index(vocab[par["pad_token"]]) + ntokens = len(vocab) + + # Model configuration + logger.info("Loading model and configurations") + model_config_file = par["model_config"] + with open(model_config_file, "r") as f: + model_configs = json.load(f) + embsize = model_configs["embsize"] + nhead = model_configs["nheads"] + d_hid = model_configs["d_hid"] + nlayers = model_configs["nlayers"] + + # Ensure the provided model has the correct architecture + logger.info("Loading model") + model_file = par["model"] + model_dict = torch.load(model_file, map_location=device) + for k, v in { + "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"], + "--label_mapper_key": par["label_mapper_key"], + }.items(): + if v not in model_dict.keys(): + raise KeyError( + f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) + pretrained_dict = model_dict[par["finetuned_checkpoints_key"]] + + # Label mapper configuration + logger.info("Loading label mapper") + label_mapper = model_dict[par["label_mapper_key"]] + cell_type_mapper = {int(k): v for k, v in label_mapper.items()} + n_cls = len(cell_type_mapper) + + # Model instatiation + logger.info("Instantiating model") + model = TransformerModel( + ntokens, + d_model=embsize, # self.encoder (GenEncoder), self.value_encoder (ContinuousValueEncoder), self.transformerencoder(TransformerEncoderLayer) + nhead=nhead, # self.transformer_encoder(TransformerEncoderLayer) + d_hid=d_hid, # self.transformer_encoder(TransformerEncoderLayer) + nlayers=nlayers, # self.transformer_encoder(TransformerEncoderLayer), self.cls_decoder + nlayers_cls=3, # self.cls_decoder + n_cls=n_cls, # self.cls_decoder + vocab=vocab, + dropout=0.2, # self.transformer_encoder + pad_token=par["pad_token"], + pad_value=par["pad_value"], + do_mvc=False, + do_dab=False, + use_batch_labels=par["dsbn"], + num_batch_labels=num_batch_types if par["dsbn"] else None, + domain_spec_batchnorm=par["dsbn"], + input_emb_style="continuous", + n_input_bins=par["n_input_bins"], + cell_emb_style="cls", # required for cell-type annotation + use_fast_transformer=False, # TODO: parametrize when GPU is available + fast_transformer_backend="flash", # TODO: parametrize when GPU is available + pre_norm=False, # TODO: parametrize when GPU is available + ) + + # Load model params + logger.info(f"Loading model params from {model_file}") + try: + model.load_state_dict(pretrained_dict) + except RuntimeError: + logger.info("only load params that are in the model and match the size") + model_dict = model.state_dict() + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if k in model_dict and v.shape == model_dict[k].shape + } + for k, v in pretrained_dict.items(): + logger.info(f"Loading params {k} with shape {v.shape}") + model_dict.update(pretrained_dict) + model.load_state_dict(model_dict) + + model.to(device) + + # Load tokenized gene data + logger.info("Loading data for inference") + for k, v in { + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + }.items(): + if v not in adata.obsm.keys(): + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) + + input_gene_ids = adata.obsm[par["obsm_gene_tokens"]] + input_values = adata.obsm[par["obsm_tokenized_values"]] + + data_pt = { + "gene_ids": input_gene_ids, + "values": input_values, + "batch_labels": torch.from_numpy(batch_ids).long(), + } + + data_loader = DataLoader( + dataset=SeqDataset(data_pt), + batch_size=par["batch_size"], + num_workers=min(os.cpu_count(), par["batch_size"] // 2), + pin_memory=True, + ) + + # Inference + logger.info("Predicting cell type classes") + model.eval() + predictions = [] + probabilities = [] + with torch.no_grad(): + for batch_data in tqdm(data_loader): + input_gene_ids = batch_data["gene_ids"].to(device) + input_values = batch_data["values"].to(device) + batch_labels = batch_data["batch_labels"].to(device) + + src_key_padding_mask = input_gene_ids.eq(vocab[par["pad_token"]]) + with torch.cuda.amp.autocast(enabled=False): + output_dict = model( + input_gene_ids, + input_values, + src_key_padding_mask=src_key_padding_mask, + batch_labels=batch_labels if par["dsbn"] else None, + CLS=True, # Return celltype classification objective output + CCE=False, + MVC=False, + ECS=False, + ) + output_values = output_dict["cls_output"] + + preds = output_values.argmax(1).cpu().numpy() + predictions.append(preds) + + probs = functional.softmax(output_values, dim=1).max(1)[0] + probabilities.append(probs.cpu().numpy()) + + predictions = np.concatenate(predictions, axis=0) + probabilities = np.concatenate(probabilities, axis=0) + + # Assign cell type labels to predicted classes + logger.info("Assigning cell type predictions and probabilities") + adata.obs["scgpt_class_pred"] = predictions + adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map( + lambda x: cell_type_mapper[x] + ) + adata.obs[par["output_obs_probability"]] = probabilities + + # Write output + logger.info("Writing output data") + mdata.mod[par["modality"]] = adata + mdata.write(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + freeze_support() + warnings.filterwarnings("ignore") + main() diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py new file mode 100644 index 00000000..2d2eb9bd --- /dev/null +++ b/src/scgpt/cell_type_annotation/test.py @@ -0,0 +1,215 @@ +import pytest +from mudata import read_h5mu +import sys +import subprocess +import re + + +input_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu" +ft_model = f"{meta['resources_dir']}/best_model.pt" +model_config = f"{meta['resources_dir']}/args.json" +model_vocab = f"{meta['resources_dir']}/vocab.json" + + +def test_cell_type_inference(run_component, tmp_path): + output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu" + + args = [ + "--input", + input_path, + "--output", + output_annotation_file, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--obs_batch_label", + "sample", + "--dsbn", + "True", + ] + run_component(args) + + output_mudata = read_h5mu(output_annotation_file) + output_adata = output_mudata.mod["rna"] + assert "scgpt_pred" in output_adata.obs.keys(), ( + "scgpt_pred is not present in anndata obs keys" + ) + assert "scgpt_probability" in output_adata.obs.keys(), ( + "scgpt_probability is not present in anndata obs keys" + ) + + # run withou dsbn + output_annotation_file_without_dsbn = ( + tmp_path / "Kim2020_Lung_subset_annotated_no_dsbn.h5mu" + ) + args = [ + "--input", + input_path, + "--output", + output_annotation_file_without_dsbn, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--obs_batch_label", + "sample", + "--dsbn", + "False", + ] + run_component(args) + # Read output file + output_mdata_no_dsbn = read_h5mu(output_annotation_file_without_dsbn) + output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"] + + # Assert that embeddings without dsbn are different + assert not ( + output_adata.obs["scgpt_pred"].astype(str) + == output_adata_no_dsbn.obs["scgpt_pred"].astype(str) + ).all(), "Cell type predictions with and without dsbn are the same" + + +def test_annotation_dsbn_without_batch_labels(run_component, tmp_path): + output_annotation_labels_without_dsbn = ( + tmp_path / "Kim2020_Lung_subset_annotated_labels_without_dsbn.h5mu" + ) + + args = [ + "--input", + input_path, + "--output", + output_annotation_labels_without_dsbn, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--dsbn", + "True", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: When dsbn is set to True, you are required to provide batch labels \(obs_batch_labels\)\.", + err.value.stdout.decode("utf-8"), + ) + + +def test_annotation_non_existing_keys(run_component, tmp_path): + output_annotation_dummy_values = ( + tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" + ) + + # Test for non-existing tokenized values key + args = [ + "--input", + input_path, + "--output", + output_annotation_dummy_values, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "dummy_values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--obs_batch_label", + "sample", + "--dsbn", + "True", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"', + err.value.stdout.decode("utf-8"), + ) + + +def test_checkpoint_architecture(run_component, tmp_path): + output_dummy_model_key = tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" + + # Test for non-existing model file keys + args = [ + "--input", + input_path, + "--output", + output_dummy_model_key, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "dummy_checkpoints_key", + "--label_mapper_key", + "id_to_class", + "--obs_batch_label", + "sample", + "--dsbn", + "True", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r'KeyError: "The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper."', + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/cross_check_genes/config.vsh.yaml b/src/scgpt/cross_check_genes/config.vsh.yaml new file mode 100644 index 00000000..c51bd606 --- /dev/null +++ b/src/scgpt/cross_check_genes/config.vsh.yaml @@ -0,0 +1,99 @@ +name: cross_check_genes +namespace: "scgpt" +scope: "public" +description: | + Cross-check genes with pre-trained scGPT model. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/elizabeth_mlynarski.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + direction: input + required: true + example: input.h5mu + description: | + The input h5mu file containing of pre-processed data. + - name: "--modality" + type: string + default: "rna" + required: false + description: | + The modality key of the MuData object containing the RNA AnnData object. + - name: "--vocab_file" + type: file + direction: input + required: true + example: resources_test/scgpt/vocab.json + description: | + Model vocabulary file path. + - name: "--input_var_gene_names" + type: string + example: "gene_name" + required: false + description: | + The name of the adata.var column containing gene names. By default the .var index will be used. + - name: "--var_input" + type: string + required: false + description: ".var column containing highly variable genes. If provided, will only cross-check HVG filtered genes with model vocabulary." + - name: Outputs + arguments: + - name: "--output" + type: file + direction: output + required: true + example: output.h5mu + description: | + The output cross-checked anndata file. + - name: "--output_var_filter" + type: string + default: "id_in_vocab" + description: In which .var slot to store a boolean array corresponding to which observations should be filtered out based on HVG and model vocabulary. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--pad_token" + type: string + default: "" + required: false + description: | + The padding token used in the model. +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu + - path: /resources_test/scgpt/source/vocab.json + +engines: + - type: docker + image: nvcr.io/nvidia/pytorch:23.09-py3 + setup: + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + - type: python + packages: + - scgpt==0.2.1 + test_setup: + - type: python + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ lowmem, lowcpu ] \ No newline at end of file diff --git a/src/scgpt/cross_check_genes/script.py b/src/scgpt/cross_check_genes/script.py new file mode 100644 index 00000000..c181d815 --- /dev/null +++ b/src/scgpt/cross_check_genes/script.py @@ -0,0 +1,70 @@ +import sys +import mudata as mu +from scgpt.tokenizer.gene_tokenizer import GeneVocab + +## VIASH START +par = { + "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu", + "output": "output.h5mu", + "modality": "rna", + "input_var_gene_names": None, + "output_var_filter": "id_in_vocab", + "pad_token": "", + "var_input": "filter_with_hvg", + "vocab_file": "resources_test/scgpt/source/vocab.json", + "output_compression": None, +} + +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +# Read in data +logger.info(f"Reading {par['input']}") +mudata = mu.read_h5mu(par["input"]) +adata = mudata.mod[par["modality"]].copy() + +pad_token = par["pad_token"] +special_tokens = [pad_token, "", ""] + +# Fetching gene names +if not par["input_var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +elif par["input_var_gene_names"] not in adata.var.columns: + raise ValueError( + f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs." + ) +else: + genes = adata.var[par["input_var_gene_names"]].astype(str).tolist() + +# Cross-check genes with pre-trained model +logger.info(f"Loading model vocab from {par['vocab_file']}") +vocab_file = par["vocab_file"] +vocab = GeneVocab.from_file(vocab_file) +[vocab.append_token(s) for s in special_tokens if s not in vocab] + +if par["var_input"]: + logger.info("Filtering genes based on model vocab and HVG") + filter_with_hvg = adata.var[par["var_input"]].tolist() + gene_filter_mask = [ + 1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg) + ] + logger.info( + f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}" + ) +else: + logger.info("Filtering genes based on model vocab") + gene_filter_mask = [1 if gene in vocab else 0 for gene in genes] + logger.info( + f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}" + ) + +logger.info(f"Writing to {par['output']}") +adata.var[par["output_var_filter"]] = gene_filter_mask +adata.var[par["output_var_filter"]] = adata.var[par["output_var_filter"]].astype("bool") +mudata.mod[par["modality"]] = adata +mudata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/cross_check_genes/test.py b/src/scgpt/cross_check_genes/test.py new file mode 100644 index 00000000..bb8c53a1 --- /dev/null +++ b/src/scgpt/cross_check_genes/test.py @@ -0,0 +1,96 @@ +import pytest +import subprocess +from mudata import read_h5mu +import re +import sys + +## VIASH START +meta = { + "executable": "./target/docker/scgpt/cross_check/cross_check", + "resources_dir": "./resources_test/scgpt/", + "config": "./src/scgpt/cross_check/config.vsh.yaml", +} +## VIASH END + +input_path = meta["resources_dir"] + "/Kim2020_Lung_subset_preprocessed.h5mu" +vocab_path = meta["resources_dir"] + "/vocab.json" + + +def test_cross_check(run_component, random_path): + output_path = random_path(extension="h5mu") + args = [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--vocab_file", + vocab_path, + "--output_compression", + "gzip", + ] + run_component(args) + + output_mudata = read_h5mu(output_path) + + # Check added columns + assert {"gene_name", "id_in_vocab"}.issubset( + set(output_mudata.mod["rna"].var.columns) + ), "Gene columns were not added." + # Check if genes were filtered + assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len( + output_mudata.mod["rna"].var["id_in_vocab"] + ), "Genes were not filtered." + + output_hvg_path = random_path(extension="h5mu") + args_hvg = [ + "--input", + input_path, + "--output", + output_hvg_path, + "--modality", + "rna", + "--var_input", + "filter_with_hvg", + "--vocab_file", + vocab_path, + "--output_compression", + "gzip", + ] + + run_component(args_hvg) + + output_mudata_hvg = read_h5mu(output_hvg_path) + # Check if genes were filtered based on HVG + assert sum(output_mudata_hvg.mod["rna"].var["id_in_vocab"]) != len( + output_mudata_hvg.mod["rna"].var["id_in_vocab"] + ), "Genes were not filtered." + assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len( + output_mudata_hvg.mod["rna"].var["id_in_vocab"] + ), "Genes were not filtered based on HVG." + + +def test_cross_check_invalid_gene_layer_raises(run_component, random_path): + output_path = random_path(extension="h5mu") + args = [ + "--input", + input_path, + "--output", + output_path, + "--vocab_file", + vocab_path, + "--input_var_gene_names", + "dummy_var", + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: Gene name column 'dummy_var' not found in .mod\['rna'\]\.obs\.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml new file mode 100644 index 00000000..fd238696 --- /dev/null +++ b/src/scgpt/embedding/config.vsh.yaml @@ -0,0 +1,147 @@ +name: embedding +namespace: scgpt +scope: "public" +description: | + Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/elizabeth_mlynarski.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + direction: input + required: true + example: input.h5mu + description: | + The input h5mu file containing tokenized gene and count data. + - name: "--modality" + description: | + Which modality from the input MuData file to process. + + type: string + default: "rna" + required: false + - name: "--model" + type: file + direction: input + required: true + example: best_model.pt + description: | + Path to scGPT model file. + - name: "--model_vocab" + type: file + direction: input + required: true + example: vocab.json + description: | + Path to scGPT model vocabulary file. + - name: "--model_config" + type: file + direction: input + required: true + example: args.json + description: | + Path to scGPT model config file. + - name: "--obsm_gene_tokens" + type: string + default: "gene_id_tokens" + description: | + The key of the .obsm array containing the gene token ids + example: values.pt + - name: "--obsm_tokenized_values" + type: string + default: values_tokenized + description: | + The key of the .obsm array containing the count values of the tokenized genes + - name: "--obsm_padding_mask" + type: string + default: padding_mask + description: | + The key of the .obsm array containing the padding mask. + - name: "--var_gene_names" + type: string + description: | + The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used. + - name: "--obs_batch_label" + type: string + description: | + The name of the adata.obs column containing the batch labels. Must be provided when 'dsbn' is set to True. + - name: "--finetuned_checkpoints_key" + type: string + required: false + example: model_state_dict + description: | + Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models. + + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + description: | + Path to output anndata file containing pre-processed data as well as scGPT embeddings. + direction: output + example: output.h5mu + - name: "--obsm_embeddings" + type: string + default: "X_scGPT" + description: | + The name of the adata.obsm array to which scGPT embeddings will be written. + __merge__: [., /src/base/h5_compression_argument.yaml] + - name: Arguments + arguments: + - name: "--pad_token" + type: string + default: "" + description: | + The token to be used for padding. + - name: "--pad_value" + type: integer + default: -2 + description: | + The value of the padding token. + - name: "--dsbn" + type: boolean + default: true + description: | + Whether to apply domain-specific batch normalization for generating embeddings. When set to True, 'obs_batch_labels' must be set as well. + - name: "--batch_size" + type: integer + default: 64 + description: | + The batch size to be used for inference + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/scgpt/source + - path: /resources_test/scgpt/finetuned_model + - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu + +engines: + - type: docker + image: nvcr.io/nvidia/pytorch:23.09-py3 + setup: + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] + - type: python + packages: + - scgpt==0.2.1 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml ] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu, gpu ] diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py new file mode 100644 index 00000000..b78d42c3 --- /dev/null +++ b/src/scgpt/embedding/script.py @@ -0,0 +1,187 @@ +import sys +import numpy as np +import mudata as mu +import json +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.model import TransformerModel +from scgpt.utils.util import load_pretrained +import torch + +## VIASH START +par = { + "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask", + "model": "resources_test/scgpt/source/best_model.pt", + "model_config": "resources_test/scgpt/source/args.json", + "model_vocab": "resources_test/scgpt/source/vocab.json", + "output": "Kim2020_Lung_embedded.h5ad", + "var_gene_names": "gene_name", + "obs_batch_label": "sample", + "obsm_embeddings": "X_scGPT", + "pad_token": "", + "pad_value": -2, + "batch_size": 64, + "modality": "rna", + "dsbn": True, + "n_input_bins": 51, +} +meta = { + "resources_dir": "src/utils", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +logger.info("Reading in data") + +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +for k, v in { + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + "--obsm_padding_mask": par["obsm_padding_mask"], +}.items(): + if v not in adata.obsm.keys(): + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) + +all_gene_ids = adata.obsm[par["obsm_gene_tokens"]] +all_values = adata.obsm[par["obsm_tokenized_values"]] +padding_mask = adata.obsm[par["obsm_padding_mask"]] + +# Fetch batch ids for domain-specific batch normalization +if par["dsbn"] and not par["obs_batch_label"]: + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels)." + ) +elif par["dsbn"] and par["obs_batch_label"]: + logger.info("Fetching batch id's for domain-specific batch normalization") + batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") + batch_id_labels = batch_id_cats.cat.codes.values + batch_ids = batch_id_labels.tolist() + batch_ids = np.array(batch_ids) + num_batch_types = len(set(batch_ids)) +elif not par["dsbn"] and par["obs_batch_label"]: + logger.info( + "Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed." + ) + +# Set padding specs +logger.info("Setting padding specs") +pad_token = par["pad_token"] +pad_value = par["pad_value"] +special_tokens = [pad_token, "", ""] + +# Fetching gene names +logger.info("Fetching gene names") +if not par["var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +else: + genes = adata.var[par["var_gene_names"]].astype(str).tolist() + +# Model files +logger.info("Loading model, vocab and configs") +model_config_file = par["model_config"] +model_file = par["model"] +vocab_file = par["model_vocab"] + +# Load vocab +vocab = GeneVocab.from_file(vocab_file) +for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + +vocab.set_default_index(vocab[""]) +ntokens = len(vocab) +gene_ids = np.array(vocab(genes), dtype=int) + +# Load model configs +with open(model_config_file, "r") as f: + model_configs = json.load(f) +embsize = model_configs["embsize"] +nhead = model_configs["nheads"] +d_hid = model_configs["d_hid"] +nlayers = model_configs["nlayers"] + +# Instantiate model +logger.info("Initializing transformer model") +model = TransformerModel( + ntokens, + d_model=embsize, + nhead=nhead, + d_hid=d_hid, + nlayers=nlayers, + vocab=vocab, + dropout=0.5, # scGPT default, only relevant for fine-tuning applications + pad_token=pad_token, + pad_value=pad_value, + nlayers_cls=3, # only applicable for decoder-based operations + n_cls=1, # only applicable for decoder-based operations + do_mvc=False, # only applicable for decoder-based operations + ecs_threshold=0.8, # only applicable for decoder-based operations + do_dab=False, # only applicable for decoder-based operations + use_batch_labels=False, # only applicable for decoder-based operations + num_batch_labels=num_batch_types if par["dsbn"] else None, + domain_spec_batchnorm=par["dsbn"], + input_emb_style="continuous", # scGPT default + explicit_zero_prob=False, # TODO: Parametrize when GPU-based machine types are supported + use_fast_transformer=False, # TODO: Parametrize when GPU-based machine types are supported + # fast_transformer_backend="flash", #TODO: Parametrize when GPU-based machine types are supported + pre_norm=False, # TODO: Parametrize when GPU-based machine types are supported +) + + +logger.info("Loading model") +model_file = par["model"] +model_dict = torch.load(model_file, map_location=device) + +# Ensure the provided model has the correct architecture +finetuned_checkpoints_key = par.get("finetuned_checkpoints_key") +if finetuned_checkpoints_key: + try: + model_dict = model_dict[finetuned_checkpoints_key] + except KeyError as e: + raise ValueError( + f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) from e + +# Load model +load_pretrained(model, model_dict, verbose=False) + +# Embed tokenized data +logger.info("Converting tokenized input data to embeddings") +model.to(device) +model.eval() + +cell_embeddings = model.encode_batch( + torch.from_numpy(all_gene_ids), + torch.from_numpy(all_values).float(), + src_key_padding_mask=torch.from_numpy(padding_mask), + batch_size=par["batch_size"], + batch_labels=torch.from_numpy(batch_ids).long() if par["dsbn"] else None, + output_to_cpu=True, + time_step=0, + return_np=True, +) + +cell_embeddings = cell_embeddings / np.linalg.norm( + cell_embeddings, axis=1, keepdims=True +) + +# Write output +logger.info("Writing output data") +adata.obsm[par["obsm_embeddings"]] = cell_embeddings +mdata.mod[par["modality"]] = adata +mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/embedding/test.py b/src/scgpt/embedding/test.py new file mode 100644 index 00000000..8ba9b190 --- /dev/null +++ b/src/scgpt/embedding/test.py @@ -0,0 +1,350 @@ +import pytest +import subprocess +import re +import sys +import mudata as mu +import numpy as np + + +## VIASH START +meta = { + "resources_dir": "resources_test", +} +## VIASH END + +input = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu" +model_file = f"{meta['resources_dir']}/source/best_model.pt" +ft_model_file = f"{meta['resources_dir']}/finetuned_model/best_model.pt" +vocab_file = f"{meta['resources_dir']}/source/vocab.json" +model_config_file = f"{meta['resources_dir']}/source/args.json" +input_file = mu.read(input) + + +def test_integration_embedding(run_component, tmp_path): + output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + + run_component( + [ + "--input", + input, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, + "--batch_size", + "4", + ] + ) + + # Read output file + output_mdata = mu.read(output_embedding_file) + output_adata = output_mdata.mod["rna"] + + # check that embedding obs is present + assert "X_scGPT" in output_adata.obsm.keys(), ( + "X_scGPT is not present in anndata obsm keys" + ) + + # check embedding size + assert output_adata.obsm["X_scGPT"].shape[1] == 512, ( + "Embedding size does not equal 512" + ) + + # check embedding value range + assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), ( + "Embedding values are nan" + ) + assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), ( + "Range of embedding values is outside of [-1, 1]" + ) + + # Run embeddings without dsbn + output_embedding_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + + run_component( + [ + "--input", + input, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "False", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file_without_dsbn, + "--batch_size", + "4", + ] + ) + + # Read output file + output_mdata_no_dsbn = mu.read(output_embedding_file_without_dsbn) + output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"] + + # Assert that embeddings without dsbn are different + assert not ( + output_adata.obsm["X_scGPT"] == output_adata_no_dsbn.obsm["X_scGPT"] + ).all(), "Embeddings with and without dsbn are the same" + + +def test_integration_embedding_dsbn_without_batch_labels(run_component, tmp_path): + output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + + args = [ + "--input", + input, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: When dsbn is set to True, you are required to provide batch labels \(input_obs_batch_labels\)\.", + err.value.stdout.decode("utf-8"), + ) + + +def test_integration_embedding_non_existing_keys(run_component, tmp_path): + output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + + # Test for non-existing gene names key + args_1 = [ + "--input", + input, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--var_gene_names", + "dummy_gene_name_key", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args_1) + assert re.search( + r"KeyError: \'dummy_gene_name_key\'", err.value.stdout.decode("utf-8") + ) + + # Test for non-existing batch label key + args_2 = [ + "--input", + input, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "dummy_batch_label_key", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args_2) + assert re.search( + r"KeyError: \'dummy_batch_label_key\'", err.value.stdout.decode("utf-8") + ) + + # Test for non-existing tokenized values key + args_3 = [ + "--input", + input, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "dummy_values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args_3) + assert re.search( + r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"', + err.value.stdout.decode("utf-8"), + ) + + +def test_finetuned_model(run_component, tmp_path): + output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + + run_component( + [ + "--input", + input, + "--modality", + "rna", + "--model", + ft_model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--finetuned_checkpoints_key", + "model_state_dict", + "--output", + output_embedding_file, + "--batch_size", + "4", + ] + ) + + # Read output file + output_mdata = mu.read(output_embedding_file) + output_adata = output_mdata.mod["rna"] + + # check that embedding obs is present + assert "X_scGPT" in output_adata.obsm.keys(), ( + "X_scGPT is not present in anndata obsm keys" + ) + + # check embedding size + assert output_adata.obsm["X_scGPT"].shape[1] == 512, ( + "Embedding size does not equal 512" + ) + + # check embedding value range + assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), ( + "Embedding values are nan" + ) + assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), ( + "Range of embedding values is outside of [-1, 1]" + ) + + +def test_finetuned_model_architecture(run_component, tmp_path): + output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + + args = [ + "--input", + input, + "--modality", + "rna", + "--model", + ft_model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--finetuned_checkpoints_key", + "dummy_checkpoints_key", + "--output", + output_embedding_file, + ] + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component(args) + assert re.search( + r"ValueError: The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/pad_tokenize/config.vsh.yaml b/src/scgpt/pad_tokenize/config.vsh.yaml new file mode 100644 index 00000000..6efa9a01 --- /dev/null +++ b/src/scgpt/pad_tokenize/config.vsh.yaml @@ -0,0 +1,125 @@ +name: pad_tokenize +namespace: "scgpt" +scope: "public" +description: | + Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/elizabeth_mlynarski.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + direction: input + required: true + example: input.h5mu + description: | + The input h5mu file of pre-processed data. + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--model_vocab" + type: file + direction: input + required: true + example: vocab.json + description: | + Path to model vocabulary file. + - name: "--var_gene_names" + type: string + required: false + description: | + The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used. + - name: "--var_input" + type: string + default: "id_in_vocab" + description: | + The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes. + - name: "--input_obsm_binned_counts" + type: string + default: "binned_counts" + required: false + description: | + The name of the .obsm field containing the binned counts to be padded and tokenized. + + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + description: | + The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask. + direction: output + example: output.h5mu + - name: "--obsm_gene_tokens" + type: string + default: "gene_id_tokens" + description: | + The key of the .obsm array containing the gene token ids + example: values.pt + - name: "--obsm_tokenized_values" + type: string + default: values_tokenized + description: | + The key of the .obsm array containing the count values of the tokenized genes + - name: "--obsm_padding_mask" + type: string + default: padding_mask + description: | + The key of the .obsm array containing the padding mask. + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: Arguments + arguments: + - name: "--pad_token" + type: string + default: "" + required: false + description: | + Token used for padding. + - name: "--pad_value" + type: integer + default: -2 + required: false + description: | + The value of the padding token. + - name: "--max_seq_len" + type: integer + description: | + The maximum sequence length of the tokenized data. Defaults to the number of features if not provided. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/subset_vars.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/scgpt/ + +engines: + - type: docker + image: nvcr.io/nvidia/pytorch:23.09-py3 + setup: + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] + - type: python + packages: + - scgpt==0.2.1 + - ipython~=8.5.0 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/scgpt/pad_tokenize/script.py b/src/scgpt/pad_tokenize/script.py new file mode 100644 index 00000000..641e28a1 --- /dev/null +++ b/src/scgpt/pad_tokenize/script.py @@ -0,0 +1,111 @@ +import sys +import mudata as mu +import numpy as np +from scipy.sparse import issparse +from scgpt.tokenizer import tokenize_and_pad_batch +from scgpt.tokenizer.gene_tokenizer import GeneVocab + + +## VIASH START +par = { + "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu", + "model_vocab": "resources_test/scgpt/source/vocab.json", + "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", + "pad_token": "", + "pad_value": -2, + "modality": "rna", + "input_obsm_binned_counts": "binned_counts", + "max_seq_len": None, + "var_gene_names": None, + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask", + "output_compression": None, + "var_input": "id_in_vocab", +} +meta = {"resources_dir": "src/utils/"} + +# mdata = mu.read(par["input"]) +# mdata.mod["rna"].obsm["binned_counts"] = mdata.mod["rna"].layers["binned"] +# mdata.write_h5mu(par["input"]) +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars + +logger = setup_logger() + +logger.info("Reading in data") + +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +adata = subset_vars(adata, par["var_input"]) + +# Set padding specs +pad_token = par["pad_token"] +special_tokens = [pad_token, "", ""] +pad_value = -2 + +logger.info("Fetching counts and gene names") +# Fetch counts +all_counts = ( + adata.obsm[par["input_obsm_binned_counts"]].toarray() + if issparse(adata.obsm[par["input_obsm_binned_counts"]]) + else adata.obsm[par["input_obsm_binned_counts"]] +) + +# Fetching gene names +if not par["var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +else: + genes = adata.var[par["var_gene_names"]].astype(str).tolist() + +# Fetch gene names and look up tokens in vocab +logger.info("Reading in vocab and fetching gene tokens") +vocab_file = par["model_vocab"] +vocab = GeneVocab.from_file(vocab_file) +for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + +vocab.set_default_index(vocab[""]) +ntokens = len(vocab) +gene_ids = np.array(vocab(genes), dtype=int) + +# Fetch max seq len +if not par["max_seq_len"]: + max_seq_len = adata.var.shape[0] + 1 +else: + max_seq_len = par["max_seq_len"] + +# Tokenize and pad data +logger.info( + f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}." +) +tokenized_data = tokenize_and_pad_batch( + all_counts, + gene_ids, + max_len=max_seq_len, + vocab=vocab, + pad_token=pad_token, + pad_value=pad_value, + append_cls=True, # append token at the beginning, + include_zero_gene=False, + return_pt=True, + mod_type=None, + vocab_mod=None, +) + +all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] +padding_mask = all_gene_ids.eq(vocab[pad_token]) + +logger.info("Writing output data") +input_adata.obsm[par["obsm_gene_tokens"]] = all_gene_ids.numpy() +input_adata.obsm[par["obsm_tokenized_values"]] = all_values.numpy() +input_adata.obsm[par["obsm_padding_mask"]] = padding_mask.numpy() + +mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/pad_tokenize/test.py b/src/scgpt/pad_tokenize/test.py new file mode 100644 index 00000000..e79a9b6c --- /dev/null +++ b/src/scgpt/pad_tokenize/test.py @@ -0,0 +1,113 @@ +import pytest +import sys +import mudata as mu +from scgpt.tokenizer.gene_tokenizer import GeneVocab + +## VIASH START +meta = { + "resources_dir": "resources_test/scgpt", + "executable": "./target/docker/scgpt/integration_pad_tokenize/integration_pad_tokenize", + "temp_dir": "tmp", + "config": "./target/docker/scgpt/integration_pad_tokenize/.config.vsh.yaml", +} +## VIASH END + +input_file = ( + f"{meta['resources_dir']}/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu" +) +vocab_file = f"{meta['resources_dir']}/scgpt/source/vocab.json" +vocab = GeneVocab.from_file(vocab_file) + + +def test_integration_pad_tokenize(run_component, tmp_path): + output = tmp_path / "Kim2020_Lung_tokenized.h5mu" + + run_component( + [ + "--input", + input_file, + "--output", + output, + "--modality", + "rna", + "--var_input", + "scgpt_cross_checked_genes", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--pad_token", + "", + "--pad_value", + "-2", + "--input_obsm_binned_counts", + "binned_counts", + "--model_vocab", + vocab_file, + ] + ) + + output_file = mu.read(output) + output_adata = output_file.mod["rna"] + + gene_ids = output_adata.obsm["gene_id_tokens"] + values = output_adata.obsm["values_tokenized"] + padding_mask = output_adata.obsm["padding_mask"] + + # check output dimensions + ## nr of genes that are tokenized + assert gene_ids.shape[1] <= output_adata.var.shape[0] + 1, ( + "gene_ids shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" + ) + assert values.shape[1] <= output_adata.var.shape[0] + 1, ( + "values shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" + ) + assert padding_mask.shape[1] <= output_adata.var.shape[0] + 1, ( + "padding_mask shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" + ) + + ## equal size of output tensors + assert gene_ids.shape == values.shape, ( + "gene_ids shape[1] does not match values shape[1]" + ) + assert gene_ids.shape == padding_mask.shape, ( + "gene_ids shape[1] does not match padding_mask shape[1]" + ) + + ## check values of output tensors + assert gene_ids.dtype == "int64", "tokenized gene_ids are not integers" + assert (gene_ids > 0).all(), "not all gene id tokens are higher than 0" + + assert values.dtype == "float32", "tokenized values are not floats" + assert (values >= -2).all(), "not all tokenized values are higher than/equal to -2" + + assert padding_mask.dtype == bool, "padding mask is not boolean" + + ## check cls token + assert (gene_ids[:, 0] == vocab[""]).all(), ( + "cls token was not correctly appended at the beginning of the gene_ids tensor" + ) + assert (values[:, 0] == 0).all(), ( + "cls token was not correctly appended at the beginning of the values tensors" + ) + + # check padding values + masked_gene_ids = gene_ids[padding_mask] + unmasked_gene_ids = gene_ids[~padding_mask] + assert all(masked_gene_ids == vocab[""]), ( + "masked gene_ids contain non-pad tokens" + ) + assert all(unmasked_gene_ids != vocab[""]), ( + "unmasked gene_ids contain pad tokens" + ) + + masked_values = values[padding_mask] + unmasked_values = values[~padding_mask] + assert all(masked_values == -2), "masked values contain non-pad values" + assert all(unmasked_values != -2), "unmasked values contain pad values" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/bpcells_regress_out/config.vsh.yaml b/src/transform/bpcells_regress_out/config.vsh.yaml new file mode 100644 index 00000000..cab5a81f --- /dev/null +++ b/src/transform/bpcells_regress_out/config.vsh.yaml @@ -0,0 +1,76 @@ +name: bpcells_regress_out +namespace: "transform" +scope: "public" +description: | + Regress out the effects of confounding variables using a linear least squares regression model with BPCells. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ contributor, author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] +arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu + - name: "--modality" + type: string + description: "The modality to run this component on." + default: "rna" + - name: "--obs_keys" + type: string + multiple: true + description: The .obs keys to regress on. + - name: "--input_layer" + type: string + required: false + example: X_normalized + description: | + The layer of the adata object to regress on. + If not provided, the X attribute of the adata object will be used. + - name: "--output_layer" + type: string + example: X_regressed + required: false + description: | + The layer of the adata object containing the regressed count data. + If not provided, the X attribute of the adata object will be used. +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: + - type: docker + image: rocker/r2u:22.04 + setup: + - type: docker + env: + - RETICULATE_PYTHON=/usr/bin/python + - type: apt + packages: [ libhdf5-dev, python3, python3-pip, python3-dev, python-is-python3 ] + - type: r + cran: [ anndata, reticulate ] + github: bnprks/BPCells/r + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml ] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowcpu] diff --git a/src/transform/bpcells_regress_out/script.R b/src/transform/bpcells_regress_out/script.R new file mode 100644 index 00000000..a98cdaaf --- /dev/null +++ b/src/transform/bpcells_regress_out/script.R @@ -0,0 +1,67 @@ +cat("Loading libraries\n") +library(glue) +library(BPCells) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +mudata <- reticulate::import("mudata") + +## VIASH START +par <- list( + input = paste0( + "resources_test/pbmc_1k_protein_v3", + "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + ), + output = "output.h5mu", + modality = "rna" +) +## VIASH END + +# Read the h5mu file and make var names unique +mdata <- mudata$read_h5mu(par$input) + +# Regress out +if (!is.null(par$obs_keys) && length(par$obs_keys) > 0) { + cat("Regress out variables ", par$obs_keys, " on modality ", + par$modality, "\n", + sep = "" + ) + + # Fetch modality AnnData and convert to an iterable matrix + adata <- mdata$mod[[par$modality]] + + # Fetch the input layer + mat <- + if (is.null(par$input_layer)) { + cat("Using .X as input layer\n") + adata$X + } else { + cat("Using .layers ", par$input_layer, " as input layer\n", sep = "") + adata$layers[[par$input_layer]] + } + + imat <- as(as(mat, "CsparseMatrix"), "IterableMatrix") + dimnames(imat) <- NULL + + # obs_keys is not NULL and not empty + latent_data <- as.data.frame(adata$obs[, par$obs_keys]) + + # Regress out using BPCells + regressed_data <- regress_out(imat, latent_data, prediction_axis = "col") + + # Convert iterable matrix back to C sparse matrix + rmat <- as(regressed_data, "dgCMatrix") + + # Assign regressed out data back to AnnData object + if (is.null(par$output_layer)) { + cat("Using .X as output layer\n") + adata$X <- rmat + } else { + cat("Using .layers ", par$output_layer, " as output layer\n", sep = "") + adata$layers[[par$output_layer]] <- rmat + } +} else { + cat("No obs_keys provided, skipping regression\n") +} + +# Write to output h5mu file +mdata$write(par$output, compression = par$output_compression) diff --git a/src/transform/bpcells_regress_out/test.py b/src/transform/bpcells_regress_out/test.py new file mode 100644 index 00000000..1c30f393 --- /dev/null +++ b/src/transform/bpcells_regress_out/test.py @@ -0,0 +1,127 @@ +import sys +import pytest +import mudata as mu +import numpy as np + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_data(input_path): + return mu.read_h5mu(input_path) + + +@pytest.fixture +def input_h5mu(input_data): + input_data.obs["var"] = np.random.rand(input_data.n_obs) + input_data.mod["rna"].obs["var"] = input_data.obs["var"] + input_data.mod["prot"].obs["var"] = input_data.obs["var"] + input_data.mod["rna"].layers["input"] = input_data.mod["rna"].X + return input_data + + +@pytest.fixture +def input_h5mu_path(write_mudata_to_file, input_h5mu): + return write_mudata_to_file(input_h5mu) + + +@pytest.fixture +def output_h5mu_path(tmp_path): + return tmp_path / "output.h5mu" + + +def test_regress_out(run_component, input_h5mu_path, output_h5mu_path): + # execute command + cmd_pars = [ + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + assert output_h5mu_path.is_file(), "No output was created." + + mu_input = mu.read_h5mu(input_h5mu_path) + mu_output = mu.read_h5mu(output_h5mu_path) + + assert "rna" in mu_output.mod, 'Output should contain data.mod["prot"].' + assert "prot" in mu_output.mod, 'Output should contain data.mod["prot"].' + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + prot_in = mu_input.mod["prot"] + prot_out = mu_output.mod["prot"] + + assert rna_in.shape == rna_out.shape, "Should have same shape as before" + assert prot_in.shape == prot_out.shape, "Should have same shape as before" + + assert np.mean(rna_in.X) != np.mean(rna_out.X), "RNA expression should have changed" + assert np.mean(prot_in.X) == np.mean(prot_out.X), ( + "Protein expression should remain the same" + ) + + +def test_no_regress_out_without_obs_keys( + run_component, input_h5mu_path, output_h5mu_path +): + # execute command + cmd_pars = [ + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + mu_input = mu.read_h5mu(input_h5mu_path) + mu_output = mu.read_h5mu(output_h5mu_path) + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + + assert np.mean(rna_in.X) == np.mean(rna_out.X), ( + "RNA expression should remain the same" + ) + + +def test_regress_out_with_layers(run_component, input_h5mu_path, output_h5mu_path): + # execute command + cmd_pars = [ + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--input_layer", + "input", + "--output_layer", + "output", + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + mu_input = mu.read_h5mu(input_h5mu_path) + mu_output = mu.read_h5mu(output_h5mu_path) + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + + assert np.mean(rna_in.layers["input"]) != np.mean(rna_out.layers["output"]), ( + "RNA expression should have changed" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/clr/config.vsh.yaml b/src/transform/clr/config.vsh.yaml new file mode 100644 index 00000000..1f33d847 --- /dev/null +++ b/src/transform/clr/config.vsh.yaml @@ -0,0 +1,76 @@ +name: clr +namespace: "transform" +scope: "public" +description: | + Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017). +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "prot" + required: false + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu + - name: "--input_layer" + type: string + description: "Input layer to use. By default, .X is used." + - name: "--output_layer" + type: string + description: Output layer to use. By default, use X. + required: false + - name: "--axis" + type: integer + description: | + Axis across which CLR is performed. If set to 0, CLR is performed across observations (cells). + If set to 1, CLR is performed across features (genes). + default: 0 + required: false + choices: [0, 1] +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - muon~=0.1.5 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowmem, midcpu] + diff --git a/src/transform/clr/script.py b/src/transform/clr/script.py new file mode 100644 index 00000000..d04604e7 --- /dev/null +++ b/src/transform/clr/script.py @@ -0,0 +1,48 @@ +import sys +from muon import prot as pt +from mudata import read_h5ad +from anndata import AnnData +from functools import partial +from operator import setitem + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "prot", + "output": "foo.h5mu", + "layer": None, +} +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def main(): + input_data = read_h5ad(par["input"], mod=par["modality"]) + if par["input_layer"]: + input_data = AnnData(X=input_data.layers[par["input_layer"]]) + # CLR always normalizes the .X layer, so we have to create an AnnData file with + # the input layer at .X + normalized_counts = pt.pp.clr(input_data, axis=par["axis"], inplace=False) + if not normalized_counts: + raise RuntimeError("CLR failed to return the requested output layer") + + output_layer_setter = ( + partial(setattr, input_data, "X") + if not par["output_layer"] + else partial(setitem, input_data.layers, par["output_layer"]) + ) + output_layer_setter(normalized_counts.X) + write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + input_data, + par["output_compression"], + ) + + +if __name__ == "__main__": + main() diff --git a/src/transform/clr/test.py b/src/transform/clr/test.py new file mode 100644 index 00000000..96ce3dd5 --- /dev/null +++ b/src/transform/clr/test.py @@ -0,0 +1,134 @@ +import sys +import pytest +from mudata import read_h5mu +import numpy as np + +## VIASH START +meta = { + "executable": "target/executable/transform/clr/clr", + "resources_dir": "./resources_test/", + "cpus": 2, + "config": "./src/transform/clr/config.vsh.yaml", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +def test_clr(run_component, tmp_path): + output_file = tmp_path / "foo.h5mu" + + run_component( + [ + "--input", + input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + "--output_layer", + "clr", + ] + ) + assert output_file.is_file() + output_h5mu = read_h5mu(output_file) + assert "clr" in output_h5mu.mod["prot"].layers.keys() + assert output_h5mu.mod["prot"].layers["clr"] is not None + input = read_h5mu(input_file) + input_col = input.mod["prot"].X[:, 0].toarray() + result_col = output_h5mu.mod["prot"].layers["clr"][:, 0].toarray() + expected_col = np.log1p( + input_col / np.exp(np.log1p(input_col).sum(axis=0) / input_col.size) + ) + np.testing.assert_allclose(result_col, expected_col) + + +def test_clr_select_input_layer(run_component, tmp_path): + output_file = tmp_path / "foo.h5mu" + + input_data = read_h5mu(input_file) + input_data.mod["prot"].layers["test_layer"] = input_data.mod["prot"].X.copy() + input_data.mod["prot"].X = None + + temp_input_file = tmp_path / "temp.h5mu" + input_data.write(temp_input_file) + + run_component( + [ + "--input", + temp_input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + "--output_layer", + "clr", + "--input_layer", + "test_layer", + ] + ) + assert output_file.is_file() + output_h5mu = read_h5mu(output_file) + assert "clr" in output_h5mu.mod["prot"].layers.keys() + assert output_h5mu.mod["prot"].layers["clr"] is not None + + +def test_clr_output_to_x(run_component, tmp_path): + output_file = tmp_path / "foo.h5mu" + + original_x = read_h5mu(input_file).mod["prot"].X + run_component( + [ + "--input", + input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + ] + ) + assert output_file.is_file() + output_h5mu = read_h5mu(output_file) + assert "clr" not in output_h5mu.mod["prot"].layers + assert not np.all( + np.isclose( + original_x.toarray(), + output_h5mu.mod["prot"].X.toarray(), + rtol=1e-07, + atol=1e-07, + ) + ) + + +def test_clr_set_axis(run_component, tmp_path): + output_file = tmp_path / "foo.h5mu" + + run_component( + [ + "--input", + input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + "--output_layer", + "clr", + "--axis", + "1", + ] + ) + assert output_file.is_file() + output_h5mu = read_h5mu(output_file) + assert "clr" in output_h5mu.mod["prot"].layers.keys() + assert output_h5mu.mod["prot"].layers["clr"] is not None + input = read_h5mu(input_file) + input_row = input.mod["prot"].X[0].toarray() + result_row = output_h5mu.mod["prot"].layers["clr"][0].toarray() + expected_row = np.log1p( + input_row / np.exp(np.log1p(input_row).sum(axis=1) / input_row.size) + ) + np.testing.assert_allclose(result_row, expected_row) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/delete_layer/config.vsh.yaml b/src/transform/delete_layer/config.vsh.yaml new file mode 100644 index 00000000..2a380993 --- /dev/null +++ b/src/transform/delete_layer/config.vsh.yaml @@ -0,0 +1,65 @@ +name: delete_layer +namespace: "transform" +scope: "public" +description: | + Delete an anndata layer from one or more modalities. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--layer" + type: string + required: true + multiple: true + description: "Input layer to remove" + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu + - name: "--missing_ok" + type: boolean_true + description: Do not raise an error if the layer does not exist for all modalities. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/compress_h5mu.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [midmem, singlecpu] diff --git a/src/transform/delete_layer/script.py b/src/transform/delete_layer/script.py new file mode 100644 index 00000000..89676bb9 --- /dev/null +++ b/src/transform/delete_layer/script.py @@ -0,0 +1,51 @@ +import sys +from mudata import read_h5ad +from pathlib import Path + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "output": "output.h5mu", + "modality": "rna", + "layer": ["log_normalized"], + "missing_ok": False, + "output_compression": "lzf", +} +meta = {"name": "delete_layer", "resources_dir": "resources_test"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) + + logger.info("Reading input file %s, modality %s.", input_file, mod_name) + mod = read_h5ad(input_file, mod=mod_name) + for layer in par["layer"]: + if layer not in mod.layers: + if par["missing_ok"]: + continue + raise ValueError(f"Layer '{layer}' is not present in modality {mod_name}.") + logger.info("Deleting layer %s from modality %s.", layer, mod_name) + del mod.layers[layer] + + logger.info("Writing output to %s.", par["output"]) + + write_h5ad_to_h5mu_with_compression( + output_file, input_file, mod_name, mod, par["output_compression"] + ) + + logger.info("Finished.") + + +if __name__ == "__main__": + main() diff --git a/src/transform/delete_layer/test.py b/src/transform/delete_layer/test.py new file mode 100644 index 00000000..e3b559d1 --- /dev/null +++ b/src/transform/delete_layer/test.py @@ -0,0 +1,104 @@ +import sys +import pytest + +from mudata import read_h5mu +from subprocess import CalledProcessError +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "name": "./target/executable/transform/delete_layer/delete_layer", + "executable": "target/executable/transform/delete_layer/delete_layer", + "config": "src/transform/delete_layer/config.vsh.yaml", + "resources_dir": "./resources_test/", +} +## VIASH END + +input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.mark.parametrize("output_compression", [None, "gzip", "lzf"]) +def test_delete_layer(run_component, random_h5mu_path, output_compression): + temp_input = random_h5mu_path() + temp_output = random_h5mu_path() + + # create input file + input = read_h5mu(input_file) + new_layer = input.mod["rna"].X + input.mod["rna"].layers["test"] = new_layer + assert "test" in input.mod["rna"].layers.keys() + input.write_h5mu(temp_input) + + arguments = [ + "--input", + temp_input, + "--modality", + "rna", + "--layer", + "test", + "--output", + temp_output, + ] + if output_compression: + arguments.extend(["--output_compression", output_compression]) + + # run command + run_component(arguments) + + # check if output is correct + assert temp_output.is_file() + output = read_h5mu(temp_output) + assert "test" not in output.mod["rna"].layers.keys() + assert set(output.mod) == {"rna", "prot"} + + # Test that other data from input remains untouched + del input.mod["rna"].layers["test"] + assert_annotation_objects_equal(input, output) + + +def test_missing_layer_raises(run_component, random_h5mu_path): + output = random_h5mu_path() + with pytest.raises(CalledProcessError) as err: + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--layer", + "test", + "--output", + str(output), + ] + ) + assert not output.is_file() + assert "Layer 'test' is not present in modality rna." in err.value.stdout.decode( + "utf-8" + ) + + +def test_missing_layer_missing_ok(run_component, random_h5mu_path): + output = random_h5mu_path() + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--layer", + "test", + "--output", + str(output), + "--missing_ok", + ] + ) + assert output.is_file() + output_data = read_h5mu(output) + assert "test" not in output_data.mod["rna"].layers.keys() + + # Test that other data from input remains untouched + assert_annotation_objects_equal(input_file, output) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/log1p/config.vsh.yaml b/src/transform/log1p/config.vsh.yaml new file mode 100644 index 00000000..69112f9a --- /dev/null +++ b/src/transform/log1p/config.vsh.yaml @@ -0,0 +1,83 @@ +name: log1p +namespace: "transform" +scope: "public" +description: | + Logarithmize the data matrix. Computes X = log(X + 1), where log denotes the natural logarithm unless a different base is given. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ contributor ] +arguments: + # input + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is normalized" + + - name: "--output_layer" + type: string + description: Output layer to use. By default, use X. + required: false + + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu + + # arguments + - name: "--base" + description: | + Base of the logarithm. Natural logarithm is used by default. + type: double + example: 2 + +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: run_test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [midmem, lowcpu] diff --git a/src/transform/log1p/run_test.py b/src/transform/log1p/run_test.py new file mode 100644 index 00000000..5103123c --- /dev/null +++ b/src/transform/log1p/run_test.py @@ -0,0 +1,119 @@ +from os import path +import mudata as mu +import numpy as np +import scanpy as sc +import pandas as pd +import sys +import pytest +import uuid +from operator import attrgetter + +## VIASH START +meta = { + "name": "lognorm", + "resources_dir": "resources_test/", + "config": "./src/transform/log1p/config.vsh.yaml", + "executable": "../../executable/docker/transform/log1p/log1p", +} + + +## VIASH END + + +@pytest.fixture +def input_data(): + return mu.read_h5mu( + f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + ).copy() + + +@pytest.fixture +def random_h5mu_path(tmp_path): + def wrapper(): + unique_filename = f"{str(uuid.uuid4())}.h5mu" + temp_file = tmp_path / unique_filename + return temp_file + + return wrapper + + +@pytest.mark.parametrize("output_layer", [None, "log_normalized"]) +@pytest.mark.parametrize("input_layer", [None, "normalized"]) +def test_1logp(run_component, input_data, output_layer, input_layer, random_h5mu_path): + output = random_h5mu_path() + if input_layer: + mod = input_data.mod["rna"] + mod.layers[input_layer] = mod.X.copy() + # Overwrite the original layer to make sure + # it is not accidentally used as input layer. + mod.X[:] = 0 + input_path = random_h5mu_path() + input_data.write(input_path) + run_args = [ + "--input", + input_path, + "--output", + output, + "--output_compresion", + "gzip", + ] + if output_layer: + run_args.extend(["--output_layer", output_layer]) + if input_layer: + run_args.extend(["--input_layer", input_layer]) + run_component(run_args) + get_output_layer = ( + attrgetter("X") + if not output_layer + else lambda x: getattr(x, "layers")[output_layer] + ) + + assert path.exists(output), "No output was created." + + mu_input = mu.read_h5mu(input_path) + mu_output = mu.read_h5mu(output) + + assert "rna" in mu_output.mod, 'Output should contain data.mod["prot"].' + assert "prot" in mu_output.mod, 'Output should contain data.mod["prot"].' + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + prot_in = mu_input.mod["prot"] + prot_out = mu_output.mod["prot"] + + assert rna_in.shape == rna_out.shape, "Should have same shape as before" + assert prot_in.shape == prot_out.shape, "Should have same shape as before" + input_layer_data = rna_in.X if not input_layer else rna_in.layers[input_layer] + assert np.mean(input_layer_data) != np.mean(get_output_layer(rna_out)), ( + "Expression should have changed" + ) + + nz_row, nz_col = input_layer_data.nonzero() + row_corr = np.corrcoef( + input_layer_data[nz_row[0], :].toarray().flatten(), + get_output_layer(rna_out)[nz_row[0], :].toarray().flatten(), + )[0, 1] + col_corr = np.corrcoef( + input_layer_data[:, nz_col[0]].toarray().flatten(), + get_output_layer(rna_out)[:, nz_col[0]].toarray().flatten(), + )[0, 1] + assert row_corr > 0.1 + assert col_corr > 0.1 + + assert "log1p" in rna_out.uns + + # Make sure that the original input layer has not been overwritten + layers_to_test = [None] + list(rna_in.layers.keys()) + for layer in layers_to_test: + if layer != output_layer: + in_data = sc.get.var_df( + rna_in, keys=rna_in.obs_names.to_list(), layer=layer + ) + out_data = sc.get.var_df( + rna_out, keys=rna_in.obs_names.to_list(), layer=layer + ) + pd.testing.assert_frame_equal(in_data, out_data) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/log1p/script.py b/src/transform/log1p/script.py new file mode 100644 index 00000000..78c962ca --- /dev/null +++ b/src/transform/log1p/script.py @@ -0,0 +1,50 @@ +import scanpy as sc +import mudata as mu +import anndata as ad +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "output.h5mu", + "base": None, + "modality": "rna", + "output_layer": "foo", +} +meta = {"name": "lognorm"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from input mudata", par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) +assert data.var_names.is_unique, "Expected var_names to be unique." + +logger.info("Performing log transformation") +# Make our own copy with not a lot of data +# this avoid excessive memory usage and accidental overwrites +input_layer = data.layers[par["input_layer"]] if par["input_layer"] else data.X +data_for_scanpy = ad.AnnData(X=input_layer.copy()) +sc.pp.log1p( + data_for_scanpy, + base=par["base"], + layer=None, # use X + copy=False, +) # allow overwrites in the copy that was made + +# Scanpy will overwrite the input layer. +# So fetch input layer from the copy and use it to populate the output slot +if par["output_layer"]: + data.layers[par["output_layer"]] = data_for_scanpy.X +else: + data.X = data_for_scanpy.X +data.uns["log1p"] = data_for_scanpy.uns["log1p"].copy() + +logger.info("Writing to file %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) diff --git a/src/transform/move_layer/config.vsh.yaml b/src/transform/move_layer/config.vsh.yaml new file mode 100644 index 00000000..a7dd5e6c --- /dev/null +++ b/src/transform/move_layer/config.vsh.yaml @@ -0,0 +1,65 @@ +name: move_layer +namespace: "transform" +scope: "public" +description: "Move a data matrix stored at the .layers or .X attributes in a MuData object to another layer." +arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + description: | + Input layer to move to a new output location. If specified, will be used to select a key from .layers, + otherwise .X is used. + type: string + required: false + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + - name: "--output_layer" + description: | + Destination location for the layer. If not provided, .X will be used, + Otherwise, will be the key for the .layers attribute in the output MuData file. + type: string + required: false +__merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [ singlecpu, lowmem ] \ No newline at end of file diff --git a/src/transform/move_layer/script.py b/src/transform/move_layer/script.py new file mode 100644 index 00000000..e269f75f --- /dev/null +++ b/src/transform/move_layer/script.py @@ -0,0 +1,54 @@ +import sys +from mudata import read_h5ad +from functools import partial +from operator import setitem + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "foo.h5mu", + "modality": "rna", + "input_layer": None, + "output_layer": "test", + "output_compression": None, +} + +meta = {"resources_dir": "."} +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s mudata from file %s", par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + + +logger.info( + "Using input layer '%s'", "X" if not par["input_layer"] else par["input_layer"] +) +if par["input_layer"]: + data_to_write = mod_data.layers[par["input_layer"]].copy() + del mod_data.layers[par["input_layer"]] +else: + data_to_write = mod_data.X + mod_data.X = None + +output_layer_setter = ( + partial(setattr, mod_data, "X") + if not par["output_layer"] + else partial(setitem, mod_data.layers, par["output_layer"]) +) +output_layer_setter(data_to_write) + +logger.info( + "Writing output to file %s with compression %s", + par["output"], + par["output_compression"], +) + +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) diff --git a/src/transform/move_layer/test.py b/src/transform/move_layer/test.py new file mode 100644 index 00000000..5fddc26d --- /dev/null +++ b/src/transform/move_layer/test.py @@ -0,0 +1,101 @@ +import sys +import pytest +import pandas as pd +from anndata import AnnData +from mudata import MuData, read_h5mu + +## VIASH START +meta = { + "executable": "./target/executable/transform/move_layer/move_layer", + "config": "./src/transform/move_layer/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def test_mudata(random_h5mu_path): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + obsm = pd.DataFrame( + [["X", "W"]], index=pd.Index([0]), columns=["uns_col1", "uns_col2"] + ) + ad1 = AnnData(df, obs=obs, var=var, uns={"obsm1": obsm}) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = AnnData(df, obs=obs2, var=var2) + + test_h5mu = random_h5mu_path() + mudata = MuData({"mod1": ad1, "mod2": ad2}) + mudata.write_h5mu(test_h5mu) + return test_h5mu + + +def test_move_layer(test_mudata, run_component, random_h5mu_path): + output_file = random_h5mu_path() + run_component( + [ + "--input", + test_mudata, + "--modality", + "mod1", + "--output_layer", + "test_layer", + "--output", + output_file, + ] + ) + assert output_file.is_file() + output_mudata = read_h5mu(output_file) + assert "test_layer" in output_mudata.mod["mod1"].layers + assert output_mudata.mod["mod1"].X is None + + +def test_move_layer_select_input_layer(test_mudata, run_component, random_h5mu_path): + output_file = random_h5mu_path() + run_component( + [ + "--input", + test_mudata, + "--modality", + "mod1", + "--output_layer", + "test_layer", + "--output", + output_file, + ] + ) + output_file_2 = random_h5mu_path() + run_component( + [ + "--input", + output_file, + "--modality", + "mod1", + "--input_layer", + "test_layer", + "--output_layer", + "test_layer2", + "--output", + output_file_2, + ] + ) + assert output_file_2.is_file() + output_mudata = read_h5mu(output_file_2) + assert "test_layer2" in output_mudata.mod["mod1"].layers + assert "test_layer" not in output_mudata.mod["mod1"].layers + assert output_mudata.mod["mod1"].X is None + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/normalize_total/config.vsh.yaml b/src/transform/normalize_total/config.vsh.yaml new file mode 100644 index 00000000..5fb4d870 --- /dev/null +++ b/src/transform/normalize_total/config.vsh.yaml @@ -0,0 +1,84 @@ +name: normalize_total +namespace: "transform" +scope: "public" +description: | + Normalize counts per cell. + + Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization. + + If exclude_highly_expressed=True, very highly expressed genes are excluded from the computation of the normalization factor (size factor) for each cell. This is meaningful as these can strongly influence the resulting normalized values for all other genes [Weinreb17]. +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ maintainer ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ contributor ] +arguments: + # input + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. By default, X is normalized" + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu + + - name: "--output_layer" + type: string + description: Output layer to use. By default, use X. + required: false + + # arguments + - name: "--target_sum" + type: integer + description: If None, after normalization, each observation (cell) has a total count equal to the median of total counts for observations (cells) before normalization. + + - name: "--exclude_highly_expressed" + type: boolean_true + description: Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell. A gene is considered highly expressed, if it has more than max_fraction of the total counts in at least one cell. The not-excluded genes will sum up to target_sum. +__merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [midmem, lowcpu] diff --git a/src/transform/normalize_total/script.py b/src/transform/normalize_total/script.py new file mode 100644 index 00000000..c508e4d2 --- /dev/null +++ b/src/transform/normalize_total/script.py @@ -0,0 +1,53 @@ +import sys +import scanpy as sc +import mudata as mu + +## VIASH START +par = { + "input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu", + "output": "output.h5mu", + "target_sum": 10000, + "modality": "rna", + "exclude_highly_expressed": False, +} +meta = {"name": "lognorm"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from %s", par["modality"], par["input"]) +dat = mu.read_h5ad(par["input"], mod=par["modality"]) +assert dat.var_names.is_unique, "The var_names of the input modality must be be unique." + +logger.info(par) + +logger.info("Performing total normalization.") +if par["input_layer"] and par["input_layer"] not in dat.layers.keys(): + raise ValueError(f"Input layer {par['input_layer']} not found in {par['modality']}") +output_data = sc.pp.normalize_total( + dat, + layer=par["input_layer"], + target_sum=par["target_sum"], + copy=True if par["output_layer"] else False, +) + +if output_data: + result = ( + output_data.X + if not par["input_layer"] + else output_data.layers[par["input_layer"]] + ) + dat.layers[par["output_layer"]] = result + +logger.info( + "Writing to file to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], dat, par["output_compression"] +) diff --git a/src/transform/normalize_total/test.py b/src/transform/normalize_total/test.py new file mode 100644 index 00000000..010a3d48 --- /dev/null +++ b/src/transform/normalize_total/test.py @@ -0,0 +1,91 @@ +import sys +import pytest +import mudata as mu +import numpy as np +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "name": "lognorm", + "resources_dir": "resources_test/", + "config": "src/transform/normalize_total/config.vsh.yaml", + "executable": "target/docker/transform/normalize_total/normalize_total", +} +## VIASH END + +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +def test_run(run_component, random_h5mu_path): + output = random_h5mu_path() + cmd_pars = [ + "--input", + input, + "--output", + output, + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + assert output.is_file(), "No output was created." + + mu_input = mu.read_h5mu(input) + mu_output = mu.read_h5mu(output) + + assert "rna" in mu_output.mod, 'Output should contain data.mod["rna"].' + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + + assert rna_in.shape == rna_out.shape, "Should have same shape as before" + assert np.mean(rna_in.X) != np.mean(rna_out.X), "Expression should have changed" + + nz_row, nz_col = rna_in.X.nonzero() + row_corr = np.corrcoef( + rna_in.X[nz_row[0], :].toarray().flatten(), + rna_out.X[nz_row[0], :].toarray().flatten(), + )[0, 1] + col_corr = np.corrcoef( + rna_in.X[:, nz_col[0]].toarray().flatten(), + rna_out.X[:, nz_col[0]].toarray().flatten(), + )[0, 1] + assert row_corr > 0.1 + assert col_corr > 0.1 + + # Copy over X so that the rest of the objects can be compared + mu_input["rna"].X = mu_output["rna"].X + assert_annotation_objects_equal(mu_input, mu_output) + + +def test_target_sum(run_component, random_h5mu_path): + output = random_h5mu_path() + cmd_pars = [ + "--input", + input, + "--output", + output, + "--output_compression", + "gzip", + "--target_sum", + "10000", + ] + run_component(cmd_pars) + + assert output.is_file(), "No output was created." + + mu_output = mu.read_h5mu(output) + mu_input = mu.read_h5mu(input) + rna_out = mu_output.mod["rna"] + + assert np.all(np.abs(rna_out.X.sum(axis=1) - 10000) < 1), ( + "Expression should have changed" + ) + + # Copy over X so that the rest of the object can be compared + mu_input["rna"].X = mu_output["rna"].X + assert_annotation_objects_equal(mu_input, mu_output) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/regress_out/config.vsh.yaml b/src/transform/regress_out/config.vsh.yaml new file mode 100644 index 00000000..a8b4ac6e --- /dev/null +++ b/src/transform/regress_out/config.vsh.yaml @@ -0,0 +1,72 @@ +name: regress_out +namespace: "transform" +scope: "public" +description: | + Regress out (mostly) unwanted sources of variation. + Uses simple linear regression. This is inspired by Seurat's regressOut function in R [Satija15]. + Note that this function tends to overcorrect in certain circumstances as described in issue theislab/scanpy#526. + See https://github.com/theislab/scanpy/issues/526. +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, contributor ] +arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu + - name: "--modality" + type: string + description: "Which modality (one or more) to run this component on." + default: "rna" + - name: "--obs_keys" + type: string + multiple: true + description: Which .obs keys to regress on. + - name: "--input_layer" + type: string + required: false + example: X_normalized + description: | + The layer of the adata object to regress on. + If not provided, the X attribute of the adata object will be used. + - name: "--output_layer" + type: string + example: X_regressed + required: false + description: | + The layer of the adata object containing the regressed count data. + If not provided, the X attribute of the adata object will be used. +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: + - type: docker + image: python:3.10-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowcpu] diff --git a/src/transform/regress_out/script.py b/src/transform/regress_out/script.py new file mode 100644 index 00000000..957be276 --- /dev/null +++ b/src/transform/regress_out/script.py @@ -0,0 +1,47 @@ +import scanpy as sc +import mudata as mu +import anndata as ad +import multiprocessing +import sys + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "output": "output.h5mu", + "modality": "rna", + "obs_keys": [], +} +meta = {"name": "lognorm"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) +mdata.var_names_make_unique() + +if par["obs_keys"] is not None and len(par["obs_keys"]) > 0: + mod = par["modality"] + data = mdata.mod[mod] + + # Copy required data from input data to new AnnData object to allow providing input and output layers + input_layer = data.X if not par["input_layer"] else data.layers[par["input_layer"]] + obs = data.obs.loc[:, par["obs_keys"]] + sc_data = ad.AnnData(X=input_layer.copy(), obs=obs) + + logger.info("Regress out variables on modality %s", mod) + sc.pp.regress_out( + sc_data, keys=par["obs_keys"], n_jobs=multiprocessing.cpu_count() - 1 + ) + + # Copy regressed data back to original input data + if par["output_layer"]: + data.layers[par["output_layer"]] = sc_data.X + else: + data.X = sc_data.X + +logger.info("Writing to file") +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/transform/regress_out/test.py b/src/transform/regress_out/test.py new file mode 100644 index 00000000..b2f3eeb4 --- /dev/null +++ b/src/transform/regress_out/test.py @@ -0,0 +1,103 @@ +import sys +import pytest +import mudata as mu +import numpy as np + +## VIASH START +meta = {"name": "lognorm", "resources_dir": "resources_test/"} +## VIASH END + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_data(input_path): + return mu.read_h5mu(input_path) + + +@pytest.fixture +def input_h5mu(input_data): + input_data.obs["var"] = np.random.rand(input_data.n_obs) + input_data.mod["rna"].obs["var"] = input_data.obs["var"] + input_data.mod["prot"].obs["var"] = input_data.obs["var"] + input_data.mod["rna"].layers["input"] = input_data.mod["rna"].X + return input_data + + +@pytest.fixture +def input_h5mu_path(write_mudata_to_file, input_h5mu): + return write_mudata_to_file(input_h5mu) + + +@pytest.fixture +def output_h5mu_path(tmp_path): + return tmp_path / "output.h5mu" + + +def test_regress_out(run_component, input_h5mu_path, output_h5mu_path): + # execute command + cmd_pars = [ + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + assert output_h5mu_path.is_file(), "No output was created." + + mu_input = mu.read_h5mu(input_h5mu_path) + mu_output = mu.read_h5mu(output_h5mu_path) + + assert "rna" in mu_output.mod, 'Output should contain data.mod["prot"].' + assert "prot" in mu_output.mod, 'Output should contain data.mod["prot"].' + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + prot_in = mu_input.mod["prot"] + prot_out = mu_output.mod["prot"] + + assert rna_in.shape == rna_out.shape, "Should have same shape as before" + assert prot_in.shape == prot_out.shape, "Should have same shape as before" + + assert np.mean(rna_in.X) != np.mean(rna_out.X), "Expression should have changed" + + +def test_regress_out_with_layers(run_component, input_h5mu_path, output_h5mu_path): + # execute command + cmd_pars = [ + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--input_layer", + "input", + "--output_layer", + "output", + "--output_compression", + "gzip", + ] + run_component(cmd_pars) + + mu_input = mu.read_h5mu(input_h5mu_path) + mu_output = mu.read_h5mu(output_h5mu_path) + + rna_in = mu_input.mod["rna"] + rna_out = mu_output.mod["rna"] + + assert np.mean(rna_in.layers["input"]) != np.mean(rna_out.layers["output"]), ( + "RNA expression should have changed" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/scale/config.vsh.yaml b/src/transform/scale/config.vsh.yaml new file mode 100644 index 00000000..61998ba8 --- /dev/null +++ b/src/transform/scale/config.vsh.yaml @@ -0,0 +1,71 @@ +name: scale +namespace: "transform" +scope: "public" +description: | + Scale data to unit variance and zero mean. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +arguments: + # input + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file. + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: List of modalities to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: "string" + description: "Input layer with data to scale. Uses .X by default" + required: false + - name: "--output_layer" + type: string + description: "Output layer where scaled data will be stored. If not specified, .X will be used." + - name: "--max_value" + required: false + type: double + description: Clip (truncate) to this value after scaling. Does not clip by default. + - name: "--zero_center" + type: boolean_false + description: If set, omit zero-centering variables, which allows to handle sparse input efficiently. + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + default: output.h5mu +__merge__: [., /src/base/h5_compression_argument.yaml] + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowcpu] diff --git a/src/transform/scale/script.py b/src/transform/scale/script.py new file mode 100644 index 00000000..eba6c1e8 --- /dev/null +++ b/src/transform/scale/script.py @@ -0,0 +1,57 @@ +import sys +from mudata import read_h5ad +import scanpy +from functools import partial +from operator import setitem + +## VIASH START +par = { + "input": "../../../resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3.h5mu", + "output": "output.h5mu", + "modality": "rna", + "max_value": None, + "zero_center": True, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info( + "Reading modality %s from .h5mu file: %s", par["modality"], par["input"] + ) + data = read_h5ad(par["input"], mod=par["modality"]) + logger.info("Scaling modality %s.", par["modality"]) + scanpy_output = scanpy.pp.scale( + data, + layer=par["input_layer"], + zero_center=par["zero_center"], + max_value=par["max_value"], + copy=True, + ) + output_layer_setter = ( + partial(setattr, data, "X") + if not par["output_layer"] + else partial(setitem, data.layers, par["output_layer"]) + ) + output_layer_setter( + scanpy_output.X + if not par["input_layer"] + else scanpy_output.layers[par["input_layer"]] + ) + logger.info( + "Writing to %s with compression %s", par["output"], par["output_compression"] + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] + ) + logger.info("Finished") + + +if __name__ == "__main__": + main() diff --git a/src/transform/scale/test.py b/src/transform/scale/test.py new file mode 100644 index 00000000..83d0a372 --- /dev/null +++ b/src/transform/scale/test.py @@ -0,0 +1,195 @@ +import sys +import pytest +import numpy as np +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +## VIASH START +meta = { + "executable": "target/docker/scale/scale", + "resources_dir": "resources_test", + "config": "src/transform/scale/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + + +@pytest.fixture +def input_data(input_path): + return read_h5mu(input_path) + + +def test_scaling_input_layer( + run_component, input_data, write_mudata_to_file, random_h5mu_path +): + """ + The component must select the correct input layer. + """ + input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X.copy() + del input_data.mod["rna"].X + input_path = write_mudata_to_file(input_data) + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_file, + "--input_layer", + "test_layer", + "--ouput_compression", + "gzip", + ] + ) + + assert output_file.is_file() + output_data = read_h5mu(output_file) + output_x = output_data.mod["rna"].X + mean = np.mean(output_x, axis=0, dtype=np.float64) + variance = np.multiply(output_x, output_x).mean(axis=0, dtype=np.float64) - mean**2 + variance[variance == 0] = 1 + assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) + assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) + # Remove the output layer in order to + # test equality for the other data slots + del output_data.mod["rna"].X + assert_annotation_objects_equal(input_path, output_data) + + +def test_scaling_output_layer(run_component, random_h5mu_path, input_path): + """ + Output data must create the specified output layer. + """ + output_file = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_file, + "--output_layer", + "scaled", + "--ouput_compression", + "gzip", + ] + ) + + assert output_file.is_file() + output_data = read_h5mu(output_file) + assert "scaled" in output_data.mod["rna"].layers + output_scaled = output_data.mod["rna"].layers["scaled"] + mean = np.mean(output_scaled, axis=0, dtype=np.float64) + variance = ( + np.multiply(output_scaled, output_scaled).mean(axis=0, dtype=np.float64) + - mean**2 + ) + variance[variance == 0] = 1 + assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) + assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) + # Remove the output layer in order to + # test equality for the other data slots + del output_data.mod["rna"].layers["scaled"] + assert_annotation_objects_equal(input_path, output_data) + + +def test_scaling(run_component, random_h5mu_path, input_path): + """ + Output data must be centered around mean 0 and it has unit variance. + """ + output_file = random_h5mu_path() + + run_component( + ["--input", input_path, "--output", output_file, "--ouput_compression", "gzip"] + ) + + assert output_file.is_file() + output_data = read_h5mu(output_file) + output_x = output_data.mod["rna"].X + mean = np.mean(output_x, axis=0, dtype=np.float64) + variance = np.multiply(output_x, output_x).mean(axis=0, dtype=np.float64) - mean**2 + variance[variance == 0] = 1 + assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) + assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) + # Restore the output layer in order to + # test equality for the other data slots + input_x = read_h5mu(input_path)["rna"].X + output_data["rna"].X = input_x + assert_annotation_objects_equal(input_path, output_data) + + +def test_scaling_noncenter(run_component, random_h5mu_path, input_path): + """ + Check if centering can be disabled. + """ + output_file = random_h5mu_path() + + run_component( + ["--input", input_path, "--output", str(output_file), "--zero_center", "false"] + ) + assert output_file.is_file() + output_data = read_h5mu(output_file) + output_x = output_data.mod["rna"].X + mean = np.mean(output_x, axis=0, dtype=np.float64) + assert not np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) + # Restore the output layer in order to + # test equality for the other data slots + input_x = read_h5mu(input_path)["rna"].X + output_data["rna"].X = input_x + assert_annotation_objects_equal(input_path, output_data) + + +def test_scaling_maxvalue(run_component, random_h5mu_path, input_path): + """ + Check if output data is clipped when using --max_value + """ + output_file = random_h5mu_path() + + run_component( + ["--input", input_path, "--output", str(output_file), "--max_value", "0.5"] + ) + assert output_file.is_file() + output_data = read_h5mu(output_file) + output_x = output_data.mod["rna"].X + assert np.all(output_x <= 0.5) + # Restore the output layer in order to + # test equality for the other data slots + input_x = read_h5mu(input_path)["rna"].X + output_data["rna"].X = input_x + assert_annotation_objects_equal(input_path, output_data) + + +def test_scaling_modality(run_component, random_h5mu_path, input_path): + """ + Check if 'rna' modality remain untouched when using '--modality prot' argument. + """ + output_file = random_h5mu_path() + + run_component( + ["--input", input_path, "--output", str(output_file), "--modality", "prot"] + ) + assert output_file.is_file() + output_data = read_h5mu(output_file) + + output_prot = output_data.mod["prot"].X + mean = np.mean(output_prot, axis=0, dtype=np.float64) + variance = ( + np.multiply(output_prot, output_prot).mean(axis=0, dtype=np.float64) - mean**2 + ) + variance[variance == 0] = 1 + assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) + assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) + # Restore the output layer in order to + # test equality for the other data slots + input_x = read_h5mu(input_path)["prot"].X + output_data["prot"].X = input_x + assert_annotation_objects_equal(input_path, output_data) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/tfidf/config.vsh.yaml b/src/transform/tfidf/config.vsh.yaml new file mode 100644 index 00000000..11820f13 --- /dev/null +++ b/src/transform/tfidf/config.vsh.yaml @@ -0,0 +1,104 @@ +name: tfidf +namespace: "transform" +scope: "public" +description: | + Perform TF-IDF normalization of the data (typically, ATAC). + + TF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis. + In the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells). + TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks) + across different cells while down-weighting regions that are commonly accessible across many cells. +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ maintainer ] +arguments: + # input + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "atac" + required: false + + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. By default, X is normalized" + + # output + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + + - name: "--output_layer" + type: string + description: Output layer to use. + default: "tfidf" + required: false + + # arguments + - name: "--scale_factor" + type: integer + description: Scale factor to multiply the TF-IDF matrix by. + default: 10000 + min: 1 + + - name: "--log_idf" + description: Whether to log-transform IDF term. + type: boolean + default: true + + - name: "--log_tf" + description: Whether to log-transform TF term. + type: boolean + default: true + + - name: "--log_tfidf" + description: Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False. + type: boolean + default: false + +__merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_atac_tiny_bcl/counts/ +engines: + - type: docker + image: python:3.10-slim-bullseye + setup: + - type: apt + packages: + - libhdf5-dev + - procps + - pkg-config # Otherwise h5py installation fails, which is required for scanpy + - gcc + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - muon~=0.1.5 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, lowcpu] diff --git a/src/transform/tfidf/script.py b/src/transform/tfidf/script.py new file mode 100644 index 00000000..a4e9b341 --- /dev/null +++ b/src/transform/tfidf/script.py @@ -0,0 +1,52 @@ +import sys +import mudata +import muon + +## VIASH START +par = { + "input": "work/d9/3adbd080e0de618d44b59b1ec81685/run.output.h5mu", + "output": "output.h5mu", + "scale_factor": 10000, + "modality": "atac", + "input_layer": None, + "output_layer": None, + "output_compression": "gzip", + "log_idf": True, + "log_tf": True, + "log_tfidf": False, +} +meta = {"name": "tfidf"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input modality %s from %s", par["modality"], par["input"]) +adata = mudata.read_h5ad(par["input"], mod=par["modality"]) + +logger.info(par) + +logger.info("Performing TF-IDF normalization") +input_data = adata.copy() + +muon.atac.pp.tfidf( + input_data, + log_tf=par["log_tf"], + log_idf=par["log_idf"], + log_tfidf=par["log_tfidf"], + scale_factor=par["scale_factor"], + inplace=True, + copy=False, + from_layer=par["input_layer"], + to_layer=par["output_layer"], +) + +adata.layers[par["output_layer"]] = input_data.layers[par["output_layer"]] + +logger.info("Writing to file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) diff --git a/src/transform/tfidf/test.py b/src/transform/tfidf/test.py new file mode 100644 index 00000000..97772027 --- /dev/null +++ b/src/transform/tfidf/test.py @@ -0,0 +1,158 @@ +from pathlib import Path +import pytest +import sys + +import mudata as md +import numpy as np +import scanpy as sc +import muon as mu + +## VIASH START +meta = { + "executable": "./target/docker/transform/tfidf/tfidf", + "resources_dir": "./resources_test/cellranger_atac_tiny_bcl/counts/", + "config": "./src/transform/tfidf/config.vsh.yaml", + "cpus": 2, +} +## VIASH END + + +@pytest.fixture +def synthetic_example(): + atac = sc.AnnData( + np.array([[0, 0, 0], [1, 0, 1], [10, 0, 0], [100, 0, 1], [1000, 0, 0]]) + ) + atac.obs_names = ["A", "B", "C", "D", "E"] + atac.var_names = ["x", "y", "z"] + + return md.MuData({"atac": atac}) + + +@pytest.fixture +def example_mudata(tmp_path, synthetic_example): + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + + +@pytest.fixture +def example_mudata_with_layer(tmp_path, synthetic_example): + synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod[ + "atac" + ].X.copy() + synthetic_example.mod["atac"].X = np.random.normal( + size=synthetic_example.mod["atac"].X.shape + ) + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + + +@pytest.fixture +def neurips_mudata(tmp_path): + """From the `NeurIPS Multimodal Single-Cell Integration Challenge + ` + + Link is taken from the Moscot repository: + https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 + """ + adata = sc.read( + "../data/neurips_data.h5ad", + backup_url="https://figshare.com/ndownloader/files/37993503", + ) + + mdata = md.MuData({"atac": adata}) + mdata_path = tmp_path / "neurips.h5mu" + mdata.write(mdata_path) + + return mdata_path + + +@pytest.fixture +def tiny_atac_mudata(tmp_path): + resources_dir = Path(meta["resources_dir"]) + + mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5") + mdata_path = tmp_path / "tiny_atac.h5mu" + mdata.write(mdata_path) + + return mdata_path + + +@pytest.mark.parametrize( + "mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"] +) +def test_output_layer(run_component, request, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "atac", + ] + + run_component(args) + assert output_path.is_file() + output_mdata = md.read(output_path) + + assert "tfidf" in output_mdata.mod["atac"].layers.keys() + + +@pytest.mark.parametrize("mudata", ["example_mudata"]) +def test_calculations_correctness(request, run_component, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "atac", + ] + + run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"]) + assert output_path.is_file() + output_mdata = md.read(output_path) + + assert np.allclose( + output_mdata.mod["atac"].layers["tfidf_10000"].toarray(), + np.array( + [ + [np.nan, np.nan, np.nan], + [0.0382461, 0.0, 10.67027475], + [0.04135813, 0.0, 0.0], + [0.04131346, 0.0, 5.7693107], + [0.04135813, 0.0, 0.0], + ] + ), + equal_nan=True, + ) + + run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"]) + output_mdata = md.read(output_path) + + assert np.allclose( + output_mdata.mod["atac"].layers["tfidf_100"].toarray(), + np.array( + [ + [np.nan, np.nan, np.nan], + [0.01765529, 0.0, 4.92564555], + [0.02072352, 0.0, 0.0], + [0.02067929, 0.0, 0.86213192], + [0.02072352, 0.0, 0.0], + ] + ), + equal_nan=True, + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/utils/compress_h5mu.py b/src/utils/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/src/utils/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/src/utils/cross_check_genes.py b/src/utils/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/src/utils/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/src/utils/find_non_ascii.sh b/src/utils/find_non_ascii.sh new file mode 100644 index 00000000..889f8c5c --- /dev/null +++ b/src/utils/find_non_ascii.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +find src -type f -exec grep --color='auto' -P -n "[\x80-\xFF]" {} + \ No newline at end of file diff --git a/src/utils/set_var_index.py b/src/utils/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/src/utils/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/src/utils/setup_logger.py b/src/utils/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/src/utils/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/src/utils/subset_vars.py b/src/utils/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/src/utils/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/src/velocity/scvelo/config.vsh.yaml b/src/velocity/scvelo/config.vsh.yaml new file mode 100644 index 00000000..8a0f87ae --- /dev/null +++ b/src/velocity/scvelo/config.vsh.yaml @@ -0,0 +1,126 @@ +name: scvelo +namespace: "velocity" +scope: "public" +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + description: "Input MuData file" + type: file + direction: input + required: true + - name: "--counts_layer" + type: string + description: "Name of the counts layer, if not specified, X is used." + required: false + - name: "--modality" + description: Input modality + required: true + type: string + - name: "--layer_spliced" + description: | + Name of the layer to store the spliced abundances in. Will be used as key in the .layer attribute of the + output MuData object. + type: string + required: false + default: "spliced" + - name: "--layer_unspliced" + description: | + Name of the layer to store the unspliced abundances in. + Will be used as key in the .layer attribute of the output MuData object. + type: string + required: false + default: "unspliced" + - name: "--layer_ambiguous" + description: | + Name of the layer to store the abundances in for which no fate was determined. + Will be used as key in the .layer attribute of the output MuData object. + type: string + required: false + default: "ambiguous" + - name: Outputs + arguments: + - name: "--output" + required: true + type: file + direction: output + description: "Output directory. If it does not exist, will be created." + - name: "--output_h5mu" + required: true + type: file + direction: output + description: "Output mudata file." + __merge__: [., /src/base/h5_compression_argument.yaml] + + - name: "Filtering and normalization" + description: Arguments for filtering, normalization an log transform (see scvelo.pp.filter_and_normalize function) + arguments: + - name: --min_counts + description: Minimum number of counts required for a gene to pass filtering (spliced). + type: integer + - name: --min_counts_u + description: Minimum number of counts required for a gene to pass filtering (unspliced). + type: integer + - name: --min_cells + description: Minimum number of cells expressed required to pass filtering (spliced). + type: integer + - name: --min_cells_u + description: Minimum number of cells expressed required to pass filtering (unspliced). + type: integer + - name: "--min_shared_counts" + description: "Minimum number of counts (both unspliced and spliced) required for a gene." + type: integer + - name: "--min_shared_cells" + description: "Minimum number of cells required to be expressed (both unspliced and spliced)." + type: integer + - name: "--n_top_genes" + description: "Number of genes to keep." + type: integer + - name: "--log_transform" + type: boolean + default: true + description: "Do not log transform counts." + - name: Fitting parameters + description: Arguments for fitting the data + arguments: + - name: --n_principal_components + description: Number of principal components to use for calculating moments. + type: integer + - name: --n_neighbors + description: | + Number of neighbors to use. First/second-order moments are computed for each + cell across its nearest neighbors, where the neighbor graph is obtained from + euclidean distances in PCA space. + type: integer + default: 30 +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/rna_velocity/velocyto_processed/velocyto.h5mu +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - git + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - scvelo~=0.3.3 + - scipy~=1.14.1 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [ highmem, highcpu ] \ No newline at end of file diff --git a/src/velocity/scvelo/script.py b/src/velocity/scvelo/script.py new file mode 100644 index 00000000..2a244944 --- /dev/null +++ b/src/velocity/scvelo/script.py @@ -0,0 +1,177 @@ +import sys +import mudata +import anndata +import tempfile +import shutil +from contextlib import redirect_stdout +from pathlib import Path +import matplotlib as mpl +import scvelo + +## VIASH START +from collections import defaultdict + + +def none_factory(): + return None + + +par = defaultdict( + none_factory, + { + "input": "resources_test/rna_velocity/velocyto_processed/velocyto.h5mu", + "modality": "velocyto", + "output": "./foo", + "output_h5mu": "output.h5mu", + "log_transform": True, + "n_neighbors": 30, + "layer_spliced": "velo_spliced", + "layer_unspliced": "velo_unspliced", + "layer_ambiguous": "velo_ambiguous", + }, +) + +meta = {"resources_dir": "src/utils", "temp_dir": "/tmp/"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import compress_h5mu + +logger = setup_logger() + +mpl.rcParams["savefig.dpi"] = 150 + + +# Script must be wrapped into a main function because scvelo spawn subprocesses +# and this fails when the functions are not wrapped. +def main(): + # Create output directory + output_dir = Path(par["output"]) + output_dir.mkdir(parents=True, exist_ok=True) + scvelo.settings.figdir = str(output_dir) + + # Load the input data + adata_in = mudata.read_h5ad(par["input"], mod=par["modality"]) + + # Create a copy of the data as input + # as many scvelo functions do not take input layer arguments + layers_mapping = { + "spliced": par["layer_spliced"], + "unspliced": par["layer_unspliced"], + "ambiguous": par["layer_ambiguous"], + } + layer_data = { + default: (adata_in.layers.get(arg_val) if arg_val else adata_in.layers[default]) + for default, arg_val in layers_mapping.items() + } + adata = anndata.AnnData( + X=adata_in.X + if not par["counts_layer"] + else adata_in.layers[par["counts_layer"]], + layers=layer_data, + ) + + # Calculate the sample name + sample_name = par["output"].removesuffix(".h5mu") + sample_name = Path(sample_name).name + + # Read the input data + + # Save spliced vs unspliced proportions to file + with (output_dir / "proportions.txt").open("w") as target: + with redirect_stdout(target): + scvelo.utils.show_proportions(adata) + + # Plot piecharts of spliced vs unspliced proportions + scvelo.pl.proportions(adata, save=True, show=False) + + # Perform preprocessing + scvelo.pp.filter_and_normalize( + adata, + min_counts=par["min_counts"], + min_counts_u=par["min_counts_u"], + min_cells=par["min_cells"], + min_cells_u=par["min_cells_u"], + min_shared_counts=par["min_shared_counts"], + min_shared_cells=par["min_shared_cells"], + n_top_genes=par["n_top_genes"], + log=par["log_transform"], + ) + + # Fitting + scvelo.pp.moments( + adata, n_pcs=par["n_principal_components"], n_neighbors=par["n_neighbors"] + ) + + # Second step in velocyto calculations + # Velocity calculation and visualization + # From the scvelo manual: + # The solution to the full dynamical model is obtained by setting mode='dynamical', + # which requires to run scv.tl.recover_dynamics(adata) beforehand + scvelo.tl.recover_dynamics(adata) + scvelo.tl.velocity(adata, mode="dynamical") + scvelo.tl.velocity_graph(adata) + scvelo.pl.velocity_graph( + adata, save=str(output_dir / "scvelo_graph.pdf"), show=False + ) + + # Plotting + # TODO: add more here. + scvelo.pl.velocity_embedding_stream( + adata, save=str(output_dir / "scvelo_embedding.pdf"), show=False + ) + + # Copy over slots to output + for slot in ("obs", "var"): + setattr( + adata_in, + slot, + getattr(adata_in, slot) + .assign(**getattr(adata, slot).to_dict()) + .convert_dtypes(), + ) + items_per_slot = { + "uns": ( + "recover_dynamics", + "velocity_params", + "velocity_graph", + "velocity_graph_neg", + ), + "varm": ("loss",), + "obsm": ("velocity_pca",), + "layers": ( + "Ms", + "Mu", + "fit_t", + "fit_tau", + "fit_tau_", + "velocity", + "velocity_u", + ), + } + for dict_slot, dict_items in items_per_slot.items(): + setattr( + adata_in, + dict_slot, + dict( + getattr(adata_in, dict_slot), + **{key_: getattr(adata, dict_slot)[key_] for key_ in dict_items}, + ), + ) + with tempfile.NamedTemporaryFile( + suffix=".h5mu", delete_on_close=False + ) as temp_h5mu: + shutil.copyfile(par["input"], temp_h5mu.name) + # Create output + mudata.write_h5ad(temp_h5mu.name, mod=par["modality"], data=adata_in) + compression = par["output_compression"] + + if compression: + compress_h5mu(temp_h5mu.name, par["output_h5mu"], compression=compression) + else: + shutil.move(temp_h5mu.name, par["output_h5mu"]) + + +if __name__ == "__main__": + main() diff --git a/src/velocity/scvelo/test.py b/src/velocity/scvelo/test.py new file mode 100644 index 00000000..ea761824 --- /dev/null +++ b/src/velocity/scvelo/test.py @@ -0,0 +1,77 @@ +import sys +import pytest +from mudata import read_h5mu + +## VIASH START +meta = { + "config": "src/velocity/scvelo/config.vsh.yaml", + "executable": "./target/executable/velocity/scvelo/scvelo", + "resources_dir": "resources_test/rna_velocity/velocyto_processed/", +} +## VIASH END + +input_h5mu = f"{meta['resources_dir']}/velocyto.h5mu" + + +def test_scvelo(run_component, tmp_path): + output_dir = tmp_path / "foo" + output_h5mu = tmp_path / "output.h5mu" + run_component( + [ + "--input", + input_h5mu, + "--modality", + "velocyto", + "--output", + str(output_dir), + "--output_h5mu", + output_h5mu, + "--layer_spliced", + "velo_spliced", + "--layer_unspliced", + "velo_unspliced", + "--layer_ambiguous", + "velo_ambiguous", + "--output_compression", + "gzip", + ] + ) + + assert output_dir.is_dir() + assert (output_dir / "scvelo_proportions.pdf").is_file() + assert (output_dir / "scvelo_embedding.pdf").is_file() + assert (output_dir / "scvelo_graph.pdf").is_file() + assert (output_dir / "proportions.txt").is_file() + assert output_h5mu.is_file() + + output_data = read_h5mu(output_h5mu) + items_per_slot = { + "uns": ( + "recover_dynamics", + "velocity_params", + "velocity_graph", + "velocity_graph_neg", + ), + "varm": ("loss",), + "obsm": ("velocity_pca",), + "layers": ( + "Ms", + "Mu", + "fit_t", + "fit_tau", + "fit_tau_", + "velocity", + "velocity_u", + ), + } + for dict_slot, dict_items in items_per_slot.items(): + for dict_item in dict_items: + assert ( + getattr(output_data.mod["velocyto"], dict_slot).get(dict_item) + is not None + ), f"Expected {dict_item} to be present in {dict_slot}" + del getattr(output_data.mod["velocyto"], dict_slot)[dict_item] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/velocity/velocyto/config.vsh.yaml b/src/velocity/velocyto/config.vsh.yaml new file mode 100644 index 00000000..66e562a8 --- /dev/null +++ b/src/velocity/velocyto/config.vsh.yaml @@ -0,0 +1,81 @@ +name: velocyto +namespace: "velocity" +scope: "public" +description: "Runs the velocity analysis on a BAM file, outputting a loom file." +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] +arguments: + - name: "--input" + alternatives: [-i] + type: file + direction: input + description: "Path to BAM file" + required: true + - name: "--transcriptome" + alternatives: [-t] + type: file + direction: input + description: "Path to GTF file" + required: true + - name: "--barcode" + alternatives: [-b] + type: file + direction: input + description: | + Valid barcodes file, to filter the bam. If --bcfile is not specified all the cell barcodes will be included. + Cell barcodes should be specified in the bcfile as the 'CB' tag for each read + required: false + - name: "--without_umi" + type: boolean_true + description: "foo" + - name: "--output" + alternatives: [-o] + required: true + type: file + direction: output + description: "Velocyto loom file" + - name: "--logic" + alternatives: ["-l"] + type: string + required: false + choices: ["Default", "Permissive10X", "Intermediate10X", "ValidatedIntrons10X", "Stricter10X", "ObservedSpanning10X", "Discordant10X", "SmartSeq2"] + default: Default + description: "The logic to use for the filtering." +resources: + - type: bash_script + path: script.sh +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + - path: /resources_test/rna_velocity + - path: /resources_test/reference_gencodev41_chr1 + +engines: + - type: docker + image: python:3.10-slim-bookworm + setup: + - type: apt + packages: + - procps + - build-essential + - file + - type: python + pip: + - numpy<2 + - Cython + - type: python + pip: + - velocyto + - type: apt + packages: + - samtools + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, lowcpu] \ No newline at end of file diff --git a/src/velocity/velocyto/script.sh b/src/velocity/velocyto/script.sh new file mode 100644 index 00000000..2420647c --- /dev/null +++ b/src/velocity/velocyto/script.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_input="resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" +par_transcriptome="resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf" +par_output="./foo/output.loom" +par_barcode="" +## VIASH END + +extra_params=( ) + +if [ ! -z "$par_barcode" ]; then + extra_params+=( "--bcfile=$par_barcode" ) +fi +if [ "$par_without_umi" == "true" ]; then + extra_params+=( "--without-umi" ) +fi +if [ ! -z "$meta_cpus" ]; then + extra_params+=( "--samtools-threads" "$meta_cpus" ) +fi +if [ ! -z "$meta_memory_mb" ]; then + extra_params+=( "--samtools-memory" "$meta_memory_mb" ) +fi + +output_dir=`dirname "$par_output"` +sample_id=`basename "$par_output" .loom` + +if (file `readlink -f "$par_transcriptome"` | grep -q compressed ) ; then + # create temporary directory + tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + zcat "$par_transcriptome" > "$tmpdir/genes.gtf" + par_transcriptome="$tmpdir/genes.gtf" +fi + +velocyto run \ + "$par_input" \ + "$par_transcriptome" \ + "${extra_params[@]}" \ + --outputfolder "$output_dir" \ + --sampleid "$sample_id" diff --git a/src/velocity/velocyto/test.py b/src/velocity/velocyto/test.py new file mode 100644 index 00000000..b21c416d --- /dev/null +++ b/src/velocity/velocyto/test.py @@ -0,0 +1,117 @@ +import pytest +import sys +import gzip +import shutil +import loompy +import pysam + +## VIASH START +meta = { + "name": "./target/executable/projection/velocyto/velocyto", + "resources_dir": "./resources_test/", +} +## VIASH END + +# input data for bd bam +input_bam_bd = f"{meta['resources_dir']}/rna_velocity/velocyto/compatible_bd_input.bam" +input_gtf_bd = f"{meta['resources_dir']}/reference_gencodev41_chr1/reference.gtf.gz" +input_barcodes_bd = f"{meta['resources_dir']}/rna_velocity/velocyto/barcodes.txt" + +# input data for 10x bam +input_bam_cellranger = ( + f"{meta['resources_dir']}/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" +) +input_gtf_cellranger = f"{meta['resources_dir']}/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" + + +def test_velocyto_cellranger(run_component, tmp_path): + """Check whether component accepts compressed gtf files""" + + output_file = tmp_path / "foo" / "velocyto.loom" + + run_component( + [ + "--input", + input_bam_cellranger, + "--transcriptome", + input_gtf_cellranger, + "--output", + str(output_file), + ] + ) + assert output_file.is_file() + + input_barcodes = set() + with pysam.AlignmentFile(input_bam_cellranger, "r") as input_bam: + for read in input_bam: + tags = dict(read.tags) + cell_barcode = tags.get("CB") + if cell_barcode: + input_barcodes.add(cell_barcode.removesuffix("-1")) + with loompy.connect(output_file) as ds: + result_barcodes = { + tag.removeprefix("velocyto:").removesuffix("x") for tag in ds.ca.CellID + } + assert result_barcodes.issubset(input_barcodes) + assert ds.ca.keys() == ["CellID"] + assert ds.ra.keys(), [ + "Accession", + "Chromosome", + "End", + "Gene", + "Start" == "Strand", + ] + rows, cols = ds.shape + assert rows > 0 + assert cols > 0 + + +def test_velocyto_bd_rhapsody(run_component, tmp_path): + """Check whether component also accepts uncompressed gtf files""" + + output_file = tmp_path / "foo" / "velocyto.loom" + transcriptome = tmp_path / "genes.gtf" + + with open(transcriptome, "wb") as gtf_uncompressed: + with gzip.open(input_gtf_bd, "rb") as gtf_compressed: + shutil.copyfileobj(gtf_compressed, gtf_uncompressed) + + run_component( + [ + "--input", + input_bam_bd, + "--transcriptome", + str(transcriptome), + "--output", + str(output_file), + "--barcode", + input_barcodes_bd, + ] + ) + assert output_file.is_file() + + input_barcodes = set() + with open(input_barcodes_bd, "r") as barcodes_file: + for barcode in barcodes_file: + input_barcodes.add(barcode.strip()) + + with loompy.connect(output_file) as ds: + result_barcodes = { + tag.removeprefix("velocyto:").removesuffix("x") for tag in ds.ca.CellID + } + assert result_barcodes.issubset(input_barcodes) + assert ds.ca.keys() == ["CellID"] + assert ds.ra.keys(), [ + "Accession", + "Chromosome", + "End", + "Gene", + "Start" == "Strand", + ] + rows, cols = ds.shape + assert rows > 0 + assert cols > 0 + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/velocity/velocyto_to_h5mu/config.vsh.yaml b/src/velocity/velocyto_to_h5mu/config.vsh.yaml new file mode 100644 index 00000000..6ad91a8f --- /dev/null +++ b/src/velocity/velocyto_to_h5mu/config.vsh.yaml @@ -0,0 +1,76 @@ +name: "velocyto_to_h5mu" +namespace: "convert" +scope: "public" +description: | + Convert a velocyto loom file to a h5mu file. + + If an input h5mu file is also provided, the velocity + h5ad object will get added to that h5mu instead. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--input_loom" + type: file + description: Path to the input loom file. + example: input.loom + required: true + - name: "--input_h5mu" + type: file + description: If a MuData file is provided, + example: input.h5mu + - name: "--modality" + type: string + description: The name of the modality to operate on. + default: rna_velocity + - name: Outputs + arguments: + - name: "--output" + type: file + description: Path to the output MuData file. + example: output.h5mu + direction: output + - name: "--layer_spliced" + type: string + description: Output layer for the spliced reads. + default: velo_spliced + - name: "--layer_unspliced" + type: string + description: Output layer for the unspliced reads. + default: velo_unspliced + - name: "--layer_ambiguous" + type: string + description: Output layer for the ambiguous reads. + default: velo_ambiguous + __merge__: [., /src/base/h5_compression_argument.yaml] +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_tiny_fastq + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - loompy + +runners: +- type: executable +- type: nextflow + directives: + label: [lowmem, lowcpu] \ No newline at end of file diff --git a/src/velocity/velocyto_to_h5mu/script.py b/src/velocity/velocyto_to_h5mu/script.py new file mode 100644 index 00000000..53a96c11 --- /dev/null +++ b/src/velocity/velocyto_to_h5mu/script.py @@ -0,0 +1,54 @@ +import sys +import anndata as ad +import mudata as mu +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.string_ = np.bytes_ +numpy_module.unicode_ = np.str_ +sys.modules["numpy"] = numpy_module + +## VIASH START +par = { + "input_loom": "resources_test/rna_velocity/velocyto_processed/cellranger_tiny.loom", + "input_h5mu": "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu", + "modality": "rna_velocity", + "output": "output.h5mu", + "layer_spliced": "velo_spliced", + "layer_unspliced": "velo_unspliced", + "layer_ambiguous": "velo_ambiguous", +} +## VIASH END + +print("Parameters:", par, flush=True) + +print("Reading AnnData from loom", flush=True) +adata_in = ad.read_loom(par["input_loom"]) +adata_in.var_names = adata_in.var["Accession"] + +print("Creating clean AnnData", flush=True) +adata = ad.AnnData( + X=adata_in.X, + obs=adata_in.obs[[]], + var=adata_in.var[[]], + layers={ + par["layer_spliced"]: adata_in.layers["spliced"], + par["layer_unspliced"]: adata_in.layers["unspliced"], + par["layer_ambiguous"]: adata_in.layers["ambiguous"], + }, +) + +if par["input_h5mu"]: + print("Received input h5mu to read", flush=True) + mdata = mu.read_h5mu(par["input_h5mu"]) + + print(f"Storing AnnData in modality {par['modality']}", flush=True) + mdata.mod[par["modality"]] = adata +else: + print("Creating h5mu from scratch", flush=True) + mdata = mu.MuData({par["modality"]: adata}) + +print("Resulting mudata:", mdata, flush=True) + +print("Writing h5mu to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/velocity/velocyto_to_h5mu/test.py b/src/velocity/velocyto_to_h5mu/test.py new file mode 100644 index 00000000..1ccfb6b9 --- /dev/null +++ b/src/velocity/velocyto_to_h5mu/test.py @@ -0,0 +1,56 @@ +import sys +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.string_ = np.bytes_ +numpy_module.unicode_ = np.str_ +sys.modules["numpy"] = numpy_module + +import subprocess +import pathlib +import mudata +import loompy + +## VIASH START +meta = { + "name": "./target/native/convert/from_velocyto_to_h5mu/from_velocyto_to_h5mu", + "resources_dir": "./resources_test/", +} +## VIASH END + +tiny_fastq = pathlib.Path(meta["resources_dir"]) / "cellranger_tiny_fastq" +input_loom = tiny_fastq / "velocyto.loom" +input_h5mu = tiny_fastq / "raw_dataset.h5mu" +output = pathlib.Path("output.h5mu") + +print(f"Running {meta['name']}", flush=True) +subprocess.run( + args=[ + meta["executable"], + "--input_loom", + input_loom, + "--input_h5mu", + input_h5mu, + "--output", + output, + "--output_compression", + "gzip", + ], + check=True, +) + +print("Checking whether output exists", flush=True) +assert output.is_file() + +print("Reading output file", flush=True) +output_data = mudata.read_h5mu(output) + +print("Checking contents", flush=True) +assert list(output_data.mod.keys()) == ["rna", "rna_velocity"] + +with loompy.connect(input_loom) as ds: + mshape = output_data.mod["rna_velocity"].shape[::-1] + lshape = ds.shape + assert mshape == lshape, ( + f"Expected mudata shape {mshape} to be the same the loom shape {lshape}" + ) diff --git a/src/workflows/.gitignore b/src/workflows/.gitignore new file mode 100644 index 00000000..1415cce4 --- /dev/null +++ b/src/workflows/.gitignore @@ -0,0 +1 @@ +*.dot \ No newline at end of file diff --git a/src/workflows/annotation/harmony_knn/config.vsh.yaml b/src/workflows/annotation/harmony_knn/config.vsh.yaml new file mode 100644 index 00000000..8c36bb88 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/config.vsh.yaml @@ -0,0 +1,201 @@ +name: "harmony_knn" +namespace: "workflows/annotation" +scope: "public" +description: "Cell type annotation workflow by performing harmony integration of reference and query dataset followed by KNN label transfer." +info: + name: "Harmony integration followed by KNN label transfer" + test_dependencies: + - name: harmony_knn_test + namespace: test_workflows/annotation +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Query Input + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference. + example: input.h5mu + - name: "--modality" + description: Which modality to process. Should match the modality of the --reference dataset. + type: string + default: "rna" + required: false + - name: "--input_layer" + description: The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts. + required: false + type: string + - name: "--input_obs_batch_label" + type: string + description: "The .obs field in the input (query) dataset containing the batch labels." + example: "sample" + required: true + - name: "--input_var_gene_names" + type: string + required: false + description: | + The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + - name: "--overwrite_existing_key" + type: boolean_true + description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. + + - name: Reference input + arguments: + - name: "--reference" + required: true + type: file + description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset. + example: reference.h5mu + - name: "--reference_layer" + description: The layer of the reference dataset to process if .X is not to be used. Should contain log normalized counts. + required: false + type: string + - name: "--reference_obs_target" + type: string + example: cell_type + required: true + description: The `.obs` key of the target cell type labels to transfer. + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The .var field in the reference dataset containing gene names; if not provided, the .var index will be used. + - name: "--reference_obs_batch_label" + type: string + description: "The .obs field in the reference dataset containing the batch labels." + example: "sample" + required: true + + - name: "HVG subset arguments" + arguments: + - name: "--n_hvg" + type: integer + default: 2000 + description: | + Number of highly variable genes to subset for. + + - name: PCA options + arguments: + - name: "--pca_num_components" + type: integer + example: 25 + description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. + + - name: Harmony integration options + arguments: + - name: "--harmony_theta" + type: double + description: | + Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --input_obs_batch_label. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." + min: 0 + default: [2] + multiple: true + + - name: Leiden clustering options + arguments: + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + min: 0 + default: [1] + multiple: true + + - name: Neighbor classifier arguments + arguments: + - name: "--knn_weights" + type: string + default: "uniform" + choices: ["uniform", "distance"] + description: | + Weight function used in prediction. Possible values are: + `uniform` (all points in each neighborhood are weighted equally) or + `distance` (weight points by the inverse of their distance) + - name: "--knn_n_neighbors" + type: integer + default: 15 + min: 5 + required: false + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. + + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. + example: output.h5mu + - name: "--output_obs_predictions" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the predicted cell labels. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. + - name: "--output_obs_probability" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the probability of the predictions. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. + - name: "--output_obsm_integrated" + type: string + default: "X_integrated_harmony" + required: false + description: "In which .obsm slot to store the integrated embedding." + - name: "--output_compression" + type: string + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + +dependencies: + - name: workflows/integration/harmony_leiden + alias: harmony_leiden_workflow + - name: labels_transfer/knn + - name: workflows/multiomics/split_h5mu + - name: dataflow/concatenate_h5mu + - name: dimred/pca + - name: feature_annotation/align_query_reference + - name: feature_annotation/highly_variable_features_scanpy + - name: transform/delete_layer + - name: workflows/multiomics/split_modalities + - name: dataflow/merge + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +runners: + - type: nextflow diff --git a/src/workflows/annotation/harmony_knn/integration_test.sh b/src/workflows/annotation/harmony_knn/integration_test.sh new file mode 100755 index 00000000..e8597bfe --- /dev/null +++ b/src/workflows/annotation/harmony_knn/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/annotation/harmony_knn/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ diff --git a/src/workflows/annotation/harmony_knn/main.nf b/src/workflows/annotation/harmony_knn/main.nf new file mode 100644 index 00000000..935ab3a2 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/main.nf @@ -0,0 +1,259 @@ +workflow run_wf { + take: + input_ch + + main: + + modalities_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Align query and reference datasets + | align_query_reference.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "reference": "reference", + "reference_layer": "reference_layer", + "reference_obs_batch": "reference_obs_batch_label", + "reference_obs_label": "reference_obs_target", + "reference_var_gene_names": "reference_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "overwrite_existing_key": "overwrite_existing_key" + ], + args: [ + "input_id": "query", + "reference_id": "reference", + "output_layer": "_counts", + "output_var_gene_names": "_gene_names", + "output_obs_batch": "_sample_id", + "output_obs_label": "_cell_type", + "output_obs_id": "_dataset", + "output_var_common_genes": "_common_vars" + ], + toState: [ + "input": "output_query", + "reference": "output_reference" + ] + ) + + | split_modalities.run( + fromState: {id, state -> + def newState = ["input": state.input, "id": id] + }, + toState: ["output": "output", "output_types": "output_types"] + ) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + // def new_id = id + "_" + dat.name + def new_id = id // it's okay because the channel will get split up anyways + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types", "output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + | view {"After splitting modalities: $it"} + + + rna_ch = modalities_ch + + | filter{id, state -> state.modality == "rna"} + + // Concatenate query and reference datasets prior to integration + // Only concatenate rna modality in this channel + | concatenate_h5mu.run( + fromState: { id, state -> [ + "input": [state.input, state.reference] + ] + }, + args: [ + "input_id": ["query", "reference"], + "modality": "rna", + "other_axis_mode": "move" + ], + toState: ["input": "output"] + ) + | view {"After concatenation: $it"} + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "modality": "modality", + "n_top_features": "n_hvg" + ], + args: [ + "layer": "_counts", + "var_input": "_common_vars", + "var_name_filter": "_common_hvg", + "obs_batch_key": "_sample_id" + ], + toState: [ + "input": "output" + ] + ) + | pca.run( + fromState: [ + "input": "input", + "modality": "modality", + "overwrite": "overwrite_existing_key", + "num_compontents": "pca_num_components" + ], + args: [ + "layer": "_counts", + "var_input": "_common_hvg", + "obsm_output": "X_pca_query_reference", + "varm_output": "pca_loadings_query_reference", + "uns_output": "pca_variance_query_reference", + ], + toState: [ + "input": "output" + ] + ) + | delete_layer.run( + key: "delete_aligned_lognormalized_counts_layer", + fromState: [ + "input": "input", + "modality": "modality", + ], + args: [ + "layer": "_counts", + "missing_ok": "true" + ], + toState: [ + "input": "output" + ] + ) + // Run harmony integration with leiden clustering + | harmony_leiden_workflow.run( + fromState: { id, state -> [ + "id": id, + "input": state.input, + "modality": state.modality, + "obsm_integrated": state.output_obsm_integrated, + "theta": state.harmony_theta, + "leiden_resolution": state.leiden_resolution, + ]}, + args: [ + "embedding": "X_pca_query_reference", + "uns_neighbors": "harmonypy_integration_neighbors", + "obsp_neighbor_distances": "harmonypy_integration_distances", + "obsp_neighbor_connectivities": "harmonypy_integration_connectivities", + "obs_cluster": "harmony_integration_leiden", + "obsm_umap": "X_leiden_harmony_umap", + "obs_covariates": "_sample_id" + ], + toState: ["input": "output"] + ) + | view {"After integration: $it"} + // Split integrated dataset back into a separate reference and query dataset + | split_h5mu.run( + fromState: [ + "input": "input", + "modality": "modality" + ], + args: [ + "obs_feature": "_dataset", + "output_files": "sample_files.csv", + "drop_obs_nan": "true", + "output": "ref_query" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ], + auto: [ publish: true ] + ) + | view {"After sample splitting: $it"} + // map the integrated query and reference datasets back to the state + | map {id, state -> + def outputDir = state.output + if (workflow.stubRun) { + def output_files = outputDir.listFiles() + def new_state = state + [ + "input": output_files[0], + "reference": output_files[1], + ] + return [id, new_state] + } + def files = readCsv(state.output_files.toUriString()) + def query_file = files.findAll{ dat -> dat.name == 'query' } + assert query_file.size() == 1, 'there should only be one query file' + def reference_file = files.findAll{ dat -> dat.name == 'reference' } + assert reference_file.size() == 1, 'there should only be one reference file' + def integrated_query = outputDir.resolve(query_file.filename) + def integrated_reference = outputDir.resolve(reference_file.filename) + def newKeys = ["input": integrated_query, "reference": integrated_reference] + [id, state + newKeys] + } + // remove keys from split files + | map {id, state -> + def keysToRemove = ["output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + // Perform KNN label transfer from integrated reference to integrated query + | knn.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_obsm_features": "output_obsm_integrated", + "reference": "reference", + "reference_obsm_features": "output_obsm_integrated", + "reference_obs_targets": "reference_obs_target", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "output_compression": "output_compression", + "weights": "knn_weights", + "n_neighbors": "knn_n_neighbors", + "output": "workflow_output" + ], + toState: ["input": "output"] + // toState: {id, output, state -> ["output": output.output]} + ) + | view {"After processing RNA modality: $it"} + + other_mod_ch = modalities_ch + | filter{id, state -> state.modality != "rna"} + + output_ch = rna_ch.mix(other_mod_ch) + | groupTuple(by: 0, sort: "hash") + | map { id, states -> + def new_input = states.collect{it.input} + def modalities = states.collect{it.modality}.unique() + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality", "reference"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + [id, new_state + ["input": new_input, "modalities": modalities]] + } + | merge.run( + fromState: ["input": "input"], + toState: ["output": "output"], + ) + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/workflows/annotation/harmony_knn/nextflow.config b/src/workflows/annotation/harmony_knn/nextflow.config new file mode 100644 index 00000000..059100c4 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/annotation/harmony_knn/test.nf b/src/workflows/annotation/harmony_knn/test.nf new file mode 100644 index 00000000..cbe7833f --- /dev/null +++ b/src/workflows/annotation/harmony_knn/test.nf @@ -0,0 +1,66 @@ +nextflow.enable.dsl=2 + +include { harmony_knn } from params.rootDir + "/target/nextflow/workflows/annotation/harmony_knn/main.nf" +include { harmony_knn_test } from params.rootDir + "/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/main.nf" +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_layer: "log_normalized", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + input_obs_batch_label: "sample_id", + reference_layer: "log_normalized", + reference_obs_batch_label: "donor_assay", + reference_obs_target: "cell_type", + leiden_resolution: [1.0, 0.25] + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_layer: "log_normalized", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + input_obs_batch_label: "sample_id", + reference_layer: "log_normalized", + reference_obs_batch_label: "donor_assay", + reference_obs_target: "cell_type", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | harmony_knn + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | harmony_knn_test.run( + fromState: [ + "input": "output" + ] + ) + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } + } \ No newline at end of file diff --git a/src/workflows/annotation/scanvi_scarches/config.vsh.yaml b/src/workflows/annotation/scanvi_scarches/config.vsh.yaml new file mode 100644 index 00000000..25529e1a --- /dev/null +++ b/src/workflows/annotation/scanvi_scarches/config.vsh.yaml @@ -0,0 +1,212 @@ +name: "scanvi_scarches" +namespace: "workflows/annotation" +scope: "public" +description: "Cell type annotation workflow using ScanVI with scArches for reference mapping." +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +info: + name: "scANVI - scArches workflow" + test_dependencies: + - name: scanvi_scarches_test + namespace: test_workflows/annotation + +argument_groups: + - name: Query Input + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference. + example: input.h5mu + - name: "--modality" + description: Which modality to process. Should match the modality of the --reference dataset. + type: string + default: "rna" + required: false + - name: "--layer" + type: string + description: Which layer to use for integration if .X is not to be used. Should match the layer of the --reference dataset. + - name: "--input_obs_batch_label" + type: string + description: "The .obs field in the input (query) dataset containing the batch labels." + example: "sample" + required: true + - name: "--input_obs_size_factor" + type: string + required: false + description: | + Key in adata.obs for size factor information. Instead of using library size as a size factor, + the provided size factor column will be used as offset in the mean of the likelihood. + Assumed to be on linear scale. + - name: "--input_var_gene_names" + type: string + required: false + description: ".var column containing gene names. By default, use the index." + + - name: Reference input + arguments: + - name: "--reference" + required: true + type: file + description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset. + example: reference.h5mu + - name: "--reference_obs_target" + type: string + example: cell_type + required: true + description: The `.obs` key containing the target labels. + - name: "--reference_obs_batch_label" + type: string + description: "The .obs field in the reference dataset containing the batch labels." + example: "sample" + required: true + - name: "--reference_obs_size_factor" + type: string + required: false + description: | + Key in adata.obs for size factor information. Instead of using library size as a size factor, + the provided size factor column will be used as offset in the mean of the likelihood. + Assumed to be on linear scale. + - name: "--unlabeled_category" + type: string + default: "Unknown" + description: | + Value in the --reference_obs_batch_label field that indicates unlabeled observations + - name: "--reference_var_hvg" + type: string + required: false + description: ".var column containing highly variable genes. If not provided, genes will not be subset." + - name: "--reference_var_gene_names" + type: string + required: false + description: ".var column containing gene names. By default, use the index." + + - name: scVI, scANVI and scArches training options + arguments: + - name: "--early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement." + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + + - name: Leiden clustering options + arguments: + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + min: 0 + default: [1] + multiple: true + + - name: Neighbor classifier arguments + arguments: + - name: "--knn_weights" + type: string + default: "uniform" + choices: ["uniform", "distance"] + description: | + Weight function used in prediction. Possible values are: + `uniform` (all points in each neighborhood are weighted equally) or + `distance` (weight points by the inverse of their distance) + - name: "--knn_n_neighbors" + type: integer + default: 15 + min: 5 + required: false + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. + + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. + example: output.h5mu + - name: "--output_obs_predictions" + type: string + default: "scanvi_pred" + description: "In which .obs slot to store the predicted labels." + - name: "--output_obs_probability" + type: string + default: "scanvi_probabilities" + description: "In which. obs slot to store the probabilities of the predicted labels." + - name: "--output_obsm_integrated" + type: string + default: "X_integrated_scanvi" + description: "In which .obsm slot to store the integrated embedding." + - name: "--output_compression" + type: string + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + - name: "--output_model" + type: file + direction: output + description: Path to the resulting scANVI model that was updated with query data. + +dependencies: + - name: integrate/scvi + - name: annotate/scanvi + - name: integrate/scarches + - name: workflows/multiomics/neighbors_leiden_umap + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/annotation/scanvi_scarches/integration_test.sh b/src/workflows/annotation/scanvi_scarches/integration_test.sh new file mode 100755 index 00000000..5321ac04 --- /dev/null +++ b/src/workflows/annotation/scanvi_scarches/integration_test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/annotation/scanvi_scarches/test.nf \ + -entry test_wf \ + -resume \ + -profile no_publish,docker \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/annotation/scanvi_scarches/main.nf b/src/workflows/annotation/scanvi_scarches/main.nf new file mode 100644 index 00000000..e14c767f --- /dev/null +++ b/src/workflows/annotation/scanvi_scarches/main.nf @@ -0,0 +1,123 @@ +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output, "workflow_output_model": state.output_model] + [id, new_state] + } + + // Integrate & generate scvi model from the reference data + | scvi.run( + fromState: [ + "input": "reference", + "modality": "modality", + "input_layer": "layer", + "obs_batch": "reference_obs_batch_label", + "var_input": "reference_var_hvg", + "var_gene_names": "reference_var_gene_names", + "obs_size_factor": "reference_obs_size_factor", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + ], + args: [ + "obsm_output": "X_integrated_scvi" + ], + toState: [ + "reference": "output", + "output_scvi_model": "output_model" + ] + ) + + // Create scanvi model from the scvi reference model and integrate reference data + | scanvi.run( + fromState: [ + "input": "reference", + "modality": "modality", + "input_layer": "layer", + "var_input": "reference_var_hvg", + "var_gene_names": "reference_var_gene_names", + "obs_labels": "reference_obs_target", + "unlabeled_category": "unlabeled_category", + "scvi_model": "output_scvi_model", + "obsm_output": "output_obsm_integrated", + "obs_output_predictions": "output_obs_predictions", + "obs_output_probabilities": "output_obs_probability", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + ], + toState: [ + "reference": "output", + "scanvi_model": "output_model" + ] + ) + + // Update scANVI reference model with query data and integrate+annotate query data + | scarches.run( + fromState: [ + "input": "input", + "layer": "layer", + "modality": "modality", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "input_obs_size_factor": "input_obs_size_factor", + "reference": "scanvi_model", + "obsm_output": "output_obsm_integrated", + "obs_output_predictions": "output_obs_predictions", + "obs_output_probabilities": "output_obs_probability", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + "output": "workflow_output", + "model_output": "workflow_output_model" + ], + toState: [ + "input": "output", + "output_model": "model_output" + ] + ) + | view {"After scarches: $it"} + // Perform neighbors calculation, leiden clustering and umap dimred on the integrated query data + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "output_obsm_integrated", + "output": "workflow_output", + "leiden_resolution": "leiden_resolution" + ], + args: [ + "uns_neighbors": "scanvi_integration_neighbors", + "obsp_neighbor_distances": "scanvi_integration_distances", + "obsp_neighbor_connectivities": "scanvi_integration_connectivities", + "obs_cluster": "scanvi_integration_leiden", + "obsm_umap": "X_scanvi_umap" + ], + toState: [ "output": "output" ] + ) + | setState(["output", "output_model"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/annotation/scanvi_scarches/nextflow.config b/src/workflows/annotation/scanvi_scarches/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/annotation/scanvi_scarches/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/annotation/scanvi_scarches/test.nf b/src/workflows/annotation/scanvi_scarches/test.nf new file mode 100644 index 00000000..423a43dc --- /dev/null +++ b/src/workflows/annotation/scanvi_scarches/test.nf @@ -0,0 +1,64 @@ +nextflow.enable.dsl=2 + +include { scanvi_scarches } from params.rootDir + "/target/nextflow/workflows/annotation/scanvi_scarches/main.nf" +include { scanvi_scarches_test } from params.rootDir + "/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_obs_batch_label: "sample_id", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_obs_batch_label: "donor_assay", + reference_obs_target: "cell_type", + reference_var_gene_names: "ensemblid", + max_epochs: 10, + obsm_embedding: "X_pca" + ] + ]) + | map{ state -> [state.id, state] } + | scanvi_scarches + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.containsKey("output_model") : "Output should contain key 'output_model'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + // check output_model + assert state.containsKey("output_model") : "Output should contain key 'output_model'." + assert state.output_model.isDirectory() : "'output_model' should be a directory." + assert state.output_model.toString().endsWith("_model") : "Model output directory should end with '_model'. Found: ${state.output_model}" + def modelFile = state.output_model.resolve("model.pt") + assert modelFile.isFile(), "Model output directory should contain a model.pt file." + + "Output: $output" + } + | scanvi_scarches_test.run( + fromState: [ + "input": "output" + ] + ) + + output_ch + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 1 event" + assert output_list.collect{it[0]} == ["simple_execution_test"] + } + } \ No newline at end of file diff --git a/src/workflows/annotation/scgpt_annotation/config.vsh.yaml b/src/workflows/annotation/scgpt_annotation/config.vsh.yaml new file mode 100644 index 00000000..81321b48 --- /dev/null +++ b/src/workflows/annotation/scgpt_annotation/config.vsh.yaml @@ -0,0 +1,211 @@ +name: "scgpt_annotation" +namespace: "workflows/annotation" +scope: "public" +description: | + Cell type annotation workflow using scGPT. + The workflow takes a pre-processed h5mu file as query input, and performs + - subsetting for HVG + - cross-checking of genes with the model vocabulary + - binning of gene counts + - padding and tokenizing of genes + - transformer-based cell type prediction + Note that cell-type prediction using scGPT is only possible using a fine-tuned scGPT model. +info: + name: "scGPT Annotation" + test_dependencies: + - name: scgpt_annotation_test + namespace: test_workflows/annotation + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/elizabeth_mlynarski.yaml + roles: [ contributor ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: "Query input" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + type: file + required: true + description: Path to the input file. + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: False + description: | + The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts. + - name: "--input_var_gene_names" + type: string + required: false + description: | + The .var field in the input (query) containing gene names; if not provided, the var index will be used. + - name: "--input_obs_batch_label" + type: string + required: true + description: | + The .obs field in the input (query) dataset containing the batch labels. + + - name: "Model input" + arguments: + - name: "--model" + type: file + required: true + example: best_model.pt + description: | + The scGPT model file. + Must be a fine-tuned model that contains keys for checkpoints (--finetuned_checkpoints_key) and cell type label mapper(--label_mapper_key). + - name: "--model_config" + type: file + required: true + example: args.json + description: | + The scGPT model configuration file. + - name: "--model_vocab" + type: file + required: true + example: vocab.json + description: | + The scGPT model vocabulary file. + - name: "--finetuned_checkpoints_key" + type: string + default: model_state_dict + description: | + Key in the model file containing the pre-trained checkpoints. + - name: "--label_mapper_key" + type: string + default: id_to_class + description: | + Key in the model file containing the cell type class to label mapper dictionary. + + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Output file path + example: output.h5mu + - name: "--output_compression" + type: string + example: "gzip" + required: false + choices: ["gzip", "lzf"] + description: | + The compression algorithm to use for the output h5mu file. + - name: "--output_obs_predictions" + type: string + default: "scgpt_pred" + required: false + description: | + The name of the adata.obs column to write predicted cell type labels to. + - name: "--output_obs_probability" + type: string + default: "scgpt_probability" + required: false + description: | + The name of the adata.obs column to write predicted cell type labels to. + + - name: "Padding arguments" + arguments: + - name: "--pad_token" + type: string + default: "" + required: false + description: | + Token used for padding. + - name: "--pad_value" + type: integer + default: -2 + required: false + description: | + The value of the padding token. + + - name: "HVG subset arguments" + arguments: + - name: "--n_hvg" + type: integer + default: 1200 + description: | + Number of highly variable genes to subset for. + - name: "--hvg_flavor" + type: string + choices: ["cell_ranger", "seurat"] + default: "cell_ranger" + description: | + Method to be used for identifying highly variable genes. + Note that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`). + + - name: "Tokenization arguments" + arguments: + - name: "--max_seq_len" + type: integer + required: false + description: | + The maximum sequence length of the tokenized data. + + - name: "Embedding arguments" + arguments: + - name: --dsbn + type: boolean + default: true + description: | + Apply domain-specific batch normalization + - name: "--batch_size" + type: integer + default: 64 + min: 1 + description: | + The batch size to be used for embedding inference. + + - name: "Binning arguments" + arguments: + - name: "--n_input_bins" + type: integer + default: 51 + required: False + min: 1 + description: | + The number of bins to discretize the data into; When no value is provided, data won't be binned. + - name: "--seed" + type: integer + min: 0 + required: false + description: | + Seed for random number generation used for binning. If not set, no seed is used. + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/scgpt + +dependencies: + - name: scgpt/cross_check_genes + - name: scgpt/binning + - name: feature_annotation/highly_variable_features_scanpy + - name: filter/do_filter + - name: scgpt/pad_tokenize + - name: scgpt/cell_type_annotation + alias: scgpt_celltype_annotation + +runners: + - type: nextflow diff --git a/src/workflows/annotation/scgpt_annotation/integration_test.sh b/src/workflows/annotation/scgpt_annotation/integration_test.sh new file mode 100755 index 00000000..108a4c4d --- /dev/null +++ b/src/workflows/annotation/scgpt_annotation/integration_test.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow run . \ + -main-script src/workflows/annotation/scgpt_annotation/test.nf \ + -profile docker,no_publish \ + -entry test_wf \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/annotation/scgpt_annotation/main.nf b/src/workflows/annotation/scgpt_annotation/main.nf new file mode 100644 index 00000000..010ed7b1 --- /dev/null +++ b/src/workflows/annotation/scgpt_annotation/main.nf @@ -0,0 +1,112 @@ +workflow run_wf { + + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Annotate the mudata object with highly variable genes. + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "layer": "input_layer", + "modality": "modality", + "n_top_features": "n_hvg", + "flavor": "hvg_flavor" + ], + args: [ + "var_name_filter": "scgpt_filter_with_hvg" + ], + toState: ["input": "output"] + ) + // Check whether the genes are part of the provided vocabulary. + // Subsets for genes present in vocab only. + | cross_check_genes.run( + fromState: [ + "input": "input", + "modality": "modality", + "vocab_file": "model_vocab", + "input_var_gene_names": "input_var_gene_names", + "output": "output", + "pad_token": "pad_token" + ], + args: [ + "var_input": "scgpt_filter_with_hvg", + "output_var_filter": "scgpt_cross_checked_genes" + ], + toState: ["input": "output"] + ) + // Bins the data into a fixed number of bins. + | binning.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "n_input_bins": "n_input_bins", + "output": "output", + "seed": "seed" + ], + args: [ + "output_obsm_binned_counts": "binned_counts", + "var_input": "scgpt_cross_checked_genes" + ], + toState: ["input": "output"] + ) + // Padding and tokenization of gene count values. + | pad_tokenize.run( + fromState: [ + "input": "input", + "modality": "modality", + "model_vocab": "model_vocab", + "var_gene_names": "input_var_gene_names", + "pad_token": "pad_token", + "pad_value": "pad_value", + "max_seq_len": "max_seq_len", + "output": "output" + ], + args: [ + "input_obsm_binned_counts": "binned_counts", + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask", + "var_input": "scgpt_cross_checked_genes" + ], + toState: ["input": "output"] + ) + // scGPT decoder-based cell type annotation. + | scgpt_celltype_annotation.run( + fromState: [ + "model": "model", + "model_vocab": "model_vocab", + "model_config": "model_config", + "label_mapper_key": "label_mapper_key", + "finetuned_checkpoints_key": "finetuned_checkpoints_key", + "input": "input", + "modality": "modality", + "obs_batch_label": "input_obs_batch_label", + "pad_token": "pad_token", + "pad_value": "pad_value", + "n_input_bins": "n_input_bins", + "dsbn": "dsbn", + "batch_size": "batch_size", + "seed": "seed", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "output": "workflow_output", + "output_compression": "output_compression" + ], + args: [ + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized" + ], + toState: {id, output, state -> ["output": output.output]} + ) + + emit: + output_ch +} diff --git a/src/workflows/annotation/scgpt_annotation/nextflow.config b/src/workflows/annotation/scgpt_annotation/nextflow.config new file mode 100644 index 00000000..059100c4 --- /dev/null +++ b/src/workflows/annotation/scgpt_annotation/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/annotation/scgpt_annotation/test.nf b/src/workflows/annotation/scgpt_annotation/test.nf new file mode 100644 index 00000000..5e86cc16 --- /dev/null +++ b/src/workflows/annotation/scgpt_annotation/test.nf @@ -0,0 +1,58 @@ +nextflow.enable.dsl=2 + +include { scgpt_annotation } from params.rootDir + "/target/nextflow/workflows/annotation/scgpt_annotation/main.nf" +include { scgpt_annotation_test } from params.rootDir + "/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + resources_test = file(params.resources_test) + scgpt_test_resources = resources_test / "scgpt" + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: scgpt_test_resources.resolve("test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), + model: scgpt_test_resources.resolve("finetuned_model/best_model.pt"), + model_config: scgpt_test_resources.resolve("source/args.json"), + model_vocab: scgpt_test_resources.resolve("source/vocab.json"), + input_layer: "log_normalized", + input_obs_batch_label: "sample", + // change default to reduce resource requirements + n_hvg: 400, + seed: 1 + ] + ]) + | map{ state -> [state.id, state] } + | scgpt_annotation + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | scgpt_annotation_test.run( + fromState: [ + "input": "output" + ], + args: [ + "n_hvg": 400 + ] + ) + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 1 event" + assert output_list.collect{it[0]} == ["simple_execution_test"] + } + +} diff --git a/src/workflows/annotation/scvi_knn/config.vsh.yaml b/src/workflows/annotation/scvi_knn/config.vsh.yaml new file mode 100644 index 00000000..b37003bf --- /dev/null +++ b/src/workflows/annotation/scvi_knn/config.vsh.yaml @@ -0,0 +1,229 @@ +name: "scvi_knn" +namespace: "workflows/annotation" +scope: "public" +description: | + "Cell type annotation workflow that performs scVI integration of reference and query dataset followed by KNN label transfer. + The query and reference datasets are expected to be pre-processed in the same way, for example with the `process_samples` workflow of OpenPipeline. + Note that this workflow will integrate the reference dataset from scratch and integrate the query dataset in the same embedding space. + The workflow does not currently output the trained SCVI reference model." +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] +info: + name: "scVI Annotation" + test_dependencies: + - name: scvi_knn_test + namespace: test_workflows/annotation + +argument_groups: + - name: Query Input + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Input dataset consisting of the (unlabeled) query observations. + example: input.h5mu + - name: "--modality" + description: Which modality to process. Should match the modality of the --reference dataset. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + example: counts + description: The layer of the input dataset containing the raw counts if .X is not to be used. + - name: "--input_layer_lognormalized" + type: string + example: log_normalized + description: The layer of the input dataset containing the lognormalized counts if .X is not to be used. + - name: "--input_obs_batch_label" + type: string + description: "The .obs field in the input (query) dataset containing the batch labels." + example: "sample" + required: true + - name: "--input_var_gene_names" + type: string + required: false + description: | + The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + - name: "--overwrite_existing_key" + type: boolean_true + description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. + + - name: Reference input + arguments: + - name: "--reference" + required: true + type: file + description: Reference dataset consisting of the labeled observations. + example: reference.h5mu + - name: "--reference_layer" + type: string + example: counts + description: The layer of the reference dataset containing the raw counts if .X is not to be used. + - name: "--reference_layer_lognormalized" + type: string + example: log_normalized + description: The layer of the reference dataset containing the lognormalized counts if .X is not to be used. + - name: "--reference_obs_target" + type: string + example: cell_type + required: true + description: The `.obs` key(s) of the target labels to transfer. + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The .var field in the reference dataset containing gene names; if not provided, the .var index will be used. + - name: "--reference_obs_batch_label" + type: string + description: "The .obs field in the reference dataset containing the batch labels." + example: "sample" + required: true + + - name: "HVG subset arguments" + arguments: + - name: "--n_hvg" + type: integer + default: 2000 + description: | + Number of highly variable genes to subset for. + + - name: scVI integration options + arguments: + - name: "--scvi_early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--scvi_early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--scvi_early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--scvi_early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement." + - name: "--scvi_max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--scvi_reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--scvi_lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--scvi_lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + + - name: Leiden clustering options + arguments: + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + min: 0 + default: [1] + multiple: true + + - name: Neighbor classifier arguments + arguments: + - name: "--knn_weights" + type: string + default: "uniform" + choices: ["uniform", "distance"] + description: | + Weight function used in prediction. Possible values are: + `uniform` (all points in each neighborhood are weighted equally) or + `distance` (weight points by the inverse of their distance) + - name: "--knn_n_neighbors" + type: integer + default: 15 + min: 5 + required: false + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. + + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. + example: output.h5mu + - name: "--output_obs_predictions" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the predicted cell labels. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. + - name: "--output_obs_probability" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the probability of the predictions. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. + - name: "--output_obsm_integrated" + type: string + default: "X_integrated_scvi" + required: false + description: "In which .obsm slot to store the integrated embedding." + +dependencies: + - name: workflows/integration/scvi_leiden + alias: scvi_leiden_workflow + - name: labels_transfer/knn + - name: workflows/multiomics/split_h5mu + - name: dataflow/concatenate_h5mu + - name: feature_annotation/align_query_reference + - name: feature_annotation/highly_variable_features_scanpy + - name: transform/delete_layer + - name: workflows/multiomics/split_modalities + - name: dataflow/merge + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/scgpt + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +runners: + - type: nextflow diff --git a/src/workflows/annotation/scvi_knn/integration_test.sh b/src/workflows/annotation/scvi_knn/integration_test.sh new file mode 100755 index 00000000..405a12ac --- /dev/null +++ b/src/workflows/annotation/scvi_knn/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/annotation/scvi_knn/test.nf \ + -entry test_wf \ + -profile no_publish,docker \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/annotation/scvi_knn/main.nf b/src/workflows/annotation/scvi_knn/main.nf new file mode 100644 index 00000000..2a673f56 --- /dev/null +++ b/src/workflows/annotation/scvi_knn/main.nf @@ -0,0 +1,266 @@ +workflow run_wf { + take: + input_ch + + main: + modalities_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Allign query and reference datasets + | align_query_reference.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "input_layer_lognormalized": "input_layer_lognormalized", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "reference": "reference", + "reference_layer": "reference_layer", + "reference_layer_lognormalized": "reference_layer_lognormalized", + "reference_obs_batch": "reference_obs_batch_label", + "reference_obs_label": "reference_obs_target", + "reference_var_gene_names": "reference_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "overwrite_existing_key": "overwrite_existing_key" + ], + args: [ + "input_id": "query", + "reference_id": "reference", + "output_layer": "_counts", + "output_layer_lognormalized": "_log_normalized", + "output_var_gene_names": "_gene_names", + "output_obs_batch": "_sample_id", + "output_obs_label": "_cell_type", + "output_obs_id": "_dataset", + "output_var_common_genes": "_common_vars", + "align_layers_lognormalized_counts": "true" + ], + toState: [ + "input": "output_query", + "reference": "output_reference" + ] + ) + + | split_modalities.run( + fromState: {id, state -> + def newState = ["input": state.input, "id": id] + }, + toState: ["output": "output", "output_types": "output_types"] + ) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + // def new_id = id + "_" + dat.name + def new_id = id // it's okay because the channel will get split up anyways + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types", "output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + | view {"After splitting modalities: $it"} + + rna_ch = modalities_ch + + | filter{id, state -> state.modality == "rna"} + + // Concatenate query and reference datasets prior to integration + // Only concatenate rna modality in this channel + | concatenate_h5mu.run( + fromState: { id, state -> + ["input": [state.input, state.reference]] + }, + args: [ + "input_id": ["query", "reference"], + "modality": "rna", + "other_axis_mode": "move" + ], + toState: ["input": "output"] + ) + + // Calculate HVG across query and reference + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "modality": "modality", + "n_top_features": "n_hvg" + ], + args: [ + "layer": "_log_normalized", + "var_input": "_common_vars", + "var_name_filter": "_common_hvg", + "obs_batch_key": "_sample_id" + ], + toState: [ + "input": "output" + ] + ) + | delete_layer.run( + key: "delete_aligned_lognormalized_counts_layer", + fromState: [ + "input": "input", + "modality": "modality", + ], + args: [ + "layer": "_log_normalized", + "missing_ok": "true" + ], + toState: [ + "input": "output" + ] + ) + // Run scvi integration with leiden clustering + | scvi_leiden_workflow.run( + fromState: { id, state -> [ + "id": id, + "input": state.input, + "modality": state.modality, + "obsm_output": state.output_obsm_integrated, + "leiden_resolution": state.leiden_resolution, + "early_stopping": state.scvi_early_stopping, + "early_stopping_monitor": state.scvi_early_stopping_monitor, + "early_stoping_patience": state.scvi_early_stoping_patience, + "early_stopping_min_delta": state.scvi_early_stopping_min_delta, + "max_epochs": state.scvi_max_epochs, + "reduce_lr_on_plateau": state.scvi_reduce_lr_on_plateau, + "lr_factor": state.scvi_lr_factor, + "lr_patience": state.scvi_lr_patience + ]}, + args: [ + "var_input": "_common_hvg", + "uns_neighbors": "scvi_integration_neighbors", + "obsp_neighbor_distances": "scvi_integration_distances", + "obsp_neighbor_connectivities": "scvi_integration_connectivities", + "obs_cluster": "scvi_integration_leiden", + "obsm_umap": "X_leiden_scvi_umap", + "obs_batch": "_sample_id", + "layer": "_counts" + ], + toState: ["input": "output"] + ) + | delete_layer.run( + key: "delete_aligned_counts_layer", + fromState: [ + "input": "input", + "modality": "modality", + ], + args: [ + "layer": "_counts", + "missing_ok": "true" + ], + toState: [ + "input": "output" + ] + ) + // Split integrated dataset back into a separate reference and query dataset + | split_h5mu.run( + fromState: [ + "input": "input", + "modality": "modality" + ], + args: [ + "obs_feature": "_dataset", + "output_files": "sample_files.csv", + "drop_obs_nan": "true", + "output": "ref_query" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ] + ) + // map the integrated query and reference datasets back to the state + | map {id, state -> + def outputDir = state.output + if (workflow.stubRun) { + def output_files = outputDir.listFiles() + def new_state = state + [ + "input": output_files[0], + "reference": output_files[1], + ] + return [id, new_state] + } + def files = readCsv(state.output_files.toUriString()) + def query_file = files.findAll{ dat -> dat.name == 'query' } + assert query_file.size() == 1, 'there should only be one query file' + def reference_file = files.findAll{ dat -> dat.name == 'reference' } + assert reference_file.size() == 1, 'there should only be one reference file' + def integrated_query = outputDir.resolve(query_file.filename) + def integrated_reference = outputDir.resolve(reference_file.filename) + def newKeys = ["input": integrated_query, "reference": integrated_reference] + [id, state + newKeys] + } + // remove keys from split files + | map {id, state -> + def keysToRemove = ["output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + + // Perform KNN label transfer from reference to query + | knn.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_obsm_features": "output_obsm_integrated", + "reference": "reference", + "reference_obsm_features": "output_obsm_integrated", + "reference_obs_targets": "reference_obs_target", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "weights": "knn_weights", + "n_neighbors": "knn_n_neighbors", + "output": "workflow_output" + ], + args:[ + "output_compression": "gzip" + ], + toState: ["input": "output"] + // toState: {id, output, state -> ["output": output.output]} + ) + | view {"After processing RNA modality: $it"} + + + other_mod_ch = modalities_ch + | filter{id, state -> state.modality != "rna"} + + output_ch = rna_ch.mix(other_mod_ch) + | groupTuple(by: 0, sort: "hash") + | map { id, states -> + def new_input = states.collect{it.input} + def modalities = states.collect{it.modality}.unique() + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality", "reference"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + [id, new_state + ["input": new_input, "modalities": modalities]] + } + | merge.run( + fromState: ["input": "input"], + toState: ["output": "output"], + ) + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/workflows/annotation/scvi_knn/nextflow.config b/src/workflows/annotation/scvi_knn/nextflow.config new file mode 100644 index 00000000..059100c4 --- /dev/null +++ b/src/workflows/annotation/scvi_knn/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/annotation/scvi_knn/test.nf b/src/workflows/annotation/scvi_knn/test.nf new file mode 100644 index 00000000..8e676823 --- /dev/null +++ b/src/workflows/annotation/scvi_knn/test.nf @@ -0,0 +1,69 @@ +nextflow.enable.dsl=2 + +include { scvi_knn } from params.rootDir + "/target/nextflow/workflows/annotation/scvi_knn/main.nf" +include { scvi_knn_test } from params.rootDir + "/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_layer_lognormalized: "log_normalized", + input_obs_batch_label: "sample_id", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_layer_lognormalized: "log_normalized", + reference_var_gene_names: "ensemblid", + reference_obs_batch_label: "donor_assay", + reference_obs_target: "cell_type", + leiden_resolution: [1.0, 0.25] + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_layer_lognormalized: "log_normalized", + input_obs_batch_label: "sample_id", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_layer_lognormalized: "log_normalized", + reference_obs_batch_label: "donor_assay", + reference_var_gene_names: "ensemblid", + reference_obs_target: "cell_type", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | scvi_knn + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | scvi_knn_test.run( + fromState: [ + "input": "output" + ] + ) + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } + } + \ No newline at end of file diff --git a/src/workflows/gdo/gdo_singlesample/config.vsh.yaml b/src/workflows/gdo/gdo_singlesample/config.vsh.yaml new file mode 100644 index 00000000..5681723b --- /dev/null +++ b/src/workflows/gdo/gdo_singlesample/config.vsh.yaml @@ -0,0 +1,76 @@ +name: "gdo_singlesample" +namespace: "workflows/gdo" +scope: "public" +description: "Processing unimodal single-sample guide-derived oligonucleotide (GDO) data." +info: + name: "GDO Singlesample" + test_dependencies: +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "Input" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + type: file + required: true + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + type: string + required: false + description: "Input layer to start from. By default, .X will be used." + - name: "Output" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "Filtering options" + arguments: + - name: "--min_counts" + example: 200 + type: integer + description: Minimum number of counts captured per cell. + - name: "--max_counts" + example: 5000000 + type: integer + description: Maximum number of counts captured per cell. + + - name: "--min_guides_per_cell" + type: integer + example: 200 + description: Minimum of non-zero values per cell. + - name: "--max_guides_per_cell" + example: 1500000 + type: integer + description: Maximum of non-zero values per cell. + + - name: "--min_cells_per_guide" + example: 3 + type: integer + description: Minimum of non-zero values per gene. + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +dependencies: + - name: filter/filter_with_counts + - name: filter/do_filter +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/10x_5k_lung_crispr +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/gdo/gdo_singlesample/integration_test.sh b/src/workflows/gdo/gdo_singlesample/integration_test.sh new file mode 100755 index 00000000..6a2ab79f --- /dev/null +++ b/src/workflows/gdo/gdo_singlesample/integration_test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/gdo/gdo_singlesample/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + diff --git a/src/workflows/gdo/gdo_singlesample/main.nf b/src/workflows/gdo/gdo_singlesample/main.nf new file mode 100644 index 00000000..055bb102 --- /dev/null +++ b/src/workflows/gdo/gdo_singlesample/main.nf @@ -0,0 +1,51 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // cell filtering + | filter_with_counts.run( + key: "gdo_filter_with_counts", + fromState: { id, state -> + [ + "input": state.input, + "modality": "gdo", + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "min_counts": state.min_counts, + "max_counts": state.max_counts, + "min_genes_per_cell": state.min_guides_per_cell, + "max_genes_per_cell": state.max_guides_per_cell, + "min_cells_per_gene": state.min_cells_per_guide, + "layer": state.layer, + ] + }, + toState: ["input": "output"] + ) + | do_filter.run( + key: "gdo_do_filter", + fromState : { id, state -> + // do_filter does not need a layer argument because it filters all layers + // from a modality. + def newState = [ + "input": state.input, + "obs_filter": "filter_with_counts", + "modality": "gdo", + "var_filter": "filter_with_counts", + "output_compression": "gzip", + "output": state.workflow_output + ] + return newState + }, + toState: ["output": "output"] + ) + | setState(["output"]) + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/gdo/gdo_singlesample/nextflow.config b/src/workflows/gdo/gdo_singlesample/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/gdo/gdo_singlesample/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/gdo/gdo_singlesample/test.nf b/src/workflows/gdo/gdo_singlesample/test.nf new file mode 100644 index 00000000..430d6d4b --- /dev/null +++ b/src/workflows/gdo/gdo_singlesample/test.nf @@ -0,0 +1,38 @@ +nextflow.enable.dsl=2 + +include { gdo_singlesample } from params.rootDir + "/target/nextflow/workflows/gdo/gdo_singlesample/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"), + min_counts: 3, + max_counts: 10000000, + min_guides_per_cell: 2, + max_guides_per_cell: 10000000, + min_cells_per_guide: 2, + output: "simple_execution_test.final.h5mu" + ] + ]) + | map{ state -> [state.id, state] } + | gdo_singlesample + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + | toSortedList{a, b -> a[0] <=> b[0]} + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain two events" + println "output_list: $output_list" + assert output_list.collect{it[0]} == ["simple_execution_test"] : "Output ID should be same as input ID" + assert (output_list.collect({it[1].output.getFileName().toString()}) as Set).equals(["simple_execution_test.final.h5mu"] as Set) + + } +} diff --git a/src/workflows/ingestion/bd_rhapsody/config.vsh.yaml b/src/workflows/ingestion/bd_rhapsody/config.vsh.yaml new file mode 100644 index 00000000..627794bd --- /dev/null +++ b/src/workflows/ingestion/bd_rhapsody/config.vsh.yaml @@ -0,0 +1,352 @@ +name: "bd_rhapsody" +namespace: "workflows/ingestion" +scope: "public" +info: + name: "BD Rhapsody" + test_dependencies: + - name: bd_rhapsody_test + namespace: test_workflows/ingestion +description: | + BD Rhapsody Sequence Analysis CWL pipeline v2.2.1 + + This pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported + sequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome + mRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and + ATAC-Seq + + The CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement' from the YAML. + +authors: + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + +argument_groups: +- name: Inputs + arguments: + - name: "--reads" + type: file + description: | + Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include: + + - WTA mRNA + - Targeted mRNA + - AbSeq + - Sample Multiplexing + - VDJ + + You may specify as many R1/R2 read pairs as you want. + required: false + multiple: true + example: + - WTALibrary_S1_L001_R1_001.fastq.gz + - WTALibrary_S1_L001_R2_001.fastq.gz + info: + config_key: Reads + - name: "--reads_atac" + type: file + description: | + Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries. + You may specify as many R1/R2/I2 files as you want. + required: false + multiple: true + example: + - ATACLibrary_S2_L001_R1_001.fastq.gz + - ATACLibrary_S2_L001_R2_001.fastq.gz + - ATACLibrary_S2_L001_I2_001.fastq.gz + info: + config_key: Reads_ATAC +- name: References + description: | + Assay type will be inferred from the provided reference(s). + Do not provide both reference_archive and targeted_reference at the same time. + + Valid reference input combinations: + - reference_archive: WTA only + - reference_archive & abseq_reference: WTA + AbSeq + - reference_archive & supplemental_reference: WTA + extra transgenes + - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes + - reference_archive: WTA + ATAC or ATAC only + - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes + - targeted_reference: Targeted only + - targeted_reference & abseq_reference: Targeted + AbSeq + - abseq_reference: AbSeq only + + The reference_archive can be generated with the `reference/build_bdrhap_reference` component. + Alternatively, BD also provides standard references which can be downloaded from these locations: + + - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz + - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz + + arguments: + - name: "--reference_archive" + type: file + description: | + Path to Rhapsody WTA Reference in the tar.gz format. + + Structure of the reference archive: + + - `BD_Rhapsody_Reference_Files/`: top level folder + - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate` + - GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf" + example: "RhapRef_Human_WTA_2023-02.tar.gz" + required: false + info: + config_key: Reference_Archive + - name: "--targeted_reference" + type: file + description: | + Path to the targeted reference file in FASTA format. + example: "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + multiple: true + info: + config_key: Targeted_Reference + - name: "--abseq_reference" + type: file + description: Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used. + example: "AbSeq_reference.fasta" + multiple: true + info: + config_key: AbSeq_Reference + - name: "--supplemental_reference" + type: file + alternatives: [-s] + description: Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment. + example: "supplemental_reference.fasta" + multiple: true + info: + config_key: Supplemental_Reference +- name: Outputs + description: Outputs + arguments: + - name: "--output" + type: file + direction: output + description: "The processed output file in h5mu format." + example: output.h5mu + required: true + - name: "--output_raw" + type: file + direction: output + alternatives: [-o] + description: "The unprocessed output directory containing all the outputs from the pipeline." + required: true + example: output_dir/ +- name: Putative Cell Calling Settings + arguments: + - name: "--cell_calling_data" + type: string + description: | + Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC + + For putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above. + + For putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above. + + The default data for putative cell calling, will be determined the following way: + + - If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC + - If only ATAC Reads exist: ATAC + - Otherwise: mRNA + choices: [mRNA, AbSeq, ATAC, mRNA_and_ATAC] + example: mRNA + info: + config_key: Cell_Calling_Data + - name: "--cell_calling_bioproduct_algorithm" + type: string + description: | + Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_Bioproduct_Algorithm + - name: "--cell_calling_atac_algorithm" + type: string + description: | + Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_ATAC_Algorithm + - name: "--exact_cell_count" + type: integer + description: | + Set a specific number of cells as putative, based on those with the highest error-corrected read count + example: 10000 + min: 1 + info: + config_key: Exact_Cell_Count + - name: "--expected_cell_count" + type: integer + description: | + Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. + example: 20000 + min: 1 + info: + config_key: Expected_Cell_Count +- name: Intronic Reads Settings + arguments: + - name: --exclude_intronic_reads + type: boolean + description: | + By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded. + The value can be true or false. + example: false + info: + config_key: Exclude_Intronic_Reads +- name: Multiplex Settings + arguments: + - name: "--sample_tags_version" + type: string + description: | + Specify the version of the Sample Tags used in the run: + + * If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only + * If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the "nuclei_includes_mrna" option. + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the "nuclei_atac_only" option. + choices: [human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only] + example: human + info: + config_key: Sample_Tags_Version + - name: "--tag_names" + type: string + description: | + Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv + Do not use the special characters. + multiple: true + example: [4-mySample, 9-myOtherSample, 6-alsoThisSample] + info: + config_key: Tag_Names +- name: VDJ arguments + arguments: + - name: "--vdj_version" + type: string + description: | + If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR + choices: [human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR] + example: human + info: + config_key: VDJ_Version +- name: ATAC options + arguments: + - name: "--predefined_atac_peaks" + type: file + description: An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. + example: predefined_peaks.bed + info: + config_key: Predefined_ATAC_Peaks +- name: Additional options + arguments: + - name: "--run_name" + type: string + description: | + Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces. + default: sample + info: + config_key: Run_Name + - name: "--generate_bam" + type: boolean + description: | + Specify whether to create the BAM file output + default: false + info: + config_key: Generate_Bam + - name: "--long_reads" + type: boolean + description: | + Use STARlong (default: undefined - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp. + info: + config_key: Long_Reads +- name: Advanced options + description: | + NOTE: Only change these if you are really sure about what you are doing + arguments: + - name: "--custom_star_params" + type: string + description: | + Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline. + For reference this is the default that is used: + + Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000` + Long Reads: Same as Short Reads + `--seedPerReadNmax 10000` + + This applies to fastqs provided in the Reads user input + Do NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc. + We use STAR version 2.7.10b + example: "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000" + info: + config_key: Custom_STAR_Params + - name: "--custom_bwa_mem2_params" + type: string + description: | + Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline + The pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used + This applies to fastqs provided in the Reads_ATAC user input + Do NOT set any non-mapping related params like `-C`, `-t`, etc. + We use bwa-mem2 version 2.2.1 + example: "-k 16 -w 200 -r" + info: + config_key: Custom_bwa_mem2_Params +- name: CWL-runner arguments + arguments: + - name: "--parallel" + type: boolean + description: "Run jobs in parallel." + default: true + - name: "--timestamps" + type: boolean_true + description: "Add timestamps to the errors, warnings, and notifications." +- name: Undocumented arguments + arguments: + - name: --abseq_umi + type: integer + multiple: false + info: + config_key: AbSeq_UMI + - name: --target_analysis + type: boolean + multiple: false + info: + config_key: Target_analysis + - name: --vdj_jgene_evalue + type: double + description: | + e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_JGene_Evalue + - name: --vdj_vgene_evalue + type: double + description: | + e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_VGene_Evalue + - name: --write_filtered_reads + type: boolean + multiple: false + info: + config_key: Write_Filtered_Reads +dependencies: + - name: mapping/bd_rhapsody + alias: bd_rhapsody_component + - name: convert/from_bdrhap_to_h5mu +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/bdrhap_5kjrt + - path: /resources_test/reference_gencodev41_chr1/ +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/ingestion/bd_rhapsody/integration_test.sh b/src/workflows/ingestion/bd_rhapsody/integration_test.sh new file mode 100755 index 00000000..f69a5235 --- /dev/null +++ b/src/workflows/ingestion/bd_rhapsody/integration_test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/bd_rhapsody/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + diff --git a/src/workflows/ingestion/bd_rhapsody/main.nf b/src/workflows/ingestion/bd_rhapsody/main.nf new file mode 100644 index 00000000..04934c1c --- /dev/null +++ b/src/workflows/ingestion/bd_rhapsody/main.nf @@ -0,0 +1,41 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // run bd rhapsody + | bd_rhapsody_component.run( + fromState: { id, state -> + // pass all arguments except: + // - remove output_h5mu + // - rename output_raw to output_dir + def data_ = state.clone() + data_.remove("output") + data_.remove("output_raw") + data_ + }, + toState: [ + "input": "output_mudata", + "output_raw": "output_dir" + ] + ) + + // convert to h5mu + | from_bdrhap_to_h5mu.run( + fromState: { id, state -> + [ + id: id, + input: state.input, + output: state.output, + output_compression: "gzip" + ] + }, + toState: ["output": "output"] + ) + + | setState(["output_raw", "output"]) + + emit: + output_ch +} diff --git a/src/workflows/ingestion/bd_rhapsody/nextflow.config b/src/workflows/ingestion/bd_rhapsody/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/ingestion/bd_rhapsody/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/ingestion/bd_rhapsody/test.nf b/src/workflows/ingestion/bd_rhapsody/test.nf new file mode 100644 index 00000000..57011368 --- /dev/null +++ b/src/workflows/ingestion/bd_rhapsody/test.nf @@ -0,0 +1,49 @@ +nextflow.enable.dsl=2 + +include { bd_rhapsody } from params.rootDir + "/target/nextflow/workflows/ingestion/bd_rhapsody/main.nf" +include { bd_rhapsody_test } from params.rootDir + "/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "foo", + reads: file("${params.resources_test}/bdrhap_5kjrt/raw/12*.fastq.gz"), + reference_archive: resources_test.resolve("reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz"), + abseq_reference: resources_test.resolve("bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta"), + cell_calling_data: "mRNA", + exact_cell_count: 4900 + ] + ]) + | map{ state -> [state.id, state] } + | bd_rhapsody + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + + def id = output[0] + def data = output[1] + + assert id == "foo" : "Output ID should be same as input ID" + assert "output_raw" in data : "Output should contain output_raw" + assert "output" in data : "Output should contain output_h5mu" + assert data.output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + + | bd_rhapsody_test.run( + fromState: ["input": "output"] + ) + + | toList() + | view { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + } + + // | view { output -> output[1]} + // | check_format(args: {""}) // todo: check whether output h5mu has the right slots defined +} diff --git a/src/workflows/ingestion/cellranger_mapping/config.vsh.yaml b/src/workflows/ingestion/cellranger_mapping/config.vsh.yaml new file mode 100644 index 00000000..ed7c015e --- /dev/null +++ b/src/workflows/ingestion/cellranger_mapping/config.vsh.yaml @@ -0,0 +1,116 @@ +name: "cellranger_mapping" +namespace: "workflows/ingestion" +scope: "public" +description: "A pipeline for running Cell Ranger mapping." +info: + name: Cell Ranger mapping + test_dependencies: + - name: cellranger_mapping_test + namespace: test_workflows/ingestion +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - type: file + name: --input + required: true + multiple: true + multiple_sep: ";" + example: [ "sample_S1_L001_R1_001.fastq.gz", "sample_S1_L001_R2_001.fastq.gz" ] + description: The fastq.gz files to align. Can also be a single directory containing fastq.gz files. + - type: file + name: --reference + required: true + description: The path to Cell Ranger reference tar.gz file. + example: reference.tar.gz + - name: Outputs + arguments: + - name: "--output_raw" + type: file + direction: output + description: "Location where the output folder from Cell Ranger will be stored." + required: true + example: output_dir/ + - name: "--output_h5mu" + type: file + direction: output + description: "The output from Cell Ranger, converted to h5mu." + required: true + example: output.h5mu + - name: "--uns_metrics" + type: string + description: Name of the .uns slot under which to QC metrics (if any). + default: "metrics_summary" + - name: "--output_type" + type: string + description: "Which Cell Ranger output to use for converting to h5mu." + choices: [ raw, filtered ] + default: raw + - name: Cell Ranger arguments + arguments: + - type: integer + name: --expect_cells + example: 3000 + description: "Expected number of recovered cells, used as input to cell calling algorithm." + + - type: string + name: --chemistry + default: auto + description: | + Assay configuration. + - auto: autodetect mode + - threeprime: Single Cell 3' + - fiveprime: Single Cell 5' + - SC3Pv1: Single Cell 3' v1 + - SC3Pv2: Single Cell 3' v2 + - SC3Pv3: Single Cell 3' v3 + - SC3Pv3LT: Single Cell 3' v3 LT + - SC3Pv3HT: Single Cell 3' v3 HT + - SC5P-PE: Single Cell 5' paired-end + - SC5P-R2: Single Cell 5' R2-only + - SC-FB: Single Cell Antibody-only 3' v2 or 5' + See https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information. + choices: [ auto, threeprime, fiveprime, SC3Pv1, SC3Pv2, SC3Pv3, SC3Pv3LT, SC3Pv3HT, SC5P-PE, SC5P-R2, SC-FB ] + + - type: boolean + name: "--secondary_analysis" + default: false + description: Whether or not to run the secondary analysis e.g. clustering. + + - type: boolean + name: "--generate_bam" + default: true + description: Whether to generate a BAM file. + + - type: boolean + name: "--include_introns" + default: true + description: Include intronic reads in count (default=true unless --target-panel is specified in which case default=false) +dependencies: + - name: mapping/cellranger_count + - name: mapping/cellranger_count_split + - name: convert/from_10xh5_to_h5mu +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/cellranger_tiny_fastq +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/ingestion/cellranger_mapping/integration_test.sh b/src/workflows/ingestion/cellranger_mapping/integration_test.sh new file mode 100755 index 00000000..c2744e1c --- /dev/null +++ b/src/workflows/ingestion/cellranger_mapping/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/cellranger_mapping/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/ingestion/cellranger_mapping/main.nf b/src/workflows/ingestion/cellranger_mapping/main.nf new file mode 100644 index 00000000..e8247a21 --- /dev/null +++ b/src/workflows/ingestion/cellranger_mapping/main.nf @@ -0,0 +1,57 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | cellranger_count.run( + fromState: [ + "input": "input", + "output": "output_raw", + "expect_cells": "expect_cells", + "chemistry": "chemistry", + "secondary_analysis": "secondary_analysis", + "generate_bam": "generate_bam", + "include_introns": "include_introns", + "reference": "reference" + ], + toState: [ + "input": "output", + "output_raw": "output" + ] + ) + // split output dir into map + | cellranger_count_split.run( + fromState: {id, state -> + def stateMapping = [ + "input": state.input, + ] + outputType = state.output_type == "raw" ? "raw_h5" : "filtered_h5" + stateMapping += [outputType: "\$id.\$key.${outputType}.h5"] + stateMapping += ["metrics_summary": "\$id.\$key.metrics_summary.csv"] + return stateMapping + }, + toState: {id, output, state -> + def outputFile = state.output_type == "raw" ? output.raw_h5 : output.filtered_h5 + def newState = state + [ "input": outputFile ] + return newState + } + ) + // convert to h5mu + | from_10xh5_to_h5mu.run( + fromState: {id, state -> + [ + "input": state.input, + "output_compression": "gzip", + "output": state.output_h5mu, + "uns_metrics": state.uns_metrics, + "input_metrics_summary": state.metrics_summary + ] + }, + toState: ["output_h5mu": "output"] + ) + | setState(["output_raw", "output_h5mu"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/ingestion/cellranger_mapping/nextflow.config b/src/workflows/ingestion/cellranger_mapping/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/ingestion/cellranger_mapping/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/ingestion/cellranger_mapping/test.nf b/src/workflows/ingestion/cellranger_mapping/test.nf new file mode 100644 index 00000000..452445ed --- /dev/null +++ b/src/workflows/ingestion/cellranger_mapping/test.nf @@ -0,0 +1,38 @@ +nextflow.enable.dsl=2 + +include { cellranger_mapping } from params.rootDir + "/target/nextflow/workflows/ingestion/cellranger_mapping/main.nf" +include { cellranger_mapping_test } from params.rootDir + "/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "foo", + input: resources_test.resolve("cellranger_tiny_fastq/cellranger_tiny_fastq"), + reference: resources_test.resolve("cellranger_tiny_fastq/cellranger_tiny_ref"), + output_type: "filtered", + ] + ]) + | map{ state -> [state.id, state] } + | cellranger_mapping + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, out]" + assert output[1] instanceof Map : "Output should be a Map." + // todo: check whether output dir contains fastq files + "Output: $output" + } + + | cellranger_mapping_test.run( + fromState: ["input": "output_h5mu"] + ) + + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "foo" : "Output ID should be same as input ID" + } +} diff --git a/src/workflows/ingestion/cellranger_multi/config.vsh.yaml b/src/workflows/ingestion/cellranger_multi/config.vsh.yaml new file mode 100644 index 00000000..0be49650 --- /dev/null +++ b/src/workflows/ingestion/cellranger_multi/config.vsh.yaml @@ -0,0 +1,58 @@ +name: "cellranger_multi" +namespace: "workflows/ingestion" +scope: "public" +description: "A pipeline for running Cell Ranger multi." +info: + name: Cell Ranger multi + test_dependencies: + - name: cellranger_multi_test + namespace: test_workflows/ingestion +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +__merge__: /src/mapping/cellranger_multi/cellranger_multi.yaml +argument_groups: + - name: Outputs + arguments: + - name: "--output_raw" + type: file + direction: output + description: "The raw output folder." + required: true + example: output_dir/ + - name: "--output_h5mu" + type: file + description: | + Locations for the output files. Must contain a wildcard (*) character, + which will be replaced with the sample name. + example: "*.h5mu" + direction: output + required: true + - name: "--uns_metrics" + type: string + description: Name of the .uns slot under which to QC metrics (if any). + default: "metrics_cellranger" +dependencies: + - name: mapping/cellranger_multi + alias: cellranger_multi_component + - name: convert/from_cellranger_multi_to_h5mu +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/10x_5k_anticmv/raw/ + dest: 10x_5k_anticmv/raw/ + - path: /resources_test/10x_5k_fixed/raw/ + dest: 10x_5k_fixed/raw/ + - path: /resources_test/reference_gencodev41_chr1 +runners: + - type: nextflow diff --git a/src/workflows/ingestion/cellranger_multi/integration_test.sh b/src/workflows/ingestion/cellranger_multi/integration_test.sh new file mode 100755 index 00000000..b0b08402 --- /dev/null +++ b/src/workflows/ingestion/cellranger_multi/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/cellranger_multi/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/ingestion/cellranger_multi/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/ingestion/cellranger_multi/main.nf b/src/workflows/ingestion/cellranger_multi/main.nf new file mode 100644 index 00000000..36313ee7 --- /dev/null +++ b/src/workflows/ingestion/cellranger_multi/main.nf @@ -0,0 +1,133 @@ +process cellrangerMultiStub { + input: + tuple val(id), val(unused) + + output: + tuple val(id), path("raw_dir"), path("*.h5mu"), path("samples.csv") + + script: + """ + echo "This process is not meant to be run without -stub being defined." + exit 1 + """ + + stub: + """ + mkdir raw_dir + touch ${id}_sample_1.h5mu + touch ${id}_sample_2.h5mu + touch ${id}_sample_3.h5mu + echo -e "sample_name,file\n${id}_sample_1,${id}_sample_1.h5mu\n${id}_sample_2,${id}_sample_2.h5mu\n${id}_sample_3,${id}_sample_3.h5mu" > samples.csv + """ +} + +workflow run_wf { + take: + input_ch + + main: + cellranger_ch = input_ch + | cellranger_multi_component.run( + fromState: [ + "input": "input", + "gex_input": "gex_input", + "abc_input": "abc_input", + "cgc_input": "cgc_input", + "mux_input": "mux_input", + "vdj_input": "vdj_input", + "vdj_t_input": "vdj_t_input", + "vdj_t_gd_input": "vdj_t_gd_input", + "vdj_b_input": "vdj_b_input", + "agc_input": "agc_input", + "library_id": "library_id", + "library_type": "library_type", + "library_subsample": "library_subsample", + "library_lanes": "library_lanes", + "library_chemistry": "library_chemistry", + "sample_ids": "sample_ids", + "sample_description": "sample_description", + "sample_expect_cells": "sample_expect_cells", + "sample_force_cells": "sample_force_cells", + "feature_reference": "feature_reference", + "feature_r1_length": "feature_r1_length", + "feature_r2_length": "feature_r2_length", + "gex_reference": "gex_reference", + "gex_secondary_analysis": "gex_secondary_analysis", + "gex_generate_bam": "gex_generate_bam", + "gex_expect_cells": "gex_expect_cells", + "gex_force_cells": "gex_force_cells", + "gex_include_introns": "gex_include_introns", + "gex_r1_length": "gex_r1_length", + "gex_r2_length": "gex_r2_length", + "gex_chemistry": "gex_chemistry", + "vdj_reference": "vdj_reference", + "vdj_inner_enrichment_primers": "vdj_inner_enrichment_primers", + "vdj_r1_length": "vdj_r1_length", + "vdj_r2_length": "vdj_r2_length", + "cell_multiplex_oligo_ids": "cell_multiplex_oligo_ids", + "min_assignment_confidence": "min_assignment_confidence", + "cmo_set": "cmo_set", + "barcode_sample_assignment": "barcode_sample_assignment", + "probe_set": "probe_set", + "filter_probes": "filter_probes", + "probe_barcode_ids": "probe_barcode_ids", + "control_id": "control_id", + "mhc_allele": "mhc_allele", + "check_library_compatibility": "check_library_compatibility", + "output": "output_raw", + ], + toState: [ + "output_raw": "output", + "input": "output" + ] + ) + + h5mu_ch = cellranger_ch + | from_cellranger_multi_to_h5mu.run( + fromState: {id, state -> + [ + "input": state.input, + "uns_metrics": state.uns_metrics, + "output_compression": "gzip" + ] + }, + toState: {id, output, state -> + [ + "sample_csv": output.sample_csv, + "output_h5mu": output.output, + "output_raw": state.output_raw + ] + }, + filter: {!workflow.stubRun}, + ) + + h5mu_stub_ch = cellranger_ch + | filter{workflow.stubRun} + | map {id, state -> [id, state.input]} + | cellrangerMultiStub + | map {id, raw_dir, output, output_files -> + [id, ["output_raw": raw_dir, "output_h5mu": output, "sample_csv": output_files]] + } + + output_ch = h5mu_stub_ch.concat(h5mu_ch) + | flatMap {id, state -> + def h5mu_list = state.output_h5mu + def samples = readCsv(state.sample_csv.toUriString()) + println "Samples: $samples" + def result = h5mu_list.collect{ h5mu_file -> + println "H5mu: ${h5mu_file}, getName: ${h5mu_file.getName()}" + def corresponding_csv_entry = samples.find{h5mu_file.getName() == it.file} + print "CSV entry: $corresponding_csv_entry" + // The cellranger component used to only be able to output a single h5mu file + // In cases where cell multiplexing is not used (1 output sample), it uses 'run' for the sample ID as a dummy. + // This sample ID 'run' was never used for the ID of the channel events. + // So here we overwrite this 'run' id with the name of the input event. + def new_id = h5mu_list.size() == 1 ? id : corresponding_csv_entry.sample_name + return [ new_id, ["output_h5mu": h5mu_file, "output_raw": state.output_raw, "_meta": ["join_id": id]]] + } + return result + } + + emit: + output_ch +} diff --git a/src/workflows/ingestion/cellranger_multi/nextflow.config b/src/workflows/ingestion/cellranger_multi/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/ingestion/cellranger_multi/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/ingestion/cellranger_multi/test.nf b/src/workflows/ingestion/cellranger_multi/test.nf new file mode 100644 index 00000000..7639307c --- /dev/null +++ b/src/workflows/ingestion/cellranger_multi/test.nf @@ -0,0 +1,91 @@ +nextflow.enable.dsl=2 + +include { cellranger_multi } from params.rootDir + "/target/nextflow/workflows/ingestion/cellranger_multi/main.nf" +include { cellranger_multi_test } from params.rootDir + "/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "foo", + input:[ + resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz"), + resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz"), + resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz"), + resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz"), + resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz"), + resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz") + ], + gex_reference: resources_test.resolve("reference_gencodev41_chr1/reference_cellranger.tar.gz"), + vdj_reference: resources_test.resolve("10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"), + feature_reference: resources_test.resolve("10x_5k_anticmv/raw/feature_reference.csv"), + library_id: [ + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "5k_human_antiCMV_T_TBNK_connect_AB_subset", + "5k_human_antiCMV_T_TBNK_connect_VDJ_subset" + ], + library_type: [ + "Gene Expression", + "Antibody Capture", + "VDJ" + ] + ] + ]) + | map{ state -> [state.id, state] } + | cellranger_multi + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, out]" + assert output[1] instanceof Map : "Output should be a Map." + // todo: check whether output dir contains fastq files + "Output: $output" + } + + | cellranger_multi_test.run( + fromState: ["input": "output_h5mu"] + ) + + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "foo" : "Output ID should be same as input ID" + } +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "foo", + input: resources_test.resolve("10x_5k_fixed/raw/"), + gex_reference: resources_test.resolve("reference_gencodev41_chr1/reference_cellranger.tar.gz"), + feature_reference: resources_test.resolve("10x_5k_anticmv/raw/feature_reference.csv"), + library_id: ["4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_subset"], + library_type: ["Gene Expression"], + library_lanes: "any", + probe_barcode_ids: ["BC001|BC002", "BC003", "BC004"], + gex_generate_bam: false, + sample_force_cells: [5000, -1, -1], + probe_set: resources_test.resolve("10x_5k_fixed/raw/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"), + sample_ids: ["Liver_BC1andOvarian_BC2", "Colorectal_BC3", "Pancreas_BC4"] + ] + ]) + | map{ state -> [state.id, state] } + | cellranger_multi + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, out]" + assert output[1] instanceof Map : "Output should be a Map." + // todo: check whether output dir contains fastq files + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 3 : "output channel should contain three events" + assert (output_list.collect{it[0]} as Set) == (["Liver_BC1andOvarian_BC2", "Pancreas_BC4", "Colorectal_BC3"] as Set) : "Output ID should be same as input ID" + } +} diff --git a/src/workflows/ingestion/cellranger_postprocessing/config.vsh.yaml b/src/workflows/ingestion/cellranger_postprocessing/config.vsh.yaml new file mode 100644 index 00000000..f6f7be86 --- /dev/null +++ b/src/workflows/ingestion/cellranger_postprocessing/config.vsh.yaml @@ -0,0 +1,76 @@ +name: "cellranger_postprocessing" +namespace: "workflows/ingestion" +scope: "public" +description: "Post-processing Cell Ranger datasets." +info: + name: Cell Ranger post-processing + test_dependencies: + - name: from_10xh5_to_h5mu + namespace: convert + - name: cellranger_postprocessing_test + namespace: test_workflows/ingestion +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - type: file + name: --input + required: true + example: "input.h5mu" + description: Input h5mu file created by running Cell Ranger and converting its output to h5mu. + - name: Outputs + arguments: + - name: "--output" + type: file + direction: output + description: "The converted h5mu file." + - name: Correction arguments + arguments: + - name: "--perform_correction" + type: boolean_true + description: Whether or not to run CellBender to perform count correction. + - name: "--cellbender_epochs" + type: integer + description: Number of epochs to run CellBender for. + default: 150 + - name: Filtering arguments + arguments: + - name: "--min_genes" + type: integer + example: 100 + description: Minimum number of counts required for a cell to pass filtering. + - name: "--min_counts" + type: integer + example: 1000 + description: Minimum number of genes expressed required for a cell to pass filtering. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +dependencies: + - name: correction/cellbender_remove_background + - name: filter/filter_with_counts + - name: filter/subset_h5mu +# test_dependencies: +# - name: convert/from_10xh5_to_h5mu +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow diff --git a/src/workflows/ingestion/cellranger_postprocessing/integration_test.sh b/src/workflows/ingestion/cellranger_postprocessing/integration_test.sh new file mode 100755 index 00000000..3ca259c9 --- /dev/null +++ b/src/workflows/ingestion/cellranger_postprocessing/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/cellranger_postprocessing/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/ingestion/cellranger_postprocessing/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/ingestion/cellranger_postprocessing/main.nf b/src/workflows/ingestion/cellranger_postprocessing/main.nf new file mode 100644 index 00000000..c21d9c39 --- /dev/null +++ b/src/workflows/ingestion/cellranger_postprocessing/main.nf @@ -0,0 +1,67 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map{id, state -> + def output_state + if (!workflow.stubRun) { + assert (state.perform_correction || state.min_genes != null || state.min_counts != null): + "Either perform_correct, min_genes or min_counts should be specified!" + output_state = state + } else { + output_state = state + ["perform_correction": true, "min_genes": 1, "min_counts": 1] + } + [id, output_state] + } + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // perform correction if so desired + | cellbender_remove_background.run( + runIf: {id, state -> state.perform_correction}, + fromState: { id, state -> + [ + input: state.input, + epochs: state.cellbender_epochs, + output_layer: "cellbender_corrected", + output_compression: "gzip", + output: state.workflow_output, + ] + }, + toState: { id, output, state -> + state + ["input": output.output, "layer": "cellbender_corrected"] + } + ) + | filter_with_counts.run( + runIf: {id, state -> + state.min_genes != null || state.min_counts != null + }, + fromState: { id, state -> + [ + input: state.input, + min_genes: state.min_genes, + min_counts: state.min_counts, + layer: state.layer, + output_compression: "gzip", + do_subset: true, + output: state.workflow_output, + ] + }, + toState: [input: "output"] + ) + // Make sure to use the correct ouput file names, + // irrespective of which component(s) (or combinations of then) + // were run. The above components + // should put their output into 'input' + | map {id, state -> + [id, ["output": state.input]] + } + + emit: + output_ch +} diff --git a/src/workflows/ingestion/cellranger_postprocessing/nextflow.config b/src/workflows/ingestion/cellranger_postprocessing/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/ingestion/cellranger_postprocessing/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/ingestion/cellranger_postprocessing/test.nf b/src/workflows/ingestion/cellranger_postprocessing/test.nf new file mode 100644 index 00000000..a6499dcc --- /dev/null +++ b/src/workflows/ingestion/cellranger_postprocessing/test.nf @@ -0,0 +1,105 @@ +nextflow.enable.dsl=2 + +include { cellranger_postprocessing } from params.rootDir + "/target/nextflow/workflows/ingestion/cellranger_postprocessing/main.nf" +include { from_10xh5_to_h5mu } from params.rootDir + "/target/nextflow/convert/from_10xh5_to_h5mu/main.nf" +include { cellranger_postprocessing_test } from params.rootDir + "/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "foo", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + input_og: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + perform_correction: true, + min_genes: 100, + min_counts: 1000, + cellbender_epochs: 5 + ] + ]) + | map{ state -> [state.id, state] } + + | cellranger_postprocessing.run( + toState: {id, output, state -> + output + [ + input_og: state.input_og, + perform_correction: state.perform_correction + ] + } + ) + + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, out]" + assert output[1] instanceof Map : "Output should be a Map." + // todo: check whether output dir contains fastq files + "Output: $output" + } + + | cellranger_postprocessing_test.run( + fromState: {id, state -> + [ + input: state.output, + input_og: state.input_og, + is_corrected: state.perform_correction + ] + } + ) + + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + } + //| check_format(args: {""}) // todo: check whether output h5mu has the right slots defined +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "zing", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + input_og: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + perform_correction: false, + min_genes: 100, + min_counts: 1000, + cellbender_epochs: 5 + ] + ]) + | map{ state -> [state.id, state] } + | cellranger_postprocessing.run( + toState: {id, output, state -> + output + [ + input_og: state.input_og, + perform_correction: state.perform_correction + ] + } + ) + + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, out]" + assert output[1] instanceof Map : "Output should be a Map." + // todo: check whether output dir contains fastq files + "Output: $output" + } + + | cellranger_postprocessing_test.run( + fromState: {id, state -> + [ + input: state.output, + input_og: state.input_og, + is_corrected: state.perform_correction + ] + } + ) + + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + } +} diff --git a/src/workflows/ingestion/conversion/config.vsh.yaml b/src/workflows/ingestion/conversion/config.vsh.yaml new file mode 100644 index 00000000..76e7150d --- /dev/null +++ b/src/workflows/ingestion/conversion/config.vsh.yaml @@ -0,0 +1,67 @@ +name: "conversion" +namespace: "workflows/ingestion" +scope: "public" +description: "A pipeline to convert different file formats to .h5mu." +info: + name: Convert to MuData + test_dependencies: + - name: conversion_test + namespace: test_workflows/ingestion +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + alternatives: [-i] + description: Path to the sample. + required: true + example: input.h5mu + type: file + - name: "--input_type" + type: string + required: true + alternatives: [-t] + description: Type of the input file + choices: ["10xmtx", "10xh5", "h5ad"] + - name: Outputs + arguments: + - name: "--output" + required: false + direction: output + type: file + multiple: false + description: Name or template for the output files. + example: output.h5mu + - name: Conversion from h5ad + arguments: + - name: "--modality" + required: false + type: string + multiple: true + description: "Name of the modality where the h5ad is stored in the h5mu object." +dependencies: + - name: convert/from_10xh5_to_h5mu + - name: convert/from_10xmtx_to_h5mu + - name: convert/from_h5ad_to_h5mu +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow diff --git a/src/workflows/ingestion/conversion/integration_test.sh b/src/workflows/ingestion/conversion/integration_test.sh new file mode 100755 index 00000000..d551d436 --- /dev/null +++ b/src/workflows/ingestion/conversion/integration_test.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/conversion/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/ingestion/conversion/main.nf b/src/workflows/ingestion/conversion/main.nf new file mode 100644 index 00000000..52c84ad1 --- /dev/null +++ b/src/workflows/ingestion/conversion/main.nf @@ -0,0 +1,34 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | runEach( + components: [from_10xh5_to_h5mu, from_h5ad_to_h5mu, from_10xmtx_to_h5mu], + filter: { id, state, component -> + def componentNameMapper = [ + "10xh5": "from_10xh5_to_h5mu", + "10xmtx": "from_10xmtx_to_h5mu", + "h5ad": "from_h5ad_to_h5mu" + ] + componentNameMapper[state.input_type] == component.config.name + }, + fromState: { id, state, component -> + def passed_state = [ + input: state.input, + compression: "gzip", + output: state.output + ] + if (component.name == "from_h5ad_to_h5mu") { + passed_state.modality = state.modality + } + passed_state + }, + toState: ["output": "output"] + ) + | setState(["output": "output"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/ingestion/conversion/nextflow.config b/src/workflows/ingestion/conversion/nextflow.config new file mode 100644 index 00000000..c6511810 --- /dev/null +++ b/src/workflows/ingestion/conversion/nextflow.config @@ -0,0 +1,11 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/ingestion/conversion/test.nf b/src/workflows/ingestion/conversion/test.nf new file mode 100644 index 00000000..7b542874 --- /dev/null +++ b/src/workflows/ingestion/conversion/test.nf @@ -0,0 +1,57 @@ +nextflow.enable.dsl=2 + +include { conversion } from params.rootDir + "/target/nextflow/workflows/ingestion/conversion/main.nf" +include { conversion_test } from params.rootDir + "/target/_test/nextflow/test_workflows/ingestion/conversion_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "10xh5_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5"), + input_type: "10xh5", + modality: null + ], + [ + id: "10xmtx_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"), + input_type: "10xmtx", + modality: null, + output: "\$id.h5mu" + ], + [ + id: "10xmtx", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"), + input_type: "10xmtx", + modality: "rna", + output: "\$key.h5mu" + ], + [ + id: "h5ad", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad"), + input_type: "h5ad", + modality: "rna", + output: "\$key.h5mu" + ] + ]) + | map{ state -> [state.id, state] } + | conversion + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + + | conversion_test.run( + fromState: ["input": "output"] + ) + + | toSortedList() + | map { output_list -> + assert output_list.size() == 4 : "output channel should contain four events" + } +} diff --git a/src/workflows/ingestion/demux/config.vsh.yaml b/src/workflows/ingestion/demux/config.vsh.yaml new file mode 100644 index 00000000..7a9aa62b --- /dev/null +++ b/src/workflows/ingestion/demux/config.vsh.yaml @@ -0,0 +1,83 @@ +name: demux +namespace: workflows/ingestion +scope: "public" +description: | + Convert `.bcl` files to `.fastq` files using bcl2fastq, bcl-convert or Cell Ranger mkfastq. +info: + name: Demux + short_description: A generic pipeline for running bcl2fastq, bcl-convert or Cell Ranger mkfastq. + test_dependencies: +authors: + - __merge__: /src/authors/toni_verbeiren.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/marijke_van_moerbeke.yaml + roles: [ author ] + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/samuel_d_souza.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author ] +arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + alternatives: [ "-i" ] + type: file + required: true + description: Input run directory + example: bcl_dir + - name: "--sample_sheet" + alternatives: [ "-s" ] + type: file + required: true + description: Pointer to the sample sheet + example: bcl_dir + - name: "--demultiplexer" + type: string + description: The multiplexer to use, one of bclconvert or mkfastq + choices: [ bclconvert, bcl2fastq, mkfastq ] + default: bcl2fastq + - name: "--ignore_missing" + type: boolean + description: Should the demultiplexer ignore missing entities (filter, ...) + - name: "--output_fastq" + type: file + direction: output + required: true + description: Output directory containig fastq files + example: fastq_dir + - name: "--output_fastqc" + type: file + direction: output + required: false + description: Reports directory produced by FastQC + example: reports_dir + - name: "--output_multiqc" + type: file + direction: output + required: false + description: Reports directory produced by MultiQC + example: reports_dir +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/cellranger_tiny_bcl +dependencies: + - name: demux/cellranger_mkfastq + - name: demux/bcl_convert + - name: demux/bcl2fastq + - name: qc/fastqc + - name: qc/multiqc +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/ingestion/demux/integration_test.sh b/src/workflows/ingestion/demux/integration_test.sh new file mode 100755 index 00000000..ef155d14 --- /dev/null +++ b/src/workflows/ingestion/demux/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/demux/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/ingestion/demux/main.nf b/src/workflows/ingestion/demux/main.nf new file mode 100644 index 00000000..34088ea4 --- /dev/null +++ b/src/workflows/ingestion/demux/main.nf @@ -0,0 +1,65 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // run the demultiplexers + | runEach( + components: [cellranger_mkfastq, bcl_convert, bcl2fastq], + filter: { id, state, component -> + def funcNameMapper = [ + "bclconvert": "bcl_convert", + "bcl2fastq": "bcl2fastq", + "mkfastq": "cellranger_mkfastq" + ] + funcNameMapper[state.demultiplexer] == component.config.name + }, + fromState: { id, state, component -> + def data = [ + input: state.input, + sample_sheet: state.sample_sheet, + reports: null // disable reports so they end up in the output dir + ] + if (component.config.name== "bcl2fastq") { + data.ignore_missing = state.ignore_missing + } + data + }, + toState: [ + "input": "output", + "output_fastq": "output" + ] + ) + + // run fastqc + | fastqc.run( + fromState: [ + "input": "input", + "output": "output_fastqc" + ], + args: [mode: "dir"], + toState: [ + "output_fastqc": "output", + "input": "output" + ] + ) + + // run multiqc + | multiqc.run( + fromState: { id, state -> + [ + "input": [state.input], + "output": state.output_multiqc + ] + }, + toState: ["output_multiqc": "output"] + ) + // subset state to the outputs + | setState(["output_fastq", "output_fastqc", "output_multiqc"]) + + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/ingestion/demux/nextflow.config b/src/workflows/ingestion/demux/nextflow.config new file mode 100644 index 00000000..059100c4 --- /dev/null +++ b/src/workflows/ingestion/demux/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/ingestion/demux/test.nf b/src/workflows/ingestion/demux/test.nf new file mode 100644 index 00000000..366fa9d9 --- /dev/null +++ b/src/workflows/ingestion/demux/test.nf @@ -0,0 +1,56 @@ +nextflow.enable.dsl=2 + + +include { demux } from params.rootDir + "/target/nextflow/workflows/ingestion/demux/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + // or when running from s3: + Channel.fromList([ + [ + id: "mkfastq_test", + input: resources_test.resolve("cellranger_tiny_bcl/bcl"), + sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"), + demultiplexer: "mkfastq" + ], + [ + id: "bclconvert_test", + input: resources_test.resolve("cellranger_tiny_bcl/bcl2/"), + sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl2/sample_sheet.csv"), + demultiplexer: "bclconvert" + ], + [ + id: "bcl2fastq_test", + input: resources_test.resolve("cellranger_tiny_bcl/bcl"), + sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"), + demultiplexer: "bcl2fastq", + ignore_missing: true + ] + ]) + | map{ state -> [state.id, state] } + | demux + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, state]" + + def id = output[0] + assert id.contains("_test") + + def state = output[1] + assert state.containsKey("output_fastq") : "State should contain output_fastq" + assert state.output_fastq.isDirectory() : "output_fastq should be a directory." + assert state.containsKey("output_fastqc") : "State should contain output_fastqc" + assert state.output_fastqc.isDirectory() : "output_fastqc should be a directory." + assert state.containsKey("output_multiqc") : "State should contain output_multiqc" + assert state.output_multiqc.isDirectory() : "output_multiqc should be a directory." + + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 3 : "There should be three outputs" + } +} diff --git a/src/workflows/ingestion/make_reference/config.vsh.yaml b/src/workflows/ingestion/make_reference/config.vsh.yaml new file mode 100644 index 00000000..fd923b36 --- /dev/null +++ b/src/workflows/ingestion/make_reference/config.vsh.yaml @@ -0,0 +1,169 @@ +name: make_reference +namespace: workflows/ingestion +scope: "public" +description: | + Build a transcriptomics reference into one of many formats. +info: + test_dependencies: +authors: + - __merge__: /src/authors/angela_pisco.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the reference. + example: foo + - type: file + name: --genome_fasta + required: true + description: Reference genome fasta. + example: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz + - type: file + name: --transcriptome_gtf + required: true + description: Reference transcriptome annotation. + example: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz + - type: file + name: --ercc + description: ERCC sequence and annotation file. + example: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip + - name: STAR Settings + arguments: + - type: integer + name: --star_genome_sa_index_nbases + description: | + Length (bases) of the SA pre-indexing string. Typically between 10 and 15. + Longer strings will use much more memory, but allow faster searches. For small + genomes, the parameter {genomeSAindexNbases must be scaled down to + min(14, log2(GenomeLength)/2 - 1). + required: false + default: 14 + - name: BD Rhapsody Settings + arguments: + - type: string + name: --bdrhap_mitochondrial_contigs + description: | + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are + identified as 'nuclear fragments' in the ATACseq analysis pipeline. + required: false + multiple: true + default: [chrM, chrMT, M, MT] + - type: boolean_true + name: --bdrhap_filtering_off + description: | + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features + having the following attribute values are kept: + + - protein_coding + - lncRNA + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + - type: boolean_true + name: --bdrhap_wta_only_index + description: Build a WTA only index, otherwise builds a WTA + ATAC index. + - type: string + name: --bdrhap_extra_star_params + description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + required: false + - name: "Cellranger ARC options" + arguments: + - name: "--motifs_file" + type: file + direction: input + description: Path to file containing transcription factor motifs in JASPAR format. + - name: "--non_nuclear_contigs" + multiple: true + required: false + type: string + description: | + Name(s) of contig(s) that do not have any chromatin structure, for example, + mitochondria or plastids. These contigs are excluded from peak calling since + the entire contig will be "open" due to a lack of chromatin structure. + Leave empty if there are no such contigs. + + - name: Outputs + arguments: + - type: string + name: --target + choices: [ cellranger, cellranger_arc, bd_rhapsody, star ] + description: Which reference indices to generate. + multiple: true + default: [ star ] + - type: file + name: --output_fasta + direction: output + description: Output genome sequence fasta. + example: genome_sequence.fa.gz + - type: file + name: --output_gtf + direction: output + description: Output transcriptome annotation gtf. + example: transcriptome_annotation.gtf.gz + - type: file + name: --output_cellranger + direction: output + description: Output index + example: cellranger_index.tar.gz + - type: file + name: --output_cellranger_arc + direction: output + description: Output index + example: cellranger_index_arc.tar.gz + - type: file + name: --output_bd_rhapsody + direction: output + description: Output index + example: bdrhap_index.tar.gz + - type: file + name: --output_star + direction: output + description: Output index + example: star_index.tar.gz + - name: Arguments + arguments: + - type: string + name: --subset_regex + description: Will subset the reference chromosomes using the given regex. + example: (ERCC-00002|chr1) +dependencies: + - name: reference/make_reference + alias: make_reference_component + - name: reference/build_bdrhap_reference + - name: reference/build_star_reference + - name: reference/build_cellranger_reference + - name: reference/build_cellranger_arc_reference +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/reference_gencodev41_chr1 +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/ingestion/make_reference/integration_test.sh b/src/workflows/ingestion/make_reference/integration_test.sh new file mode 100755 index 00000000..d390fc78 --- /dev/null +++ b/src/workflows/ingestion/make_reference/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/ingestion/make_reference/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/ingestion/make_reference/main.nf b/src/workflows/ingestion/make_reference/main.nf new file mode 100644 index 00000000..fba3cb4b --- /dev/null +++ b/src/workflows/ingestion/make_reference/main.nf @@ -0,0 +1,95 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map{ id, state -> + // remove all fields starting with 'output_' + def newState = state.findAll{k, v -> !k.startsWith("output_")} + // make sure target is present + newState.target = newState.target ?: [] + [id, newState] + } + | make_reference_component.run( + fromState: [ + "input": "input", + "genome_fasta": "genome_fasta", + "transcriptome_gtf": "transcriptome_gtf", + "ercc": "ercc", + "output_fasta": "output_fasta", + "output_gtf": "output_gtf", + "subset_regex": "subset_regex" + ], + toState: [ + "output_fasta": "output_fasta", + "output_gtf": "output_gtf" + ] + ) + | build_cellranger_arc_reference.run( + runIf: { id, state -> + state.target.contains("cellranger_arc") + }, + fromState: [ + "genome_fasta": "output_fasta", + "annotation_gtf": "output_gtf", + "output": "output_cellranger_arc", + "motifs_file": "motifs_file", + "non_nuclear_contigs": "non_nuclear_contigs", + ], + toState: [ + "output_cellranger_arc": "output" + ], + ) + | build_cellranger_reference.run( + runIf: { id, state -> + state.target.contains("cellranger") + }, + fromState: [ + genome_fasta: "output_fasta", + transcriptome_gtf: "output_gtf" + ], + toState: [ + output_cellranger: "output" + ] + ) + | build_star_reference.run( + runIf: { id, state -> + state.target.contains("star") + }, + fromState: [ + genome_fasta: "output_fasta", + transcriptome_gtf: "output_gtf", + genomeSAindexNbases: "star_genome_sa_index_nbases" + ], + toState: [ + output_star: "output" + ] + ) + | build_bdrhap_reference.run( + runIf: { id, state -> + state.target.contains("bd_rhapsody") + }, + fromState: [ + genome_fasta: "output_fasta", + gtf: "output_gtf", + mitochondrial_contigs: "bdrhap_mitochondrial_contigs", + filtering_off: "bdrhap_filtering_off", + wta_only_index: "bdrhap_wta_only_index", + rna_only_index: "bdrhap_rna_only_index" + ], + toState: [ + output_bd_rhapsody: "reference_archive" + ] + ) + | setState([ + "output_fasta", + "output_gtf", + "output_cellranger", + "output_star", + "output_bd_rhapsody", + "output_cellranger_arc", + ]) + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/ingestion/make_reference/nextflow.config b/src/workflows/ingestion/make_reference/nextflow.config new file mode 100644 index 00000000..059100c4 --- /dev/null +++ b/src/workflows/ingestion/make_reference/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/ingestion/make_reference/test.nf b/src/workflows/ingestion/make_reference/test.nf new file mode 100644 index 00000000..80c0d314 --- /dev/null +++ b/src/workflows/ingestion/make_reference/test.nf @@ -0,0 +1,33 @@ +nextflow.enable.dsl=2 + +include { make_reference } from params.rootDir + "/target/nextflow/workflows/ingestion/make_reference/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "gencode_v41_ercc", + genome_fasta: resources_test.resolve("reference_gencodev41_chr1/reference.fa.gz"), + transcriptome_gtf: resources_test.resolve("reference_gencodev41_chr1/reference.gtf.gz"), + ercc: resources_test.resolve("reference_gencodev41_chr1/ERCC92.zip"), + subset_regex: "(ERCC-00002|chr1)", + target: ["cellranger", "bd_rhapsody", "star"] + ] + ]) + | map{ state -> [state.id, state] } + | make_reference + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].size() == 5 : "output data should contain 5 elements" + // todo: check output data tuple + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "There should be one output" + } +} diff --git a/src/workflows/integration/bbknn_leiden/config.vsh.yaml b/src/workflows/integration/bbknn_leiden/config.vsh.yaml new file mode 100644 index 00000000..c0625bb3 --- /dev/null +++ b/src/workflows/integration/bbknn_leiden/config.vsh.yaml @@ -0,0 +1,117 @@ +name: "bbknn_leiden" +namespace: "workflows/integration" +scope: "public" +description: "Run bbknn followed by leiden clustering and run umap on the result." +info: + test_dependencies: +authors: + - __merge__: /src/authors/mauro_saporita.yaml + roles: [ author ] + - __merge__: /src/authors/povilas_gibas.yaml + roles: [ author ] +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + default: "log_normalized" + type: string + description: use specified layer for expression values instead of the .X object from the modality. + required: false + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: Bbknn + arguments: + - name: "--obsm_input" + description: The dimensionality reduction in `.obsm` to use for neighbour detection. Defaults to X_pca. + type: string + default: "X_pca" + - name: "--obs_batch" + type: string + description: .obs column name discriminating between your batches. + default: "sample_id" + - name: "--uns_output" + type: string + default: "bbknn_integration_neighbors" + description: Mandatory .uns slot to store various neighbor output objects. + - name: "--obsp_distances" + type: string + default: "bbknn_integration_distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--obsp_connectivities" + type: string + default: "bbknn_integration_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: "--n_neighbors_within_batch" + type: integer + description: How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches. + default: 3 + - name: "--n_pcs" + type: integer + description: How many dimensions (in case of PCA, principal components) to use in the analysis. + default: 50 + - name: "--n_trim" + type: integer + description: Trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. The lower the value the more independent the individual populations, at the cost of more conserved batch effect. If `None` (default), sets the parameter value automatically to 10 times `neighbors_within_batch` times the number of batches. Set to 0 to skip. + - name: Clustering options + arguments: + - name: "--obs_cluster" + type: string + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions + resolutions specified in '--leiden_resolution'. + default: "bbknn_integration_leiden" + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + - name: UMAP options + arguments: + - name: "--obsm_umap" + type: string + default: "X_leiden_bbknn_umap" + required: false + description: "In which .obsm slot to store the resulting UMAP embedding." +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/pbmc_1k_protein_v3 +dependencies: + - name: cluster/leiden + - name: dimred/umap + - name: neighbors/bbknn + - name: metadata/move_obsm_to_obs +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/integration/bbknn_leiden/integration_test.sh b/src/workflows/integration/bbknn_leiden/integration_test.sh new file mode 100755 index 00000000..79c3106c --- /dev/null +++ b/src/workflows/integration/bbknn_leiden/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/integration/bbknn_leiden/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/integration/bbknn_leiden/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/bbknn_leiden/main.nf b/src/workflows/integration/bbknn_leiden/main.nf new file mode 100644 index 00000000..474d12ff --- /dev/null +++ b/src/workflows/integration/bbknn_leiden/main.nf @@ -0,0 +1,79 @@ +workflow run_wf { + take: + input_ch + + main: + bbknn_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // compute bbknn graph + | bbknn.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_input", + "obs_batch": "obs_batch", + "uns_output": "uns_output", + "obsp_distances": "obsp_distances", + "obsp_connectivities": "obsp_connectivities", + "n_neighbors_within_batch": "n_neighbors_within_batch", + "n_pcs": "n_pcs", + "n_trim": "n_trim", + ], + toState: [ + "input": "output" + ] + ) + with_leiden_ch = bbknn_ch + | filter{id, state -> state.leiden_resolution} + // run leiden on the bbknn graph + | leiden.run( + fromState: [ + "input": "input", + "obsp_connectivities": "obsp_connectivities", + "obsm_name": "obs_cluster", + "resolution": "leiden_resolution", + "modality": "modality" + ], + toState: [ + "input": "output" + ] + ) + // move obsm leiden cluster dataframe to obs + | move_obsm_to_obs.run( + fromState: + [ + "input": "input", + "obsm_key": "obs_cluster", + "modality": "modality", + ], + toState: ["input": "output"] + ) + + without_leiden_ch = bbknn_ch + | filter{id, state -> !state.leiden_resolution} + + output_ch = with_leiden_ch.mix(without_leiden_ch) + // run umap on the bbknn graph + | umap.run( + fromState: { id, state -> + [ + "input": state.input, + "uns_neighbors": state.uns_output, + "obsm_output": state.obsm_umap, + "modality": state.modality, + "output": state.workflow_output, + "output_compression": "gzip" + ] + }, + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/integration/bbknn_leiden/nextflow.config b/src/workflows/integration/bbknn_leiden/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/integration/bbknn_leiden/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/bbknn_leiden/test.nf b/src/workflows/integration/bbknn_leiden/test.nf new file mode 100644 index 00000000..5a1eba9a --- /dev/null +++ b/src/workflows/integration/bbknn_leiden/test.nf @@ -0,0 +1,82 @@ +nextflow.enable.dsl=2 + +include { bbknn_leiden } from params.rootDir + "/target/nextflow/workflows/integration/bbknn_leiden/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = + Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized" + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | bbknn_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = + Channel.fromList([ + [ + id: "test_output_arg", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + output: "test.h5mu", + ], + ]) + | map{ state -> [state.id, state] } + | bbknn_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith("test.h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 1 event." + assert output_list.collect{it[0]} == ["test_output_arg"] + } +} diff --git a/src/workflows/integration/harmony_leiden/config.vsh.yaml b/src/workflows/integration/harmony_leiden/config.vsh.yaml new file mode 100644 index 00000000..7a6ecb4f --- /dev/null +++ b/src/workflows/integration/harmony_leiden/config.vsh.yaml @@ -0,0 +1,118 @@ +name: "harmony_leiden" +namespace: "workflows/integration" +scope: "public" +description: "Run harmony integration followed by neighbour calculations, leiden clustering and run umap on the result." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +info: + test_dependencies: +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + default: "log_normalized" + type: string + description: use specified layer for expression values instead of the .X object from the modality. + required: false + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: Neighbour calculation + arguments: + - name: "--uns_neighbors" + type: string + default: harmonypy_integration_neighbors + description: In which .uns slot to store various neighbor output objects. + - name: "--obsp_neighbor_distances" + type: string + default: "harmonypy_integration_distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--obsp_neighbor_connectivities" + type: string + default: "harmonypy_integration_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: Harmony integration options + arguments: + - name: "--embedding" + default: "X_pca" + type: string + description: "Embedding to use as input" + - name: "--obsm_integrated" + type: string + default: "X_pca_integrated" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--obs_covariates" + type: string + description: "The .obs field(s) that define the covariate(s) to regress out." + example: ["batch", "sample"] + multiple: true + required: true + - name: "--theta" + type: double + description: | + Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta + result in more diverse clusters." + default: 2 + multiple: true + - name: Clustering options + arguments: + - name: "--obs_cluster" + type: string + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions + resolutions specified in '--leiden_resolution'. + default: "harmony_integration_leiden" + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + - name: Umap options + arguments: + - name: "--obsm_umap" + type: string + default: "X_leiden_harmony_umap" + required: false + description: "In which .obsm slot to store the resulting UMAP embedding." +dependencies: + - name: integrate/harmonypy + - name: workflows/multiomics/neighbors_leiden_umap +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/integration/harmony_leiden/integration_test.sh b/src/workflows/integration/harmony_leiden/integration_test.sh new file mode 100755 index 00000000..1dfb63b1 --- /dev/null +++ b/src/workflows/integration/harmony_leiden/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/integration/harmony_leiden/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/integration/harmony_leiden/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/harmony_leiden/main.nf b/src/workflows/integration/harmony_leiden/main.nf new file mode 100644 index 00000000..3e442bb3 --- /dev/null +++ b/src/workflows/integration/harmony_leiden/main.nf @@ -0,0 +1,44 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // run harmonypy + | harmonypy.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "embedding", + "obs_covariates": "obs_covariates", + "obsm_output": "obsm_integrated", + "theta": "theta" + ], + toState: ["input": "output"] + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_integrated", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "leiden_resolution": "leiden_resolution", + "obs_cluster": "obs_cluster", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/workflows/integration/harmony_leiden/nextflow.config b/src/workflows/integration/harmony_leiden/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/integration/harmony_leiden/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/harmony_leiden/test.nf b/src/workflows/integration/harmony_leiden/test.nf new file mode 100644 index 00000000..e5c0d025 --- /dev/null +++ b/src/workflows/integration/harmony_leiden/test.nf @@ -0,0 +1,94 @@ +nextflow.enable.dsl=2 + +include { harmony_leiden } from params.rootDir + "/target/nextflow/workflows/integration/harmony_leiden/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = + Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + obs_covariates: "sample_id", + embedding: "X_pca", + leiden_resolution: [1.0, 0.25], + output: "foo.final.h5mu" + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + obs_covariates: "sample_id", + embedding: "X_pca", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | harmony_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } +} + + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = + Channel.fromList([ + [ + id: "test_output_arg", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + obs_covariates: "sample_id", + embedding: "X_pca", + leiden_resolution: [1.0, 0.25], + output: "foo.final.h5mu", + ] + ]) + | map{ state -> [state.id, state] } + | harmony_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith("foo.final.h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 1 events" + assert output_list.collect{it[0]} == ["test_output_arg"] + } +} + + diff --git a/src/workflows/integration/scanorama_leiden/config.vsh.yaml b/src/workflows/integration/scanorama_leiden/config.vsh.yaml new file mode 100644 index 00000000..955d0f09 --- /dev/null +++ b/src/workflows/integration/scanorama_leiden/config.vsh.yaml @@ -0,0 +1,131 @@ +name: "scanorama_leiden" +namespace: "workflows/integration" +scope: "public" +description: "Run scanorama integration followed by neighbour calculations, leiden clustering and run umap on the result." +authors: + - __merge__: /src/authors/mauro_saporita.yaml + roles: [ author ] + - __merge__: /src/authors/povilas_gibas.yaml + roles: [ author ] +info: + test_dependencies: +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + default: "log_normalized" + type: string + description: use specified layer for expression values instead of the .X object from the modality. + required: false + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: Neighbour calculation + arguments: + - name: "--uns_neighbors" + type: string + default: scanorama_integration_neighbors + description: In which .uns slot to store various neighbor output objects. + - name: "--obsp_neighbor_distances" + type: string + default: "scanorama_integration_distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--obsp_neighbor_connectivities" + type: string + default: "scanorama_integration_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: "Scanorama integration options" + arguments: + - name: "--obs_batch" + type: string + description: Column name discriminating between your batches. + default: "sample_id" + required: false + - name: "--obsm_input" + type: string + description: .obsm slot that points to embedding to run scanorama on. + default: "X_pca" + - name: "--obsm_output" + type: string + description: The name of the field in adata.obsm where the integrated embeddings will be stored after running this function. Defaults to X_scanorama. + default: "X_scanorama" + - name: "--knn" + type: integer + description: "Number of nearest neighbors to use for matching." + default: 20 + - name: "--batch_size" + type: integer + description: "The batch size used in the alignment vector computation. Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory." + default: 5000 + - name: "--sigma" + type: double + description: "Correction smoothing parameter on Gaussian kernel." + default: 15 + - name: "--approx" + type: boolean + description: "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime." + default: True + - name: "--alpha" + type: double + description: "Alignment score minimum cutoff" + default: 0.1 + - name: "Clustering options" + arguments: + - name: "--obs_cluster" + type: string + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the + resolutions specified in '--leiden_resolution'. + default: "scanorama_integration_leiden" + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + multiple: true + default: [1] + - name: "Umap options" + arguments: + - name: "--obsm_umap" + type: string + default: "X_leiden_scanorama_umap" + required: false + description: "In which .obsm slot to store the resulting UMAP embedding." +dependencies: + - name: integrate/scanorama + - name: workflows/multiomics/neighbors_leiden_umap +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow diff --git a/src/workflows/integration/scanorama_leiden/integration_test.sh b/src/workflows/integration/scanorama_leiden/integration_test.sh new file mode 100755 index 00000000..0c1c6db0 --- /dev/null +++ b/src/workflows/integration/scanorama_leiden/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/integration/scanorama_leiden/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/integration/scanorama_leiden/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/scanorama_leiden/main.nf b/src/workflows/integration/scanorama_leiden/main.nf new file mode 100644 index 00000000..d3aabe8b --- /dev/null +++ b/src/workflows/integration/scanorama_leiden/main.nf @@ -0,0 +1,47 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | scanorama.run( + fromState: [ + "input": "input", + "obsm_input": "obsm_input", + "obs_batch": "obs_batch", + "obsm_output": "obsm_output", + "modality": "modality", + "batch_size": "batch_size", + "sigma": "sigma", + "approx": "approx", + "alpha": "alpha", + "knn": "knn", + ], + toState: ["input": "output"] + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_output", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "leiden_resolution": "leiden_resolution", + "obs_cluster": "obs_cluster", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/workflows/integration/scanorama_leiden/nextflow.config b/src/workflows/integration/scanorama_leiden/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/integration/scanorama_leiden/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/scanorama_leiden/test.nf b/src/workflows/integration/scanorama_leiden/test.nf new file mode 100644 index 00000000..4803001c --- /dev/null +++ b/src/workflows/integration/scanorama_leiden/test.nf @@ -0,0 +1,82 @@ +nextflow.enable.dsl=2 + +include { scanorama_leiden } from params.rootDir + "/target/nextflow/workflows/integration/scanorama_leiden/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + leiden_resolution: [1.0, 0.25], + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + leiden_resolution: [], + ] + ]) + | map{ state -> [state.id, state] } + | scanorama_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "test_output_arg", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: "log_normalized", + leiden_resolution: [1.0, 0.25], + output: "test.h5mu" + ], + ]) + | map{ state -> [state.id, state] } + | scanorama_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith("test.h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 1 event" + assert output_list.collect{it[0]} == ["test_output_arg"] + } +} diff --git a/src/workflows/integration/scgpt_leiden/config.vsh.yaml b/src/workflows/integration/scgpt_leiden/config.vsh.yaml new file mode 100644 index 00000000..256dca6a --- /dev/null +++ b/src/workflows/integration/scgpt_leiden/config.vsh.yaml @@ -0,0 +1,185 @@ +name: "scgpt_leiden" +namespace: "workflows/integration" +scope: "public" +description: "Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result." +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ maintainer, author ] + - __merge__: /src/authors/elizabeth_mlynarski.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] +info: + test_dependencies: +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + type: file + required: true + description: Path to the input file. + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: False + description: | + The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts. + - name: "--var_gene_names" + type: string + required: false + description: | + The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used. + - name: "--obs_batch_label" + type: string + description: | + The name of the adata obs column containing the batch labels. + - name: Model + arguments: + - name: "--model" + type: file + required: true + example: resources_test/scgpt/best_model.pt + description: | + Path to scGPT model file. + - name: "--model_vocab" + type: file + direction: input + required: true + example: resources_test/scgpt/vocab.json + description: | + Path to scGPT model vocabulary file. + - name: "--model_config" + type: file + direction: input + required: true + example: args.json + description: | + Path to scGPT model config file. + - name: "--finetuned_checkpoints_key" + type: string + required: false + example: model_state_dict + description: | + Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models. + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Output file path + example: output.h5mu + - name: "--obsm_integrated" + type: string + default: "X_scgpt" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + + - name: "Padding arguments" + arguments: + - name: "--pad_token" + type: string + default: "" + required: false + description: | + Token used for padding. + - name: "--pad_value" + type: integer + default: -2 + required: false + description: | + The value of the padding token. + + - name: "HVG subset arguments" + arguments: + - name: "--n_hvg" + type: integer + default: 1200 + description: | + Number of highly variable genes to subset for. + - name: "--hvg_flavor" + type: string + choices: ["cell_ranger", "seurat"] + default: "cell_ranger" + description: | + Method to be used for identifying highly variable genes. + Note that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`). + + - name: "Tokenization arguments" + arguments: + - name: "--max_seq_len" + type: integer + required: false + description: | + The maximum sequence length of the tokenized data. Defaults to the number of features if not provided. + - name: "Embedding arguments" + arguments: + - name: --dsbn + type: boolean + default: true + description: | + Apply domain-specific batch normalization + - name: "--batch_size" + type: integer + default: 64 + description: | + The batch size to be used for embedding inference. + + - name: "Binning arguments" + arguments: + - name: "--n_input_bins" + type: integer + default: 51 + required: False + min: 1 + description: | + The number of bins to discretize the data into; When no value is provided, data won't be binned. + - name: "--seed" + type: integer + required: false + description: | + Seed for random number generation used for binning. If not set, no seed is used. + + - name: "Clustering arguments" + arguments: + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +dependencies: + - name: scgpt/cross_check_genes + - name: scgpt/binning + - name: feature_annotation/highly_variable_features_scanpy + - name: scgpt/pad_tokenize + - name: scgpt/embedding + - name: workflows/multiomics/neighbors_leiden_umap + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/scgpt + +runners: + - type: nextflow diff --git a/src/workflows/integration/scgpt_leiden/integration_test.sh b/src/workflows/integration/scgpt_leiden/integration_test.sh new file mode 100755 index 00000000..001299c4 --- /dev/null +++ b/src/workflows/integration/scgpt_leiden/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/integration/scgpt_leiden/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/integration/scgpt_leiden/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/scgpt_leiden/main.nf b/src/workflows/integration/scgpt_leiden/main.nf new file mode 100644 index 00000000..ac5df109 --- /dev/null +++ b/src/workflows/integration/scgpt_leiden/main.nf @@ -0,0 +1,138 @@ +workflow run_wf { + + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Annotates the mudata object with highly variable genes. + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "layer": "input_layer", + "modality": "modality", + "n_top_features": "n_hvg", + "flavor": "hvg_flavor" + ], + args: ["var_name_filter": "scgpt_filter_with_hvg"], + toState: ["input": "output"] + ) + // Check whether the genes are part of the provided vocabulary. + | cross_check_genes.run( + fromState: [ + "input": "input", + "modality": "modality", + "vocab_file": "model_vocab", + "input_var_gene_names": "var_gene_names", + "output": "output", + "pad_token": "pad_token" + ], + args: [ + "var_input": "scgpt_filter_with_hvg", + "output_var_filter": "scgpt_cross_checked_genes" + ], + toState: [ + "input": "output" + ] + ) + // Bins the data into a fixed number of bins. + | binning.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "n_input_bins": "n_input_bins", + "output": "output" + ], + args: [ + "output_obsm_binned_counts": "binned_counts", + "var_input": "scgpt_cross_checked_genes" + ], + toState: [ + "input": "output" + ] + ) + // Padding and tokenization of gene count values. + | pad_tokenize.run( + fromState: [ + "input": "input", + "modality": "modality", + "model_vocab": "model_vocab", + "var_gene_names": "var_gene_names", + "pad_token": "pad_token", + "pad_value": "pad_value", + "max_seq_len": "max_seq_len", + "output": "output" + ], + args: [ + "input_obsm_binned_counts": "binned_counts", + "var_input": "scgpt_cross_checked_genes", + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask" + ], + toState: [ + "input": "output" + ] + ) + // Generation of cell embedings from the tokenized gene counts values. + | embedding.run( + fromState: [ + "input": "input", + "modality": "modality", + "model": "model", + "model_vocab": "model_vocab", + "model_config": "model_config", + "var_gene_names": "var_gene_names", + "obs_batch_label": "obs_batch_label", + "pad_token": "pad_token", + "pad_value": "pad_value", + "dsbn": "dsbn", + "batch_size": "batch_size", + "obsm_embeddings": "obsm_integrated", + "finetuned_checkpoints_key": "finetuned_checkpoints_key", + "output": "output" + ], + args: [ + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask" + ], + toState: [ + "input": "output" + ] + ) + // Calculation of neighbors, leiden clustering and UMAP. + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "obsm_input": "obsm_integrated", + "modality": "modality", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "output": "workflow_output", + "leiden_resolution": "leiden_resolution", + "obsm_umap": "obsm_integrated", + ], + toState: [ + "output": "output" + ], + args: [ + "uns_neighbors": "scGPT_integration_neighbors", + "obsp_neighbor_distances": "scGPT_integration_distances", + "obsp_neighbor_connectivities": "scGPT_integration_connectivities", + "obs_cluster": "scGPT_integration_leiden", + "obsm_umap": "X_scGPT_umap" + ] + ) + | setState(["output"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/integration/scgpt_leiden/nextflow.config b/src/workflows/integration/scgpt_leiden/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/integration/scgpt_leiden/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/scgpt_leiden/test.nf b/src/workflows/integration/scgpt_leiden/test.nf new file mode 100644 index 00000000..064cf4a5 --- /dev/null +++ b/src/workflows/integration/scgpt_leiden/test.nf @@ -0,0 +1,100 @@ +nextflow.enable.dsl=2 + +include { scgpt_leiden } from params.rootDir + "/target/nextflow/workflows/integration/scgpt_leiden/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), + model: resources_test.resolve("scgpt/source/best_model.pt"), + model_config: resources_test.resolve("scgpt/source/args.json"), + model_vocab: resources_test.resolve("scgpt/source/vocab.json"), + input_layer: "log_normalized", + obs_batch_label: "sample", + n_hvg: 400, + seed: 1, + leiden_resolution: [1.0, 0.25] + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), + model: resources_test.resolve("scgpt/source/best_model.pt"), + model_config: resources_test.resolve("scgpt/source/args.json"), + model_vocab: resources_test.resolve("scgpt/source/vocab.json"), + obs_batch_label: "sample", + n_hvg: 400, + seed: 1, + input_layer: "log_normalized", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | scgpt_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList{a, b -> a[0] <=> b[0]} + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } +} + + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "test_output_arg", + input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"), + model: resources_test.resolve("scgpt/source/best_model.pt"), + model_config: resources_test.resolve("scgpt/source/args.json"), + model_vocab: resources_test.resolve("scgpt/source/vocab.json"), + input_layer: "log_normalized", + obs_batch_label: "sample", + n_hvg: 400, + leiden_resolution: [1.0, 0.25], + output: "test.h5mu" + ], + ]) + | map{ state -> [state.id, state] } + | scgpt_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith("test.h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 1 event" + assert output_list.collect{it[0]} == ["test_output_arg"] + } + } diff --git a/src/workflows/integration/scvi_leiden/config.vsh.yaml b/src/workflows/integration/scvi_leiden/config.vsh.yaml new file mode 100644 index 00000000..553bf0f3 --- /dev/null +++ b/src/workflows/integration/scvi_leiden/config.vsh.yaml @@ -0,0 +1,149 @@ +name: "scvi_leiden" +namespace: "workflows/integration" +scope: "public" +description: "Run scvi integration followed by neighbour calculations, leiden clustering and run umap on the result." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +info: + test_dependencies: +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + type: string + description: use specified layer for expression values instead of the .X object from the modality. + required: false + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "--output_model" + type: file + description: Folder where the state of the trained model will be saved to. + required: true + direction: output + example: output_dir/ + - name: Neighbour calculation + arguments: + - name: "--uns_neighbors" + type: string + default: scvi_integration_neighbors + description: In which .uns slot to store various neighbor output objects. + - name: "--obsp_neighbor_distances" + type: string + default: "scvi_integration_distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--obsp_neighbor_connectivities" + type: string + default: "scvi_integration_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: Scvi integration options + arguments: + - name: "--obs_batch" + type: string + description: Column name discriminating between your batches. + required: true + - name: "--obsm_output" + type: string + default: "X_scvi_integrated" + required: false + description: "In which .obsm slot to store the resulting integrated embedding." + - name: "--var_input" + type: string + required: false + description: ".var column containing highly variable genes. By default, do not subset genes." + - name: "--early_stopping" + required: false + type: boolean + description: "Whether to perform early stopping with respect to the validation set." + - name: "--early_stopping_monitor" + choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"] + default: "elbo_validation" + type: string + description: "Metric logged during validation set epoch." + - name: "--early_stopping_patience" + type: integer + min: 1 + default: 45 + description: "Number of validation epochs with no improvement after which training will be stopped." + - name: "--early_stopping_min_delta" + min: 0 + type: double + default: 0.0 + description: "Minimum change in the monitored quantity to qualify as an improvement, + i.e. an absolute change of less than min_delta, will count as no improvement." + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest." + required: false + - name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus." + type: boolean + default: True + - name: "--lr_factor" + description: "Factor to reduce learning rate." + type: double + default: 0.6 + min: 0 + - name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will be reduced." + type: double + default: 30 + min: 0 + - name: Clustering options + arguments: + - name: "--obs_cluster" + type: string + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions + resolutions specified in '--leiden_resolution'. + default: "scvi_integration_leiden" + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + - name: Umap options + arguments: + - name: "--obsm_umap" + type: string + default: "X_scvi_umap" + required: false + description: "In which .obsm slot to store the resulting UMAP embedding." +dependencies: + - name: integrate/scvi + - name: workflows/multiomics/neighbors_leiden_umap +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/integration/scvi_leiden/integration_test.sh b/src/workflows/integration/scvi_leiden/integration_test.sh new file mode 100755 index 00000000..d9577b5d --- /dev/null +++ b/src/workflows/integration/scvi_leiden/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/integration/scvi_leiden/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/scvi_leiden/main.nf b/src/workflows/integration/scvi_leiden/main.nf new file mode 100644 index 00000000..463e9d97 --- /dev/null +++ b/src/workflows/integration/scvi_leiden/main.nf @@ -0,0 +1,53 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | scvi.run( + fromState: [ + "input": "input", + "obs_batch": "obs_batch", + "obsm_output": "obsm_output", + "var_input": "var_input", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + "output_model": "output_model", + "modality": "modality", + "input_layer": "layer", + ], + toState: [ + "input": "output", + "output_model": "output_model" + ], + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_output", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "leiden_resolution": "leiden_resolution", + "obs_cluster": "obs_cluster", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"] + ) + | setState(["output", "output_model"]) + + emit: + output_ch +} diff --git a/src/workflows/integration/scvi_leiden/nextflow.config b/src/workflows/integration/scvi_leiden/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/integration/scvi_leiden/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/scvi_leiden/test.nf b/src/workflows/integration/scvi_leiden/test.nf new file mode 100644 index 00000000..27706f29 --- /dev/null +++ b/src/workflows/integration/scvi_leiden/test.nf @@ -0,0 +1,59 @@ +nextflow.enable.dsl=2 + +include { scvi_leiden } from params.rootDir + "/target/nextflow/workflows/integration/scvi_leiden/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: null, + obs_batch: "sample_id", + max_epochs: 1, + output_model: "simple_execution_test_model/" + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + layer: null, + obs_batch: "sample_id", + output_model: "no_leiden_resolutions_test_model/", + max_epochs: 1, + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | scvi_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + // check model_output + assert state.containsKey("output_model") : "Output should contain key 'output_model'." + assert state.output_model.isDirectory() : "'output_model' should be a directory." + assert state.output_model.toString().endsWith("_model") : "Model output directory should end with '_model'. Found: ${state.output_model}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } +} + diff --git a/src/workflows/integration/totalvi_leiden/config.vsh.yaml b/src/workflows/integration/totalvi_leiden/config.vsh.yaml new file mode 100644 index 00000000..4dd69f2d --- /dev/null +++ b/src/workflows/integration/totalvi_leiden/config.vsh.yaml @@ -0,0 +1,193 @@ +name: "totalvi_leiden" +namespace: "workflows/integration" +scope: "public" +description: "Run totalVI integration followed by neighbour calculations, leiden clustering and run umap on the result." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +info: + test_dependencies: +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + type: string + description: use specified layer for expression values instead of the .X object from the modality. + required: false + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--prot_modality" + description: Which modality to process. + type: string + default: "prot" + required: false + - name: "--reference" + alternatives: ["-r"] + type: file + description: Input h5mu file with reference data to train the TOTALVI model. + direction: input + required: true + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "--reference_model_path" + type: file + description: Directory with the reference model. If not exists, trained model will be saved there + required: false + default: "totalvi_model_reference/" + direction: output + - name: "--query_model_path" + type: file + description: Directory, where the query model will be saved + required: false + default: "totalvi_model_query/" + direction: output + - name: General TotalVI Options + arguments: + - name: "--obs_batch" + type: string + description: .Obs column name discriminating between your batches. + required: false + default: "sample_id" + - name: "--max_epochs" + type: integer + description: "Number of passes through the dataset" + required: false + default: 400 + - name: "--max_query_epochs" + type: integer + description: "Number of passes through the dataset, when fine-tuning model for query" + required: false + default: 200 + - name: "--weight_decay" + type: double + description: "Weight decay, when fine-tuning model for query" + required: false + default: 0.0 + - name: "--force_retrain" + type: boolean_true + description: If true, retrain the model and save it to reference_model_path + - name: "--var_input" + type: string + required: false + description: "Boolean .var column to subset data with (e.g. containing highly variable genes). By default, do not subset genes." + - name: TotalVI integration options RNA + arguments: + - name: "--rna_reference_modality" + type: string + default: "rna" + required: false + - name: "--rna_obsm_output" + type: string + default: "X_totalvi" + required: false + description: "In which .obsm slot to store the normalized RNA from TOTALVI." + - name: TotalVI integration options ADT + arguments: + - name: "--prot_reference_modality" + type: string + description: Name of the modality containing proteins in the reference + default: "prot" + required: false + - name: "--prot_obsm_output" + type: string + default: "X_totalvi" + required: false + description: "In which .obsm slot to store the normalized protein data from TOTALVI." + - name: Neighbour calculation RNA + arguments: + - name: "--rna_uns_neighbors" + type: string + default: totalvi_integration_neighbors + description: In which .uns slot to store various neighbor output objects. + - name: "--rna_obsp_neighbor_distances" + type: string + default: "totalvi_integration_distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--rna_obsp_neighbor_connectivities" + type: string + default: "totalvi_integration_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: Neighbour calculation ADT + arguments: + - name: "--prot_uns_neighbors" + type: string + default: totalvi_integration_neighbors + description: In which .uns slot to store various neighbor output objects. + - name: "--prot_obsp_neighbor_distances" + type: string + default: "totalvi_integration_distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--prot_obsp_neighbor_connectivities" + type: string + default: "totalvi_integration_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: Clustering options RNA + arguments: + - name: "--rna_obs_cluster" + type: string + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions + resolutions specified in '--leiden_resolution'. + default: "totalvi_integration_leiden" + - name: "--rna_leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + - name: Clustering options ADT + arguments: + - name: "--prot_obs_cluster" + type: string + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions + resolutions specified in '--leiden_resolution'. + default: "totalvi_integration_leiden" + - name: "--prot_leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + - name: Umap options + arguments: + - name: "--obsm_umap" + type: string + default: "X_totalvi_umap" + required: false + description: "In which .obsm slot to store the resulting UMAP embedding." +dependencies: + - name: integrate/totalvi + - name: workflows/multiomics/neighbors_leiden_umap +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu +runners: + - type: nextflow diff --git a/src/workflows/integration/totalvi_leiden/integration_test.sh b/src/workflows/integration/totalvi_leiden/integration_test.sh new file mode 100755 index 00000000..0e6ed54f --- /dev/null +++ b/src/workflows/integration/totalvi_leiden/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/integration/totalvi_leiden/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/integration/totalvi_leiden/main.nf b/src/workflows/integration/totalvi_leiden/main.nf new file mode 100644 index 00000000..dec1da23 --- /dev/null +++ b/src/workflows/integration/totalvi_leiden/main.nf @@ -0,0 +1,74 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Avoid conflict with other output arguments + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | totalvi.run( + fromState: [ + "input": "input", + "input_layer": "layer", + "obs_batch": "obs_batch", + "query_modality": "modality", + "query_proteins_modality": "prot_modality", + "query_model_path": "query_model_path", + "obsm_normalized_rna_output": "rna_obsm_output", + "obsm_normalized_protein_output": "prot_obsm_output", + "reference_model_path": "reference_model_path", + "reference_modality": "rna_reference_modality", + "reference_proteins_modality": "prot_reference_modality", + "var_input": "var_input", + "force_retrain": "force_retrain", + "weight_decay": "weight_decay", + "max_epochs": "max_epochs", + "max_query_epochs": "max_query_epochs", + "reference": "reference" + ], + toState: [ + "input": "output", + "query_model_path": "query_model_path", + "reference_model_path": "reference_model_path", + ] + ) + | neighbors_leiden_umap.run( // For gene expression + key: "rna_neighbors_leiden_umap", + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "rna_obsm_output", + "uns_neighbors": "rna_uns_neighbors", + "obsp_neighbor_distances": "rna_obsp_neighbor_distances", + "obsp_neighbor_connectivities": "rna_obsp_neighbor_connectivities", + "obs_cluster": "rna_obs_cluster", + "leiden_resolution": "rna_leiden_resolution", + "uns_neighbors": "rna_uns_neighbors", + "obsm_umap": "obsm_umap", + ], + toState: ["input": "output"], + ) + | neighbors_leiden_umap.run( // For ADT + key: "adt_neighbors_leiden_umap", + fromState: [ + "input": "input", + "modality": "prot_modality", + "obsm_input": "prot_obsm_output", + "uns_neighbors": "prot_uns_neighbors", + "obsp_neighbor_distances": "prot_obsp_neighbor_distances", + "obsp_neighbor_connectivities": "prot_obsp_neighbor_connectivities", + "obs_cluster": "prot_obs_cluster", + "leiden_resolution": "prot_leiden_resolution", + "uns_neighbors": "prot_uns_neighbors", + "obsm_umap": "obsm_umap", + "output": "workflow_output", + ], + toState: ["output": "output"], + ) + | setState(["output", "reference_model_path", "query_model_path"]) + emit: + output_ch +} diff --git a/src/workflows/integration/totalvi_leiden/nextflow.config b/src/workflows/integration/totalvi_leiden/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/integration/totalvi_leiden/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/integration/totalvi_leiden/test.nf b/src/workflows/integration/totalvi_leiden/test.nf new file mode 100644 index 00000000..a7aa7d46 --- /dev/null +++ b/src/workflows/integration/totalvi_leiden/test.nf @@ -0,0 +1,85 @@ +nextflow.enable.dsl=2 + +include { totalvi_leiden } from params.rootDir + "/target/nextflow/workflows/integration/totalvi_leiden/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + prot_modality: "prot", + prot_reference_modality: "prot", + var_input: "filter_with_hvg", + reference_model_path: "totalvi_reference_model", + query_model_path: "totalvi_query_model", + max_epochs: 1, + max_query_epochs: 1, + ], + [ + id: "no_prot_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + prot_modality: "prot", + prot_reference_modality: "prot", + var_input: "filter_with_hvg", + reference_model_path: "totalvi_reference_model", + query_model_path: "totalvi_query_model", + max_epochs: 1, + max_query_epochs: 1, + prot_leiden_resolution: [] + ], + [ + id: "no_rna_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + prot_modality: "prot", + prot_reference_modality: "prot", + var_input: "filter_with_hvg", + reference_model_path: "totalvi_reference_model", + query_model_path: "totalvi_query_model", + max_epochs: 1, + max_query_epochs: 1, + rna_leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | totalvi_leiden + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + // check reference_model + assert state.containsKey("reference_model_path") : "Output should contain key 'reference_model_path'." + assert state.reference_model_path.isDirectory() : "'reference_model_path' should be a directory." + assert state.reference_model_path.toString().endsWith("_reference_model") : "Model output directory should end with '_reference_model'. Found: ${state.reference_model_path}" + + // check query_model + assert state.containsKey("query_model_path") : "Output should contain key 'query_model_path'." + assert state.query_model_path.isDirectory() : "'query_model_path' should be a directory." + assert state.query_model_path.toString().endsWith("_query_model") : "Model output directory should end with '_query_model'. Found: ${state.query_model_path}" + + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 3 : "output channel should contain 3 events" + assert output_list.collect{it[0]} == ["no_prot_leiden_resolutions_test", "no_rna_leiden_resolutions_test", "simple_execution_test"] + } +} diff --git a/src/workflows/integration_test.R b/src/workflows/integration_test.R new file mode 100644 index 00000000..bc3aefb4 --- /dev/null +++ b/src/workflows/integration_test.R @@ -0,0 +1,76 @@ +library(tidyverse) + +workflows <- yaml::yaml.load( + system("viash ns list -q '^workflows/(?!test_workflows)'", intern = TRUE) +) + +outs <- map_df(workflows, function(wf) { + cat("Running ", wf$namespace, "/", wf$name, "\n", sep = "") + + dir <- dirname(wf$build_info$config) + tests <- wf$test_resources %>% keep(~ .$type == "nextflow_script") + + if (length(tests) == 0) { + tibble( + namespace = wf$namespace, + functionality = wf$name, + runner = "native", + test_name = "tests", + exit_code = -1L, + duration = 0L, + result = "MISSING" + ) + } else { + map_df( + tests, + function(test) { + if (file.exists(paste0(dir, "/graph.dot"))) { + file.remove(paste0(dir, "/graph.dot")) + } + args <- c( + "run", ".", + "-main-script", paste0(dir, "/", test$path), + "-entry", test$entrypoint, + "-profile", "docker,mount_temp,no_publish", + "-with-dag", paste0(dir, "/graph.dot"), + "-c", "src/workflows/utils/integration_tests.config", + "-c", "src/workflows/utils/labels_ci.config", + "-resume" + ) + + start_time <- Sys.time() + out <- processx::run( + "nextflow", + args = args, + error_on_status = FALSE, + env = c("current", NXF_VER = "24.04.4") + ) + stop_time <- Sys.time() + duration <- ceiling( + as.numeric(difftime(stop_time, start_time, unit = "sec")) + ) + result <- if (out$status > 0) "FAILED" else "SUCCESS" + if (out$status > 0) { + cat( + "========================== ERROR LOG ==========================\n", + out$stdout, + "===============================================================\n", + sep = "" + ) + } + + tibble( + namespace = wf$namespace, + functionality = wf$name, + runner = "native", + test_name = paste0(basename(test$path), "$", test$entrypoint), + exit_code = out$status, + duration = duration, + result = result + ) + } + ) + } +}) + +write_tsv(outs, ".viash_log_integration.tsv") diff --git a/src/workflows/integration_test.sh b/src/workflows/integration_test.sh new file mode 100755 index 00000000..d23ea67e --- /dev/null +++ b/src/workflows/integration_test.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +viash ns build --parallel --runner nextflow + +Rscript src/workflows/integration_test.R | tee .viash_log_integration.txt \ No newline at end of file diff --git a/src/workflows/multiomics/dimensionality_reduction/config.vsh.yaml b/src/workflows/multiomics/dimensionality_reduction/config.vsh.yaml new file mode 100644 index 00000000..b71b1644 --- /dev/null +++ b/src/workflows/multiomics/dimensionality_reduction/config.vsh.yaml @@ -0,0 +1,101 @@ +name: "dimensionality_reduction" +namespace: "workflows/multiomics" +scope: "public" +description: "Run calculations that output information required for most integration methods: PCA, nearest neighbour and UMAP." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +info: + test_dependencies: + - name: dimensionality_reduction_test + namespace: test_workflows/multiomics +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + default: "log_normalized" + type: string + description: use specified layer for expression values instead of the .X object from the modality. + required: false + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: PCA options + arguments: + - name: "--obsm_pca" + type: string + default: "X_pca" + description: "In which .obsm slot to store the resulting PCA embedding." + - name: "--var_pca_feature_selection" + type: string + required: false + description: Column name in .var matrix that will be used to select which genes to run the PCA on. + - name: "--pca_loadings_varm_output" + type: string + description: | + Name of the .varm key where the PCA loadings are stored. + - name: "--pca_variance_uns_output" + type: string + description: | + Name of the .uns key where the variance and variance ratio will be stored as a map. + The map will contain two keys: variance and variance_ratio respectively. + - name: "--pca_overwrite" + type: boolean_true + description: "Allow overwriting slots for PCA output." + - name: Neighbour calculation + arguments: + - name: "--uns_neighbors" + type: string + default: neighbors + description: In which .uns slot to store various neighbor output objects. + - name: "--obsp_neighbor_distances" + type: string + default: "distances" + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--obsp_neighbor_connectivities" + type: string + default: "connectivities" + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: Umap options + arguments: + - name: "--obsm_umap" + type: string + default: "X_umap" + required: false + description: "In which .obsm slot to store the resulting UMAP embedding." +dependencies: + - name: dimred/pca + - name: workflows/multiomics/neighbors_leiden_umap +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/concat_test_data +runners: + - type: nextflow diff --git a/src/workflows/multiomics/dimensionality_reduction/integration_test.sh b/src/workflows/multiomics/dimensionality_reduction/integration_test.sh new file mode 100755 index 00000000..49dbba78 --- /dev/null +++ b/src/workflows/multiomics/dimensionality_reduction/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/multiomics/dimensionality_reduction/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/multiomics/dimensionality_reduction/main.nf b/src/workflows/multiomics/dimensionality_reduction/main.nf new file mode 100644 index 00000000..94d1eb78 --- /dev/null +++ b/src/workflows/multiomics/dimensionality_reduction/main.nf @@ -0,0 +1,44 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | pca.run( + fromState: [ + "input": "input", + "obsm_output": "obsm_pca", + "var_input": "var_pca_feature_selection", + "modality": "modality", + "overwrite": "pca_overwrite", + "layer": "layer", + "varm_output": "pca_loadings_varm_output", + "uns_output": "pca_variance_uns_output", + ], + toState: ["input": "output"] + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "obsm_input": "obsm_pca", + "modality": "modality", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "output": "workflow_output", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"], + args: [ + "leiden_resolution": [] // disable leiden + ] + ) + | setState(["output"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/multiomics/dimensionality_reduction/nextflow.config b/src/workflows/multiomics/dimensionality_reduction/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/multiomics/dimensionality_reduction/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/multiomics/dimensionality_reduction/test.nf b/src/workflows/multiomics/dimensionality_reduction/test.nf new file mode 100644 index 00000000..705a23ba --- /dev/null +++ b/src/workflows/multiomics/dimensionality_reduction/test.nf @@ -0,0 +1,60 @@ +nextflow.enable.dsl=2 + +include { dimensionality_reduction } from params.rootDir + "/target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf" +include { dimensionality_reduction_test } from params.rootDir + "/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + input_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"), + layer: "", + output: "foo.final.h5mu" + ], + [ + id: "pca_obsm_output_test", + input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"), + layer: "", + output: "foo.final.h5mu" + ], + ]) + | map{ state -> [state.id, state] } + | dimensionality_reduction + + assert_ch = input_ch + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + + + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["pca_obsm_output_test", "simple_execution_test"] + output_list + } + + test_ch = input_ch + | map { id , output -> [id, ["input": output.output]]} + | dimensionality_reduction_test + + +} diff --git a/src/workflows/multiomics/neighbors_leiden_umap/config.vsh.yaml b/src/workflows/multiomics/neighbors_leiden_umap/config.vsh.yaml new file mode 100644 index 00000000..a0debaa0 --- /dev/null +++ b/src/workflows/multiomics/neighbors_leiden_umap/config.vsh.yaml @@ -0,0 +1,83 @@ +name: "neighbors_leiden_umap" +namespace: "workflows/multiomics" +scope: "public" +description: "Performs neighborhood search, leiden clustering and run umap on an integrated embedding." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +info: + test_dependencies: +argument_groups: + - name: "Inputs" + arguments: + - name: "--input" + required: true + type: file + description: Path to the sample. + example: dataset.h5mu + - name: "--obsm_input" + type: string + required: true + description: The key of the embedding to use as input. + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: Neighbour calculation + arguments: + - name: "--uns_neighbors" + type: string + required: true + description: In which .uns slot to store various neighbor output objects. + - name: "--obsp_neighbor_distances" + type: string + required: true + description: "In which .obsp slot to store the distance matrix between the resulting neighbors." + - name: "--obsp_neighbor_connectivities" + type: string + required: true + description: "In which .obsp slot to store the connectivities matrix between the resulting neighbors." + - name: Clustering options + arguments: + - name: "--obs_cluster" + type: string + required: false + description: | + Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will + be created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions + resolutions specified in '--leiden_resolution'. + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + default: [1] + multiple: true + - name: Umap options + arguments: + - name: "--obsm_umap" + type: string + required: false + description: | + In which .obsm slot to store the resulting UMAP embedding. + When not specified, UMAP will not be executed. +dependencies: + - name: cluster/leiden + - name: dimred/umap + - name: neighbors/find_neighbors + - name: metadata/move_obsm_to_obs +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/multiomics/neighbors_leiden_umap/main.nf b/src/workflows/multiomics/neighbors_leiden_umap/main.nf new file mode 100644 index 00000000..8fcb912e --- /dev/null +++ b/src/workflows/multiomics/neighbors_leiden_umap/main.nf @@ -0,0 +1,68 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + if (!workflow.stubRun) { + assert (state.leiden_resolution.isEmpty() || state.obs_cluster?.trim()): + "When leiden_resolution is set, obs_cluster must also be defined." + } + [id, state] + } + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | find_neighbors.run( + fromState: [ + "input": "input", + "uns_output": "uns_neighbors", + "obsp_distances": "obsp_neighbor_distances", + "obsp_connectivities": "obsp_neighbor_connectivities", + "obsm_input": "obsm_input", + "output": "workflow_output", + "modality": "modality" + ], + toState: ["input": "output"] + ) + | leiden.run( + runIf: {id, state -> state.leiden_resolution}, + fromState: [ + "input": "input", + "obsp_connectivities": "obsp_neighbor_connectivities", + "obsm_name": "obs_cluster", + "resolution": "leiden_resolution", + "modality": "modality", + ], + toState: ["input": "output"] + ) + | move_obsm_to_obs.run( + runIf: {id, state -> state.leiden_resolution}, + fromState: [ + "input": "input", + "output": "workflow_output", + "obsm_key": "obs_cluster", + "modality": "modality", + ], + args: ["output_compression": "gzip"], + toState: ["input": "output"] + ) + | umap.run( + runIf: {id, state -> !state.obsm_umap?.trim()?.isEmpty()}, + fromState: [ + "input": "input", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsm_output": "obsm_umap", + "modality": "modality", + ], + args: ["output_compression": "gzip"], + toState: ["input": "output"] + ) + | setState(["output": "input"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/multiomics/process_batches/config.vsh.yaml b/src/workflows/multiomics/process_batches/config.vsh.yaml new file mode 100644 index 00000000..25fc07b3 --- /dev/null +++ b/src/workflows/multiomics/process_batches/config.vsh.yaml @@ -0,0 +1,174 @@ +name: "process_batches" +namespace: "workflows/multiomics" +scope: "public" +description: | + This workflow serves as an entrypoint into the 'full_pipeline' in order to + re-run the multisample processing and the integration setup. An input .h5mu file will + first be split in order to run the multisample processing per modality. Next, the modalities + are merged again and the integration setup pipeline is executed. Please note that this workflow + assumes that samples from multiple pipelines are already concatenated. +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +info: + test_dependencies: + - name: workflow_test + namespace: test_workflows/multiomics/process_batches + - name: workflow_test2 + namespace: test_workflows/multiomics/process_batches +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + alternatives: [-i] + description: Path to the sample. + required: true + multiple: true + example: input.h5mu + type: file + - name: "--rna_layer" + type: string + description: "Input layer for the gene expression modality. If not specified, .X is used." + required: false + - name: "--prot_layer" + type: string + description: "Input layer for the antibody capture modality. If not specified, .X is used." + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "Highly variable features detection" + arguments: + - name: "--highly_variable_features_var_output" + alternatives: ["--filter_with_hvg_var_output"] + required: false + type: string + default: "filter_with_hvg" + description: In which .var slot to store a boolean array corresponding to the highly variable genes. + - name: "--highly_variable_features_obs_batch_key" + alternatives: [--filter_with_hvg_obs_batch_key] + type: string + default: "sample_id" + required: false + description: | + If specified, highly-variable genes are selected within each batch separately and merged. This simple + process avoids the selection of batch-specific genes and acts as a lightweight batch correction method. + - name: "QC metrics calculation options" + arguments: + - name: "--var_qc_metrics" + description: | + Keys to select a boolean (containing only True or False) column from .var. + For each cell, calculate the proportion of total values for genes which are labeled 'True', + compared to the total sum of the values for all genes. + type: string + multiple: True + multiple_sep: ',' + required: false + default: ["filter_with_hvg"] + example: "ercc,highly_variable" + - name: "--top_n_vars" + type: integer + description: | + Number of top vars to be used to calculate cumulative proportions. + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds + cumulative proportion to the 20th and 50th most expressed vars. + multiple: true + multiple_sep: ',' + required: false + default: [50, 100, 200, 500] + - name: "PCA options" + arguments: + - name: "--pca_overwrite" + type: boolean_true + description: "Allow overwriting slots for PCA output." + - name: "CLR options" + arguments: + - name: "--clr_axis" + type: integer + description: "Axis to perform the CLR transformation on." + default: 0 + required: false + - name: "RNA Scaling options" + description: | + Options for enabling scaling of the log-normalized data to unit variance and zero mean. + The scaled data will be output a different layer and representation with reduced dimensions + will be created and stored in addition to the non-scaled data. + arguments: + - name: "--rna_enable_scaling" + description: "Enable scaling for the RNA modality." + type: boolean_true + - name: "--rna_scaling_output_layer" + type: string + default: "scaled" + description: "Output layer where the scaled log-normalized data will be stored." + - name: "--rna_scaling_pca_obsm_output" + type: string + description: | + Name of the .obsm key where the PCA representation of the log-normalized + and scaled data is stored. + default: "scaled_pca" + - name: "--rna_scaling_pca_loadings_varm_output" + type: string + description: | + Name of the .varm key where the PCA loadings of the log-normalized and scaled + data is stored. + default: "scaled_pca_loadings" + - name: "--rna_scaling_pca_variance_uns_output" + type: string + description: | + Name of the .uns key where the variance and variance ratio will be stored as a map. + The map will contain two keys: variance and variance_ratio respectively. + default: "scaled_pca_variance" + - name: "--rna_scaling_umap_obsm_output" + type: string + description: + Name of the .obsm key where the UMAP representation of the log-normalized + and scaled data is stored. + default: "scaled_umap" + - name: "--rna_scaling_max_value" + description: "Clip (truncate) data to this value after scaling. If not specified, do not clip." + required: false + type: double + - name: "--rna_scaling_zero_center" + type: boolean_false + description: If set, omit zero-centering variables, which allows to handle sparse input efficiently." + +dependencies: + - name: dataflow/merge + - name: workflows/multiomics/split_modalities + alias: split_modalities_workflow + - name: workflows/prot/prot_multisample + - name: workflows/rna/rna_multisample + - name: workflows/multiomics/dimensionality_reduction + alias: dimensionality_reduction_rna + - name: workflows/multiomics/dimensionality_reduction + alias: dimensionality_reduction_prot + - name: workflows/multiomics/dimensionality_reduction + alias: dimensionality_reduction_scaling_rna +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/concat_test_data + - path: /resources_test/10x_5k_anticmv +runners: + - type: nextflow diff --git a/src/workflows/multiomics/process_batches/integration_test.sh b/src/workflows/multiomics/process_batches/integration_test.sh new file mode 100755 index 00000000..67905057 --- /dev/null +++ b/src/workflows/multiomics/process_batches/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_batches/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_batches/test.nf \ + -entry test_wf2 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/multiomics/process_batches/main.nf b/src/workflows/multiomics/process_batches/main.nf new file mode 100644 index 00000000..9da48a3e --- /dev/null +++ b/src/workflows/multiomics/process_batches/main.nf @@ -0,0 +1,227 @@ +workflow run_wf { + take: + input_ch + + main: + multisample_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // The input for this workflow can either be a list of unimodal files + // or a single multimodal file. To destingish between the two, the files will be split either way. + // For multiple unimodal files, the result before or after splitting is identical. + // In both cases, this workflow requires split files. + + // Split must be called on each item of the input list, so split it into multiple events with unique ids + // Unique ids are required to run a component + | flatMap {id, state -> + if (workflow.stubRun) { + return [[id, state]] + } + def newEvents = state.input.withIndex().collect{input_file, index -> + def newState = state + ["input": input_file, "original_id": id] + ["${id}_${index}", newState] + } + newEvents + } + | split_modalities_workflow.run( + fromState: {id, state -> + [ + "input": state.input, + "id": id + ] + }, + toState: [ + "output": "output", + "output_types": "output_types" + ] + ) + // gather the output from split_modalities_workflow + // by reading the output csv (the csv contains 1 line per output file) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + def new_id = state.original_id + "_${dat.name}" // Make a unique ID by appending the modality name. + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types", "original_id"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + + multisample_ch + | toSortedList() + | map{all_input -> + def ids = all_input.collect({it[0]}) + assert ids.clone().unique().size() == ids.size(): "Found duplicate modalities in the input." + } + + // + // Multisample processing + // + def multisample_arguments = [ + "rna": [ + "highly_variable_features_var_output": "highly_variable_features_var_output", + "highly_variable_features_obs_batch_key": "highly_variable_features_obs_batch_key", + "var_qc_metrics": "var_qc_metrics", + "top_n_vars": "top_n_vars", + "layer": "rna_layer", + "enable_scaling": "rna_enable_scaling", + "scaling_output_layer": "rna_scaling_output_layer", + "scaling_max_value": "rna_scaling_max_value", + "scaling_zero_center": "rna_scaling_zero_center", + ], + "prot": [ + "layer": "prot_layer", + "clr_axis": "clr_axis", + ] + ].asImmutable() + + multimodal_ch_known = multisample_ch + | runEach( + components: [rna_multisample, prot_multisample], + filter: { id, state, component -> + state.modality + "_multisample" == component.config.name + }, + fromState: { id, state, component -> + def newState = multisample_arguments.get(state.modality).collectEntries{key_, value_ -> + [key_, state[value_]] + } + newState + ["id": id, "input": state.input] + }, + toState: {id, output, state, component -> + def newState = state + ["input": output.output] + return newState + } + ) + + multimodal_ch_unknown = multisample_ch + | filter { id, state -> state.modality !in multisample_arguments.keySet() } + + multimodal_ch = multimodal_ch_unknown.mix(multimodal_ch_known) + // Remove arguments for multisample processing from state. + | map {id, state -> + def keysToRemove = multisample_arguments.inject([]){currentKeys, modality, stateMapping -> + def newKeys = currentKeys + stateMapping.values() + return newKeys + } + keysToRemove -= ["rna_enable_scaling", "rna_scaling_output_layer"] + def newState = state.findAll{it.key !in keysToRemove } + [id, newState] + } + | view {"After multisample processing: $it"} + + // + // Merging: joining the observations from all modalities together. Everything in 1 file. + // + + // Set the original IDs back into place to use them in groupTuple + | map {id, state -> + def newEvent = [state.id, state] + newEvent + } + // Group the modalities back together per input sample + | groupTuple(by: 0, sort: "hash") + | view {"After toSortedList: $it"} + | map { id, states -> + def new_input = states.collect{it.input} + def modalities = states.collect{it.modality}.unique() + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + [id, new_state + ["input": new_input, "modalities": modalities]] + } + | view {"Input merge channel: $it"} + | merge.run( + fromState: ["input": "input"], + toState: ["input": "output"], + ) + | view {"After merging processing: $it"} + + // Processing of multi-modal multisample MuData files. + // Performs calculations on samples that have *not* been integrated, + // and can be considered a "no-integration" workflow. + output_ch = [dimensionality_reduction_rna, dimensionality_reduction_scaling_rna, dimensionality_reduction_prot].inject(multimodal_ch){ channel_in, component -> + channel_out_integrated = channel_in + | component.run( + runIf: {id, state -> + def reg = state.rna_enable_scaling ? ~/^dimensionality_reduction_(scaling_)?/ : ~/^dimensionality_reduction_/ + def modality_to_check = component.name - reg + state.modalities.contains(modality_to_check) + }, + fromState: { id, state -> + def stateMappings = [ + "dimensionality_reduction_rna": + [ + "id": id, + "input": state.input, + "layer": "log_normalized", + "modality": "rna", + "var_pca_feature_selection": state.highly_variable_features_var_output, // run PCA on highly variable genes only + "pca_overwrite": state.pca_overwrite, + "output": state.workflow_output, + ], + "dimensionality_reduction_scaling_rna": + [ + "id": id, + "input": state.input, + "layer": state.rna_scaling_output_layer, + "modality": "rna", + "var_pca_feature_selection": state.highly_variable_features_var_output, // run PCA on highly variable genes only + "pca_overwrite": state.pca_overwrite, + // extra scaling args + "obsm_pca": state.rna_scaling_pca_obsm_output, + "pca_loadings_varm_output": state.rna_scaling_pca_loadings_varm_output, + "pca_variance_uns_output": state.rna_scaling_pca_variance_uns_output, + "pca_overwrite": state.pca_overwrite, + "obsm_umap": state.rna_scaling_umap_obsm_output, + "uns_neighbors": "neighbors_scaled", + "obsp_neighbor_connectivities": "connectivities_scaled", + "obsp_neighbor_distances": "distances_scaled", + "output": state.workflow_output, + ], + "dimensionality_reduction_prot": + [ + "id": id, + "input": state.input, + "layer": "clr", + "modality": "prot", + "pca_overwrite": state.pca_overwrite, + "output": state.workflow_output, + ] + ] + return stateMappings[component.name] + }, + toState: ["input": "output"] + ) + } + // At the end of the reduce statement, + // the `toState` closure put the output back into + // into the 'input' slot + | map {id, state -> + [id, ["output": state.input]] + } + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/multiomics/process_batches/nextflow.config b/src/workflows/multiomics/process_batches/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/multiomics/process_batches/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/multiomics/process_batches/test.nf b/src/workflows/multiomics/process_batches/test.nf new file mode 100644 index 00000000..86918af2 --- /dev/null +++ b/src/workflows/multiomics/process_batches/test.nf @@ -0,0 +1,97 @@ +nextflow.enable.dsl=2 +targetDir = params.rootDir + "/target" + +include { process_batches } from targetDir + "/nextflow/workflows/multiomics/process_batches/main.nf" +include { workflow_test } from targetDir + "/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/main.nf" +include { workflow_test2 } from targetDir + "/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + input_ch = Channel.fromList([ + [ + id: "test", + input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"), + publish_dir: "foo/", + clr_axis: 0 + ], + [ + id: "test2", + input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"), + publish_dir: "foo/", + clr_axis: 1 + ] + ]) + | map{ state -> [state.id, state] } + | view { "Input: $it" } + | process_batches.run( + toState: { id, output, state -> output + [orig_input: state.input] } + ) + + assert_ch = input_ch + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + print "output_list: $output_list" + assert output_list.size() == 2 : "output channel should contain two events" + assert output_list.collect({it[0]}).sort() == ["test", "test2"] : "First output ID should be 'test'" + } + + test_ch = input_ch + | workflow_test.run( + fromState: [ + "input": "output", + "orig_input": "orig_input" + ], + ) +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + input_ch = Channel.fromList([ + [ + input: resources_test.resolve("10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_mms.h5mu"), + pca_overwrite: true, + id: "test", + publish_dir: "foo/", + output: "test.h5mu", + ] + ]) + | map{ state -> [state.id, state] } + | view { "Input: $it" } + | process_batches.run( + toState: { id, output, state -> output + [orig_input: state.input] } + ) + + assert_ch = input_ch + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file], was $output" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + print "output_list: $output_list" + assert output_list.size() == 1 : "output channel should contain two events" + assert output_list.collect({it[0]}).sort() == ["test"] : "First output ID should be 'test'" + } + + test_ch = input_ch + | workflow_test2.run( + fromState: [ + "input": "output", + "orig_input": "orig_input" + ], + ) + + +} diff --git a/src/workflows/multiomics/process_samples/config.vsh.yaml b/src/workflows/multiomics/process_samples/config.vsh.yaml new file mode 100644 index 00000000..54f7042c --- /dev/null +++ b/src/workflows/multiomics/process_samples/config.vsh.yaml @@ -0,0 +1,347 @@ +name: "process_samples" +namespace: "workflows/multiomics" +scope: "public" +description: "A pipeline to analyse multiple multiomics samples." +info: + test_dependencies: + - name: move_layer + namespace: transform + - name: remove_modality + namespace: filter +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + alternatives: [-i] + description: Path to the sample. + required: true + example: input.h5mu + type: file + - name: "--rna_layer" + type: string + description: "Input layer for the gene expression modality. If not specified, .X is used." + required: false + - name: "--prot_layer" + type: string + description: "Input layer for the antibody capture modality. If not specified, .X is used." + required: false + - name: "--gdo_layer" + type: string + description: "Input layer for the guide-derived oligonucleotide (GDO) data. If not specified, .X is used." + required: false + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "Sample ID options" + description: | + Options for adding the id to .obs on the MuData object. Having a sample + id present in a requirement of several components for this pipeline. + arguments: + - name: "--add_id_to_obs" + description: "Add the value passed with --id to .obs." + type: boolean + default: true + - name: --add_id_obs_output + description: | + .Obs column to add the sample IDs to. Required and only used when + --add_id_to_obs is set to 'true' + type: string + default: "sample_id" + - name: "--add_id_make_observation_keys_unique" + type: boolean + description: | + Join the id to the .obs index (.obs_names). + Only used when --add_id_to_obs is set to 'true'. + default: true + - name: "RNA filtering options" + arguments: + - name: "--rna_min_counts" + example: 200 + type: integer + description: Minimum number of counts captured per cell. + - name: "--rna_max_counts" + example: 5000000 + type: integer + description: Maximum number of counts captured per cell. + + - name: "--rna_min_genes_per_cell" + type: integer + example: 200 + description: Minimum of non-zero values per cell. + - name: "--rna_max_genes_per_cell" + example: 1500000 + type: integer + description: Maximum of non-zero values per cell. + + - name: "--rna_min_cells_per_gene" + example: 3 + type: integer + description: Minimum of non-zero values per gene. + + - name: "--rna_min_fraction_mito" + example: 0 + type: double + description: Minimum fraction of UMIs that are mitochondrial. + - name: "--rna_max_fraction_mito" + type: double + example: 0.2 + description: Maximum fraction of UMIs that are mitochondrial. + + - name: "--rna_min_fraction_ribo" + example: 0 + type: double + description: Minimum fraction of UMIs that are mitochondrial. + - name: "--rna_max_fraction_ribo" + type: double + example: 0.2 + description: Maximum fraction of UMIs that are mitochondrial. + + - name: "--skip_scrublet_doublet_detection" + type: boolean_true + description: Skip the scrublet doublet detection step. + + - name: "CITE-seq filtering options" + arguments: + - name: "--prot_min_counts" + description: Minimum number of counts per cell. + type: integer + example: 3 + - name: "--prot_max_counts" + description: Minimum number of counts per cell. + type: integer + example: 5000000 + + - name: "--prot_min_proteins_per_cell" + type: integer + example: 200 + description: Minimum of non-zero values per cell. + - name: "--prot_max_proteins_per_cell" + description: Maximum of non-zero values per cell. + type: integer + example: 100000000 + + - name: "--prot_min_cells_per_protein" + example: 3 + type: integer + description: Minimum of non-zero values per protein. + + - name: "GDO filtering options" + arguments: + - name: "--gdo_min_counts" + description: Minimum number of counts per cell. + type: integer + example: 3 + - name: "--gdo_max_counts" + description: Minimum number of counts per cell. + type: integer + example: 5000000 + - name: "--gdo_min_guides_per_cell" + type: integer + example: 200 + description: Minimum of non-zero values per cell. + - name: "--gdo_max_guides_per_cell" + description: Maximum of non-zero values per cell. + type: integer + example: 100000000 + - name: "--gdo_min_cells_per_guide" + example: 3 + type: integer + description: Minimum of non-zero values per guide. + + - name: "Highly variable features detection" + arguments: + - name: "--highly_variable_features_var_output" + alternatives: ["--filter_with_hvg_var_output"] + required: false + type: string + default: "filter_with_hvg" + description: In which .var slot to store a boolean array corresponding to the highly variable genes. + - name: "--highly_variable_features_obs_batch_key" + alternatives: ["--filter_with_hvg_obs_batch_key"] + type: string + default: "sample_id" + required: false + description: | + If specified, highly-variable genes are selected within each batch separately and merged. This simple + process avoids the selection of batch-specific genes and acts as a lightweight batch correction method. + - name: "Mitochondrial & Ribosomal Gene Detection" + arguments: + - name: "--var_gene_names" + required: false + example: "gene_symbol" + type: string + description: | + .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set). + Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be + identified as mitochondrial or ribosomal genes, respectively. + - name: "--var_name_mitochondrial_genes" + type: string + required: false + description: | + In which .var slot to store a boolean array corresponding the mitochondrial genes. + - name: "--obs_name_mitochondrial_fraction" + type: string + required: false + description: | + When specified, write the fraction of counts originating from mitochondrial genes + (based on --mitochondrial_gene_regex) to an .obs column with the specified name. + Requires --var_name_mitochondrial_genes. + - name: --mitochondrial_gene_regex + type: string + description: | + Regex string that identifies mitochondrial genes from --var_gene_names. + By default will detect human and mouse mitochondrial genes from a gene symbol. + required: false + default: "^[mM][tT]-" + - name: "--var_name_ribosomal_genes" + type: string + required: false + description: | + In which .var slot to store a boolean array corresponding the ribosomal genes. + - name: "--obs_name_ribosomal_fraction" + type: string + required: false + description: | + When specified, write the fraction of counts originating from ribosomal genes + (based on --ribosomal_gene_regex) to an .obs column with the specified name. + Requires --var_name_ribosomal_genes. + - name: --ribosomal_gene_regex + type: string + description: | + Regex string that identifies ribosomal genes from --var_gene_names. + By default will detect human and mouse ribosomal genes from a gene symbol. + required: false + default: "^[Mm]?[Rr][Pp][LlSs]" + - name: "QC metrics calculation options" + arguments: + - name: "--var_qc_metrics" + description: | + Keys to select a boolean (containing only True or False) column from .var. + For each cell, calculate the proportion of total values for genes which are labeled 'True', + compared to the total sum of the values for all genes. Defaults to the combined values specified for + --var_name_mitochondrial_genes and --highly_variable_features_var_output. + type: string + multiple: True + multiple_sep: ',' + required: false + example: "ercc,highly_variable" + - name: "--top_n_vars" + type: integer + description: | + Number of top vars to be used to calculate cumulative proportions. + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds + cumulative proportion to the 20th and 50th most expressed vars. + multiple: true + multiple_sep: ',' + required: false + default: [50, 100, 200, 500] + - name: "PCA options" + arguments: + - name: "--pca_overwrite" + type: boolean_true + description: "Allow overwriting slots for PCA output." + - name: "CLR options" + arguments: + - name: "--clr_axis" + type: integer + description: "Axis to perform the CLR transformation on." + default: 0 + required: false + - name: "RNA Scaling options" + description: | + Options for enabling scaling of the log-normalized data to unit variance and zero mean. + The scaled data will be output a different layer and representation with reduced dimensions + will be created and stored in addition to the non-scaled data. + arguments: + - name: "--rna_enable_scaling" + description: "Enable scaling for the RNA modality." + type: boolean_true + - name: "--rna_scaling_output_layer" + type: string + default: "scaled" + description: "Output layer where the scaled log-normalized data will be stored." + - name: "--rna_scaling_pca_obsm_output" + type: string + description: | + Name of the .obsm key where the PCA representation of the log-normalized + and scaled data is stored. + default: "scaled_pca" + - name: "--rna_scaling_pca_loadings_varm_output" + type: string + description: | + Name of the .varm key where the PCA loadings of the log-normalized and scaled + data is stored. + default: "scaled_pca_loadings" + - name: "--rna_scaling_pca_variance_uns_output" + type: string + description: | + Name of the .uns key where the variance and variance ratio will be stored as a map. + The map will contain two keys: variance and variance_ratio respectively. + default: "scaled_pca_variance" + - name: "--rna_scaling_umap_obsm_output" + type: string + description: + Name of the .obsm key where the UMAP representation of the log-normalized + and scaled data is stored. + default: "scaled_umap" + - name: "--rna_scaling_max_value" + description: "Clip (truncate) data to this value after scaling. If not specified, do not clip." + required: false + type: double + - name: "--rna_scaling_zero_center" + type: boolean_false + description: If set, omit zero-centering variables, which allows to handle sparse input efficiently." + +dependencies: + - name: metadata/add_id + - name: workflows/multiomics/split_modalities + alias: split_modalities_workflow + - name: dataflow/merge + - name: dataflow/concatenate_h5mu + - name: workflows/rna/rna_singlesample + - name: workflows/prot/prot_singlesample + - name: workflows/gdo/gdo_singlesample + - name: workflows/multiomics/process_batches +# test_dependencies: +# - name: filter/remove_modality +# - name: transform/move_layer +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - type: nextflow_script + path: test.nf + entrypoint: test_wf3 + - type: nextflow_script + path: test.nf + entrypoint: test_wf4 + - type: nextflow_script + path: test.nf + entrypoint: test_wf5 + - path: /resources_test/concat_test_data + - path: /resources_test/pbmc_1k_protein_v3 + - path: /resources_test/10x_5k_lung_crispr +runners: + - type: nextflow diff --git a/src/workflows/multiomics/process_samples/integration_test.sh b/src/workflows/multiomics/process_samples/integration_test.sh new file mode 100755 index 00000000..fb4f4512 --- /dev/null +++ b/src/workflows/multiomics/process_samples/integration_test.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +set -eo pipefail + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +# Same as above but with remote yaml file. +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ + --param_list s3://openpipelines-data/remote_param_list/test_param_list.yaml + +# Same as above but with remote json file. +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ + --param_list s3://openpipelines-data/remote_param_list/test_param_list.json + +# Same as above but with remote csv file. +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ + --param_list s3://openpipelines-data/remote_param_list/test_param_list.csv + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -profile docker,no_publish \ + -entry test_wf2 \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf3 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf4 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf5 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf6 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf7 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -main-script src/workflows/multiomics/process_samples/test.nf \ + -entry test_wf8 \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ No newline at end of file diff --git a/src/workflows/multiomics/process_samples/main.nf b/src/workflows/multiomics/process_samples/main.nf new file mode 100644 index 00000000..7bacc1c2 --- /dev/null +++ b/src/workflows/multiomics/process_samples/main.nf @@ -0,0 +1,254 @@ +workflow run_wf { + take: + input_ch + + main: + modalities_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // If requested to be detected, make sure the mitochondrial and ribosomal genes + // are added to the input of the qc metrics calculation + | map {id, state -> + def var_qc_default = [state.highly_variable_features_var_output] + if (state.var_name_mitochondrial_genes) { + var_qc_default.add(state.var_name_mitochondrial_genes) + } + if (state.var_name_ribosomal_genes) { + var_qc_default.add(state.var_name_ribosomal_genes) + } + def newState = state + ["var_qc_metrics": var_qc_default.join(",")] + [id, newState] + } + // If requested, add the id of the events (samples) a column in .obs. + // Also allows to make .obs_names (the .obs index) unique, by prefixing the values with an unique id per .h5mu file. + // The latter is usefull to avoid duplicate observations during concatenation. + | add_id.run( + filter: {id, state -> state.add_id_to_obs }, + fromState: {id, state -> + def newState = [ + "input": state.input, + "input_id": id, + "make_observation_keys_unique": state.add_id_make_observation_keys_unique, + "obs_output": state.add_id_obs_output, + "add_id_to_obs": state.add_id_to_obs + ] + newState + }, + toState: {id, output, state -> + def keysToRemove = ["add_id_to_obs", "add_id_obs_output", "add_id_make_observation_keys_unique"] + def newState = state.findAll{it.key !in keysToRemove} + newState + ["input": output.output] + } + ) + | split_modalities_workflow.run( + fromState: {id, state -> + def newState = ["input": state.input, "id": id] + }, + toState: ["output": "output", "output_types": "output_types"] + ) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + // def new_id = id + "_" + dat.name + def new_id = id // it's okay because the channel will get split up anyways + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + | view {"After splitting modalities: $it"} + + + // + // Singlesample processing + // + def singlesample_arguments = [ + "rna": [ + "min_counts": "rna_min_counts", + "max_counts": "rna_max_counts", + "min_genes_per_cell": "rna_min_genes_per_cell", + "max_genes_per_cell": "rna_max_genes_per_cell", + "min_cells_per_gene": "rna_min_cells_per_gene", + "min_fraction_mito": "rna_min_fraction_mito", + "max_fraction_mito": "rna_max_fraction_mito", + "var_name_mitochondrial_genes": "var_name_mitochondrial_genes", + "obs_name_mitochondrial_fraction": "obs_name_mitochondrial_fraction", + "var_gene_names": "var_gene_names", + "mitochondrial_gene_regex": "mitochondrial_gene_regex", + "min_fraction_ribo": "rna_min_fraction_ribo", + "max_fraction_ribo": "rna_max_fraction_ribo", + "var_name_ribosomal_genes": "var_name_ribosomal_genes", + "obs_name_ribosomal_fraction": "obs_name_ribosomal_fraction", + "ribosomal_gene_regex": "ribosomal_gene_regex", + "layer": "rna_layer", + "skip_scrublet_doublet_detection": "skip_scrublet_doublet_detection" + ], + "prot": [ + "min_counts": "prot_min_counts", + "max_counts": "prot_max_counts", + "min_proteins_per_cell": "prot_min_proteins_per_cell", + "max_proteins_per_cell": "prot_max_proteins_per_cell", + "min_cells_per_protein": "prot_min_cells_per_protein", + "layer": "prot_layer", + ], + "gdo": [ + "min_counts": "gdo_min_counts", + "max_counts": "gdo_max_counts", + "min_guides_per_cell": "gdo_min_guides_per_cell", + "max_guides_per_cell": "gdo_max_guides_per_cell", + "min_cells_per_guide": "gdo_min_cells_per_guide", + "layer": "gdo_layer", + ], + ].asImmutable() + + multisample_ch_known = modalities_ch + // run the singlesample processing + | runEach( + components: [rna_singlesample, prot_singlesample, gdo_singlesample], + filter: { id, state, component -> + state.modality + "_singlesample" == component.config.name + }, + fromState: { id, state, component -> + def newState = singlesample_arguments.get(state.modality).collectEntries{key_, value_ -> + [key_, state[value_]] + } + return newState + ["id": id, "input": state.input] + }, + toState: ["input": "output"], + ) + + multisample_ch_unknown = modalities_ch + | filter{id, state -> state.modality !in singlesample_arguments.keySet()} + + output_ch = multisample_ch_unknown.mix(multisample_ch_known) + // Remove arguments for singlesample processing from state. + | map {id, state -> + def keysToRemove = singlesample_arguments.inject([]){currentKeys, modality, stateMapping -> + currentKeys += stateMapping.values() + } + def allwayskeep = ["gdo_layer", "rna_layer", "prot_layer", "workflow_output"] + def newState = state.findAll{(it.key !in keysToRemove + ["id"]) || (it.key in allwayskeep)} + [id, newState] + } + | view {"After singlesample processing: $it"} + + // + // Concatenation: join observations across samples together per modality. + // + // Concatenate multiple single-sample unimodal MuData objects back into several multi-sample files. + // One multi-sample MuData file is created per modality. + // + | map { id, state -> // Put modality name in first element so that we can group on it + [state.modality, id, state] + } + | groupTuple(by: 0, sort: "hash") + | view {"After groupTuple: $it"} + | map { modality, old_ids, states -> + def new_id = "combined_$modality" + // keys in the new state that should not have a unique value across samples + def new_state_non_unique_values = [ + "input": states.collect{it.input}, + "input_id": old_ids, + "_meta": ["join_id": old_ids[0]] + ] + // Gather the keys from the different states, + // one state might contain more keys compared to another (so create a set) + def all_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input_id", "input", "_meta"]) + // Create the new state from the keys, values should be the same across samples + def new_state = all_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across samples. Argument name: $argument_name, \ + argument value: $argument_values" + // take the unique value from the set (there is only one) + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + def final_state = new_state_non_unique_values + new_state + [new_id, final_state] + } + | concatenate_h5mu.run( + fromState: [ + "input": "input", + "input_id": "input_id" + ], + toState: {id, output, state -> + def keysToRemove = ["input_id"] + def newState = state.findAll{it.key !in keysToRemove} + newState + ["input": output.output] + }, + ) + + | view {"After concatenation: $it"} + | toSortedList() + | map {modalities_states -> + def states = modalities_states.collect{it[1]} + def new_input = states.collect{it.input} + def join_id = states[0]._meta.join_id + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality", "_meta"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + ["merged", new_state + ["input": new_input, "_meta": ["join_id": join_id]]] + } + | process_batches.run( + fromState: {id, state -> + [ + "id": id, + "input": state.input, + "output": state.workflow_output, + "highly_variable_features_var_output": state.highly_variable_features_var_output, + "highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key, + "var_qc_metrics": state.var_qc_metrics, + "top_n_vars": state.top_n_vars, + "pca_overwrite": state.pca_overwrite, + "rna_layer": state.rna_layer, + "prot_layer": state.prot_layer, + "clr_axis": state.clr_axis, + "rna_enable_scaling": state.rna_enable_scaling, + "rna_scaling_output_layer": state.rna_scaling_output_layer, + "rna_scaling_pca_obsm_output": state.rna_scaling_pca_obsm_output, + "rna_scaling_pca_loadings_varm_output": state.rna_scaling_pca_loadings_varm_output, + "rna_scaling_pca_variance_uns_output": state.rna_scaling_pca_variance_uns_output, + "rna_scaling_umap_obsm_output": state.rna_scaling_umap_obsm_output, + "rna_scaling_max_value": state.rna_scaling_max_value, + "rna_scaling_zero_center": state.rna_scaling_zero_center, + ] + }, + toState: {id, output, state -> + [ + "output": output.output, + "_meta": state._meta, + ] + } + ) + | view {"After process_batches: $it"} + + emit: + output_ch +} diff --git a/src/workflows/multiomics/process_samples/nextflow.config b/src/workflows/multiomics/process_samples/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/multiomics/process_samples/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/multiomics/process_samples/test.nf b/src/workflows/multiomics/process_samples/test.nf new file mode 100644 index 00000000..2a9d019d --- /dev/null +++ b/src/workflows/multiomics/process_samples/test.nf @@ -0,0 +1,398 @@ +nextflow.enable.dsl=2 +targetDir = params.rootDir + "/target/nextflow" + +include { remove_modality } from targetDir + '/filter/remove_modality/main.nf' +include { move_layer } from targetDir + '/transform/move_layer/main.nf' +include { process_samples } from targetDir + "/workflows/multiomics/process_samples/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "mouse", + input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + publish_dir: "foo/", + rna_min_counts: 2, + output: "test.h5mu", + ], + [ + id: "human", + input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + publish_dir: "foo/", + rna_min_counts: 2, + output: "test.h5mu", + + ] + ]) + | map{ state -> [state.id, state] } + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith("test.h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } + +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "pbmc", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + var_name_mitochondrial_genes: 'mitochondrial', + var_name_ribosomal_genes: 'ribosomal', + rna_min_counts: 2, + prot_min_counts: 3, + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id" + ], + [ + id: "pbmc_with_more_args", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + rna_min_counts: 2, + rna_max_counts: 1000000, + rna_min_genes_per_cell: 1, + rna_max_genes_per_cell: 1000000, + rna_min_cells_per_gene: 1, + rna_min_fraction_mito: 0.0, + rna_max_fraction_mito: 1.0, + prot_min_counts: 3, + prot_max_counts: 1000000, + prot_min_proteins_per_cell: 1, + prot_max_proteins_per_cell: 1000000, + prot_min_cells_per_protein: 1, + var_name_mitochondrial_genes: 'mitochondrial', + obs_name_mitochondrial_fraction: 'fraction_mitochondrial', + var_name_ribosomal_genes: 'ribosomal', + obs_name_ribosomal_fraction: 'fraction_ribosomal', + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id" + ], + ]) + | map{ state -> [state.id, state] } + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + // The result of this pipeline is always 1 merged sample, regardless of the number of input samples. + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } +} + +workflow test_wf3 { + + resources_test = file(params.resources_test) + + input_ch = Channel.fromList([ + [ + id: "mouse", + input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + publish_dir: "foo/", + rna_min_counts: 2, + var_qc_metrics: "highly_variable", + highly_variable_features_var_output: "highly_variable", + ], + [ + id: "human", + input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + publish_dir: "foo/", + rna_min_counts: 2, + var_qc_metrics: "highly_variable", + highly_variable_features_var_output: "highly_variable", + ] + ]) + | map{ state -> [state.id, state] } + + human_ch = input_ch + | filter{it[0] == "human"} + | remove_modality.run( + fromState: { id, state -> + [ + "input": state.input, + "modality": "atac" + ] + }, + toState: ["input": "output"] + ) + + mouse_ch = input_ch + | filter{it[0] == "mouse"} + | remove_modality.run( + fromState: { id, state -> + [ + "input": state.input, + "modality": "rna" + ] + }, + toState: ["input": "output"] + ) + + output_ch_test_2 = human_ch.concat(mouse_ch) + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } +} + +workflow test_wf4 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "10x_5k_lung_crispr", + input: resources_test.resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"), + rna_min_counts: 2, + gdo_min_counts: 10, + gdo_max_counts: 10000, + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id" + ] + ]) + | map{ state -> [state.id, state] } + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + // The result of this pipeline is always 1 merged sample, regardless of the number of input samples. + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } +} + +workflow test_wf5 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "5k_human_antiCMV_T_TBNK_select_layer", + input: resources_test.resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"), + rna_min_counts: 2, + rna_max_counts: 1000000, + rna_min_genes_per_cell: 1, + rna_max_genes_per_cell: 1000000, + rna_min_cells_per_gene: 1, + rna_min_fraction_mito: 0.0, + rna_max_fraction_mito: 1.0, + rna_min_fraction_ribo: 0.0, + rna_max_fraction_ribo: 1.0, + var_name_mitochondrial_genes: 'mitochondrial', + obs_name_mitochondrial_fraction: 'fraction_mitochondrial', + var_name_ribosomal_genes: 'ribosomal', + obs_name_ribosomal_fraction: 'fraction_ribosomal', + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id", + rna_layer: "test_layer" + ], + ]) + | map{ state -> [state.id, state] } + | move_layer.run( + fromState: { id, state -> + [ + "input": state.input, + "output_layer": "test_layer", + "modality": "rna" + ] + }, + toState: ["input": "output"] + ) + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + // The result of this pipeline is always 1 merged sample, regardless of the number of input samples. + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } +} + +workflow test_wf6 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "pbmc_clr_axis_0", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + clr_axis: 1 + ], + [ + id: "pbmc_clr_axis_1", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + clr_axis: 1 + ] + ]) + | map{ state -> [state.id, state] } + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } + +} + + +// The following test is supposed to fail. It is used to test the error handling of the pipeline. +// However, there is not way to catch the error originating from the workflow + +// workflow test_wf4 { + +// helpMessage(config) + +// // allow changing the resources_test dir +// params.resources_test = params.rootDir + "/resources_test" + + +// testParams = [ +// param_list: [ +// [ +// id: "pbmc", +// input: params.resources_test + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", +// output: "foo.h5mu" +// ], +// [ +// id: "pbmc_with_more_args", +// input: params.resources_test + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", +// output: "foo2.h5mu" +// ], +// ], + +// ] + +// output_ch = channelFromParams(testParams, config) +// | run_wf +// } + +workflow test_wf7 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "5k_human_antiCMV_T_TBNK_select_layer", + input: resources_test.resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"), + rna_min_counts: 2, + rna_max_counts: 1000000, + rna_min_genes_per_cell: 1, + rna_max_genes_per_cell: 1000000, + rna_min_cells_per_gene: 1, + rna_min_fraction_mito: 0.0, + rna_max_fraction_mito: 1.0, + rna_min_fraction_ribo: 0.0, + rna_max_fraction_ribo: 1.0, + rna_enable_scaling: true, + var_name_mitochondrial_genes: 'mitochondrial', + obs_name_mitochondrial_fraction: 'fraction_mitochondrial', + var_name_ribosomal_genes: 'ribosomal', + obs_name_ribosomal_fraction: 'fraction_ribosomal', + add_id_to_obs: true, + add_id_make_observation_keys_unique: true, + add_id_obs_output: "sample_id", + rna_layer: "test_layer", + + ], + ]) + | map{ state -> [state.id, state] } + | move_layer.run( + fromState: { id, state -> + [ + "input": state.input, + "output_layer": "test_layer", + "modality": "rna" + ] + }, + toState: ["input": "output"] + ) + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + // The result of this pipeline is always 1 merged sample, regardless of the number of input samples. + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } +} + +workflow test_wf8 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "mouse", + input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + publish_dir: "foo/", + rna_min_counts: 2, + output: "test.h5mu", + skip_scrublet_doublet_detection: true + ], + [ + id: "human", + input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + publish_dir: "foo/", + rna_min_counts: 2, + output: "test.h5mu", + skip_scrublet_doublet_detection: true + ] + ]) + | map{ state -> [state.id, state] } + | process_samples + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith("test.h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "merged" : "Output ID should be 'merged'" + } + +} \ No newline at end of file diff --git a/src/workflows/multiomics/split_h5mu/config.vsh.yaml b/src/workflows/multiomics/split_h5mu/config.vsh.yaml new file mode 100644 index 00000000..d5b0ef1f --- /dev/null +++ b/src/workflows/multiomics/split_h5mu/config.vsh.yaml @@ -0,0 +1,72 @@ +name: "split_h5mu" +namespace: "workflows/multiomics" +scope: "private" +description: "Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. " +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +info: + test_dependencies: + - name: split_h5mu_test + namespace: test_workflows/multiomics +argument_groups: +- name: Input & specifications + arguments: + - name: "--input" + type: file + description: Path to a single .h5mu file. + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--obs_feature" + type: string + required: true + description: The .obs column to split the mudata on. + example: "celltype" + - name: "--drop_obs_nan" + type: boolean_true + description: Whether to drop all .obs columns that contain only nan values after splitting. + - name: "--ensure_unique_filenames" + type: boolean_true + description: Append number suffixes to ensure unique filenames after sanitizing obs feature values. + +- name: Outputs + arguments: + - name: "--output" + type: file + required: true + direction: output + example: "/path/to/output" + description: Output directory containing multiple h5mu files. + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + - name: "--output_files" + type: file + required: true + direction: output + example: sample_files.csv + description: A csv containing the base filename and obs feature by which it was split. +dependencies: + - name: dataflow/split_h5mu + alias: split_h5mu_component +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu +runners: + - type: nextflow diff --git a/src/workflows/multiomics/split_h5mu/integration_test.sh b/src/workflows/multiomics/split_h5mu/integration_test.sh new file mode 100755 index 00000000..b8f7ccd0 --- /dev/null +++ b/src/workflows/multiomics/split_h5mu/integration_test.sh @@ -0,0 +1,13 @@ +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/multiomics/split_h5mu/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/multiomics/split_h5mu/main.nf b/src/workflows/multiomics/split_h5mu/main.nf new file mode 100644 index 00000000..520f9c2d --- /dev/null +++ b/src/workflows/multiomics/split_h5mu/main.nf @@ -0,0 +1,62 @@ +process splitStub { + input: + tuple val(id), val(unused) + + output: + tuple val(id), path("stub_h5mus"), path("samples.csv") + + script: + """ + echo "This process is not meant to be run without -stub being defined." + exit 1 + """ + + stub: + """ + mkdir stub_h5mus + touch stub_h5mus/${id}_sample_1.h5mu + touch stub_h5mus/${id}_sample_2.h5mu + touch stub_h5mus/${id}_sample_3.h5mu + echo -e "name,filename\nsample_1,stub_h5mus/${id}_sample_1.h5mu\nsample_2,stub_h5mus/${id}_sample_2.h5mu\nsample_3,stub_h5mus/${id}_sample_3.h5mu" > samples.csv + """ +} + +workflow run_wf { + take: + input_ch + + main: + split_ch = input_ch + | split_h5mu_component.run( + filter: {!workflow.stubRun}, + fromState: [ + "input": "input", + "output_compression": "output_compression", + "output_files": "output_files", + "modality": "modality", + "obs_feature": "obs_feature", + "drop_obs_nan": "drop_obs_nan", + "ensure_unique_filenames": "ensure_unique_filenames" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ] + ) + + split_stub_ch = input_ch + | filter{workflow.stubRun} + // This is not a build viash component, so we cannot use + // fromState or toState functionality + | map {id, state -> [id, state.input]} + | splitStub + | map {id, output, output_files -> + [id, ["output": output, "output_files": output_files]] + } + + output_ch = split_ch.concat(split_stub_ch) + | setState(["output", "output_files"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/multiomics/split_h5mu/nextflow.config b/src/workflows/multiomics/split_h5mu/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/multiomics/split_h5mu/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/multiomics/split_h5mu/test.nf b/src/workflows/multiomics/split_h5mu/test.nf new file mode 100644 index 00000000..98e36fae --- /dev/null +++ b/src/workflows/multiomics/split_h5mu/test.nf @@ -0,0 +1,38 @@ +nextflow.enable.dsl=2 + +include { split_h5mu } from params.rootDir + "/target/_private/nextflow/workflows/multiomics/split_h5mu/main.nf" +include { split_h5mu_test } from params.rootDir + "/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "split_test", + input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"), + publish_dir: "foo/", + obs_feature: "sample_id", + output: "samples", + output_files: "samples.csv" + ] + ]) + | map { state -> [state.id, state]} + | split_h5mu.run( + toState: { id, output, state -> output + [orig_input: state.input] } + ) + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + "Output: $output" + } + + | split_h5mu_test.run( + fromState: [ + "input": "output_files", + "samples_dir": "output", + "orig_input": "orig_input", + ] + ) +} diff --git a/src/workflows/multiomics/split_modalities/config.vsh.yaml b/src/workflows/multiomics/split_modalities/config.vsh.yaml new file mode 100644 index 00000000..196dacf5 --- /dev/null +++ b/src/workflows/multiomics/split_modalities/config.vsh.yaml @@ -0,0 +1,58 @@ +name: "split_modalities" +namespace: "workflows/multiomics" +scope: "private" +description: "A pipeline to split a multimodal mudata files into several unimodal mudata files." +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +info: + test_dependencies: + - name: split_modalities_test + namespace: test_workflows/multiomics +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + alternatives: [-i] + description: Path to the sample. + required: true + example: input.h5mu + type: file + - name: "Outputs" + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + required: true + direction: output + example: "/path/to/output" + description: Output directory containing multiple h5mu files. + - name: "--output_types" + type: file + required: true + direction: output + example: types.csv + description: A csv containing the base filename and modality type per output file. +dependencies: + - name: dataflow/split_modalities + alias: split_modalities_component +# test_dependencies: +# - name: filter/remove_modality +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu +runners: + - type: nextflow diff --git a/src/workflows/multiomics/split_modalities/integration_test.sh b/src/workflows/multiomics/split_modalities/integration_test.sh new file mode 100755 index 00000000..eaecdca1 --- /dev/null +++ b/src/workflows/multiomics/split_modalities/integration_test.sh @@ -0,0 +1,13 @@ +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/multiomics/split_modalities/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/multiomics/split_modalities/main.nf b/src/workflows/multiomics/split_modalities/main.nf new file mode 100644 index 00000000..9a174e98 --- /dev/null +++ b/src/workflows/multiomics/split_modalities/main.nf @@ -0,0 +1,55 @@ +process splitStub { + input: + tuple val(id), val(unused) + + output: + tuple val(id), path("stub_h5mus"), path("modalities.csv") + + script: + """ + echo "This process is not meant to be run without -stub being defined." + exit 1 + """ + + stub: + """ + mkdir stub_h5mus + touch stub_h5mus/${id}_vdj.h5mu + touch stub_h5mus/${id}_rna.h5mu + touch stub_h5mus/${id}_prot.h5mu + echo -e "name,filename\nrna,stub_h5mus/${id}_rna.h5mu\nprot,stub_h5mus/${id}_prot.h5mu\nvdj,stub_h5mus/${id}_vdj.h5mu" > modalities.csv + """ +} + +workflow run_wf { + // Split multimodal MuData files into several unimodal MuData files. + take: + input_ch + + main: + split_ch = input_ch + | split_modalities_component.run( + filter: {!workflow.stubRun}, + fromState: ["input": "input"], + toState: [ + "output": "output", + "output_types": "output_types" + ] + ) + + split_stub_ch = input_ch + | filter{workflow.stubRun} + // This is not a build viash component, so we cannot use + // fromState or toState functionality + | map {id, state -> [id, state.input]} + | splitStub + | map {id, output, output_types -> + [id, ["output": output, "output_types": output_types]] + } + + output_ch = split_ch.concat(split_stub_ch) + | setState(["output", "output_types"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/multiomics/split_modalities/nextflow.config b/src/workflows/multiomics/split_modalities/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/multiomics/split_modalities/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/multiomics/split_modalities/test.nf b/src/workflows/multiomics/split_modalities/test.nf new file mode 100644 index 00000000..480cc610 --- /dev/null +++ b/src/workflows/multiomics/split_modalities/test.nf @@ -0,0 +1,37 @@ +nextflow.enable.dsl=2 + +include { split_modalities } from params.rootDir + "/target/_private/nextflow/workflows/multiomics/split_modalities/main.nf" +include { split_modalities_test } from params.rootDir + "/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "mouse", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + publish_dir: "foo/", + output: "modalities", + output_types: "types.csv" + ] + ]) + | map { state -> [state.id, state]} + | split_modalities.run( + toState: { id, output, state -> output + [orig_input: state.input] } + ) + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + "Output: $output" + } + + | split_modalities_test.run( + fromState: [ + "input": "output_types", + "mod_dir": "output", + "orig_input": "orig_input", + ] + ) +} diff --git a/src/workflows/prot/prot_multisample/config.vsh.yaml b/src/workflows/prot/prot_multisample/config.vsh.yaml new file mode 100644 index 00000000..3770dfcb --- /dev/null +++ b/src/workflows/prot/prot_multisample/config.vsh.yaml @@ -0,0 +1,124 @@ +name: "prot_multisample" +namespace: "workflows/prot" +scope: "public" +description: "Processing unimodal multi-sample ADT data." +info: + image: /images/concepts/fig_workflow_multiomics_adt_multisample.svg + test_dependencies: +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the concatenated file + example: concatenated + - name: "--input" + required: true + type: file + description: Path to the samples. + example: dataset.h5mu + - name: "--layer" + type: string + description: "Input layer to use. If not specified, .X is used." + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "QC metrics calculation options" + arguments: + - name: "--var_qc_metrics" + description: | + Keys to select a boolean (containing only True or False) column from .var. + For each cell, calculate the proportion of total values for genes which are labeled 'True', + compared to the total sum of the values for all genes. Defaults to the value from + --var_name_mitochondrial_genes. + type: string + multiple: True + multiple_sep: ',' + required: false + example: "ercc,highly_variable" + - name: "--top_n_vars" + type: integer + description: | + Number of top vars to be used to calculate cumulative proportions. + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds + cumulative proportion to the 20th and 50th most expressed vars. + multiple: true + multiple_sep: ',' + required: false + default: [50, 100, 200, 500] + - name: "--output_obs_num_nonzero_vars" + description: | + Name of column in .obs describing, for each observation, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each row the number of columns that contain data. + type: string + required: false + default: "num_nonzero_vars" + - name: "--output_obs_total_counts_vars" + description: | + Name of the column for .obs describing, for each observation (row), + the sum of the stored values in the columns. + type: string + required: false + default: total_counts + - name: "--output_var_num_nonzero_obs" + description: | + Name of column describing, for each feature, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each column the number of rows that contain data. + type: string + required: false + default: "num_nonzero_obs" + - name: "--output_var_total_counts_obs" + description: | + Name of the column in .var describing, for each feature (column), + the sum of the stored values in the rows. + type: string + required: false + default: total_counts + - name: "--output_var_obs_mean" + type: string + description: | + Name of the column in .obs providing the mean of the values in each row. + default: "obs_mean" + required: false + - name: "--output_var_pct_dropout" + type: string + default: "pct_dropout" + description: | + Name of the column in .obs providing for each feature the percentage of + observations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs` + but percentage based. + - name: "CLR arguments" + arguments: + - name: "--clr_axis" + type: integer + description: Axis across which CLR is performed. + default: 0 + required: false +dependencies: + - name: transform/clr + - name: workflows/qc/qc + alias: prot_qc +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/prot/prot_multisample/integration_test.sh b/src/workflows/prot/prot_multisample/integration_test.sh new file mode 100755 index 00000000..46633a8f --- /dev/null +++ b/src/workflows/prot/prot_multisample/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/prot/prot_multisample/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/prot/prot_multisample/main.nf b/src/workflows/prot/prot_multisample/main.nf new file mode 100644 index 00000000..ac037313 --- /dev/null +++ b/src/workflows/prot/prot_multisample/main.nf @@ -0,0 +1,52 @@ +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | clr.run( + fromState: [ + "input": "input", + "input_layer": "layer", + "clr_axis": "clr_axis", + ], + toState: ["input": "output"], + args: [ + output_layer: "clr", + modality: "prot" + ] + ) + | prot_qc.run( + // TODO: remove when viash 0.8.3 is released + key: "prot_qc", + fromState: { id, state -> + def newState = [ + "id": id, + "output": state.workflow_output, + "input": state.input, + "top_n_vars": state.top_n_vars, + "var_qc_metrics": null, + "input_layer": state.layer, // Use the non-transformed layer + "modality": "prot", + "var_name_mitochondrial_genes": null, + "output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars, + "output_obs_total_counts_vars": state.output_obs_total_counts_vars, + "num_nonzeoutput_var_num_nonzero_obsro_obs": state.output_var_num_nonzero_obs, + "output_var_total_counts_obs": state.output_var_total_counts_obs, + "output_var_obs_mean": state.output_var_obs_mean, + "output_var_pct_dropout": state.pct_dropout + ] + newState + } + ) + | setState(["output"]) + + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/prot/prot_multisample/nextflow.config b/src/workflows/prot/prot_multisample/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/prot/prot_multisample/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/prot/prot_multisample/test.nf b/src/workflows/prot/prot_multisample/test.nf new file mode 100644 index 00000000..9a35e0bd --- /dev/null +++ b/src/workflows/prot/prot_multisample/test.nf @@ -0,0 +1,39 @@ +nextflow.enable.dsl=2 + +include { prot_multisample } from params.rootDir + "/target/nextflow/workflows/prot/prot_multisample/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "adt_samples_axis_0", + sample_id: "pbmc", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + clr_axis: 0 + ], + [ + id: "adt_samples_axis_1", + sample_id: "pbmc", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + clr_axis: 1 + ] + ]) + | map{ state -> [state.id, state] } + | prot_multisample + | view { output -> + assert output.size() == 2 : "outputs should contain three elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + print "output_list: $output_list" + assert output_list.size() == 2 : "output channel should contain two events" + assert output_list.collect({it[0]}).sort() == ["adt_samples_axis_0", "adt_samples_axis_1"] : "Output IDs should be [adt_samples_axis_0, adt_samples_axis_1]" + } + +} diff --git a/src/workflows/prot/prot_singlesample/config.vsh.yaml b/src/workflows/prot/prot_singlesample/config.vsh.yaml new file mode 100644 index 00000000..5b31e9c8 --- /dev/null +++ b/src/workflows/prot/prot_singlesample/config.vsh.yaml @@ -0,0 +1,79 @@ +name: "prot_singlesample" +namespace: "workflows/prot" +scope: "public" +description: "Processing unimodal single-sample CITE-seq data." +info: + image: /images/concepts/fig_workflow_multiomics_adt_singlesample.svg + test_dependencies: +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "Input" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + type: file + required: true + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + type: string + required: false + description: "Input layer to start from. By default, .X will be used." + - name: "Output" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "Filtering options" + arguments: + - name: "--min_counts" + example: 200 + type: integer + description: Minimum number of counts captured per cell. + - name: "--max_counts" + example: 5000000 + type: integer + description: Maximum number of counts captured per cell. + + - name: "--min_proteins_per_cell" + type: integer + example: 200 + description: Minimum of non-zero values per cell. + - name: "--max_proteins_per_cell" + example: 1500000 + type: integer + description: Maximum of non-zero values per cell. + + - name: "--min_cells_per_protein" + example: 3 + type: integer + description: Minimum of non-zero values per gene. +dependencies: + - name: filter/filter_with_counts + - name: filter/do_filter +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/prot/prot_singlesample/integration_test.sh b/src/workflows/prot/prot_singlesample/integration_test.sh new file mode 100755 index 00000000..22f68c0d --- /dev/null +++ b/src/workflows/prot/prot_singlesample/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/multiomics/prot_singlesample/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/prot/prot_singlesample/main.nf b/src/workflows/prot/prot_singlesample/main.nf new file mode 100644 index 00000000..e26718d4 --- /dev/null +++ b/src/workflows/prot/prot_singlesample/main.nf @@ -0,0 +1,52 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // filtering + | filter_with_counts.run( + key: "prot_filter_with_counts", + fromState: { id, state -> + def newState = [ + "input": state.input, + "min_counts": state.min_counts, + "max_counts": state.max_counts, + "min_genes_per_cell": state.min_proteins_per_cell, + "max_genes_per_cell": state.max_proteins_per_cell, + "min_cells_per_gene": state.min_cells_per_protein, + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "modality": "prot", + "layer": state.layer, + ] + newState + }, + toState: ["input": "output"] + ) + | do_filter.run( + key: "prot_do_filter", + fromState : { id, state -> + // do_filter does not need a layer argument because it filters all layers + // from a modality. + def newState = [ + "input": state.input, + "obs_filter": "filter_with_counts", + "modality": "prot", + "var_filter": "filter_with_counts", + "output_compression": "gzip", + "output": state.workflow_output + ] + return newState + }, + toState: ["output": "output"], + ) + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/workflows/prot/prot_singlesample/nextflow.config b/src/workflows/prot/prot_singlesample/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/prot/prot_singlesample/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/prot/prot_singlesample/test.nf b/src/workflows/prot/prot_singlesample/test.nf new file mode 100644 index 00000000..e2d45696 --- /dev/null +++ b/src/workflows/prot/prot_singlesample/test.nf @@ -0,0 +1,38 @@ +nextflow.enable.dsl=2 + +include { prot_singlesample } from params.rootDir + "/target/nextflow/workflows/prot/prot_singlesample/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "foo", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + min_counts: 3, + max_counts: 100000, + min_genes_per_cell: 2, + max_genes_per_cell: 10000, + min_cells_per_gene: 10, + min_fraction_mito: 0.2, + max_fraction_mito: 0.8, + output: "foo.final.h5mu", + ] + ]) + | map{ state -> [state.id, state] } + | prot_singlesample + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}" + "Output: $output" + } + | toSortedList() + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + assert output_list[0][0] == "foo" : "Output ID should be same as input ID" + assert (output_list.collect({it[1].output.getFileName().toString()}) as Set).equals(["foo.final.h5mu"] as Set) + } +} diff --git a/src/workflows/qc/qc/config.vsh.yaml b/src/workflows/qc/qc/config.vsh.yaml new file mode 100644 index 00000000..4dbe6c1c --- /dev/null +++ b/src/workflows/qc/qc/config.vsh.yaml @@ -0,0 +1,172 @@ +name: "qc" +namespace: "workflows/qc" +scope: "public" +description: "A pipeline to add basic qc statistics to a MuData " +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +info: + test_dependencies: + - name: qc_test + namespace: test_workflows/qc +argument_groups: + - name: Inputs + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + alternatives: [-i] + description: Path to the sample. + required: true + example: input.h5mu + type: file + - name: "--modality" + description: Which modality to process. + type: string + default: "rna" + required: false + - name: "--layer" + description: "Layer to calculate qc metrics for." + type: string + example: "raw_counts" + required: false + - name: "Mitochondrial & Ribosomal Gene Detection" + arguments: + - name: "--var_gene_names" + required: false + example: "gene_symbol" + type: string + description: | + .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set). + Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be + identified as mitochondrial or ribosomal genes, respectively. + - name: "--var_name_mitochondrial_genes" + type: string + required: false + description: | + In which .var slot to store a boolean array corresponding the mitochondrial genes. + - name: "--obs_name_mitochondrial_fraction" + type: string + required: false + description: | + .Obs slot to store the fraction of reads found to be mitochondrial. Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes + - name: --mitochondrial_gene_regex + type: string + description: | + Regex string that identifies mitochondrial genes from --var_gene_names. + By default will detect human and mouse mitochondrial genes from a gene symbol. + required: false + default: "^[mM][tT]-" + - name: "--var_name_ribosomal_genes" + type: string + required: false + description: | + In which .var slot to store a boolean array corresponding the ribosomal genes. + - name: "--obs_name_ribosomal_fraction" + type: string + required: false + description: | + When specified, write the fraction of counts originating from ribosomal genes + (based on --ribosomal_gene_regex) to an .obs column with the specified name. + Requires --var_name_ribosomal_genes. + - name: --ribosomal_gene_regex + type: string + description: | + Regex string that identifies ribosomal genes from --var_gene_names. + By default will detect human and mouse ribosomal genes from a gene symbol. + required: false + default: "^[Mm]?[Rr][Pp][LlSs]" + - name: "QC metrics calculation options" + arguments: + - name: "--var_qc_metrics" + description: | + Keys to select a boolean (containing only True or False) column from .var. + For each cell, calculate the proportion of total values for genes which are labeled 'True', + compared to the total sum of the values for all genes. Defaults to the value from + --var_name_mitochondrial_genes. + type: string + multiple: True + multiple_sep: ',' + required: false + example: "ercc,highly_variable" + - name: "--top_n_vars" + type: integer + description: | + Number of top vars to be used to calculate cumulative proportions. + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds + cumulative proportion to the 20th and 50th most expressed vars. + multiple: true + multiple_sep: ',' + required: false + default: [50, 100, 200, 500] + - name: "--output_obs_num_nonzero_vars" + description: | + Name of column in .obs describing, for each observation, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each row the number of columns that contain data. + type: string + required: false + default: "num_nonzero_vars" + - name: "--output_obs_total_counts_vars" + description: | + Name of the column for .obs describing, for each observation (row), + the sum of the stored values in the columns. + type: string + required: false + default: total_counts + - name: "--output_var_num_nonzero_obs" + description: | + Name of column describing, for each feature, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each column the number of rows that contain data. + type: string + required: false + default: "num_nonzero_obs" + - name: "--output_var_total_counts_obs" + description: | + Name of the column in .var describing, for each feature (column), + the sum of the stored values in the rows. + type: string + required: false + default: total_counts + - name: "--output_var_obs_mean" + type: string + description: | + Name of the column in .obs providing the mean of the values in each row. + default: "obs_mean" + required: false + - name: "--output_var_pct_dropout" + type: string + default: "pct_dropout" + description: | + Name of the column in .obs providing for each feature the percentage of + observations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs` + but percentage based. + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu +dependencies: + - name: metadata/grep_annotation_column + - name: qc/calculate_qc_metrics +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/concat_test_data + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow diff --git a/src/workflows/qc/qc/integration_test.sh b/src/workflows/qc/qc/integration_test.sh new file mode 100755 index 00000000..408ec36e --- /dev/null +++ b/src/workflows/qc/qc/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/qc/qc/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/qc/qc/main.nf b/src/workflows/qc/qc/main.nf new file mode 100644 index 00000000..28ff2dab --- /dev/null +++ b/src/workflows/qc/qc/main.nf @@ -0,0 +1,103 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Avoid conflict between output from component and output for this workflow + | map {id, state -> + assert state.output, "Output must be defined" + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Add default for var_qc_metrics component + | map {id, state -> + def var_qc_default = [] + // Remove the var_qc_metric argument from the state if its value is null (not specified) + def new_state = state.findAll { it.key != "var_qc_metrics" || it.value == null } + if (state.var_name_mitochondrial_genes) { + var_qc_default.add(state.var_name_mitochondrial_genes) + } + if (state.var_name_ribosomal_genes) { + var_qc_default.add(state.var_name_ribosomal_genes) + } + // Get the new state, but make sure to overwrite var_qc_metrics if the user has set it. + new_state = ["var_qc_metrics": var_qc_default.join(";")] + new_state + [id, new_state] + } + + | grep_annotation_column.run( + key: "grep_mitochondrial_genes", + runIf: { id, state -> + state.var_name_mitochondrial_genes + }, + fromState: { id, state -> + def stateMapping = [ + "input": state.input, + "modality": state.modality, + "input_column": state.var_gene_names, + "matrix": "var", + "output_match_column": state.var_name_mitochondrial_genes, + "regex_pattern": state.mitochondrial_gene_regex, + "input_layer": state.layer, + ] + stateMapping.output_fraction_column = state.obs_name_mitochondrial_fraction ? state.obs_name_mitochondrial_fraction: "fraction_$state.var_name_mitochondrial_genes" + return stateMapping + }, + toState: ["input": "output"] + ) + + | grep_annotation_column.run( + key: "grep_ribosomal_genes", + runIf: { id, state -> + state.var_name_ribosomal_genes + }, + fromState: { id, state -> + def stateMapping = [ + "input": state.input, + "modality": state.modality, + "input_column": state.var_gene_names, + "matrix": "var", + "output_match_column": state.var_name_ribosomal_genes, + "regex_pattern": state.ribosomal_gene_regex, + "input_layer": state.layer, + ] + stateMapping.output_fraction_column = state.obs_name_ribosomal_fraction ? state.obs_name_ribosomal_fraction: "fraction_$state.var_name_ribosomal_genes" + return stateMapping + }, + toState: ["input": "output"] + ) + + | calculate_qc_metrics.run( + fromState: { id, state -> + def newState = [ + "input": state.input, + "modality": state.modality, + "layer": state.layer, + // TODO: remove this workaround when Viash issue is resolved: + // 'top_n_vars': list(map(int, r''.split(';'))), + // ValueError: invalid literal for int() with base 10: '' + // See https://github.com/viash-io/viash/issues/619 + "top_n_vars": state.top_n_vars ? state.top_n_vars : null, + "var_qc_metrics_fill_na_value": state.var_qc_metrics_fill_na_value, + "output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars, + "output_obs_total_counts_vars": state.output_obs_total_counts_vars, + "output_var_num_nonzero_obs": state.output_var_num_nonzero_obs, + "output_var_total_counts_obs": state.output_var_total_counts_obs, + "output_var_obs_mean": state.output_var_obs_mean, + "output_var_pct_dropout": state.output_var_pct_dropout, + "output": state.workflow_output, + "compression": "gzip" + ] + if (state.var_qc_metrics) { + newState += ["var_qc_metrics": state.var_qc_metrics] + } + return newState + }, + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/workflows/qc/qc/nextflow.config b/src/workflows/qc/qc/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/qc/qc/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/qc/qc/test.nf b/src/workflows/qc/qc/test.nf new file mode 100644 index 00000000..a2bb34fa --- /dev/null +++ b/src/workflows/qc/qc/test.nf @@ -0,0 +1,62 @@ +nextflow.enable.dsl=2 + +include { qc } from params.rootDir + "/target/nextflow/workflows/qc/qc/main.nf" +include { qc_test } from params.rootDir + "/target/_test/nextflow/test_workflows/qc/qc_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = + Channel.fromList([ + [ + id: "mouse_test", + input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + var_name_mitochondrial_genes: "mitochondrial", + var_name_ribosomal_genes: "ribosomal", + ], + [ + id: "human_test", + input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"), + var_name_mitochondrial_genes: "mitochondrial", + var_name_ribosomal_genes: "ribosomal", + ] + ]) + | map { state -> [state.id, state] } + | qc.run( + toState: { id, output, state -> output + [og_input: state.input] } + ) + + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | qc_test.run( + fromState: {id, state -> + [ + input: state.output, + og_input: state.og_input + ] + } + ) + + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["human_test", "mouse_test"] + } +} diff --git a/src/workflows/rna/rna_multisample/config.vsh.yaml b/src/workflows/rna/rna_multisample/config.vsh.yaml new file mode 100644 index 00000000..cc946dc8 --- /dev/null +++ b/src/workflows/rna/rna_multisample/config.vsh.yaml @@ -0,0 +1,183 @@ +name: "rna_multisample" +namespace: "workflows/rna" +scope: "public" +description: "Processing unimodal multi-sample RNA transcriptomics data." +info: + image: /images/concepts/fig_workflow_multiomics_rna_multisample.svg + test_dependencies: +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "Inputs" + arguments: + - name: "--id" + required: true + type: string + description: ID of the concatenated file + example: concatenated + - name: "--input" + required: true + type: file + description: Path to the samples. + example: dataset.h5mu + - name: "--modality" + type: string + description: Modality to process. + default: "rna" + - name: "--layer" + type: string + description: "Input layer to use. If not specified, .X is used." + - name: "Output" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "Filtering highly variable features" + arguments: + - name: "--highly_variable_features_var_output" + alternatives: ["--filter_with_hvg_var_output"] + required: false + type: string + default: "filter_with_hvg" + description: In which .var slot to store a boolean array corresponding to the highly variable features. + - name: "--highly_variable_features_obs_batch_key" + alternatives: ["--filter_with_hvg_obs_batch_key"] + type: string + default: "sample_id" + required: false + description: | + If specified, highly-variable features are selected within each batch separately and merged. This simple + process avoids the selection of batch-specific features and acts as a lightweight batch correction method. + For all flavors, featues are first sorted by how many batches they are highly variable. For dispersion-based flavors + ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across + batches) rank based on within-batch normalized variance. + - name: "--highly_variable_features_flavor" + alternatives: ["--filter_with_hvg_flavor"] + type: string + default: "seurat" + choices: ["seurat", "cell_ranger", "seurat_v3"] + description: | + Choose the flavor for identifying highly variable features. For the dispersion based methods + in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features. + - name: "--highly_variable_features_n_top_features" + alternatives: ["--filter_with_hvg_n_top_genes"] + required: false + type: integer + description: Number of highly-variable features to keep. Mandatory if filter_with_hvg_flavor is set to 'seurat_v3'. + - name: "QC metrics calculation options" + arguments: + - name: "--var_qc_metrics" + description: | + Keys to select a boolean (containing only True or False) column from .var. + For each cell, calculate the proportion of total values for genes which are labeled 'True', + compared to the total sum of the values for all genes. + type: string + multiple: True + multiple_sep: ',' + required: false + default: ["filter_with_hvg"] + example: "ercc,highly_variable" + - name: "--top_n_vars" + type: integer + description: | + Number of top vars to be used to calculate cumulative proportions. + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds + cumulative proportion to the 20th and 50th most expressed vars. + multiple: true + multiple_sep: ',' + required: false + default: [50, 100, 200, 500] + - name: "--output_obs_num_nonzero_vars" + description: | + Name of column in .obs describing, for each observation, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each row the number of columns that contain data. + type: string + required: false + default: "num_nonzero_vars" + - name: "--output_obs_total_counts_vars" + description: | + Name of the column for .obs describing, for each observation (row), + the sum of the stored values in the columns. + type: string + required: false + default: total_counts + - name: "--output_var_num_nonzero_obs" + description: | + Name of column describing, for each feature, the number of stored values + (including explicit zeroes). In other words, the name of the column that counts + for each column the number of rows that contain data. + type: string + required: false + default: "num_nonzero_obs" + - name: "--output_var_total_counts_obs" + description: | + Name of the column in .var describing, for each feature (column), + the sum of the stored values in the rows. + type: string + required: false + default: total_counts + - name: "--output_var_obs_mean" + type: string + description: | + Name of the column in .obs providing the mean of the values in each row. + default: "obs_mean" + required: false + - name: "--output_var_pct_dropout" + type: string + default: "pct_dropout" + description: | + Name of the column in .obs providing for each feature the percentage of + observations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs` + but percentage based. + - name: "RNA Scaling options" + description: | + Options for enabling scaling of the log-normalized data to unit variance and zero mean. + The scaled data will be output a different layer and representation with reduced dimensions + will be created and stored in addition to the non-scaled data. + arguments: + - name: "--enable_scaling" + description: "Enable scaling for the RNA modality." + type: boolean_true + - name: "--scaling_output_layer" + type: string + default: "scaled" + description: "Output layer where the scaled log-normalized data will be stored." + - name: "--scaling_max_value" + description: "Clip (truncate) data to this value after scaling. If not specified, do not clip." + required: false + type: double + - name: "--scaling_zero_center" + type: boolean_false + description: If set, omit zero-centering variables, which allows to handle sparse input efficiently." + +dependencies: + - name: transform/normalize_total + - name: transform/log1p + - name: feature_annotation/highly_variable_features_scanpy + - name: workflows/qc/qc + alias: rna_qc + - name: transform/delete_layer + - name: metadata/add_id + - name: transform/scale +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/concat_test_data +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/rna/rna_multisample/integration_test.sh b/src/workflows/rna/rna_multisample/integration_test.sh new file mode 100755 index 00000000..c11d688b --- /dev/null +++ b/src/workflows/rna/rna_multisample/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/rna/rna_multisample/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/rna/rna_multisample/main.nf b/src/workflows/rna/rna_multisample/main.nf new file mode 100644 index 00000000..9af4b633 --- /dev/null +++ b/src/workflows/rna/rna_multisample/main.nf @@ -0,0 +1,97 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | normalize_total.run( + fromState: { id, state -> + [ + "input": state.input, + "input_layer": state.layer, + "output_layer": "normalized", + "modality": state.modality + ] + }, + toState: ["input": "output"], + ) + | log1p.run( + fromState: { id, state -> + [ + "input": state.input, + "output_layer": "log_normalized", + "input_layer": "normalized", + "modality": state.modality + ] + }, + toState: ["input": "output"] + ) + | delete_layer.run( + fromState: {id, state -> + [ + "input": state.input, + "layer": "normalized", + "modality": state.modality + ] + }, + toState: ["input": "output"] + ) + | scale.run( + runIf: {id, state -> state.enable_scaling}, + fromState: {id, state -> + [ + "input": state.input, + "modality": state.modality, + "input_layer": "log_normalized", + "output_layer": state.scaling_output_layer, + "max_value": state.scaling_max_value, + "zero_center": state.scaling_zero_center, + ] + }, + toState: ["input": "output"], + ) + | highly_variable_features_scanpy.run( + fromState: {id, state -> + [ + "input": state.input, + "layer": "log_normalized", + "modality": state.modality, + "var_name_filter": state.highly_variable_features_var_output, + "n_top_features": state.highly_variable_features_n_top_features, + "flavor": state.highly_variable_features_flavor, + "obs_batch_key": state.highly_variable_features_obs_batch_key + ] + }, + toState: ["input": "output"], + ) + | rna_qc.run( + // TODO: remove when viash 0.8.3 is released + key: "rna_qc", + fromState: {id, state -> + [ + "id": id, + "input": state.input, + "output": state.workflow_output, + "layer": state.layer, // Use the non-transformed layer + "output_compression": "gzip", + "modality": state.modality, + "var_qc_metrics": state.var_qc_metrics, + "top_n_vars": state.top_n_vars, + "output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars, + "output_obs_total_counts_vars": state.output_obs_total_counts_vars, + "output_var_num_nonzero_obs": state.output_var_num_nonzero_obs, + "output_var_total_counts_obs": state.output_var_total_counts_obs, + "output_var_obs_mean": state.output_var_obs_mean, + "output_var_pct_dropout": state.output_var_pct_dropout + ] + }, + ) + | setState(["output"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/rna/rna_multisample/nextflow.config b/src/workflows/rna/rna_multisample/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/rna/rna_multisample/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/rna/rna_multisample/test.nf b/src/workflows/rna/rna_multisample/test.nf new file mode 100644 index 00000000..eb65dc4d --- /dev/null +++ b/src/workflows/rna/rna_multisample/test.nf @@ -0,0 +1,41 @@ +nextflow.enable.dsl=2 + +include { rna_multisample } from params.rootDir + "/target/nextflow/workflows/rna/rna_multisample/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "simple_execution_test", + input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"), + output: "concatenated_file.final.h5mu" + ] + ]) + | map{ state -> [state.id, state] } + | rna_multisample + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["simple_execution_test"] + } +} diff --git a/src/workflows/rna/rna_singlesample/config.vsh.yaml b/src/workflows/rna/rna_singlesample/config.vsh.yaml new file mode 100644 index 00000000..b0ea9574 --- /dev/null +++ b/src/workflows/rna/rna_singlesample/config.vsh.yaml @@ -0,0 +1,170 @@ +name: "rna_singlesample" +namespace: "workflows/rna" +scope: "public" +description: "Processing unimodal single-sample RNA transcriptomics data." +info: + image: /images/concepts/fig_workflow_multiomics_rna_singlesample.svg + test_dependencies: +authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ author ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author ] +argument_groups: + - name: "Input" + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + type: file + required: true + description: Path to the sample. + example: dataset.h5mu + - name: "--layer" + type: string + required: false + description: "Input layer to start from. By default, .X will be used." + - name: "Output" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: Destination path to the output. + example: output.h5mu + - name: "Filtering options" + arguments: + - name: "--min_counts" + example: 200 + type: integer + description: Minimum number of counts captured per cell. + - name: "--max_counts" + example: 5000000 + type: integer + description: Maximum number of counts captured per cell. + + - name: "--min_genes_per_cell" + type: integer + example: 200 + description: Minimum of non-zero values per cell. + - name: "--max_genes_per_cell" + example: 1500000 + type: integer + description: Maximum of non-zero values per cell. + + - name: "--min_cells_per_gene" + example: 3 + type: integer + description: Minimum of non-zero values per gene. + + - name: "--min_fraction_mito" + example: 0 + type: double + required: false + min: 0 + max: 1 + description: Minimum fraction of UMIs that are mitochondrial. Requires --obs_name_mitochondrial_fraction. + - name: "--max_fraction_mito" + type: double + required: false + min: 0 + max: 1 + example: 0.2 + description: | + Maximum fraction of UMIs that are mitochondrial. Requires --obs_name_mitochondrial_fraction. + + - name: "--min_fraction_ribo" + type: double + required: false + min: 0 + max: 1 + example: 0 + description: Minimum fraction of UMIs that are ribosomal. Requires --obs_name_ribosomal_fraction. + - name: "--max_fraction_ribo" + type: double + required: false + min: 0 + max: 1 + example: 0.2 + description: | + Maximum fraction of UMIs that are ribosomal. Requires --obs_name_ribosomal_fraction. + + - name: "--skip_scrublet_doublet_detection" + type: boolean_true + description: Skip the scrublet doublet detection step. + + - name: "Mitochondrial & Ribosomal Gene Detection" + arguments: + - name: "--var_gene_names" + required: false + example: "gene_symbol" + type: string + description: | + .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set). + Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be + identified as mitochondrial or ribosomal genes, respectively. + - name: "--var_name_mitochondrial_genes" + type: string + required: false + description: | + In which .var slot to store a boolean array corresponding the mitochondrial genes. + - name: "--obs_name_mitochondrial_fraction" + type: string + required: false + description: | + When specified, write the fraction of counts originating from mitochondrial genes + (based on --mitochondrial_gene_regex) to an .obs column with the specified name. + Requires --var_name_mitochondrial_genes. + - name: --mitochondrial_gene_regex + type: string + description: | + Regex string that identifies mitochondrial genes from --var_gene_names. + By default will detect human and mouse mitochondrial genes from a gene symbol. + required: false + default: "^[mM][tT]-" + - name: "--var_name_ribosomal_genes" + type: string + required: false + description: | + In which .var slot to store a boolean array corresponding the ribosomal genes. + - name: "--obs_name_ribosomal_fraction" + type: string + required: false + description: | + When specified, write the fraction of counts originating from ribosomal genes + (based on --ribosomal_gene_regex) to an .obs column with the specified name. + Requires --var_name_ribosomal_genes. + - name: --ribosomal_gene_regex + type: string + description: | + Regex string that identifies ribosomal genes from --var_gene_names. + By default will detect human and mouse ribosomal genes from a gene symbol. + required: false + default: "^[Mm]?[Rr][Pp][LlSs]" +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: /src/workflows/utils/ +dependencies: + - name: filter/filter_with_counts + - name: filter/filter_with_scrublet + - name: filter/do_filter + - name: filter/delimit_fraction + - name: workflows/qc/qc +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + - path: /resources_test/pbmc_1k_protein_v3 +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/rna/rna_singlesample/integration_test.sh b/src/workflows/rna/rna_singlesample/integration_test.sh new file mode 100755 index 00000000..43b6299c --- /dev/null +++ b/src/workflows/rna/rna_singlesample/integration_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/rna/rna_singlesample/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config + +nextflow \ + run . \ + -entry test_wf2 \ + -main-script src/workflows/rna/rna_singlesample/test.nf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config diff --git a/src/workflows/rna/rna_singlesample/main.nf b/src/workflows/rna/rna_singlesample/main.nf new file mode 100644 index 00000000..7cfdfcd2 --- /dev/null +++ b/src/workflows/rna/rna_singlesample/main.nf @@ -0,0 +1,186 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Check for correctness of mitochondrial and ribosomal gene detection arguments + | map { id, state -> + def new_state = [:] + if (state.obs_name_mitochondrial_fraction && !state.var_name_mitochondrial_genes) { + throw new RuntimeException("Using --obs_name_mitochondrial_fraction requires --var_name_mitochondrial_genes.") + } + if (!state.obs_name_mitochondrial_fraction && state.var_name_mitochondrial_genes) { + new_state.obs_name_mitochondrial_fraction = "fraction_${state.var_name_mitochondrial_genes}" + } + if ((state.min_fraction_mito != null || state.max_fraction_mito != null) && !state.var_name_mitochondrial_genes) { + throw new RuntimeException("Enabling --min_fraction_mito or --max_fraction_mito requires --var_name_mitochondrial_genes.") + } + if (state.var_gene_names && !state.var_name_mitochondrial_genes && !state.var_name_ribosomal_genes) { + System.err.println("Warning: --var_gene_names is only required for mitochondrial gene detection and does nothing while \ + not also setting --var_name_mitochondrial_genes or --var_name_ribosomal_genes.") + } + if (state.mitochondrial_gene_regex && !state.var_name_mitochondrial_genes && !state.var_name_ribosomal_genes) { + System.err.println("Warning: --mitochondrial_gene_regex is only required for mitochondrial gene detection and does \ + nothing while not also setting --var_name_mitochondrial_genes or --var_name_ribosomal_genes.") + } + if (state.obs_name_ribosomal_fraction && !state.var_name_ribosomal_genes) { + throw new RuntimeException("Using --obs_name_ribosomal_fraction requires --var_name_ribosomal_genes.") + } + if (!state.obs_name_ribosomal_fraction && state.var_name_ribosomal_genes) { + new_state.obs_name_ribosomal_fraction = "fraction_${state.var_name_ribosomal_genes}" + } + if ((state.min_fraction_ribo != null || state.max_fraction_ribo != null) && !state.var_name_ribosomal_genes) { + throw new RuntimeException("Enabling --min_fraction_ribo or --max_fraction_ribo requires --var_name_ribosomal_genes.") + } + [id, state + new_state] + } + | qc.run( + fromState: { id, state -> + // The rna singlesample processing allows detecting mitochondrial genes and filtering based + // on the fraction of mitochondrial genes per cell + // This behaviour is optional based on the presence of var_name_mitochondrial_genes + // The behavior of other components must be tuned to this argument as well + def args = [ + "id": id, + "input": state.input, + // disable other qc metric calculations + // only mitochondrial gene detection is required at this point + "top_n_vars": [], + "output_obs_num_nonzero_vars": null, + "output_obs_total_counts_vars": null, + "output_var_num_nonzero_obs": null, + "output_var_total_counts_obs": null, + "output_var_obs_mean": null, + "output_var_pct_dropout": null, + "output": state.output, + "modality": "rna", + "layer": state.layer, + ] + + if (state.var_name_mitochondrial_genes) { + // Check if user has defined var columns to calculate metrics + def new_var_qc_metrics = state.var_qc_metrics != null ? state.var_qc_metrics : [] + assert new_var_qc_metrics instanceof List + // Add the mitochondrial genes var column to the columns to calculate statistics for if set. + new_var_qc_metrics = ((new_var_qc_metrics as Set) + [state.var_name_mitochondrial_genes]) as List + + args += [ + "var_qc_metrics": new_var_qc_metrics, + "obs_name_mitochondrial_fraction": state.obs_name_mitochondrial_fraction, + "var_gene_names": state.var_gene_names, + "var_name_mitochondrial_genes": state.var_name_mitochondrial_genes, + "mitochondrial_gene_regex": state.mitochondrial_gene_regex + ] + } + + if (state.var_name_ribosomal_genes) { + // Check if user has defined var columns to calculate metrics + def new_var_qc_metrics = state.var_qc_metrics != null ? state.var_qc_metrics : [] + assert new_var_qc_metrics instanceof List + // Add the ribosomal genes var column to the columns to calculate statistics for if set. + new_var_qc_metrics = ((new_var_qc_metrics as Set) + [state.var_name_ribosomal_genes]) as List + + args += [ + "var_qc_metrics": new_var_qc_metrics, + "obs_name_ribosomal_fraction": state.obs_name_ribosomal_fraction, + "var_gene_names": state.var_gene_names, + "var_name_ribosomal_genes": state.var_name_ribosomal_genes, + "ribosomal_gene_regex": state.ribosomal_gene_regex + ] + } + + return args + }, + toState: ["input": "output"] + ) + | delimit_fraction.run( + runIf: {id, state -> state.var_name_mitochondrial_genes}, + fromState: {id, state -> + [ + "input": state.input, + "obs_name_filter": "filter_mitochondrial", + "min_fraction": state.min_fraction_mito, + "max_fraction": state.max_fraction_mito, + "obs_fraction_column": state.obs_name_mitochondrial_fraction, + ] + }, + toState: ["input": "output"] + ) + | delimit_fraction.run( + runIf: {id, state -> state.var_name_ribosomal_genes}, + fromState: {id, state -> + [ + "input": state.input, + "obs_name_filter": "filter_ribosomal", + "min_fraction": state.min_fraction_ribo, + "max_fraction": state.max_fraction_ribo, + "obs_fraction_column": state.obs_name_ribosomal_fraction, + ] + }, + toState: ["input": "output"] + ) + // cell filtering + | filter_with_counts.run( + key: "rna_filter_with_counts", + fromState: { id, state -> + [ + "input": state.input, + "layer": state.layer, + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "min_counts": state.min_counts, + "max_counts": state.max_counts, + "min_genes_per_cell": state.min_genes_per_cell, + "max_genes_per_cell": state.max_genes_per_cell, + "min_cells_per_gene": state.min_cells_per_gene, + ] + }, + toState: ["input": "output"] + ) + | do_filter.run( + key: "rna_do_filter", + fromState: {id, state -> + // do_filter does not need a layer argument because it filters all layers + // from a modality. + def stateMapping = [ + input: state.input, + var_filter: ["filter_with_counts"], + // If scrublet is skipped, the output should be set to the workflow output + output: state.workflow_output + ] + def obs_filter = ["filter_with_counts"] + if (state.var_name_mitochondrial_genes) { + obs_filter += ["filter_mitochondrial"] + } + if (state.var_name_ribosomal_genes) { + obs_filter += ["filter_ribosomal"] + } + stateMapping += ["obs_filter": obs_filter] + return stateMapping + }, + toState: ["input": "output"] + ) + // doublet calling + | filter_with_scrublet.run( + runIf: { id, state -> + !state.skip_scrublet_doublet_detection + }, + fromState: [ + "input": "input", + "layer": "layer", + "output": "workflow_output" + ], + args: [output_compression: "gzip"], + toState: ["input": "output"] + ) + | setState(["output": "input"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/rna/rna_singlesample/nextflow.config b/src/workflows/rna/rna_singlesample/nextflow.config new file mode 100644 index 00000000..8108bc25 --- /dev/null +++ b/src/workflows/rna/rna_singlesample/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file diff --git a/src/workflows/rna/rna_singlesample/test.nf b/src/workflows/rna/rna_singlesample/test.nf new file mode 100644 index 00000000..2e2962d6 --- /dev/null +++ b/src/workflows/rna/rna_singlesample/test.nf @@ -0,0 +1,110 @@ +nextflow.enable.dsl=2 + +include { rna_singlesample } from params.rootDir + "/target/nextflow/workflows/rna/rna_singlesample/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "mitochondrial_ribosomal_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + min_counts: 3, + max_counts: 10000000, + min_genes_per_cell: 2, + max_genes_per_cell: 10000000, + min_cells_per_gene: 2, + var_gene_names: "gene_symbol", + min_fraction_mito: 0.05, + max_fraction_mito: 0.2, + var_name_mitochondrial_genes: "mitochondrial", + min_fraction_ribo: 0.05, + max_fraction_ribo: 0.2, + var_name_ribosomal_genes: "ribosomal", + // see if obs_name is automatically filled in + // obs_name_mitochondrial_fraction: "fraction_mitochondrial", + output: "mitochondrial_ribosomal_test.final.h5mu" + ], + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + min_counts: 3, + max_counts: 10000000, + min_genes_per_cell: 2, + max_genes_per_cell: 10000000, + min_cells_per_gene: 2, + output: "simple_execution_test.final.h5mu" + ], + [ + id: "skip_scrublet_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + min_counts: 3, + max_counts: 10000000, + min_genes_per_cell: 2, + max_genes_per_cell: 10000000, + min_cells_per_gene: 2, + skip_scrublet_doublet_detection: true, + output: "skip_scrublet_test.final.h5mu" + ] + ]) + | map{ state -> [state.id, state] } + | rna_singlesample + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + | toSortedList{a, b -> a[0] <=> b[0]} + | map { output_list -> + assert output_list.size() == 3 : "output channel should contain three events" + println "output_list: $output_list" + assert output_list.collect{it[0]} == ["mitochondrial_ribosomal_test", "simple_execution_test", "skip_scrublet_test"] : "Output ID should be same as input ID" + assert (output_list.collect({it[1].output.getFileName().toString()}) as Set).equals(["mitochondrial_ribosomal_test.final.h5mu", "simple_execution_test.final.h5mu", "skip_scrublet_test.final.h5mu"] as Set) + + } +} + +workflow test_wf2 { + + resources_test = file(params.resources_test) + + output_ch = Channel.fromList([ + [ + id: "test_different_fraction_column", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"), + min_counts: 3, + max_counts: 10000000, + min_genes_per_cell: 2, + max_genes_per_cell: 10000000, + min_cells_per_gene: 2, + var_gene_names: "gene_symbol", + min_fraction_mito: 0.05, + max_fraction_mito: 0.2, + var_name_mitochondrial_genes: "mitochondrial", + obs_name_mitochondrial_fraction: "foobar_mito", + min_fraction_ribo: 0.05, + max_fraction_ribo: 0.2, + var_name_ribosomal_genes: "ribosomal", + obs_name_ribosomal_fraction: "foobar_ribo", + output: "mitochondrial_ribosomal_test.final.h5mu" + ], + ]) + | map{ state -> [state.id, state] } + | rna_singlesample + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output.toString().endsWith(".h5mu") : "Output file should be a h5mu file. Found: ${output[1]}" + "Output: $output" + } + | toSortedList{a, b -> a[0] <=> b[0]} + | map { output_list -> + assert output_list.size() == 1 : "output channel should contain one event" + println "output_list: $output_list" + assert output_list.collect{it[0]} == ["test_different_fraction_column"] : "Output ID should be same as input ID" + assert (output_list.collect({it[1].output.getFileName().toString()}) as Set).equals(["mitochondrial_ribosomal_test.final.h5mu"] as Set) + + } +} diff --git a/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml b/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml new file mode 100644 index 00000000..9f84f9e0 --- /dev/null +++ b/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "harmony_knn_test" +namespace: "test_workflows/annotation" +scope: "test" +description: "This component tests the output of the annotation of the harmony_knn of workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/annotation/harmony_knn/script.py b/src/workflows/test_workflows/annotation/harmony_knn/script.py new file mode 100644 index 00000000..2e1a71b2 --- /dev/null +++ b/src/workflows/test_workflows/annotation/harmony_knn/script.py @@ -0,0 +1,48 @@ +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +par = {"input": "harmony_knn/output.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_harmony", "X_leiden_harmony_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = [ + "harmonypy_integration_distances", + "harmonypy_integration_connectivities", + ] + expected_mod = ["rna", "prot"] + + assert all(key in list(input_mudata.mod) for key in expected_mod), ( + f"Input modalities should be: {expected_mod}, found: {input_mudata.mod.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category", ( + "Cell type predictions should be of dtype category." + ) + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64", ( + "Cell type probabilities should be of dtype float64." + ) + + assert input_mudata.mod["rna"].shape[0] == input_mudata.mod["prot"].shape[0], ( + "Number of observations should be equal in all modalities." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/annotation/scanvi_scarches/config.vsh.yaml b/src/workflows/test_workflows/annotation/scanvi_scarches/config.vsh.yaml new file mode 100644 index 00000000..91564449 --- /dev/null +++ b/src/workflows/test_workflows/annotation/scanvi_scarches/config.vsh.yaml @@ -0,0 +1,26 @@ +name: "scanvi_scarches_test" +namespace: "test_workflows/annotation" +scope: "test" +description: "This component tests the output of the annotation of the scanvi annotation workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/annotation/scanvi_scarches/script.py b/src/workflows/test_workflows/annotation/scanvi_scarches/script.py new file mode 100644 index 00000000..8be27fc6 --- /dev/null +++ b/src/workflows/test_workflows/annotation/scanvi_scarches/script.py @@ -0,0 +1,44 @@ +from mudata import read_h5mu +import sys +import pytest +import pandas + +##VIASH START +par = {"input": "harmony_knn/output.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scanvi", "X_scanvi_umap"] + expected_obs = ["scanvi_pred", "scanvi_probabilities"] + expected_obsp = [ + "scanvi_integration_distances", + "scanvi_integration_connectivities", + ] + expected_uns = ["scanvi_integration_neighbors"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].uns) for key in expected_uns), ( + f"Input mod['rna'] uns columns should be: {expected_uns}, found: {input_mudata.mod['rna'].uns.keys()}." + ) + + assert input_mudata.mod["rna"].obs["scanvi_pred"].dtype == "category" + assert pandas.api.types.is_float_dtype( + input_mudata.mod["rna"].obs["scanvi_probabilities"].dtype + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/annotation/scgpt/config.vsh.yaml b/src/workflows/test_workflows/annotation/scgpt/config.vsh.yaml new file mode 100644 index 00000000..1e1b911a --- /dev/null +++ b/src/workflows/test_workflows/annotation/scgpt/config.vsh.yaml @@ -0,0 +1,30 @@ +name: "scgpt_annotation_test" +namespace: "test_workflows/annotation" +scope: "test" +description: "This component test the output of the the scgpt_annotation workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu + - name: "--n_hvg" + type: integer + required: true + description: Number of highly variable genes the input file was subset for. + example: 400 +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/annotation/scgpt/script.py b/src/workflows/test_workflows/annotation/scgpt/script.py new file mode 100644 index 00000000..aa32c489 --- /dev/null +++ b/src/workflows/test_workflows/annotation/scgpt/script.py @@ -0,0 +1,45 @@ +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +par = {"input": "input.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = [ + "gene_id_tokens", + "values_tokenized", + "padding_mask", + "bin_edges", + "binned_counts", + ] + expected_var = ["scgpt_filter_with_hvg", "scgpt_cross_checked_genes"] + expected_obs = ["scgpt_pred", "scgpt_probability"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obs columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].var) for key in expected_var), ( + f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + # hvg subsetting is not exact - add 10% to allowed data shape + assert ( + input_mudata.mod["rna"].obsm["binned_counts"].shape[1] + <= par["n_hvg"] + 0.1 * par["n_hvg"] + ), ( + f"Input shape should be lower or equal than --n_hvg {par['n_hvg']}, found: {input_mudata.shape[1]}." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/annotation/scvi_knn/config.vsh.yaml b/src/workflows/test_workflows/annotation/scvi_knn/config.vsh.yaml new file mode 100644 index 00000000..a34192df --- /dev/null +++ b/src/workflows/test_workflows/annotation/scvi_knn/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "scvi_knn_test" +namespace: "test_workflows/annotation" +scope: "test" +description: "This component tests the output of the annotation of the scvi_knn of workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow diff --git a/src/workflows/test_workflows/annotation/scvi_knn/script.py b/src/workflows/test_workflows/annotation/scvi_knn/script.py new file mode 100644 index 00000000..487348a8 --- /dev/null +++ b/src/workflows/test_workflows/annotation/scvi_knn/script.py @@ -0,0 +1,45 @@ +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +par = {"input": "scvi_knn/output.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scvi", "X_leiden_scvi_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = ["scvi_integration_connectivities", "scvi_integration_distances"] + expected_mod = ["rna", "prot"] + + assert all(key in list(input_mudata.mod) for key in expected_mod), ( + f"Input modalities should be: {expected_mod}, found: {input_mudata.mod.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category", ( + "Cell type predictions should be of dtype category." + ) + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64", ( + "Cell type probabilities should be of dtype float64." + ) + + assert input_mudata.mod["rna"].shape[0] == input_mudata.mod["prot"].shape[0], ( + "Number of observations should be equal in all modalities." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/bd_rhapsody/config.vsh.yaml b/src/workflows/test_workflows/ingestion/bd_rhapsody/config.vsh.yaml new file mode 100644 index 00000000..ffe66f9e --- /dev/null +++ b/src/workflows/test_workflows/ingestion/bd_rhapsody/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "bd_rhapsody_test" +namespace: "test_workflows/ingestion" +scope: "test" +description: "This component test the output of the integration test of the bd_rhapsody workflow." +authors: + - __merge__: /src/authors/jakub_majercik.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py b/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py new file mode 100644 index 00000000..e51b790e --- /dev/null +++ b/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py @@ -0,0 +1,50 @@ +from mudata import read_h5mu +import numpy as np +import sys +import pytest + +##VIASH START +par = {"input": "input.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_var = ["gene_name", "feature_type", "reference_file", "gene_ids"] + expected_obs = ["run_id", "library_id", "cell_id"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert "prot" in list(input_mudata.mod.keys()), "Input should contain rna modality." + # assert list(input_mudata.var.columns) == expected_var, f"Input var columns should be: {expected_var}." + assert all( + key in list(input_mudata.mod["rna"].var.columns) for key in expected_var + ), ( + f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + ) + assert all( + key in list(input_mudata.mod["rna"].obs.columns) for key in expected_obs + ), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all( + key in list(input_mudata.mod["prot"].var.columns) for key in expected_var + ), ( + f"Input mod['prot'] var columns should be: {expected_var}, found: {input_mudata.mod['prot'].var.keys()}." + ) + assert all( + key in list(input_mudata.mod["prot"].obs.columns) for key in expected_obs + ), ( + f"Input mod ['prot'] obs columns should be: {expected_obs}, found: {input_mudata.mod['prot'].obs.keys()}." + ) + assert np.array_equal( + input_mudata.mod["rna"].var["feature_type"].unique(), ["Gene Expression"] + ), "Output X should only contain Gene Expression vars." + assert np.array_equal( + input_mudata.mod["prot"].var["feature_type"].unique(), ["Antibody Capture"] + ), "Output X should only contain Gene Expression vars." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/cellranger_mapping/config.vsh.yaml b/src/workflows/test_workflows/ingestion/cellranger_mapping/config.vsh.yaml new file mode 100644 index 00000000..2a64c1ac --- /dev/null +++ b/src/workflows/test_workflows/ingestion/cellranger_mapping/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "cellranger_mapping_test" +namespace: "test_workflows/ingestion" +scope: "test" +description: "This component test the output of the integration test of the cellranger mapping workflow." +authors: + - __merge__: /src/authors/jakub_majercik.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py b/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py new file mode 100644 index 00000000..e3feaa2c --- /dev/null +++ b/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py @@ -0,0 +1,28 @@ +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +par = {"input": "input.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_colnames = ["gene_symbol", "feature_types", "genome"] + + assert list(input_mudata.mod.keys()) == ["rna"], ( + "Input should contain rna modality." + ) + assert list(input_mudata.var.columns) == expected_colnames, ( + f"Input var columns should be: {expected_colnames}." + ) + assert list(input_mudata.mod["rna"].var.columns) == expected_colnames, ( + f"Input mod['rna'] var columns should be: {expected_colnames}." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/cellranger_multi/config.vsh.yaml b/src/workflows/test_workflows/ingestion/cellranger_multi/config.vsh.yaml new file mode 100644 index 00000000..17f5d80e --- /dev/null +++ b/src/workflows/test_workflows/ingestion/cellranger_multi/config.vsh.yaml @@ -0,0 +1,26 @@ +name: "cellranger_multi_test" +namespace: "test_workflows/ingestion" +scope: "test" +description: "This component test the output of the integration test of the cellranger multi workflow." +authors: + - __merge__: /src/authors/jakub_majercik.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + multiple: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/ingestion/cellranger_multi/script.py b/src/workflows/test_workflows/ingestion/cellranger_multi/script.py new file mode 100644 index 00000000..93b74c08 --- /dev/null +++ b/src/workflows/test_workflows/ingestion/cellranger_multi/script.py @@ -0,0 +1,32 @@ +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +par = {"input": "input.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + for input_path in par["input"]: + input_mudata = read_h5mu(input_path) + + assert list(input_mudata.mod.keys()) == ["rna", "prot", "vdj_t"] + assert list(input_mudata.uns.keys()) == ["metrics_cellranger"] + expected_metrics = [ + "Category", + "Library Type", + "Grouped By", + "Group Name", + "Metric Name", + "Metric Value", + ] + assert ( + input_mudata.uns["metrics_cellranger"].columns.to_list() == expected_metrics + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/cellranger_postprocessing/config.vsh.yaml b/src/workflows/test_workflows/ingestion/cellranger_postprocessing/config.vsh.yaml new file mode 100644 index 00000000..5c6299a5 --- /dev/null +++ b/src/workflows/test_workflows/ingestion/cellranger_postprocessing/config.vsh.yaml @@ -0,0 +1,34 @@ +name: "cellranger_postprocessing_test" +namespace: "test_workflows/ingestion" +scope: "test" +description: "This component test the output of the integration test of the cellranger postprocessing workflow." +authors: + - __merge__: /src/authors/jakub_majercik.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu + - name: "--input_og" + type: file + required: true + description: Path to the original h5mu file. + example: foo.h5mu + - name: "--is_corrected" + type: boolean + required: true + description: Whether the input file has been corrected. +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py b/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py new file mode 100644 index 00000000..605ff533 --- /dev/null +++ b/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py @@ -0,0 +1,38 @@ +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +import sys +import pytest + +##VIASH START +par = {"input": "input.h5mu", "input_og": "input_og.h5mu", "is_corrected": True} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input_og"]) + output_mudata = read_h5mu(par["input"]) + + assert input_mudata.mod.keys() == output_mudata.mod.keys(), ( + "Input and output should have the same modalities." + ) + + for modality, input_adata, output_adata in zip( + input_mudata.mod.keys(), input_mudata.mod.values(), output_mudata.mod.values() + ): + assert input_adata.n_obs >= output_adata.n_obs, ( + "Output should have less or equal number of observations than input." + ) + assert input_adata.n_vars == output_adata.n_vars, ( + "Output should have the same number of variables as input." + ) + if modality != "rna": + assert_annotation_objects_equal(input_adata, output_adata) + + if par["is_corrected"]: + assert "cellbender_corrected" in output_mudata.mod["rna"].layers + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/conversion/config.vsh.yaml b/src/workflows/test_workflows/ingestion/conversion/config.vsh.yaml new file mode 100644 index 00000000..b22415b7 --- /dev/null +++ b/src/workflows/test_workflows/ingestion/conversion/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "conversion_test" +namespace: "test_workflows/ingestion" +scope: "test" +description: "This component test the output of the integration test of the conversion workflow." +authors: + - __merge__: /src/authors/jakub_majercik.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/ingestion/conversion/script.py b/src/workflows/test_workflows/ingestion/conversion/script.py new file mode 100644 index 00000000..3c4210c1 --- /dev/null +++ b/src/workflows/test_workflows/ingestion/conversion/script.py @@ -0,0 +1,22 @@ +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +par = {"input": "input.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + assert "rna" in input_mudata.mod.keys() + assert input_mudata.n_obs == 713 + assert input_mudata.mod["rna"].var["feature_types"].unique() == [ + "Gene Expression" + ], "Output X should only contain Gene Expression vars." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/multiomics/dimensionality_reduction/config.vsh.yaml b/src/workflows/test_workflows/multiomics/dimensionality_reduction/config.vsh.yaml new file mode 100644 index 00000000..b3e294db --- /dev/null +++ b/src/workflows/test_workflows/multiomics/dimensionality_reduction/config.vsh.yaml @@ -0,0 +1,29 @@ +name: "dimensionality_reduction_test" +namespace: "test_workflows/multiomics" +scope: "test" +description: "This component test the output of the integration test of dimensionality_reduction." +authors: + - __merge__: /src/authors/kai_waldrant.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py b/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py new file mode 100644 index 00000000..be619ceb --- /dev/null +++ b/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py @@ -0,0 +1,23 @@ +import mudata as mu + +##VIASH START +par = { + "input": "foo.final.h5mu", +} + +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} + +##VIASH END + +print("Loading data", flush=True) +data = mu.read_h5mu(par["input"]) + +assert "X_umap" in data.mod["rna"].obsm, "X_umap not found in .obsm" +assert data.mod["rna"].obsm["X_umap"].shape[1] == 2, ( + f"X_umap has wrong shape expected 2 n_comp but got {data.mod['rna'].obsm['X_umap'].shape[1]}" +) +assert "pca_variance" in data.mod["rna"].uns +assert "pca_loadings" in data.mod["rna"].varm + + +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/process_batches/workflow_test/config.vsh.yaml b/src/workflows/test_workflows/multiomics/process_batches/workflow_test/config.vsh.yaml new file mode 100644 index 00000000..bd38f4b5 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/process_batches/workflow_test/config.vsh.yaml @@ -0,0 +1,31 @@ +name: "workflow_test" +namespace: "test_workflows/multiomics/process_batches" +scope: "test" +description: "This component tests the output of the integration test of process_batches test_wf." +authors: + - __merge__: /src/authors/kai_waldrant.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to the modality summary csv. + example: test.h5mu + - name: "--orig_input" + type: file + required: true + description: Path to the original input file. + example: input.h5mu +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow + directives: + label: ["midmem"] \ No newline at end of file diff --git a/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py b/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py new file mode 100644 index 00000000..ea7fc148 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py @@ -0,0 +1,32 @@ +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +##VIASH START +par = {"input": "output.h5mu", "orig_input": "input.5mu"} + +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} + +##VIASH END + +print("Loading data", flush=True) +input = mu.read_h5mu(par["orig_input"]) +output = mu.read_h5mu(par["input"]) + +assert input.n_mod == output.n_mod, "Number of modalities differ" +assert input.mod.keys() == output.mod.keys(), "Modalities differ" + +# Check atac modality +assert_annotation_objects_equal( + input.mod["atac"], output.mod["atac"], promote_precision=True +) + +# Check rna modality +assert "X_umap" in output.mod["rna"].obsm, "X_umap not found in .obsm" +assert output.mod["rna"].obsm["X_umap"].shape[1] == 2, ( + f"X_umap has wrong shape expected 2 n_comp but got {output.mod['rna'].obsm['X_umap'].shape[1]}" +) +assert "pca_variance" in output.mod["rna"].uns +assert "pca_loadings" in output.mod["rna"].varm + + +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/config.vsh.yaml b/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/config.vsh.yaml new file mode 100644 index 00000000..18ecb1f1 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/config.vsh.yaml @@ -0,0 +1,31 @@ +name: "workflow_test2" +namespace: "test_workflows/multiomics/process_batches" +scope: "test" +description: "This component tests the output of the integration test of process_batches test_wf2." +authors: + - __merge__: /src/authors/kai_waldrant.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to the modality summary csv. + example: test.h5mu + - name: "--orig_input" + type: file + required: true + description: Path to the original input file. + example: input.h5mu +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow + directives: + label: ["midmem"] \ No newline at end of file diff --git a/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py b/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py new file mode 100644 index 00000000..a23acb88 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py @@ -0,0 +1,60 @@ +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column + +##VIASH START +par = { + "input": "resources_test/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_mms.h5mu", + "orig_input": "test.h5mu", +} + +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} + +##VIASH END + +print("Loading data", flush=True) +input = mu.read_h5mu(par["orig_input"]) +output = mu.read_h5mu(par["input"]) + +assert input.n_mod == output.n_mod, "Number of modalities differ" +assert input.mod.keys() == output.mod.keys(), "Modalities differ" + +# Check vdj_t modality +# Allow X_umap to be overwritten +input_vdj = input.mod["vdj_t"] +# del input_vdj.obsm['X_umap'] +output_vdj = output.mod["vdj_t"] +# del output_vdj.obsm['X_umap'] +assert_annotation_objects_equal(input_vdj, output_vdj, promote_precision=True) + +# Check prot modality +# Ignore the PCA layer and its derivatives, as its allowed to be overwritten for this test. +input_prot = input.mod["prot"] +del input_prot.varm["pca_loadings"] +del input_prot.obsm["X_pca"] +del input_prot.obsm["X_umap"] +output_prot = output.mod["prot"] +del output_prot.varm["pca_loadings"] +del output_prot.obsm["X_pca"] +del output_prot.obsm["X_umap"] +assert_annotation_objects_equal(input_prot, output_prot, promote_precision=True) + + +# Check rna modality +# Allow the highly variable genes and PCA + derivatives to be overwritten +input_rna = input.mod["rna"] +input_rna = remove_annotation_column(input_rna, "filter_with_hvg", "var") +del input_rna.varm["pca_loadings"] +del input_rna.obsm["X_pca"] +del input_rna.obsm["X_umap"] +del input_rna.layers["log_normalized"] +output_rna = output.mod["rna"] +output_rna = remove_annotation_column(output_rna, "filter_with_hvg", "var") +del output_rna.obsm["X_pca"] +del output_rna.varm["pca_loadings"] +del output_rna.obsm["X_umap"] +del output_rna.layers["log_normalized"] +assert_annotation_objects_equal(input_rna, output_rna, promote_precision=True) + + +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/split_h5mu/config.vsh.yaml b/src/workflows/test_workflows/multiomics/split_h5mu/config.vsh.yaml new file mode 100644 index 00000000..22356fff --- /dev/null +++ b/src/workflows/test_workflows/multiomics/split_h5mu/config.vsh.yaml @@ -0,0 +1,34 @@ +name: "split_h5mu_test" +namespace: "test_workflows/multiomics" +scope: "test" +description: "This component test the output of the integration test of split_h5mu." +authors: + - __merge__: /src/authors/kai_waldrant.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to sample summary csv. + example: output_samples.csv + - name: "--samples_dir" + type: file + required: true + description: Path to the directory containing the sample h5mu files. + example: /path/to/h5mu_files + - name: "--orig_input" + type: file + required: true + description: Path to the original input file. + example: /path/to/original_input.h5mu +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/multiomics/split_h5mu/script.py b/src/workflows/test_workflows/multiomics/split_h5mu/script.py new file mode 100644 index 00000000..f0d54c51 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/split_h5mu/script.py @@ -0,0 +1,56 @@ +import csv +import os +import mudata as mu +# from openpipeline_testutils.asserters import assert_annotation_objects_equal + + +##VIASH START +par = { + "input": "/home/di/code/openpipelines-multisample/work/82/485f6cde9bab8a53f15726b59de4c1/_viash_par/input_1/samples.csv", + "samples_dir": "/home/di/code/openpipelines-multisample/work/82/485f6cde9bab8a53f15726b59de4c1/_viash_par/samples_dir_1/samples", + "orig_input": "/home/di/code/openpipelines-multisample/work/82/485f6cde9bab8a53f15726b59de4c1/_viash_par/orig_input_1/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu", +} + +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} + +##VIASH END + + +print("Loading data", flush=True) +with open(par["input"], "r", encoding="utf-8") as f: + reader = csv.reader(f) + data = list(reader) + +input_mu = mu.read_h5mu(par["orig_input"]) + +input_samples = input_mu.mod["rna"].obs["sample_id"].unique() +num_samples = input_samples.shape[0] +num_files = len(os.listdir(par["samples_dir"])) + +# Check if the number of files is equal to the number of lines in the csv +assert num_samples == num_files, f"Expected {num_samples} files, but found {num_files}." +assert input_mu.n_mod == num_samples, ( + f"Expected {num_samples} samples in {par['orig_input']} got {num_files} samples." +) + + +# Check if the files exist and if the sample name is in the file name +for i, row in enumerate(data): + if i == 0: + continue + # Check if the files exist and if the sample name is in the file name + assert row[0] in row[1], f"Expected {row[0]} to be in {row[1]}." + samples_fp = os.path.join(par["samples_dir"], row[1]) + assert os.path.exists(samples_fp), f"Expected {row[1]} to exist." + # Check sample is correct in the h5mu file + mod_mu = mu.read_h5mu(samples_fp) + found_samples = mod_mu["rna"].obs["sample_id"].unique() + assert found_samples.shape[0] == 1, f"Expected 1 sample ID in {row[1]}." + assert found_samples[0] == row[0], ( + f"Expected {row[0]} to be the sample ID for {row[1]}" + ) + assert row[0] in input_samples, ( + f"Expected {row[0]} to be a mod in {par['orig_input']}." + ) + +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/split_modalities/config.vsh.yaml b/src/workflows/test_workflows/multiomics/split_modalities/config.vsh.yaml new file mode 100644 index 00000000..734398c8 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/split_modalities/config.vsh.yaml @@ -0,0 +1,34 @@ +name: "split_modalities_test" +namespace: "test_workflows/multiomics" +scope: "test" +description: "This component test the output of the integration test of split_modalities." +authors: + - __merge__: /src/authors/kai_waldrant.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to modality summary csv. + example: output_types.csv + - name: "--mod_dir" + type: file + required: true + description: Path to the directory containing the modality h5mu files. + example: /path/to/h5mu_files + - name: "--orig_input" + type: file + required: true + description: Path to the original input file. + example: /path/to/original_input.h5mu +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/multiomics/split_modalities/script.py b/src/workflows/test_workflows/multiomics/split_modalities/script.py new file mode 100644 index 00000000..40cf45d6 --- /dev/null +++ b/src/workflows/test_workflows/multiomics/split_modalities/script.py @@ -0,0 +1,58 @@ +import csv +import os +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + + +##VIASH START +par = { + "input": "output_test/split_modalities/foo_types.csv", + "mod_dir": "output_test/split_modalities/h5mu", + "orig_input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3.h5mu", +} + +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} + +##VIASH END + + +print("Loading data", flush=True) +with open(par["input"], "r", encoding="utf-8") as f: + reader = csv.reader(f) + data = list(reader) + +input_mu = mu.read_h5mu(par["orig_input"]) + +num_mod = len(data) - 1 +num_files = len(os.listdir(par["mod_dir"])) + +# Check if the number of files is equal to the number of lines in the csv +assert num_mod == num_files, f"Expected {num_mod} files, but found {num_files}." +assert input_mu.n_mod == num_mod, ( + f"Expected {num_mod} modalities in {par['orig_input']} got {input_mu.n_mod} modalities." +) + +rna_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[1][1])) +prot_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[2][1])) + +# Check if the files exist and if the modality name is in the file name +for i, row in enumerate(data): + if i == 0: + continue + # Check if the files exist and if the modality name is in the file name + assert row[0] in row[1], f"Expected {row[0]} to be in {row[1]}." + mod_fp = os.path.join(par["mod_dir"], row[1]) + assert os.path.exists(mod_fp), f"Expected {row[1]} to exist." + # Check modality is correct in the h5mu file + mod_mu = mu.read_h5mu(mod_fp) + assert mod_mu.n_mod == 1, f"Expected 1 modality in {row[1]}." + assert row[0] in mod_mu.mod.keys(), f"Expected {row[0]} to be the mod in {row[1]}." + assert row[0] in input_mu.mod.keys(), ( + f"Expected {row[0]} to be a mod in {par['orig_input']}." + ) + +# Check if extracted modalities are equal to the original modalities +assert_annotation_objects_equal(rna_mod.mod["rna"], input_mu.mod["rna"]) +assert_annotation_objects_equal(prot_mod.mod["prot"], input_mu.mod["prot"]) + +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/qc/config.vsh.yaml b/src/workflows/test_workflows/qc/config.vsh.yaml new file mode 100644 index 00000000..09d767f4 --- /dev/null +++ b/src/workflows/test_workflows/qc/config.vsh.yaml @@ -0,0 +1,32 @@ +name: "qc_test" +namespace: "test_workflows/qc" +scope: "test" +description: "This component test the output of the integration test of the QC workflow." +authors: + - __merge__: /src/authors/jakub_majercik.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu + - name: "--og_input" + type: file + required: true + description: Path to the original h5mu file. + example: foo.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +engines: + - type: docker + image: python:3.12-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu] \ No newline at end of file diff --git a/src/workflows/test_workflows/qc/script.py b/src/workflows/test_workflows/qc/script.py new file mode 100644 index 00000000..e1a0b7e1 --- /dev/null +++ b/src/workflows/test_workflows/qc/script.py @@ -0,0 +1,75 @@ +import pytest +import sys +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column + + +##VIASH START +par = {"input": "input.h5mu", "og_input": "og_input.h5mu"} + +meta = { + "resources_dir": "resources_test/concat_test_data", +} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["og_input"]) + output_mudata = read_h5mu(par["input"]) + + assert input_mudata.n_mod == output_mudata.n_mod, ( + "Number of modalities should be the same" + ) + assert input_mudata.mod.keys() == output_mudata.mod.keys(), ( + "Modalities should be the same" + ) + assert list(output_mudata.mod.keys()) == [ + "rna", + "atac", + ], "Modalities should be rna and atac" + + obs_cols_to_remove = [] + for top_n_vars in ("50", "100", "200", "500"): + obs_cols_to_remove.append(f"pct_of_counts_in_top_{top_n_vars}_vars") + + obs_cols_to_remove.extend( + [ + "total_counts", + "num_nonzero_vars", + "fraction_mitochondrial", + "fraction_ribosomal", + "total_counts_mitochondrial", + "total_counts_ribosomal", + "pct_mitochondrial", + "pct_ribosomal", + ] + ) + var_cols_to_remove = [ + "obs_mean", + "total_counts", + "num_nonzero_obs", + "pct_dropout", + "mitochondrial", + "ribosomal", + ] + + assert set(obs_cols_to_remove).issubset( + set(output_mudata.mod["rna"].obs.columns.to_list()) + ) + assert set(var_cols_to_remove).issubset( + set(output_mudata.mod["rna"].var.columns.to_list()) + ) + + initial_mudata = remove_annotation_column( + output_mudata, obs_cols_to_remove, axis="obs", modality_name="rna" + ) + initial_mudata = remove_annotation_column( + initial_mudata, var_cols_to_remove, axis="var", modality_name="rna" + ) + + assert_annotation_objects_equal(input_mudata, initial_mudata) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) diff --git a/src/workflows/utils/errorstrat_ignore.config b/src/workflows/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/src/workflows/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/src/workflows/utils/integration_tests.config b/src/workflows/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/src/workflows/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/src/workflows/utils/labels.config b/src/workflows/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/src/workflows/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/src/workflows/utils/labels_ci.config b/src/workflows/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/src/workflows/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/.build.yaml b/target/.build.yaml new file mode 100644 index 00000000..e69de29b diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/.config.vsh.yaml b/target/_private/nextflow/workflows/multiomics/split_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..37764eb3 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/.config.vsh.yaml @@ -0,0 +1,263 @@ +name: "split_h5mu" +namespace: "workflows/multiomics" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input & specifications" + arguments: + - type: "file" + name: "--input" + description: "Path to a single .h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_feature" + description: "The .obs column to split the mudata on." + info: null + example: + - "celltype" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--drop_obs_nan" + description: "Whether to drop all .obs columns that contain only nan values after\ + \ splitting." + info: null + direction: "input" + - type: "boolean_true" + name: "--ensure_unique_filenames" + description: "Append number suffixes to ensure unique filenames after sanitizing\ + \ obs feature values." + info: null + direction: "input" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output directory containing multiple h5mu files." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_files" + description: "A csv containing the base filename and obs feature by which it was\ + \ split." + info: null + example: + - "sample_files.csv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split the samples of a single modality from a .h5mu (multimodal) sample\ + \ into seperate .h5mu files based on the values of an .obs column of this modality. " +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "concatenated_brain_filtered_feature_bc_matrix_subset.h5mu" +info: + test_dependencies: + - name: "split_h5mu_test" + namespace: "test_workflows/multiomics" +status: "enabled" +scope: + image: "private" + target: "private" +dependencies: +- name: "dataflow/split_h5mu" + alias: "split_h5mu_component" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/multiomics/split_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/_private/nextflow/workflows/multiomics/split_h5mu" + executable: "target/_private/nextflow/workflows/multiomics/split_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/dataflow/split_h5mu" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/main.nf b/target/_private/nextflow/workflows/multiomics/split_h5mu/main.nf new file mode 100644 index 00000000..82ff62c8 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/main.nf @@ -0,0 +1,3560 @@ +// split_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "split_h5mu", + "namespace" : "workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input & specifications", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to a single .h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_feature", + "description" : "The .obs column to split the mudata on.", + "example" : [ + "celltype" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--drop_obs_nan", + "description" : "Whether to drop all .obs columns that contain only nan values after splitting.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--ensure_unique_filenames", + "description" : "Append number suffixes to ensure unique filenames after sanitizing obs feature values.", + "direction" : "input" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output directory containing multiple h5mu files.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_files", + "description" : "A csv containing the base filename and obs feature by which it was split.", + "example" : [ + "sample_files.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. ", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu" + } + ], + "info" : { + "test_dependencies" : [ + { + "name" : "split_h5mu_test", + "namespace" : "test_workflows/multiomics" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "private", + "target" : "private" + }, + "dependencies" : [ + { + "name" : "dataflow/split_h5mu", + "alias" : "split_h5mu_component", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/multiomics/split_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/_private/nextflow/workflows/multiomics/split_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { split_h5mu as split_h5mu_component_viashalias } from "${meta.resources_dir}/../../../../../nextflow/dataflow/split_h5mu/main.nf" +split_h5mu_component = split_h5mu_component_viashalias.run(key: "split_h5mu_component") + +// inner workflow +// user-provided Nextflow code +process splitStub { + input: + tuple val(id), val(unused) + + output: + tuple val(id), path("stub_h5mus"), path("samples.csv") + + script: + """ + echo "This process is not meant to be run without -stub being defined." + exit 1 + """ + + stub: + """ + mkdir stub_h5mus + touch stub_h5mus/${id}_sample_1.h5mu + touch stub_h5mus/${id}_sample_2.h5mu + touch stub_h5mus/${id}_sample_3.h5mu + echo -e "name,filename\nsample_1,stub_h5mus/${id}_sample_1.h5mu\nsample_2,stub_h5mus/${id}_sample_2.h5mu\nsample_3,stub_h5mus/${id}_sample_3.h5mu" > samples.csv + """ +} + +workflow run_wf { + take: + input_ch + + main: + split_ch = input_ch + | split_h5mu_component.run( + filter: {!workflow.stubRun}, + fromState: [ + "input": "input", + "output_compression": "output_compression", + "output_files": "output_files", + "modality": "modality", + "obs_feature": "obs_feature", + "drop_obs_nan": "drop_obs_nan", + "ensure_unique_filenames": "ensure_unique_filenames" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ] + ) + + split_stub_ch = input_ch + | filter{workflow.stubRun} + // This is not a build viash component, so we cannot use + // fromState or toState functionality + | map {id, state -> [id, state.input]} + | splitStub + | map {id, output, output_files -> + [id, ["output": output, "output_files": output_files]] + } + + output_ch = split_ch.concat(split_stub_ch) + | setState(["output", "output_files"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/nextflow.config b/target/_private/nextflow/workflows/multiomics/split_h5mu/nextflow.config new file mode 100644 index 00000000..39726c50 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/multiomics/split_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. ' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/nextflow_labels.config b/target/_private/nextflow/workflows/multiomics/split_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/errorstrat_ignore.config b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/integration_tests.config b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/labels.config b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/labels_ci.config b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_h5mu/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/.config.vsh.yaml b/target/_private/nextflow/workflows/multiomics/split_modalities/.config.vsh.yaml new file mode 100644 index 00000000..a90ab0ad --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/.config.vsh.yaml @@ -0,0 +1,234 @@ +name: "split_modalities" +namespace: "workflows/multiomics" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the sample." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containing multiple h5mu files." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_types" + description: "A csv containing the base filename and modality type per output\ + \ file." + info: null + example: + - "types.csv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A pipeline to split a multimodal mudata files into several unimodal\ + \ mudata files." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: + test_dependencies: + - name: "split_modalities_test" + namespace: "test_workflows/multiomics" +status: "enabled" +scope: + image: "private" + target: "private" +dependencies: +- name: "dataflow/split_modalities" + alias: "split_modalities_component" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/multiomics/split_modalities/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/_private/nextflow/workflows/multiomics/split_modalities" + executable: "target/_private/nextflow/workflows/multiomics/split_modalities/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/dataflow/split_modalities" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/main.nf b/target/_private/nextflow/workflows/multiomics/split_modalities/main.nf new file mode 100644 index 00000000..dc601eab --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/main.nf @@ -0,0 +1,3522 @@ +// split_modalities main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "split_modalities", + "namespace" : "workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the sample.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory containing multiple h5mu files.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_types", + "description" : "A csv containing the base filename and modality type per output file.", + "example" : [ + "types.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A pipeline to split a multimodal mudata files into several unimodal mudata files.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "info" : { + "test_dependencies" : [ + { + "name" : "split_modalities_test", + "namespace" : "test_workflows/multiomics" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "private", + "target" : "private" + }, + "dependencies" : [ + { + "name" : "dataflow/split_modalities", + "alias" : "split_modalities_component", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/multiomics/split_modalities/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/_private/nextflow/workflows/multiomics/split_modalities", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { split_modalities as split_modalities_component_viashalias } from "${meta.resources_dir}/../../../../../nextflow/dataflow/split_modalities/main.nf" +split_modalities_component = split_modalities_component_viashalias.run(key: "split_modalities_component") + +// inner workflow +// user-provided Nextflow code +process splitStub { + input: + tuple val(id), val(unused) + + output: + tuple val(id), path("stub_h5mus"), path("modalities.csv") + + script: + """ + echo "This process is not meant to be run without -stub being defined." + exit 1 + """ + + stub: + """ + mkdir stub_h5mus + touch stub_h5mus/${id}_vdj.h5mu + touch stub_h5mus/${id}_rna.h5mu + touch stub_h5mus/${id}_prot.h5mu + echo -e "name,filename\nrna,stub_h5mus/${id}_rna.h5mu\nprot,stub_h5mus/${id}_prot.h5mu\nvdj,stub_h5mus/${id}_vdj.h5mu" > modalities.csv + """ +} + +workflow run_wf { + // Split multimodal MuData files into several unimodal MuData files. + take: + input_ch + + main: + split_ch = input_ch + | split_modalities_component.run( + filter: {!workflow.stubRun}, + fromState: ["input": "input"], + toState: [ + "output": "output", + "output_types": "output_types" + ] + ) + + split_stub_ch = input_ch + | filter{workflow.stubRun} + // This is not a build viash component, so we cannot use + // fromState or toState functionality + | map {id, state -> [id, state.input]} + | splitStub + | map {id, output, output_types -> + [id, ["output": output, "output_types": output_types]] + } + + output_ch = split_ch.concat(split_stub_ch) + | setState(["output", "output_types"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/nextflow.config b/target/_private/nextflow/workflows/multiomics/split_modalities/nextflow.config new file mode 100644 index 00000000..88533d1f --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/multiomics/split_modalities' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A pipeline to split a multimodal mudata files into several unimodal mudata files.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/nextflow_labels.config b/target/_private/nextflow/workflows/multiomics/split_modalities/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/utils/errorstrat_ignore.config b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/utils/integration_tests.config b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/utils/labels.config b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_private/nextflow/workflows/multiomics/split_modalities/utils/labels_ci.config b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/_private/nextflow/workflows/multiomics/split_modalities/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/_test/executable/download/sync_test_resources/.config.vsh.yaml b/target/_test/executable/download/sync_test_resources/.config.vsh.yaml new file mode 100644 index 00000000..b9402b46 --- /dev/null +++ b/target/_test/executable/download/sync_test_resources/.config.vsh.yaml @@ -0,0 +1,244 @@ +name: "sync_test_resources" +namespace: "download" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the _viash.yaml project configuration file." + info: null + default: + - "_viash.yaml" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Path to the directory where the resources will be synced to." + info: null + default: + - "." + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean_true" + name: "--quiet" + description: "Displays the operations that would be performed using the specified\ + \ command without actually running them." + info: null + direction: "input" + - type: "boolean_true" + name: "--dryrun" + description: "Does not display the operations performed from the specified command." + info: null + direction: "input" + - type: "boolean_true" + name: "--delete" + description: "Files that exist in the destination but not in the source are deleted\ + \ during sync." + info: null + direction: "input" + - type: "string" + name: "--exclude" + description: "Exclude all files or objects from the command that matches the specified\ + \ pattern." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Sync test resources to the local filesystem" +usage: "sync_test_resources\nsync_test_resources --input _viash.yaml --output .\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "amazon/aws-cli:2.17.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "yum" + packages: + - "wget" + - type: "docker" + run: + - "wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64\ + \ -O /usr/bin/yq && \\\n chmod +x /usr/bin/yq\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/download/sync_test_resources/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/download/sync_test_resources" + executable: "target/_test/executable/download/sync_test_resources/sync_test_resources" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/download/sync_test_resources/nextflow_labels.config b/target/_test/executable/download/sync_test_resources/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/download/sync_test_resources/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/download/sync_test_resources/sync_test_resources b/target/_test/executable/download/sync_test_resources/sync_test_resources new file mode 100755 index 00000000..ca97484c --- /dev/null +++ b/target/_test/executable/download/sync_test_resources/sync_test_resources @@ -0,0 +1,1255 @@ +#!/usr/bin/env bash + +# sync_test_resources main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="sync_test_resources" +VIASH_META_FUNCTIONALITY_NAME="sync_test_resources" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM amazon/aws-cli:2.17.11 +ENTRYPOINT [] +RUN yum install -y wget && \ + yum clean all && \ + rm -rf /var/cache/yum + +RUN wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && \ + chmod +x /usr/bin/yq + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component download sync_test_resources" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "sync_test_resources main" + echo "" + echo "Sync test resources to the local filesystem" + echo "" + echo "Usage:" + echo "sync_test_resources" + echo "sync_test_resources --input _viash.yaml --output ." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, file must exist" + echo " default: _viash.yaml" + echo " Path to the _viash.yaml project configuration file." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " default: ." + echo " Path to the directory where the resources will be synced to." + echo "" + echo "Arguments:" + echo " --quiet" + echo " type: boolean_true" + echo " Displays the operations that would be performed using the specified" + echo " command without actually running them." + echo "" + echo " --dryrun" + echo " type: boolean_true" + echo " Does not display the operations performed from the specified command." + echo "" + echo " --delete" + echo " type: boolean_true" + echo " Files that exist in the destination but not in the source are deleted" + echo " during sync." + echo "" + echo " --exclude" + echo " type: string, multiple values allowed" + echo " Exclude all files or objects from the command that matches the specified" + echo " pattern." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "sync_test_resources main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quiet) + [ -n "$VIASH_PAR_QUIET" ] && ViashError Bad arguments for option \'--quiet\': \'$VIASH_PAR_QUIET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUIET=true + shift 1 + ;; + --dryrun) + [ -n "$VIASH_PAR_DRYRUN" ] && ViashError Bad arguments for option \'--dryrun\': \'$VIASH_PAR_DRYRUN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DRYRUN=true + shift 1 + ;; + --delete) + [ -n "$VIASH_PAR_DELETE" ] && ViashError Bad arguments for option \'--delete\': \'$VIASH_PAR_DELETE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DELETE=true + shift 1 + ;; + --exclude) + if [ -z "$VIASH_PAR_EXCLUDE" ]; then + VIASH_PAR_EXCLUDE="$2" + else + VIASH_PAR_EXCLUDE="$VIASH_PAR_EXCLUDE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exclude. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude=*) + if [ -z "$VIASH_PAR_EXCLUDE" ]; then + VIASH_PAR_EXCLUDE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_EXCLUDE="$VIASH_PAR_EXCLUDE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/download/sync_test_resources:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_INPUT+x} ]; then + VIASH_PAR_INPUT="_viash.yaml" +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="." +fi +if [ -z ${VIASH_PAR_QUIET+x} ]; then + VIASH_PAR_QUIET="false" +fi +if [ -z ${VIASH_PAR_DRYRUN+x} ]; then + VIASH_PAR_DRYRUN="false" +fi +if [ -z ${VIASH_PAR_DELETE+x} ]; then + VIASH_PAR_DELETE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_QUIET" ]]; then + if ! [[ "$VIASH_PAR_QUIET" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--quiet' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DRYRUN" ]]; then + if ! [[ "$VIASH_PAR_DRYRUN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--dryrun' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DELETE" ]]; then + if ! [[ "$VIASH_PAR_DELETE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--delete' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-sync_test_resources-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_QUIET+x} ]; then echo "${VIASH_PAR_QUIET}" | sed "s#'#'\"'\"'#g;s#.*#par_quiet='&'#" ; else echo "# par_quiet="; fi ) +$( if [ ! -z ${VIASH_PAR_DRYRUN+x} ]; then echo "${VIASH_PAR_DRYRUN}" | sed "s#'#'\"'\"'#g;s#.*#par_dryrun='&'#" ; else echo "# par_dryrun="; fi ) +$( if [ ! -z ${VIASH_PAR_DELETE+x} ]; then echo "${VIASH_PAR_DELETE}" | sed "s#'#'\"'\"'#g;s#.*#par_delete='&'#" ; else echo "# par_delete="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE+x} ]; then echo "${VIASH_PAR_EXCLUDE}" | sed "s#'#'\"'\"'#g;s#.*#par_exclude='&'#" ; else echo "# par_exclude="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=( ) + +if [ "\$par_quiet" == "true" ]; then + extra_params+=( "--quiet" ) +fi +if [ "\$par_dryrun" == "true" ]; then + extra_params+=( "--dryrun" ) +fi +if [ "\$par_delete" == "true" ]; then + extra_params+=( "--delete" ) +fi + +if [ ! -z \${par_exclude+x} ]; then + IFS=";" + for var in \$par_exclude; do + unset IFS + extra_params+=( "--exclude" "\$var" ) + done +fi + +function sync_s3() { + local s3_path="\$1" + local dest_path="\$2" + AWS_EC2_METADATA_DISABLED=true \\ + aws s3 sync \\ + "\$s3_path" \\ + "\$dest_path" \\ + --no-sign-request \\ + "\${extra_params[@]}" +} + +yq e \\ + '.info.test_resources[] | "{type: " + (.type // "s3") + ", path: " + .path + ", dest: " + .dest + "}"' \\ + "\${par_input}" | \\ + while read -r line; do + type=\$(echo "\$line" | yq e '.type') + path=\$(echo "\$line" | yq e '.path') + dest=\$(echo "\$line" | yq e '.dest') + + echo "Syncing '\$path' to '\$dest'..." + + if [ "\$type" == "s3" ]; then + sync_s3 "\$path" "\$par_output/\$dest" + fi + done +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/annotation/harmony_knn_test/.config.vsh.yaml b/target/_test/executable/test_workflows/annotation/harmony_knn_test/.config.vsh.yaml new file mode 100644 index 00000000..869f527a --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/harmony_knn_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "harmony_knn_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the annotation of the harmony_knn\ + \ of workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/annotation/harmony_knn_test" + executable: "target/_test/executable/test_workflows/annotation/harmony_knn_test/harmony_knn_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/annotation/harmony_knn_test/harmony_knn_test b/target/_test/executable/test_workflows/annotation/harmony_knn_test/harmony_knn_test new file mode 100755 index 00000000..f6fdf5b3 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/harmony_knn_test/harmony_knn_test @@ -0,0 +1,1119 @@ +#!/usr/bin/env bash + +# harmony_knn_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="harmony_knn_test" +VIASH_META_FUNCTIONALITY_NAME="harmony_knn_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/annotation harmony_knn_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "harmony_knn_test main" + echo "" + echo "This component tests the output of the annotation of the harmony_knn of" + echo "workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "harmony_knn_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/annotation/harmony_knn_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-harmony_knn_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_harmony", "X_leiden_harmony_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = [ + "harmonypy_integration_distances", + "harmonypy_integration_connectivities", + ] + expected_mod = ["rna", "prot"] + + assert all(key in list(input_mudata.mod) for key in expected_mod), ( + f"Input modalities should be: {expected_mod}, found: {input_mudata.mod.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category", ( + "Cell type predictions should be of dtype category." + ) + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64", ( + "Cell type probabilities should be of dtype float64." + ) + + assert input_mudata.mod["rna"].shape[0] == input_mudata.mod["prot"].shape[0], ( + "Number of observations should be equal in all modalities." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/annotation/harmony_knn_test/nextflow_labels.config b/target/_test/executable/test_workflows/annotation/harmony_knn_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/harmony_knn_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/annotation/harmony_knn_test/setup_logger.py b/target/_test/executable/test_workflows/annotation/harmony_knn_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/harmony_knn_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/.config.vsh.yaml b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/.config.vsh.yaml new file mode 100644 index 00000000..f4f72d3b --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "scanvi_scarches_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the annotation of the scanvi annotation\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/scanvi_scarches/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/annotation/scanvi_scarches_test" + executable: "target/_test/executable/test_workflows/annotation/scanvi_scarches_test/scanvi_scarches_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/nextflow_labels.config b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/scanvi_scarches_test b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/scanvi_scarches_test new file mode 100755 index 00000000..c16e2913 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/scanvi_scarches_test @@ -0,0 +1,1115 @@ +#!/usr/bin/env bash + +# scanvi_scarches_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scanvi_scarches_test" +VIASH_META_FUNCTIONALITY_NAME="scanvi_scarches_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/annotation scanvi_scarches_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scanvi_scarches_test main" + echo "" + echo "This component tests the output of the annotation of the scanvi annotation" + echo "workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scanvi_scarches_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/annotation/scanvi_scarches_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scanvi_scarches_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest +import pandas + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scanvi", "X_scanvi_umap"] + expected_obs = ["scanvi_pred", "scanvi_probabilities"] + expected_obsp = [ + "scanvi_integration_distances", + "scanvi_integration_connectivities", + ] + expected_uns = ["scanvi_integration_neighbors"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].uns) for key in expected_uns), ( + f"Input mod['rna'] uns columns should be: {expected_uns}, found: {input_mudata.mod['rna'].uns.keys()}." + ) + + assert input_mudata.mod["rna"].obs["scanvi_pred"].dtype == "category" + assert pandas.api.types.is_float_dtype( + input_mudata.mod["rna"].obs["scanvi_probabilities"].dtype + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/setup_logger.py b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scanvi_scarches_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/.config.vsh.yaml b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/.config.vsh.yaml new file mode 100644 index 00000000..28d77d3b --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/.config.vsh.yaml @@ -0,0 +1,209 @@ +name: "scgpt_annotation_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_hvg" + description: "Number of highly variable genes the input file was subset for." + info: null + example: + - 400 + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the the scgpt_annotation workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/scgpt/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/annotation/scgpt_annotation_test" + executable: "target/_test/executable/test_workflows/annotation/scgpt_annotation_test/scgpt_annotation_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/nextflow_labels.config b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/scgpt_annotation_test b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/scgpt_annotation_test new file mode 100755 index 00000000..92caa4f3 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/scgpt_annotation_test @@ -0,0 +1,1142 @@ +#!/usr/bin/env bash + +# scgpt_annotation_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scgpt_annotation_test" +VIASH_META_FUNCTIONALITY_NAME="scgpt_annotation_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/annotation scgpt_annotation_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scgpt_annotation_test main" + echo "" + echo "This component test the output of the the scgpt_annotation workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo " --n_hvg" + echo " type: integer, required parameter" + echo " example: 400" + echo " Number of highly variable genes the input file was subset for." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scgpt_annotation_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_hvg) + [ -n "$VIASH_PAR_N_HVG" ] && ViashError Bad arguments for option \'--n_hvg\': \'$VIASH_PAR_N_HVG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_HVG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_hvg. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_hvg=*) + [ -n "$VIASH_PAR_N_HVG" ] && ViashError Bad arguments for option \'--n_hvg=*\': \'$VIASH_PAR_N_HVG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_HVG=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/annotation/scgpt_annotation_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_N_HVG+x} ]; then + ViashError '--n_hvg' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_HVG" ]]; then + if ! [[ "$VIASH_PAR_N_HVG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_hvg' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scgpt_annotation_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_hvg': $( if [ ! -z ${VIASH_PAR_N_HVG+x} ]; then echo "int(r'${VIASH_PAR_N_HVG//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = [ + "gene_id_tokens", + "values_tokenized", + "padding_mask", + "bin_edges", + "binned_counts", + ] + expected_var = ["scgpt_filter_with_hvg", "scgpt_cross_checked_genes"] + expected_obs = ["scgpt_pred", "scgpt_probability"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obs columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].var) for key in expected_var), ( + f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + # hvg subsetting is not exact - add 10% to allowed data shape + assert ( + input_mudata.mod["rna"].obsm["binned_counts"].shape[1] + <= par["n_hvg"] + 0.1 * par["n_hvg"] + ), ( + f"Input shape should be lower or equal than --n_hvg {par['n_hvg']}, found: {input_mudata.shape[1]}." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/setup_logger.py b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scgpt_annotation_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/annotation/scvi_knn_test/.config.vsh.yaml b/target/_test/executable/test_workflows/annotation/scvi_knn_test/.config.vsh.yaml new file mode 100644 index 00000000..b33548b2 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scvi_knn_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "scvi_knn_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the annotation of the scvi_knn of\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/scvi_knn/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/annotation/scvi_knn_test" + executable: "target/_test/executable/test_workflows/annotation/scvi_knn_test/scvi_knn_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/annotation/scvi_knn_test/nextflow_labels.config b/target/_test/executable/test_workflows/annotation/scvi_knn_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scvi_knn_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/annotation/scvi_knn_test/scvi_knn_test b/target/_test/executable/test_workflows/annotation/scvi_knn_test/scvi_knn_test new file mode 100755 index 00000000..0e2ee9ea --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scvi_knn_test/scvi_knn_test @@ -0,0 +1,1115 @@ +#!/usr/bin/env bash + +# scvi_knn_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scvi_knn_test" +VIASH_META_FUNCTIONALITY_NAME="scvi_knn_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/annotation scvi_knn_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scvi_knn_test main" + echo "" + echo "This component tests the output of the annotation of the scvi_knn of workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scvi_knn_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/annotation/scvi_knn_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scvi_knn_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scvi", "X_leiden_scvi_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = ["scvi_integration_connectivities", "scvi_integration_distances"] + expected_mod = ["rna", "prot"] + + assert all(key in list(input_mudata.mod) for key in expected_mod), ( + f"Input modalities should be: {expected_mod}, found: {input_mudata.mod.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category", ( + "Cell type predictions should be of dtype category." + ) + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64", ( + "Cell type probabilities should be of dtype float64." + ) + + assert input_mudata.mod["rna"].shape[0] == input_mudata.mod["prot"].shape[0], ( + "Number of observations should be equal in all modalities." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/annotation/scvi_knn_test/setup_logger.py b/target/_test/executable/test_workflows/annotation/scvi_knn_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/annotation/scvi_knn_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/.config.vsh.yaml b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/.config.vsh.yaml new file mode 100644 index 00000000..31cfeeff --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "bd_rhapsody_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the bd_rhapsody\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/bd_rhapsody/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/ingestion/bd_rhapsody_test" + executable: "target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/bd_rhapsody_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/bd_rhapsody_test b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/bd_rhapsody_test new file mode 100755 index 00000000..dac75831 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/bd_rhapsody_test @@ -0,0 +1,1121 @@ +#!/usr/bin/env bash + +# bd_rhapsody_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bd_rhapsody_test" +VIASH_META_FUNCTIONALITY_NAME="bd_rhapsody_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/ingestion bd_rhapsody_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bd_rhapsody_test main" + echo "" + echo "This component test the output of the integration test of the bd_rhapsody" + echo "workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bd_rhapsody_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/ingestion/bd_rhapsody_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bd_rhapsody_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import numpy as np +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_var = ["gene_name", "feature_type", "reference_file", "gene_ids"] + expected_obs = ["run_id", "library_id", "cell_id"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert "prot" in list(input_mudata.mod.keys()), "Input should contain rna modality." + # assert list(input_mudata.var.columns) == expected_var, f"Input var columns should be: {expected_var}." + assert all( + key in list(input_mudata.mod["rna"].var.columns) for key in expected_var + ), ( + f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + ) + assert all( + key in list(input_mudata.mod["rna"].obs.columns) for key in expected_obs + ), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all( + key in list(input_mudata.mod["prot"].var.columns) for key in expected_var + ), ( + f"Input mod['prot'] var columns should be: {expected_var}, found: {input_mudata.mod['prot'].var.keys()}." + ) + assert all( + key in list(input_mudata.mod["prot"].obs.columns) for key in expected_obs + ), ( + f"Input mod ['prot'] obs columns should be: {expected_obs}, found: {input_mudata.mod['prot'].obs.keys()}." + ) + assert np.array_equal( + input_mudata.mod["rna"].var["feature_type"].unique(), ["Gene Expression"] + ), "Output X should only contain Gene Expression vars." + assert np.array_equal( + input_mudata.mod["prot"].var["feature_type"].unique(), ["Antibody Capture"] + ), "Output X should only contain Gene Expression vars." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/nextflow_labels.config b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/setup_logger.py b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/bd_rhapsody_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/.config.vsh.yaml b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/.config.vsh.yaml new file mode 100644 index 00000000..2e8f71a9 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "cellranger_mapping_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the cellranger\ + \ mapping workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/cellranger_mapping/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/ingestion/cellranger_mapping_test" + executable: "target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/cellranger_mapping_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/cellranger_mapping_test b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/cellranger_mapping_test new file mode 100755 index 00000000..5fdf6aba --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/cellranger_mapping_test @@ -0,0 +1,1099 @@ +#!/usr/bin/env bash + +# cellranger_mapping_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_mapping_test" +VIASH_META_FUNCTIONALITY_NAME="cellranger_mapping_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/ingestion cellranger_mapping_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_mapping_test main" + echo "" + echo "This component test the output of the integration test of the cellranger mapping" + echo "workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_mapping_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/ingestion/cellranger_mapping_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_mapping_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_colnames = ["gene_symbol", "feature_types", "genome"] + + assert list(input_mudata.mod.keys()) == ["rna"], ( + "Input should contain rna modality." + ) + assert list(input_mudata.var.columns) == expected_colnames, ( + f"Input var columns should be: {expected_colnames}." + ) + assert list(input_mudata.mod["rna"].var.columns) == expected_colnames, ( + f"Input mod['rna'] var columns should be: {expected_colnames}." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/nextflow_labels.config b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/setup_logger.py b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_mapping_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/.config.vsh.yaml b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/.config.vsh.yaml new file mode 100644 index 00000000..b67030f7 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "cellranger_multi_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the cellranger\ + \ multi workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/cellranger_multi/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/ingestion/cellranger_multi_test" + executable: "target/_test/executable/test_workflows/ingestion/cellranger_multi_test/cellranger_multi_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/cellranger_multi_test b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/cellranger_multi_test new file mode 100755 index 00000000..09d26b64 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/cellranger_multi_test @@ -0,0 +1,1134 @@ +#!/usr/bin/env bash + +# cellranger_multi_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_multi_test" +VIASH_META_FUNCTIONALITY_NAME="cellranger_multi_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/ingestion cellranger_multi_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_multi_test main" + echo "" + echo "This component test the output of the integration test of the cellranger multi" + echo "workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_multi_test main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/ingestion/cellranger_multi_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_multi_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + for input_path in par["input"]: + input_mudata = read_h5mu(input_path) + + assert list(input_mudata.mod.keys()) == ["rna", "prot", "vdj_t"] + assert list(input_mudata.uns.keys()) == ["metrics_cellranger"] + expected_metrics = [ + "Category", + "Library Type", + "Grouped By", + "Group Name", + "Metric Name", + "Metric Value", + ] + assert ( + input_mudata.uns["metrics_cellranger"].columns.to_list() == expected_metrics + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/nextflow_labels.config b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/setup_logger.py b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_multi_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/.config.vsh.yaml b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/.config.vsh.yaml new file mode 100644 index 00000000..0e8751ff --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/.config.vsh.yaml @@ -0,0 +1,220 @@ +name: "cellranger_postprocessing_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_og" + description: "Path to the original h5mu file." + info: null + example: + - "foo.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--is_corrected" + description: "Whether the input file has been corrected." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the cellranger\ + \ postprocessing workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/cellranger_postprocessing/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test" + executable: "target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/cellranger_postprocessing_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/cellranger_postprocessing_test b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/cellranger_postprocessing_test new file mode 100755 index 00000000..8d34b335 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/cellranger_postprocessing_test @@ -0,0 +1,1167 @@ +#!/usr/bin/env bash + +# cellranger_postprocessing_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_postprocessing_test" +VIASH_META_FUNCTIONALITY_NAME="cellranger_postprocessing_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/ingestion cellranger_postprocessing_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_postprocessing_test main" + echo "" + echo "This component test the output of the integration test of the cellranger" + echo "postprocessing workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo " --input_og" + echo " type: file, required parameter, file must exist" + echo " example: foo.h5mu" + echo " Path to the original h5mu file." + echo "" + echo " --is_corrected" + echo " type: boolean, required parameter" + echo " Whether the input file has been corrected." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_postprocessing_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_og) + [ -n "$VIASH_PAR_INPUT_OG" ] && ViashError Bad arguments for option \'--input_og\': \'$VIASH_PAR_INPUT_OG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_og. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_og=*) + [ -n "$VIASH_PAR_INPUT_OG" ] && ViashError Bad arguments for option \'--input_og=*\': \'$VIASH_PAR_INPUT_OG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --is_corrected) + [ -n "$VIASH_PAR_IS_CORRECTED" ] && ViashError Bad arguments for option \'--is_corrected\': \'$VIASH_PAR_IS_CORRECTED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_IS_CORRECTED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --is_corrected. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --is_corrected=*) + [ -n "$VIASH_PAR_IS_CORRECTED" ] && ViashError Bad arguments for option \'--is_corrected=*\': \'$VIASH_PAR_IS_CORRECTED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_IS_CORRECTED=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/ingestion/cellranger_postprocessing_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_OG+x} ]; then + ViashError '--input_og' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_IS_CORRECTED+x} ]; then + ViashError '--is_corrected' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_OG" ] && [ ! -e "$VIASH_PAR_INPUT_OG" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_OG' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_IS_CORRECTED" ]]; then + if ! [[ "$VIASH_PAR_IS_CORRECTED" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--is_corrected' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_INPUT_OG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_OG")" ) + VIASH_PAR_INPUT_OG=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_OG") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_postprocessing_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_og': $( if [ ! -z ${VIASH_PAR_INPUT_OG+x} ]; then echo "r'${VIASH_PAR_INPUT_OG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'is_corrected': $( if [ ! -z ${VIASH_PAR_IS_CORRECTED+x} ]; then echo "r'${VIASH_PAR_IS_CORRECTED//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input_og"]) + output_mudata = read_h5mu(par["input"]) + + assert input_mudata.mod.keys() == output_mudata.mod.keys(), ( + "Input and output should have the same modalities." + ) + + for modality, input_adata, output_adata in zip( + input_mudata.mod.keys(), input_mudata.mod.values(), output_mudata.mod.values() + ): + assert input_adata.n_obs >= output_adata.n_obs, ( + "Output should have less or equal number of observations than input." + ) + assert input_adata.n_vars == output_adata.n_vars, ( + "Output should have the same number of variables as input." + ) + if modality != "rna": + assert_annotation_objects_equal(input_adata, output_adata) + + if par["is_corrected"]: + assert "cellbender_corrected" in output_mudata.mod["rna"].layers + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_INPUT_OG" ]; then + VIASH_PAR_INPUT_OG=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_OG") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/nextflow_labels.config b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/setup_logger.py b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/cellranger_postprocessing_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/ingestion/conversion_test/.config.vsh.yaml b/target/_test/executable/test_workflows/ingestion/conversion_test/.config.vsh.yaml new file mode 100644 index 00000000..3ad17924 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/conversion_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "conversion_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the conversion\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/conversion/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/ingestion/conversion_test" + executable: "target/_test/executable/test_workflows/ingestion/conversion_test/conversion_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/ingestion/conversion_test/conversion_test b/target/_test/executable/test_workflows/ingestion/conversion_test/conversion_test new file mode 100755 index 00000000..7325a4fc --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/conversion_test/conversion_test @@ -0,0 +1,1093 @@ +#!/usr/bin/env bash + +# conversion_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="conversion_test" +VIASH_META_FUNCTIONALITY_NAME="conversion_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/ingestion conversion_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "conversion_test main" + echo "" + echo "This component test the output of the integration test of the conversion" + echo "workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "conversion_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/ingestion/conversion_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-conversion_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + assert "rna" in input_mudata.mod.keys() + assert input_mudata.n_obs == 713 + assert input_mudata.mod["rna"].var["feature_types"].unique() == [ + "Gene Expression" + ], "Output X should only contain Gene Expression vars." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/ingestion/conversion_test/nextflow_labels.config b/target/_test/executable/test_workflows/ingestion/conversion_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/conversion_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/ingestion/conversion_test/setup_logger.py b/target/_test/executable/test_workflows/ingestion/conversion_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/ingestion/conversion_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/.config.vsh.yaml b/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/.config.vsh.yaml new file mode 100644 index 00000000..079ec49a --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/.config.vsh.yaml @@ -0,0 +1,197 @@ +name: "dimensionality_reduction_test" +namespace: "test_workflows/multiomics" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of dimensionality_reduction." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/dimensionality_reduction/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test" + executable: "target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/dimensionality_reduction_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/dimensionality_reduction_test b/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/dimensionality_reduction_test new file mode 100755 index 00000000..420f7d98 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/dimensionality_reduction_test @@ -0,0 +1,1090 @@ +#!/usr/bin/env bash + +# dimensionality_reduction_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Kai Waldrant + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="dimensionality_reduction_test" +VIASH_META_FUNCTIONALITY_NAME="dimensionality_reduction_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Kai Waldrant" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/multiomics dimensionality_reduction_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "dimensionality_reduction_test main" + echo "" + echo "This component test the output of the integration test of" + echo "dimensionality_reduction." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "dimensionality_reduction_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/multiomics/dimensionality_reduction_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-dimensionality_reduction_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + +print("Loading data", flush=True) +data = mu.read_h5mu(par["input"]) + +assert "X_umap" in data.mod["rna"].obsm, "X_umap not found in .obsm" +assert data.mod["rna"].obsm["X_umap"].shape[1] == 2, ( + f"X_umap has wrong shape expected 2 n_comp but got {data.mod['rna'].obsm['X_umap'].shape[1]}" +) +assert "pca_variance" in data.mod["rna"].uns +assert "pca_loadings" in data.mod["rna"].varm + + +print("Test successful!", flush=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/nextflow_labels.config b/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/dimensionality_reduction_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/.config.vsh.yaml b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/.config.vsh.yaml new file mode 100644 index 00000000..35da666f --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/.config.vsh.yaml @@ -0,0 +1,216 @@ +name: "workflow_test" +namespace: "test_workflows/multiomics/process_batches" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to the modality summary csv." + info: null + example: + - "test.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the integration test of process_batches\ + \ test_wf." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/process_batches/workflow_test/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/multiomics/process_batches/workflow_test" + executable: "target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/workflow_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/nextflow_labels.config b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/workflow_test b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/workflow_test new file mode 100755 index 00000000..77c6b873 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test/workflow_test @@ -0,0 +1,1134 @@ +#!/usr/bin/env bash + +# workflow_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Kai Waldrant + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="workflow_test" +VIASH_META_FUNCTIONALITY_NAME="workflow_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Kai Waldrant" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/multiomics/process_batches workflow_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "workflow_test main" + echo "" + echo "This component tests the output of the integration test of process_batches" + echo "test_wf." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: test.h5mu" + echo " Path to the modality summary csv." + echo "" + echo " --orig_input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Path to the original input file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "workflow_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --orig_input) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --orig_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --orig_input=*) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input=*\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/multiomics/process_batches/workflow_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ORIG_INPUT+x} ]; then + ViashError '--orig_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ] && [ ! -e "$VIASH_PAR_ORIG_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_ORIG_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ORIG_INPUT")" ) + VIASH_PAR_ORIG_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_ORIG_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-workflow_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + +print("Loading data", flush=True) +input = mu.read_h5mu(par["orig_input"]) +output = mu.read_h5mu(par["input"]) + +assert input.n_mod == output.n_mod, "Number of modalities differ" +assert input.mod.keys() == output.mod.keys(), "Modalities differ" + +# Check atac modality +assert_annotation_objects_equal( + input.mod["atac"], output.mod["atac"], promote_precision=True +) + +# Check rna modality +assert "X_umap" in output.mod["rna"].obsm, "X_umap not found in .obsm" +assert output.mod["rna"].obsm["X_umap"].shape[1] == 2, ( + f"X_umap has wrong shape expected 2 n_comp but got {output.mod['rna'].obsm['X_umap'].shape[1]}" +) +assert "pca_variance" in output.mod["rna"].uns +assert "pca_loadings" in output.mod["rna"].varm + + +print("Test successful!", flush=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_PAR_ORIG_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_ORIG_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/.config.vsh.yaml b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/.config.vsh.yaml new file mode 100644 index 00000000..f9f3380d --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/.config.vsh.yaml @@ -0,0 +1,216 @@ +name: "workflow_test2" +namespace: "test_workflows/multiomics/process_batches" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to the modality summary csv." + info: null + example: + - "test.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the integration test of process_batches\ + \ test_wf2." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/process_batches/workflow_test2/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2" + executable: "target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/workflow_test2" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/nextflow_labels.config b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/workflow_test2 b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/workflow_test2 new file mode 100755 index 00000000..f726af06 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/process_batches/workflow_test2/workflow_test2 @@ -0,0 +1,1159 @@ +#!/usr/bin/env bash + +# workflow_test2 main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Kai Waldrant + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="workflow_test2" +VIASH_META_FUNCTIONALITY_NAME="workflow_test2" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Kai Waldrant" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/multiomics/process_batches workflow_test2" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "workflow_test2 main" + echo "" + echo "This component tests the output of the integration test of process_batches" + echo "test_wf2." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: test.h5mu" + echo " Path to the modality summary csv." + echo "" + echo " --orig_input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Path to the original input file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "workflow_test2 main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --orig_input) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --orig_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --orig_input=*) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input=*\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/multiomics/process_batches/workflow_test2:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ORIG_INPUT+x} ]; then + ViashError '--orig_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ] && [ ! -e "$VIASH_PAR_ORIG_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_ORIG_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ORIG_INPUT")" ) + VIASH_PAR_ORIG_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_ORIG_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-workflow_test2-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + +print("Loading data", flush=True) +input = mu.read_h5mu(par["orig_input"]) +output = mu.read_h5mu(par["input"]) + +assert input.n_mod == output.n_mod, "Number of modalities differ" +assert input.mod.keys() == output.mod.keys(), "Modalities differ" + +# Check vdj_t modality +# Allow X_umap to be overwritten +input_vdj = input.mod["vdj_t"] +# del input_vdj.obsm['X_umap'] +output_vdj = output.mod["vdj_t"] +# del output_vdj.obsm['X_umap'] +assert_annotation_objects_equal(input_vdj, output_vdj, promote_precision=True) + +# Check prot modality +# Ignore the PCA layer and its derivatives, as its allowed to be overwritten for this test. +input_prot = input.mod["prot"] +del input_prot.varm["pca_loadings"] +del input_prot.obsm["X_pca"] +del input_prot.obsm["X_umap"] +output_prot = output.mod["prot"] +del output_prot.varm["pca_loadings"] +del output_prot.obsm["X_pca"] +del output_prot.obsm["X_umap"] +assert_annotation_objects_equal(input_prot, output_prot, promote_precision=True) + + +# Check rna modality +# Allow the highly variable genes and PCA + derivatives to be overwritten +input_rna = input.mod["rna"] +input_rna = remove_annotation_column(input_rna, "filter_with_hvg", "var") +del input_rna.varm["pca_loadings"] +del input_rna.obsm["X_pca"] +del input_rna.obsm["X_umap"] +del input_rna.layers["log_normalized"] +output_rna = output.mod["rna"] +output_rna = remove_annotation_column(output_rna, "filter_with_hvg", "var") +del output_rna.obsm["X_pca"] +del output_rna.varm["pca_loadings"] +del output_rna.obsm["X_umap"] +del output_rna.layers["log_normalized"] +assert_annotation_objects_equal(input_rna, output_rna, promote_precision=True) + + +print("Test successful!", flush=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_PAR_ORIG_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_ORIG_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/multiomics/split_h5mu_test/.config.vsh.yaml b/target/_test/executable/test_workflows/multiomics/split_h5mu_test/.config.vsh.yaml new file mode 100644 index 00000000..bcd4a39f --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/split_h5mu_test/.config.vsh.yaml @@ -0,0 +1,225 @@ +name: "split_h5mu_test" +namespace: "test_workflows/multiomics" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to sample summary csv." + info: null + example: + - "output_samples.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--samples_dir" + description: "Path to the directory containing the sample h5mu files." + info: null + example: + - "/path/to/h5mu_files" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "/path/to/original_input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of split_h5mu." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/split_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/multiomics/split_h5mu_test" + executable: "target/_test/executable/test_workflows/multiomics/split_h5mu_test/split_h5mu_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/multiomics/split_h5mu_test/nextflow_labels.config b/target/_test/executable/test_workflows/multiomics/split_h5mu_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/split_h5mu_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/multiomics/split_h5mu_test/split_h5mu_test b/target/_test/executable/test_workflows/multiomics/split_h5mu_test/split_h5mu_test new file mode 100755 index 00000000..a4f1a75f --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/split_h5mu_test/split_h5mu_test @@ -0,0 +1,1185 @@ +#!/usr/bin/env bash + +# split_h5mu_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Kai Waldrant + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="split_h5mu_test" +VIASH_META_FUNCTIONALITY_NAME="split_h5mu_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Kai Waldrant" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/multiomics split_h5mu_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "split_h5mu_test main" + echo "" + echo "This component test the output of the integration test of split_h5mu." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: output_samples.csv" + echo " Path to sample summary csv." + echo "" + echo " --samples_dir" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/h5mu_files" + echo " Path to the directory containing the sample h5mu files." + echo "" + echo " --orig_input" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/original_input.h5mu" + echo " Path to the original input file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "split_h5mu_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --samples_dir) + [ -n "$VIASH_PAR_SAMPLES_DIR" ] && ViashError Bad arguments for option \'--samples_dir\': \'$VIASH_PAR_SAMPLES_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLES_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --samples_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --samples_dir=*) + [ -n "$VIASH_PAR_SAMPLES_DIR" ] && ViashError Bad arguments for option \'--samples_dir=*\': \'$VIASH_PAR_SAMPLES_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLES_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --orig_input) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --orig_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --orig_input=*) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input=*\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/multiomics/split_h5mu_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_SAMPLES_DIR+x} ]; then + ViashError '--samples_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ORIG_INPUT+x} ]; then + ViashError '--orig_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLES_DIR" ] && [ ! -e "$VIASH_PAR_SAMPLES_DIR" ]; then + ViashError "Input file '$VIASH_PAR_SAMPLES_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ] && [ ! -e "$VIASH_PAR_ORIG_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_ORIG_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_SAMPLES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLES_DIR")" ) + VIASH_PAR_SAMPLES_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLES_DIR") +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ORIG_INPUT")" ) + VIASH_PAR_ORIG_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_ORIG_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-split_h5mu_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import csv +import os +import mudata as mu +# from openpipeline_testutils.asserters import assert_annotation_objects_equal + + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'samples_dir': $( if [ ! -z ${VIASH_PAR_SAMPLES_DIR+x} ]; then echo "r'${VIASH_PAR_SAMPLES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +print("Loading data", flush=True) +with open(par["input"], "r", encoding="utf-8") as f: + reader = csv.reader(f) + data = list(reader) + +input_mu = mu.read_h5mu(par["orig_input"]) + +input_samples = input_mu.mod["rna"].obs["sample_id"].unique() +num_samples = input_samples.shape[0] +num_files = len(os.listdir(par["samples_dir"])) + +# Check if the number of files is equal to the number of lines in the csv +assert num_samples == num_files, f"Expected {num_samples} files, but found {num_files}." +assert input_mu.n_mod == num_samples, ( + f"Expected {num_samples} samples in {par['orig_input']} got {num_files} samples." +) + + +# Check if the files exist and if the sample name is in the file name +for i, row in enumerate(data): + if i == 0: + continue + # Check if the files exist and if the sample name is in the file name + assert row[0] in row[1], f"Expected {row[0]} to be in {row[1]}." + samples_fp = os.path.join(par["samples_dir"], row[1]) + assert os.path.exists(samples_fp), f"Expected {row[1]} to exist." + # Check sample is correct in the h5mu file + mod_mu = mu.read_h5mu(samples_fp) + found_samples = mod_mu["rna"].obs["sample_id"].unique() + assert found_samples.shape[0] == 1, f"Expected 1 sample ID in {row[1]}." + assert found_samples[0] == row[0], ( + f"Expected {row[0]} to be the sample ID for {row[1]}" + ) + assert row[0] in input_samples, ( + f"Expected {row[0]} to be a mod in {par['orig_input']}." + ) + +print("Test successful!", flush=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_SAMPLES_DIR" ]; then + VIASH_PAR_SAMPLES_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLES_DIR") + fi + if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_PAR_ORIG_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_ORIG_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/multiomics/split_modalities_test/.config.vsh.yaml b/target/_test/executable/test_workflows/multiomics/split_modalities_test/.config.vsh.yaml new file mode 100644 index 00000000..1a0a7187 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/split_modalities_test/.config.vsh.yaml @@ -0,0 +1,225 @@ +name: "split_modalities_test" +namespace: "test_workflows/multiomics" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to modality summary csv." + info: null + example: + - "output_types.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--mod_dir" + description: "Path to the directory containing the modality h5mu files." + info: null + example: + - "/path/to/h5mu_files" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "/path/to/original_input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of split_modalities." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/split_modalities/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/multiomics/split_modalities_test" + executable: "target/_test/executable/test_workflows/multiomics/split_modalities_test/split_modalities_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/multiomics/split_modalities_test/nextflow_labels.config b/target/_test/executable/test_workflows/multiomics/split_modalities_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/split_modalities_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/multiomics/split_modalities_test/split_modalities_test b/target/_test/executable/test_workflows/multiomics/split_modalities_test/split_modalities_test new file mode 100755 index 00000000..45ba3e78 --- /dev/null +++ b/target/_test/executable/test_workflows/multiomics/split_modalities_test/split_modalities_test @@ -0,0 +1,1187 @@ +#!/usr/bin/env bash + +# split_modalities_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Kai Waldrant + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="split_modalities_test" +VIASH_META_FUNCTIONALITY_NAME="split_modalities_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Kai Waldrant" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/multiomics split_modalities_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "split_modalities_test main" + echo "" + echo "This component test the output of the integration test of split_modalities." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: output_types.csv" + echo " Path to modality summary csv." + echo "" + echo " --mod_dir" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/h5mu_files" + echo " Path to the directory containing the modality h5mu files." + echo "" + echo " --orig_input" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/original_input.h5mu" + echo " Path to the original input file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "split_modalities_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --mod_dir) + [ -n "$VIASH_PAR_MOD_DIR" ] && ViashError Bad arguments for option \'--mod_dir\': \'$VIASH_PAR_MOD_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MOD_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mod_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mod_dir=*) + [ -n "$VIASH_PAR_MOD_DIR" ] && ViashError Bad arguments for option \'--mod_dir=*\': \'$VIASH_PAR_MOD_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MOD_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --orig_input) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --orig_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --orig_input=*) + [ -n "$VIASH_PAR_ORIG_INPUT" ] && ViashError Bad arguments for option \'--orig_input=*\': \'$VIASH_PAR_ORIG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORIG_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/multiomics/split_modalities_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MOD_DIR+x} ]; then + ViashError '--mod_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ORIG_INPUT+x} ]; then + ViashError '--orig_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MOD_DIR" ] && [ ! -e "$VIASH_PAR_MOD_DIR" ]; then + ViashError "Input file '$VIASH_PAR_MOD_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ] && [ ! -e "$VIASH_PAR_ORIG_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_ORIG_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_MOD_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MOD_DIR")" ) + VIASH_PAR_MOD_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_MOD_DIR") +fi +if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ORIG_INPUT")" ) + VIASH_PAR_ORIG_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_ORIG_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-split_modalities_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import csv +import os +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'mod_dir': $( if [ ! -z ${VIASH_PAR_MOD_DIR+x} ]; then echo "r'${VIASH_PAR_MOD_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +print("Loading data", flush=True) +with open(par["input"], "r", encoding="utf-8") as f: + reader = csv.reader(f) + data = list(reader) + +input_mu = mu.read_h5mu(par["orig_input"]) + +num_mod = len(data) - 1 +num_files = len(os.listdir(par["mod_dir"])) + +# Check if the number of files is equal to the number of lines in the csv +assert num_mod == num_files, f"Expected {num_mod} files, but found {num_files}." +assert input_mu.n_mod == num_mod, ( + f"Expected {num_mod} modalities in {par['orig_input']} got {input_mu.n_mod} modalities." +) + +rna_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[1][1])) +prot_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[2][1])) + +# Check if the files exist and if the modality name is in the file name +for i, row in enumerate(data): + if i == 0: + continue + # Check if the files exist and if the modality name is in the file name + assert row[0] in row[1], f"Expected {row[0]} to be in {row[1]}." + mod_fp = os.path.join(par["mod_dir"], row[1]) + assert os.path.exists(mod_fp), f"Expected {row[1]} to exist." + # Check modality is correct in the h5mu file + mod_mu = mu.read_h5mu(mod_fp) + assert mod_mu.n_mod == 1, f"Expected 1 modality in {row[1]}." + assert row[0] in mod_mu.mod.keys(), f"Expected {row[0]} to be the mod in {row[1]}." + assert row[0] in input_mu.mod.keys(), ( + f"Expected {row[0]} to be a mod in {par['orig_input']}." + ) + +# Check if extracted modalities are equal to the original modalities +assert_annotation_objects_equal(rna_mod.mod["rna"], input_mu.mod["rna"]) +assert_annotation_objects_equal(prot_mod.mod["prot"], input_mu.mod["prot"]) + +print("Test successful!", flush=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_MOD_DIR" ]; then + VIASH_PAR_MOD_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_MOD_DIR") + fi + if [ ! -z "$VIASH_PAR_ORIG_INPUT" ]; then + VIASH_PAR_ORIG_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_ORIG_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/qc/qc_test/.config.vsh.yaml b/target/_test/executable/test_workflows/qc/qc_test/.config.vsh.yaml new file mode 100644 index 00000000..e531eb19 --- /dev/null +++ b/target/_test/executable/test_workflows/qc/qc_test/.config.vsh.yaml @@ -0,0 +1,214 @@ +name: "qc_test" +namespace: "test_workflows/qc" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--og_input" + description: "Path to the original h5mu file." + info: null + example: + - "foo.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the QC workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/qc/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/_test/executable/test_workflows/qc/qc_test" + executable: "target/_test/executable/test_workflows/qc/qc_test/qc_test" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/executable/test_workflows/qc/qc_test/nextflow_labels.config b/target/_test/executable/test_workflows/qc/qc_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/executable/test_workflows/qc/qc_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/executable/test_workflows/qc/qc_test/qc_test b/target/_test/executable/test_workflows/qc/qc_test/qc_test new file mode 100755 index 00000000..8433ae37 --- /dev/null +++ b/target/_test/executable/test_workflows/qc/qc_test/qc_test @@ -0,0 +1,1175 @@ +#!/usr/bin/env bash + +# qc_test main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="qc_test" +VIASH_META_FUNCTIONALITY_NAME="qc_test" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "viashpy==0.8.0" && \ + pip install --upgrade --no-cache-dir "git+https://github.com/openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component test_workflows/qc qc_test" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "qc_test main" + echo "" + echo "This component test the output of the integration test of the QC workflow." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: foo.final.h5mu" + echo " Path to h5mu output." + echo "" + echo " --og_input" + echo " type: file, required parameter, file must exist" + echo " example: foo.h5mu" + echo " Path to the original h5mu file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "qc_test main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --og_input) + [ -n "$VIASH_PAR_OG_INPUT" ] && ViashError Bad arguments for option \'--og_input\': \'$VIASH_PAR_OG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OG_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --og_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --og_input=*) + [ -n "$VIASH_PAR_OG_INPUT" ] && ViashError Bad arguments for option \'--og_input=*\': \'$VIASH_PAR_OG_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OG_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/test_workflows/qc/qc_test:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OG_INPUT+x} ]; then + ViashError '--og_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OG_INPUT" ] && [ ! -e "$VIASH_PAR_OG_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_OG_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OG_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OG_INPUT")" ) + VIASH_PAR_OG_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OG_INPUT") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-qc_test-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import pytest +import sys +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column + + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'og_input': $( if [ ! -z ${VIASH_PAR_OG_INPUT+x} ]; then echo "r'${VIASH_PAR_OG_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["og_input"]) + output_mudata = read_h5mu(par["input"]) + + assert input_mudata.n_mod == output_mudata.n_mod, ( + "Number of modalities should be the same" + ) + assert input_mudata.mod.keys() == output_mudata.mod.keys(), ( + "Modalities should be the same" + ) + assert list(output_mudata.mod.keys()) == [ + "rna", + "atac", + ], "Modalities should be rna and atac" + + obs_cols_to_remove = [] + for top_n_vars in ("50", "100", "200", "500"): + obs_cols_to_remove.append(f"pct_of_counts_in_top_{top_n_vars}_vars") + + obs_cols_to_remove.extend( + [ + "total_counts", + "num_nonzero_vars", + "fraction_mitochondrial", + "fraction_ribosomal", + "total_counts_mitochondrial", + "total_counts_ribosomal", + "pct_mitochondrial", + "pct_ribosomal", + ] + ) + var_cols_to_remove = [ + "obs_mean", + "total_counts", + "num_nonzero_obs", + "pct_dropout", + "mitochondrial", + "ribosomal", + ] + + assert set(obs_cols_to_remove).issubset( + set(output_mudata.mod["rna"].obs.columns.to_list()) + ) + assert set(var_cols_to_remove).issubset( + set(output_mudata.mod["rna"].var.columns.to_list()) + ) + + initial_mudata = remove_annotation_column( + output_mudata, obs_cols_to_remove, axis="obs", modality_name="rna" + ) + initial_mudata = remove_annotation_column( + initial_mudata, var_cols_to_remove, axis="var", modality_name="rna" + ) + + assert_annotation_objects_equal(input_mudata, initial_mudata) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OG_INPUT" ]; then + VIASH_PAR_OG_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OG_INPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/_test/executable/test_workflows/qc/qc_test/setup_logger.py b/target/_test/executable/test_workflows/qc/qc_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/executable/test_workflows/qc/qc_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/download/sync_test_resources/.config.vsh.yaml b/target/_test/nextflow/download/sync_test_resources/.config.vsh.yaml new file mode 100644 index 00000000..85848de9 --- /dev/null +++ b/target/_test/nextflow/download/sync_test_resources/.config.vsh.yaml @@ -0,0 +1,244 @@ +name: "sync_test_resources" +namespace: "download" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the _viash.yaml project configuration file." + info: null + default: + - "_viash.yaml" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Path to the directory where the resources will be synced to." + info: null + default: + - "." + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean_true" + name: "--quiet" + description: "Displays the operations that would be performed using the specified\ + \ command without actually running them." + info: null + direction: "input" + - type: "boolean_true" + name: "--dryrun" + description: "Does not display the operations performed from the specified command." + info: null + direction: "input" + - type: "boolean_true" + name: "--delete" + description: "Files that exist in the destination but not in the source are deleted\ + \ during sync." + info: null + direction: "input" + - type: "string" + name: "--exclude" + description: "Exclude all files or objects from the command that matches the specified\ + \ pattern." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Sync test resources to the local filesystem" +usage: "sync_test_resources\nsync_test_resources --input _viash.yaml --output .\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "amazon/aws-cli:2.17.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "yum" + packages: + - "wget" + - type: "docker" + run: + - "wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64\ + \ -O /usr/bin/yq && \\\n chmod +x /usr/bin/yq\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/download/sync_test_resources/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/download/sync_test_resources" + executable: "target/_test/nextflow/download/sync_test_resources/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/download/sync_test_resources/main.nf b/target/_test/nextflow/download/sync_test_resources/main.nf new file mode 100644 index 00000000..f7fa6349 --- /dev/null +++ b/target/_test/nextflow/download/sync_test_resources/main.nf @@ -0,0 +1,3924 @@ +// sync_test_resources main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "sync_test_resources", + "namespace" : "download", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the _viash.yaml project configuration file.", + "default" : [ + "_viash.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Path to the directory where the resources will be synced to.", + "default" : [ + "." + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--quiet", + "description" : "Displays the operations that would be performed using the specified command without actually running them.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--dryrun", + "description" : "Does not display the operations performed from the specified command.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--delete", + "description" : "Files that exist in the destination but not in the source are deleted during sync.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--exclude", + "description" : "Exclude all files or objects from the command that matches the specified pattern.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Sync test resources to the local filesystem", + "usage" : "sync_test_resources\nsync_test_resources --input _viash.yaml --output .\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "amazon/aws-cli:2.17.11", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "yum", + "packages" : [ + "wget" + ] + }, + { + "type" : "docker", + "run" : [ + "wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && \\\\\n chmod +x /usr/bin/yq\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/download/sync_test_resources/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/download/sync_test_resources", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_QUIET+x} ]; then echo "${VIASH_PAR_QUIET}" | sed "s#'#'\\"'\\"'#g;s#.*#par_quiet='&'#" ; else echo "# par_quiet="; fi ) +$( if [ ! -z ${VIASH_PAR_DRYRUN+x} ]; then echo "${VIASH_PAR_DRYRUN}" | sed "s#'#'\\"'\\"'#g;s#.*#par_dryrun='&'#" ; else echo "# par_dryrun="; fi ) +$( if [ ! -z ${VIASH_PAR_DELETE+x} ]; then echo "${VIASH_PAR_DELETE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_delete='&'#" ; else echo "# par_delete="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE+x} ]; then echo "${VIASH_PAR_EXCLUDE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_exclude='&'#" ; else echo "# par_exclude="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=( ) + +if [ "\\$par_quiet" == "true" ]; then + extra_params+=( "--quiet" ) +fi +if [ "\\$par_dryrun" == "true" ]; then + extra_params+=( "--dryrun" ) +fi +if [ "\\$par_delete" == "true" ]; then + extra_params+=( "--delete" ) +fi + +if [ ! -z \\${par_exclude+x} ]; then + IFS=";" + for var in \\$par_exclude; do + unset IFS + extra_params+=( "--exclude" "\\$var" ) + done +fi + +function sync_s3() { + local s3_path="\\$1" + local dest_path="\\$2" + AWS_EC2_METADATA_DISABLED=true \\\\ + aws s3 sync \\\\ + "\\$s3_path" \\\\ + "\\$dest_path" \\\\ + --no-sign-request \\\\ + "\\${extra_params[@]}" +} + +yq e \\\\ + '.info.test_resources[] | "{type: " + (.type // "s3") + ", path: " + .path + ", dest: " + .dest + "}"' \\\\ + "\\${par_input}" | \\\\ + while read -r line; do + type=\\$(echo "\\$line" | yq e '.type') + path=\\$(echo "\\$line" | yq e '.path') + dest=\\$(echo "\\$line" | yq e '.dest') + + echo "Syncing '\\$path' to '\\$dest'..." + + if [ "\\$type" == "s3" ]; then + sync_s3 "\\$path" "\\$par_output/\\$dest" + fi + done +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/download/sync_test_resources", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/download/sync_test_resources/nextflow.config b/target/_test/nextflow/download/sync_test_resources/nextflow.config new file mode 100644 index 00000000..c1a5a0b1 --- /dev/null +++ b/target/_test/nextflow/download/sync_test_resources/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'download/sync_test_resources' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Sync test resources to the local filesystem' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/download/sync_test_resources/nextflow_labels.config b/target/_test/nextflow/download/sync_test_resources/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/download/sync_test_resources/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/.config.vsh.yaml new file mode 100644 index 00000000..096856d6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "harmony_knn_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the annotation of the harmony_knn\ + \ of workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/annotation/harmony_knn_test" + executable: "target/_test/nextflow/test_workflows/annotation/harmony_knn_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/main.nf b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/main.nf new file mode 100644 index 00000000..7ff0efcf --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/main.nf @@ -0,0 +1,3863 @@ +// harmony_knn_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "harmony_knn_test", + "namespace" : "test_workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component tests the output of the annotation of the harmony_knn of workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/annotation/harmony_knn_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_harmony", "X_leiden_harmony_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = [ + "harmonypy_integration_distances", + "harmonypy_integration_connectivities", + ] + expected_mod = ["rna", "prot"] + + assert all(key in list(input_mudata.mod) for key in expected_mod), ( + f"Input modalities should be: {expected_mod}, found: {input_mudata.mod.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category", ( + "Cell type predictions should be of dtype category." + ) + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64", ( + "Cell type probabilities should be of dtype float64." + ) + + assert input_mudata.mod["rna"].shape[0] == input_mudata.mod["prot"].shape[0], ( + "Number of observations should be equal in all modalities." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/annotation/harmony_knn_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/nextflow.config b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/nextflow.config new file mode 100644 index 00000000..2a7bc85d --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/annotation/harmony_knn_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component tests the output of the annotation of the harmony_knn of workflow.' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/setup_logger.py b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/harmony_knn_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/.config.vsh.yaml new file mode 100644 index 00000000..cadcc676 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "scanvi_scarches_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the annotation of the scanvi annotation\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/scanvi_scarches/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test" + executable: "target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/main.nf b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/main.nf new file mode 100644 index 00000000..b438733c --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/main.nf @@ -0,0 +1,3859 @@ +// scanvi_scarches_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scanvi_scarches_test", + "namespace" : "test_workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component tests the output of the annotation of the scanvi annotation workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/annotation/scanvi_scarches/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest +import pandas + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scanvi", "X_scanvi_umap"] + expected_obs = ["scanvi_pred", "scanvi_probabilities"] + expected_obsp = [ + "scanvi_integration_distances", + "scanvi_integration_connectivities", + ] + expected_uns = ["scanvi_integration_neighbors"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].uns) for key in expected_uns), ( + f"Input mod['rna'] uns columns should be: {expected_uns}, found: {input_mudata.mod['rna'].uns.keys()}." + ) + + assert input_mudata.mod["rna"].obs["scanvi_pred"].dtype == "category" + assert pandas.api.types.is_float_dtype( + input_mudata.mod["rna"].obs["scanvi_probabilities"].dtype + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/annotation/scanvi_scarches_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/nextflow.config b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/nextflow.config new file mode 100644 index 00000000..5127f60d --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/annotation/scanvi_scarches_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component tests the output of the annotation of the scanvi annotation workflow.' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/setup_logger.py b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scanvi_scarches_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/.config.vsh.yaml new file mode 100644 index 00000000..150f0282 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/.config.vsh.yaml @@ -0,0 +1,209 @@ +name: "scgpt_annotation_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_hvg" + description: "Number of highly variable genes the input file was subset for." + info: null + example: + - 400 + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the the scgpt_annotation workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/scgpt/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test" + executable: "target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf new file mode 100644 index 00000000..e642387c --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/main.nf @@ -0,0 +1,3873 @@ +// scgpt_annotation_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scgpt_annotation_test", + "namespace" : "test_workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_hvg", + "description" : "Number of highly variable genes the input file was subset for.", + "example" : [ + 400 + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the the scgpt_annotation workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/annotation/scgpt/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_hvg': $( if [ ! -z ${VIASH_PAR_N_HVG+x} ]; then echo "int(r'${VIASH_PAR_N_HVG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = [ + "gene_id_tokens", + "values_tokenized", + "padding_mask", + "bin_edges", + "binned_counts", + ] + expected_var = ["scgpt_filter_with_hvg", "scgpt_cross_checked_genes"] + expected_obs = ["scgpt_pred", "scgpt_probability"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obs columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].var) for key in expected_var), ( + f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + # hvg subsetting is not exact - add 10% to allowed data shape + assert ( + input_mudata.mod["rna"].obsm["binned_counts"].shape[1] + <= par["n_hvg"] + 0.1 * par["n_hvg"] + ), ( + f"Input shape should be lower or equal than --n_hvg {par['n_hvg']}, found: {input_mudata.shape[1]}." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/annotation/scgpt_annotation_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/nextflow.config b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/nextflow.config new file mode 100644 index 00000000..da556e47 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/annotation/scgpt_annotation_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the the scgpt_annotation workflow.' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/setup_logger.py b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scgpt_annotation_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/.config.vsh.yaml new file mode 100644 index 00000000..2cc105a6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "scvi_knn_test" +namespace: "test_workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the annotation of the scvi_knn of\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/annotation/scvi_knn/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/annotation/scvi_knn_test" + executable: "target/_test/nextflow/test_workflows/annotation/scvi_knn_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/main.nf b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/main.nf new file mode 100644 index 00000000..a630ff50 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/main.nf @@ -0,0 +1,3860 @@ +// scvi_knn_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scvi_knn_test", + "namespace" : "test_workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component tests the output of the annotation of the scvi_knn of workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/annotation/scvi_knn/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/annotation/scvi_knn_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scvi", "X_leiden_scvi_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = ["scvi_integration_connectivities", "scvi_integration_distances"] + expected_mod = ["rna", "prot"] + + assert all(key in list(input_mudata.mod) for key in expected_mod), ( + f"Input modalities should be: {expected_mod}, found: {input_mudata.mod.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), ( + f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), ( + f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + ) + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category", ( + "Cell type predictions should be of dtype category." + ) + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64", ( + "Cell type probabilities should be of dtype float64." + ) + + assert input_mudata.mod["rna"].shape[0] == input_mudata.mod["prot"].shape[0], ( + "Number of observations should be equal in all modalities." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/annotation/scvi_knn_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/nextflow.config b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/nextflow.config new file mode 100644 index 00000000..cbbc4a45 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/annotation/scvi_knn_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component tests the output of the annotation of the scvi_knn of workflow.' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/setup_logger.py b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/annotation/scvi_knn_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/.config.vsh.yaml new file mode 100644 index 00000000..9aaad88a --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "bd_rhapsody_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the bd_rhapsody\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/bd_rhapsody/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test" + executable: "target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/main.nf b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/main.nf new file mode 100644 index 00000000..083df59a --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/main.nf @@ -0,0 +1,3865 @@ +// bd_rhapsody_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bd_rhapsody_test", + "namespace" : "test_workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of the bd_rhapsody workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/ingestion/bd_rhapsody/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import numpy as np +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_var = ["gene_name", "feature_type", "reference_file", "gene_ids"] + expected_obs = ["run_id", "library_id", "cell_id"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert "prot" in list(input_mudata.mod.keys()), "Input should contain rna modality." + # assert list(input_mudata.var.columns) == expected_var, f"Input var columns should be: {expected_var}." + assert all( + key in list(input_mudata.mod["rna"].var.columns) for key in expected_var + ), ( + f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + ) + assert all( + key in list(input_mudata.mod["rna"].obs.columns) for key in expected_obs + ), ( + f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + ) + assert all( + key in list(input_mudata.mod["prot"].var.columns) for key in expected_var + ), ( + f"Input mod['prot'] var columns should be: {expected_var}, found: {input_mudata.mod['prot'].var.keys()}." + ) + assert all( + key in list(input_mudata.mod["prot"].obs.columns) for key in expected_obs + ), ( + f"Input mod ['prot'] obs columns should be: {expected_obs}, found: {input_mudata.mod['prot'].obs.keys()}." + ) + assert np.array_equal( + input_mudata.mod["rna"].var["feature_type"].unique(), ["Gene Expression"] + ), "Output X should only contain Gene Expression vars." + assert np.array_equal( + input_mudata.mod["prot"].var["feature_type"].unique(), ["Antibody Capture"] + ), "Output X should only contain Gene Expression vars." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/ingestion/bd_rhapsody_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/nextflow.config b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/nextflow.config new file mode 100644 index 00000000..c781c2ff --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/ingestion/bd_rhapsody_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of the bd_rhapsody workflow.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/setup_logger.py b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/bd_rhapsody_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/.config.vsh.yaml new file mode 100644 index 00000000..25d34a4c --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "cellranger_mapping_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the cellranger\ + \ mapping workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/cellranger_mapping/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test" + executable: "target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/main.nf b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/main.nf new file mode 100644 index 00000000..79afd0b7 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/main.nf @@ -0,0 +1,3843 @@ +// cellranger_mapping_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_mapping_test", + "namespace" : "test_workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of the cellranger mapping workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/ingestion/cellranger_mapping/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_colnames = ["gene_symbol", "feature_types", "genome"] + + assert list(input_mudata.mod.keys()) == ["rna"], ( + "Input should contain rna modality." + ) + assert list(input_mudata.var.columns) == expected_colnames, ( + f"Input var columns should be: {expected_colnames}." + ) + assert list(input_mudata.mod["rna"].var.columns) == expected_colnames, ( + f"Input mod['rna'] var columns should be: {expected_colnames}." + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/ingestion/cellranger_mapping_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/nextflow.config b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/nextflow.config new file mode 100644 index 00000000..ce0dab16 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/ingestion/cellranger_mapping_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of the cellranger mapping workflow.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/setup_logger.py b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_mapping_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/.config.vsh.yaml new file mode 100644 index 00000000..b01c9e90 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "cellranger_multi_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the cellranger\ + \ multi workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/cellranger_multi/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test" + executable: "target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/main.nf b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/main.nf new file mode 100644 index 00000000..ad10b605 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/main.nf @@ -0,0 +1,3847 @@ +// cellranger_multi_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_multi_test", + "namespace" : "test_workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of the cellranger multi workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/ingestion/cellranger_multi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + for input_path in par["input"]: + input_mudata = read_h5mu(input_path) + + assert list(input_mudata.mod.keys()) == ["rna", "prot", "vdj_t"] + assert list(input_mudata.uns.keys()) == ["metrics_cellranger"] + expected_metrics = [ + "Category", + "Library Type", + "Grouped By", + "Group Name", + "Metric Name", + "Metric Value", + ] + assert ( + input_mudata.uns["metrics_cellranger"].columns.to_list() == expected_metrics + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/ingestion/cellranger_multi_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/nextflow.config b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/nextflow.config new file mode 100644 index 00000000..c790a3d8 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/ingestion/cellranger_multi_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of the cellranger multi workflow.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/setup_logger.py b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_multi_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/.config.vsh.yaml new file mode 100644 index 00000000..0a73151b --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/.config.vsh.yaml @@ -0,0 +1,220 @@ +name: "cellranger_postprocessing_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_og" + description: "Path to the original h5mu file." + info: null + example: + - "foo.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--is_corrected" + description: "Whether the input file has been corrected." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the cellranger\ + \ postprocessing workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/cellranger_postprocessing/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test" + executable: "target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/main.nf b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/main.nf new file mode 100644 index 00000000..0c351430 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/main.nf @@ -0,0 +1,3878 @@ +// cellranger_postprocessing_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_postprocessing_test", + "namespace" : "test_workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_og", + "description" : "Path to the original h5mu file.", + "example" : [ + "foo.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--is_corrected", + "description" : "Whether the input file has been corrected.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of the cellranger postprocessing workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/ingestion/cellranger_postprocessing/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_og': $( if [ ! -z ${VIASH_PAR_INPUT_OG+x} ]; then echo "r'${VIASH_PAR_INPUT_OG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'is_corrected': $( if [ ! -z ${VIASH_PAR_IS_CORRECTED+x} ]; then echo "r'${VIASH_PAR_IS_CORRECTED//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input_og"]) + output_mudata = read_h5mu(par["input"]) + + assert input_mudata.mod.keys() == output_mudata.mod.keys(), ( + "Input and output should have the same modalities." + ) + + for modality, input_adata, output_adata in zip( + input_mudata.mod.keys(), input_mudata.mod.values(), output_mudata.mod.values() + ): + assert input_adata.n_obs >= output_adata.n_obs, ( + "Output should have less or equal number of observations than input." + ) + assert input_adata.n_vars == output_adata.n_vars, ( + "Output should have the same number of variables as input." + ) + if modality != "rna": + assert_annotation_objects_equal(input_adata, output_adata) + + if par["is_corrected"]: + assert "cellbender_corrected" in output_mudata.mod["rna"].layers + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/ingestion/cellranger_postprocessing_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/nextflow.config b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/nextflow.config new file mode 100644 index 00000000..c490499c --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/ingestion/cellranger_postprocessing_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of the cellranger postprocessing workflow.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/setup_logger.py b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/cellranger_postprocessing_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/ingestion/conversion_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/ingestion/conversion_test/.config.vsh.yaml new file mode 100644 index 00000000..fc13c115 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/conversion_test/.config.vsh.yaml @@ -0,0 +1,200 @@ +name: "conversion_test" +namespace: "test_workflows/ingestion" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the conversion\ + \ workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/ingestion/conversion/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/ingestion/conversion_test" + executable: "target/_test/nextflow/test_workflows/ingestion/conversion_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/ingestion/conversion_test/main.nf b/target/_test/nextflow/test_workflows/ingestion/conversion_test/main.nf new file mode 100644 index 00000000..3f4fefed --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/conversion_test/main.nf @@ -0,0 +1,3837 @@ +// conversion_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "conversion_test", + "namespace" : "test_workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of the conversion workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/ingestion/conversion/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/ingestion/conversion_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu +import sys +import pytest + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + assert "rna" in input_mudata.mod.keys() + assert input_mudata.n_obs == 713 + assert input_mudata.mod["rna"].var["feature_types"].unique() == [ + "Gene Expression" + ], "Output X should only contain Gene Expression vars." + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/ingestion/conversion_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/ingestion/conversion_test/nextflow.config b/target/_test/nextflow/test_workflows/ingestion/conversion_test/nextflow.config new file mode 100644 index 00000000..d74c57d6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/conversion_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/ingestion/conversion_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of the conversion workflow.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/ingestion/conversion_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/ingestion/conversion_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/conversion_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/ingestion/conversion_test/setup_logger.py b/target/_test/nextflow/test_workflows/ingestion/conversion_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/ingestion/conversion_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/.config.vsh.yaml new file mode 100644 index 00000000..3b8acfa9 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/.config.vsh.yaml @@ -0,0 +1,197 @@ +name: "dimensionality_reduction_test" +namespace: "test_workflows/multiomics" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of dimensionality_reduction." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/dimensionality_reduction/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test" + executable: "target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/main.nf b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/main.nf new file mode 100644 index 00000000..ab27cc35 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/main.nf @@ -0,0 +1,3832 @@ +// dimensionality_reduction_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Kai Waldrant + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "dimensionality_reduction_test", + "namespace" : "test_workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Kai Waldrant", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "kai@data-intuitive.com", + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361", + "linkedin" : "kaiwaldrant" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatician" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Contributor" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of dimensionality_reduction.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/multiomics/dimensionality_reduction/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + +print("Loading data", flush=True) +data = mu.read_h5mu(par["input"]) + +assert "X_umap" in data.mod["rna"].obsm, "X_umap not found in .obsm" +assert data.mod["rna"].obsm["X_umap"].shape[1] == 2, ( + f"X_umap has wrong shape expected 2 n_comp but got {data.mod['rna'].obsm['X_umap'].shape[1]}" +) +assert "pca_variance" in data.mod["rna"].uns +assert "pca_loadings" in data.mod["rna"].varm + + +print("Test successful!", flush=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/multiomics/dimensionality_reduction_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/nextflow.config b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/nextflow.config new file mode 100644 index 00000000..05886219 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/multiomics/dimensionality_reduction_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of dimensionality_reduction.' + author = 'Kai Waldrant' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/dimensionality_reduction_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/.config.vsh.yaml new file mode 100644 index 00000000..f21cdad3 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/.config.vsh.yaml @@ -0,0 +1,216 @@ +name: "workflow_test" +namespace: "test_workflows/multiomics/process_batches" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to the modality summary csv." + info: null + example: + - "test.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the integration test of process_batches\ + \ test_wf." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/process_batches/workflow_test/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test" + executable: "target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/main.nf b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/main.nf new file mode 100644 index 00000000..d560dd91 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/main.nf @@ -0,0 +1,3869 @@ +// workflow_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Kai Waldrant + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "workflow_test", + "namespace" : "test_workflows/multiomics/process_batches", + "version" : "main", + "authors" : [ + { + "name" : "Kai Waldrant", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "kai@data-intuitive.com", + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361", + "linkedin" : "kaiwaldrant" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatician" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Contributor" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to the modality summary csv.", + "example" : [ + "test.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--orig_input", + "description" : "Path to the original input file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component tests the output of the integration test of process_batches test_wf.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/multiomics/process_batches/workflow_test/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + +print("Loading data", flush=True) +input = mu.read_h5mu(par["orig_input"]) +output = mu.read_h5mu(par["input"]) + +assert input.n_mod == output.n_mod, "Number of modalities differ" +assert input.mod.keys() == output.mod.keys(), "Modalities differ" + +# Check atac modality +assert_annotation_objects_equal( + input.mod["atac"], output.mod["atac"], promote_precision=True +) + +# Check rna modality +assert "X_umap" in output.mod["rna"].obsm, "X_umap not found in .obsm" +assert output.mod["rna"].obsm["X_umap"].shape[1] == 2, ( + f"X_umap has wrong shape expected 2 n_comp but got {output.mod['rna'].obsm['X_umap'].shape[1]}" +) +assert "pca_variance" in output.mod["rna"].uns +assert "pca_loadings" in output.mod["rna"].varm + + +print("Test successful!", flush=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/multiomics/process_batches/workflow_test", + "tag" : "main" + }, + "label" : [ + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/nextflow.config b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/nextflow.config new file mode 100644 index 00000000..1fd95331 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/multiomics/process_batches/workflow_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component tests the output of the integration test of process_batches test_wf.' + author = 'Kai Waldrant' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/.config.vsh.yaml b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/.config.vsh.yaml new file mode 100644 index 00000000..d7071ca1 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/.config.vsh.yaml @@ -0,0 +1,216 @@ +name: "workflow_test2" +namespace: "test_workflows/multiomics/process_batches" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to the modality summary csv." + info: null + example: + - "test.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component tests the output of the integration test of process_batches\ + \ test_wf2." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/process_batches/workflow_test2/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2" + executable: "target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/main.nf b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/main.nf new file mode 100644 index 00000000..433741c2 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/main.nf @@ -0,0 +1,3894 @@ +// workflow_test2 main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Kai Waldrant + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "workflow_test2", + "namespace" : "test_workflows/multiomics/process_batches", + "version" : "main", + "authors" : [ + { + "name" : "Kai Waldrant", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "kai@data-intuitive.com", + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361", + "linkedin" : "kaiwaldrant" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatician" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Contributor" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to the modality summary csv.", + "example" : [ + "test.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--orig_input", + "description" : "Path to the original input file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component tests the output of the integration test of process_batches test_wf2.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + +print("Loading data", flush=True) +input = mu.read_h5mu(par["orig_input"]) +output = mu.read_h5mu(par["input"]) + +assert input.n_mod == output.n_mod, "Number of modalities differ" +assert input.mod.keys() == output.mod.keys(), "Modalities differ" + +# Check vdj_t modality +# Allow X_umap to be overwritten +input_vdj = input.mod["vdj_t"] +# del input_vdj.obsm['X_umap'] +output_vdj = output.mod["vdj_t"] +# del output_vdj.obsm['X_umap'] +assert_annotation_objects_equal(input_vdj, output_vdj, promote_precision=True) + +# Check prot modality +# Ignore the PCA layer and its derivatives, as its allowed to be overwritten for this test. +input_prot = input.mod["prot"] +del input_prot.varm["pca_loadings"] +del input_prot.obsm["X_pca"] +del input_prot.obsm["X_umap"] +output_prot = output.mod["prot"] +del output_prot.varm["pca_loadings"] +del output_prot.obsm["X_pca"] +del output_prot.obsm["X_umap"] +assert_annotation_objects_equal(input_prot, output_prot, promote_precision=True) + + +# Check rna modality +# Allow the highly variable genes and PCA + derivatives to be overwritten +input_rna = input.mod["rna"] +input_rna = remove_annotation_column(input_rna, "filter_with_hvg", "var") +del input_rna.varm["pca_loadings"] +del input_rna.obsm["X_pca"] +del input_rna.obsm["X_umap"] +del input_rna.layers["log_normalized"] +output_rna = output.mod["rna"] +output_rna = remove_annotation_column(output_rna, "filter_with_hvg", "var") +del output_rna.obsm["X_pca"] +del output_rna.varm["pca_loadings"] +del output_rna.obsm["X_umap"] +del output_rna.layers["log_normalized"] +assert_annotation_objects_equal(input_rna, output_rna, promote_precision=True) + + +print("Test successful!", flush=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/multiomics/process_batches/workflow_test2", + "tag" : "main" + }, + "label" : [ + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/nextflow.config b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/nextflow.config new file mode 100644 index 00000000..d8c5c0c7 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/multiomics/process_batches/workflow_test2' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component tests the output of the integration test of process_batches test_wf2.' + author = 'Kai Waldrant' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/nextflow_labels.config b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/process_batches/workflow_test2/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/.config.vsh.yaml new file mode 100644 index 00000000..fdb1b7fb --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/.config.vsh.yaml @@ -0,0 +1,225 @@ +name: "split_h5mu_test" +namespace: "test_workflows/multiomics" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to sample summary csv." + info: null + example: + - "output_samples.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--samples_dir" + description: "Path to the directory containing the sample h5mu files." + info: null + example: + - "/path/to/h5mu_files" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "/path/to/original_input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of split_h5mu." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/split_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/multiomics/split_h5mu_test" + executable: "target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/main.nf b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/main.nf new file mode 100644 index 00000000..c7c1ef8f --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/main.nf @@ -0,0 +1,3898 @@ +// split_h5mu_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Kai Waldrant + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "split_h5mu_test", + "namespace" : "test_workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Kai Waldrant", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "kai@data-intuitive.com", + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361", + "linkedin" : "kaiwaldrant" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatician" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Contributor" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to sample summary csv.", + "example" : [ + "output_samples.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--samples_dir", + "description" : "Path to the directory containing the sample h5mu files.", + "example" : [ + "/path/to/h5mu_files" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--orig_input", + "description" : "Path to the original input file.", + "example" : [ + "/path/to/original_input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of split_h5mu.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/multiomics/split_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import csv +import os +import mudata as mu +# from openpipeline_testutils.asserters import assert_annotation_objects_equal + + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'samples_dir': $( if [ ! -z ${VIASH_PAR_SAMPLES_DIR+x} ]; then echo "r'${VIASH_PAR_SAMPLES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +print("Loading data", flush=True) +with open(par["input"], "r", encoding="utf-8") as f: + reader = csv.reader(f) + data = list(reader) + +input_mu = mu.read_h5mu(par["orig_input"]) + +input_samples = input_mu.mod["rna"].obs["sample_id"].unique() +num_samples = input_samples.shape[0] +num_files = len(os.listdir(par["samples_dir"])) + +# Check if the number of files is equal to the number of lines in the csv +assert num_samples == num_files, f"Expected {num_samples} files, but found {num_files}." +assert input_mu.n_mod == num_samples, ( + f"Expected {num_samples} samples in {par['orig_input']} got {num_files} samples." +) + + +# Check if the files exist and if the sample name is in the file name +for i, row in enumerate(data): + if i == 0: + continue + # Check if the files exist and if the sample name is in the file name + assert row[0] in row[1], f"Expected {row[0]} to be in {row[1]}." + samples_fp = os.path.join(par["samples_dir"], row[1]) + assert os.path.exists(samples_fp), f"Expected {row[1]} to exist." + # Check sample is correct in the h5mu file + mod_mu = mu.read_h5mu(samples_fp) + found_samples = mod_mu["rna"].obs["sample_id"].unique() + assert found_samples.shape[0] == 1, f"Expected 1 sample ID in {row[1]}." + assert found_samples[0] == row[0], ( + f"Expected {row[0]} to be the sample ID for {row[1]}" + ) + assert row[0] in input_samples, ( + f"Expected {row[0]} to be a mod in {par['orig_input']}." + ) + +print("Test successful!", flush=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/multiomics/split_h5mu_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/nextflow.config b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/nextflow.config new file mode 100644 index 00000000..4ff7a333 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/multiomics/split_h5mu_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of split_h5mu.' + author = 'Kai Waldrant' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_h5mu_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/.config.vsh.yaml new file mode 100644 index 00000000..8589bd0c --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/.config.vsh.yaml @@ -0,0 +1,225 @@ +name: "split_modalities_test" +namespace: "test_workflows/multiomics" +version: "main" +authors: +- name: "Kai Waldrant" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to modality summary csv." + info: null + example: + - "output_types.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--mod_dir" + description: "Path to the directory containing the modality h5mu files." + info: null + example: + - "/path/to/h5mu_files" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--orig_input" + description: "Path to the original input file." + info: null + example: + - "/path/to/original_input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of split_modalities." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/multiomics/split_modalities/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/multiomics/split_modalities_test" + executable: "target/_test/nextflow/test_workflows/multiomics/split_modalities_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/main.nf b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/main.nf new file mode 100644 index 00000000..b3dc5212 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/main.nf @@ -0,0 +1,3900 @@ +// split_modalities_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Kai Waldrant + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "split_modalities_test", + "namespace" : "test_workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Kai Waldrant", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "kai@data-intuitive.com", + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361", + "linkedin" : "kaiwaldrant" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatician" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Contributor" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to modality summary csv.", + "example" : [ + "output_types.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--mod_dir", + "description" : "Path to the directory containing the modality h5mu files.", + "example" : [ + "/path/to/h5mu_files" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--orig_input", + "description" : "Path to the original input file.", + "example" : [ + "/path/to/original_input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of split_modalities.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/multiomics/split_modalities/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/multiomics/split_modalities_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import csv +import os +import mudata as mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal + + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'mod_dir': $( if [ ! -z ${VIASH_PAR_MOD_DIR+x} ]; then echo "r'${VIASH_PAR_MOD_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'orig_input': $( if [ ! -z ${VIASH_PAR_ORIG_INPUT+x} ]; then echo "r'${VIASH_PAR_ORIG_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +print("Loading data", flush=True) +with open(par["input"], "r", encoding="utf-8") as f: + reader = csv.reader(f) + data = list(reader) + +input_mu = mu.read_h5mu(par["orig_input"]) + +num_mod = len(data) - 1 +num_files = len(os.listdir(par["mod_dir"])) + +# Check if the number of files is equal to the number of lines in the csv +assert num_mod == num_files, f"Expected {num_mod} files, but found {num_files}." +assert input_mu.n_mod == num_mod, ( + f"Expected {num_mod} modalities in {par['orig_input']} got {input_mu.n_mod} modalities." +) + +rna_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[1][1])) +prot_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[2][1])) + +# Check if the files exist and if the modality name is in the file name +for i, row in enumerate(data): + if i == 0: + continue + # Check if the files exist and if the modality name is in the file name + assert row[0] in row[1], f"Expected {row[0]} to be in {row[1]}." + mod_fp = os.path.join(par["mod_dir"], row[1]) + assert os.path.exists(mod_fp), f"Expected {row[1]} to exist." + # Check modality is correct in the h5mu file + mod_mu = mu.read_h5mu(mod_fp) + assert mod_mu.n_mod == 1, f"Expected 1 modality in {row[1]}." + assert row[0] in mod_mu.mod.keys(), f"Expected {row[0]} to be the mod in {row[1]}." + assert row[0] in input_mu.mod.keys(), ( + f"Expected {row[0]} to be a mod in {par['orig_input']}." + ) + +# Check if extracted modalities are equal to the original modalities +assert_annotation_objects_equal(rna_mod.mod["rna"], input_mu.mod["rna"]) +assert_annotation_objects_equal(prot_mod.mod["prot"], input_mu.mod["prot"]) + +print("Test successful!", flush=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/multiomics/split_modalities_test", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/nextflow.config b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/nextflow.config new file mode 100644 index 00000000..5b76b395 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/multiomics/split_modalities_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of split_modalities.' + author = 'Kai Waldrant' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/multiomics/split_modalities_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/qc/qc_test/.config.vsh.yaml b/target/_test/nextflow/test_workflows/qc/qc_test/.config.vsh.yaml new file mode 100644 index 00000000..fa6c1030 --- /dev/null +++ b/target/_test/nextflow/test_workflows/qc/qc_test/.config.vsh.yaml @@ -0,0 +1,214 @@ +name: "qc_test" +namespace: "test_workflows/qc" +version: "main" +authors: +- name: "Jakub Majercik" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to h5mu output." + info: null + example: + - "foo.final.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--og_input" + description: "Path to the original h5mu file." + info: null + example: + - "foo.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component test the output of the integration test of the QC workflow." +info: null +status: "enabled" +scope: + image: "test" + target: "test" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/workflows/test_workflows/qc/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/_test/nextflow/test_workflows/qc/qc_test" + executable: "target/_test/nextflow/test_workflows/qc/qc_test/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/_test/nextflow/test_workflows/qc/qc_test/main.nf b/target/_test/nextflow/test_workflows/qc/qc_test/main.nf new file mode 100644 index 00000000..0ee37cc9 --- /dev/null +++ b/target/_test/nextflow/test_workflows/qc/qc_test/main.nf @@ -0,0 +1,3911 @@ +// qc_test main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "qc_test", + "namespace" : "test_workflows/qc", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to h5mu output.", + "example" : [ + "foo.final.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--og_input", + "description" : "Path to the original h5mu file.", + "example" : [ + "foo.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component test the output of the integration test of the QC workflow.", + "status" : "enabled", + "scope" : { + "image" : "test", + "target" : "test" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/test_workflows/qc/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/_test/nextflow/test_workflows/qc/qc_test", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import pytest +import sys +from mudata import read_h5mu +from openpipeline_testutils.asserters import assert_annotation_objects_equal +from openpipeline_testutils.utils import remove_annotation_column + + +##VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'og_input': $( if [ ! -z ${VIASH_PAR_OG_INPUT+x} ]; then echo "r'${VIASH_PAR_OG_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["og_input"]) + output_mudata = read_h5mu(par["input"]) + + assert input_mudata.n_mod == output_mudata.n_mod, ( + "Number of modalities should be the same" + ) + assert input_mudata.mod.keys() == output_mudata.mod.keys(), ( + "Modalities should be the same" + ) + assert list(output_mudata.mod.keys()) == [ + "rna", + "atac", + ], "Modalities should be rna and atac" + + obs_cols_to_remove = [] + for top_n_vars in ("50", "100", "200", "500"): + obs_cols_to_remove.append(f"pct_of_counts_in_top_{top_n_vars}_vars") + + obs_cols_to_remove.extend( + [ + "total_counts", + "num_nonzero_vars", + "fraction_mitochondrial", + "fraction_ribosomal", + "total_counts_mitochondrial", + "total_counts_ribosomal", + "pct_mitochondrial", + "pct_ribosomal", + ] + ) + var_cols_to_remove = [ + "obs_mean", + "total_counts", + "num_nonzero_obs", + "pct_dropout", + "mitochondrial", + "ribosomal", + ] + + assert set(obs_cols_to_remove).issubset( + set(output_mudata.mod["rna"].obs.columns.to_list()) + ) + assert set(var_cols_to_remove).issubset( + set(output_mudata.mod["rna"].var.columns.to_list()) + ) + + initial_mudata = remove_annotation_column( + output_mudata, obs_cols_to_remove, axis="obs", modality_name="rna" + ) + initial_mudata = remove_annotation_column( + initial_mudata, var_cols_to_remove, axis="var", modality_name="rna" + ) + + assert_annotation_objects_equal(input_mudata, initial_mudata) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/test_workflows/qc/qc_test", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/_test/nextflow/test_workflows/qc/qc_test/nextflow.config b/target/_test/nextflow/test_workflows/qc/qc_test/nextflow.config new file mode 100644 index 00000000..f9a6a9de --- /dev/null +++ b/target/_test/nextflow/test_workflows/qc/qc_test/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'test_workflows/qc/qc_test' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component test the output of the integration test of the QC workflow.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/_test/nextflow/test_workflows/qc/qc_test/nextflow_labels.config b/target/_test/nextflow/test_workflows/qc/qc_test/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/qc/qc_test/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/_test/nextflow/test_workflows/qc/qc_test/setup_logger.py b/target/_test/nextflow/test_workflows/qc/qc_test/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/_test/nextflow/test_workflows/qc/qc_test/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/celltypist/.config.vsh.yaml b/target/executable/annotate/celltypist/.config.vsh.yaml new file mode 100644 index 00000000..57b2faba --- /dev/null +++ b/target/executable/annotate/celltypist/.config.vsh.yaml @@ -0,0 +1,471 @@ +name: "celltypist" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data containing log normalized counts to\ + \ be used for cell type annotation if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The name of the adata obs column in the reference data containing\ + \ cell type annotations." + info: null + default: + - "cell_ontology_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments." + arguments: + - type: "file" + name: "--model" + description: "Pretrained model in pkl format. If not provided, the model will\ + \ be trained on the reference data and --reference should be provided." + info: null + example: + - "pretrained_model.pkl" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--feature_selection" + description: "Whether to perform feature selection." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--majority_voting" + description: "Whether to refine the predicted labels by running the majority voting\ + \ classifier after over-clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--C" + description: "Inverse of regularization strength in logistic regression." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "Maximum number of iterations before reaching the minimum of the\ + \ cost function." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--use_SGD" + description: "Whether to use the stochastic gradient descent algorithm." + info: null + direction: "input" + - type: "double" + name: "--min_prop" + description: "\"For the dominant cell type within a subcluster, the minimum proportion\ + \ of cells required to \nsupport naming of the subcluster by this cell type.\ + \ Ignored if majority_voting is set to False. \nSubcluster that fails to pass\ + \ this proportion threshold will be assigned 'Heterogeneous'.\"\n" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "celltypist_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "celltypist_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\ + \ of logistic regression classifiers optimised by the stochastic gradient descent\ + \ algorithm." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + upgrade: true + - type: "python" + user: false + packages: + - "celltypist==1.6.3" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/celltypist/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/celltypist" + executable: "target/executable/annotate/celltypist/celltypist" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/celltypist/celltypist b/target/executable/annotate/celltypist/celltypist new file mode 100755 index 00000000..82cda7d8 --- /dev/null +++ b/target/executable/annotate/celltypist/celltypist @@ -0,0 +1,1702 @@ +#!/usr/bin/env bash + +# celltypist main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="celltypist" +VIASH_META_FUNCTIONALITY_NAME="celltypist" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scanpy~=1.10.4" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "celltypist==1.6.3" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component annotate celltypist" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "celltypist main" + echo "" + echo "Automated cell type annotation tool for scRNA-seq datasets on the basis of" + echo "logistic regression classifiers optimised by the stochastic gradient descent" + echo "algorithm." + echo "" + echo "Inputs:" + echo " Input dataset (query) arguments" + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input (query) data to be labeled. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " The layer in the input data containing log normalized counts to be used" + echo " for cell type annotation if .X is not to be used." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the input data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --input_reference_gene_overlap" + echo " type: integer" + echo " default: 100" + echo " min: 1" + echo " The minimum number of genes present in both the reference and query" + echo " datasets." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train the CellTypist classifiers on. Only required" + echo " if a pre-trained --model is not provided." + echo "" + echo " --reference_layer" + echo " type: string" + echo " The layer in the reference data to be used for cell type annotation if" + echo " .X is not to be used. Data are expected to be processed in the same way" + echo " as the --input query dataset." + echo "" + echo " --reference_obs_target" + echo " type: string" + echo " default: cell_ontology_class" + echo " The name of the adata obs column in the reference data containing cell" + echo " type annotations." + echo "" + echo " --reference_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the reference data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --reference_var_input" + echo " type: string" + echo " .var column containing highly variable genes. By default, do not subset" + echo " genes." + echo "" + echo "Model arguments:" + echo " Model arguments." + echo "" + echo " --model" + echo " type: file, file must exist" + echo " example: pretrained_model.pkl" + echo " Pretrained model in pkl format. If not provided, the model will be" + echo " trained on the reference data and --reference should be provided." + echo "" + echo " --feature_selection" + echo " type: boolean" + echo " default: false" + echo " Whether to perform feature selection." + echo "" + echo " --majority_voting" + echo " type: boolean" + echo " default: false" + echo " Whether to refine the predicted labels by running the majority voting" + echo " classifier after over-clustering." + echo "" + echo " --C" + echo " type: double" + echo " default: 1.0" + echo " Inverse of regularization strength in logistic regression." + echo "" + echo " --max_iter" + echo " type: integer" + echo " default: 1000" + echo " Maximum number of iterations before reaching the minimum of the cost" + echo " function." + echo "" + echo " --use_SGD" + echo " type: boolean_true" + echo " Whether to use the stochastic gradient descent algorithm." + echo "" + echo " --min_prop" + echo " type: double" + echo " default: 0.0" + echo " \"For the dominant cell type within a subcluster, the minimum proportion" + echo " of cells required to" + echo " support naming of the subcluster by this cell type. Ignored if" + echo " majority_voting is set to False." + echo " Subcluster that fails to pass this proportion threshold will be assigned" + echo " 'Heterogeneous'.\"" + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_obs_predictions" + echo " type: string" + echo " default: celltypist_pred" + echo " In which \`.obs\` slots to store the predicted information." + echo "" + echo " --output_obs_probability" + echo " type: string" + echo " default: celltypist_probability" + echo " In which \`.obs\` slots to store the probability of the predictions." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "celltypist main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_reference_gene_overlap) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_reference_gene_overlap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_reference_gene_overlap=*) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap=*\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_target) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_target. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_target=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target=*\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_gene_names) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_gene_names=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names=*\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_input) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_input=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input=*\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --feature_selection) + [ -n "$VIASH_PAR_FEATURE_SELECTION" ] && ViashError Bad arguments for option \'--feature_selection\': \'$VIASH_PAR_FEATURE_SELECTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_SELECTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --feature_selection. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --feature_selection=*) + [ -n "$VIASH_PAR_FEATURE_SELECTION" ] && ViashError Bad arguments for option \'--feature_selection=*\': \'$VIASH_PAR_FEATURE_SELECTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_SELECTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --majority_voting) + [ -n "$VIASH_PAR_MAJORITY_VOTING" ] && ViashError Bad arguments for option \'--majority_voting\': \'$VIASH_PAR_MAJORITY_VOTING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAJORITY_VOTING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --majority_voting. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --majority_voting=*) + [ -n "$VIASH_PAR_MAJORITY_VOTING" ] && ViashError Bad arguments for option \'--majority_voting=*\': \'$VIASH_PAR_MAJORITY_VOTING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAJORITY_VOTING=$(ViashRemoveFlags "$1") + shift 1 + ;; + --C) + [ -n "$VIASH_PAR_C" ] && ViashError Bad arguments for option \'--C\': \'$VIASH_PAR_C\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_C="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --C. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --C=*) + [ -n "$VIASH_PAR_C" ] && ViashError Bad arguments for option \'--C=*\': \'$VIASH_PAR_C\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_C=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_iter) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_iter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_iter=*) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter=*\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --use_SGD) + [ -n "$VIASH_PAR_USE_SGD" ] && ViashError Bad arguments for option \'--use_SGD\': \'$VIASH_PAR_USE_SGD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_SGD=true + shift 1 + ;; + --min_prop) + [ -n "$VIASH_PAR_MIN_PROP" ] && ViashError Bad arguments for option \'--min_prop\': \'$VIASH_PAR_MIN_PROP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_PROP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_prop. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_prop=*) + [ -n "$VIASH_PAR_MIN_PROP" ] && ViashError Bad arguments for option \'--min_prop=*\': \'$VIASH_PAR_MIN_PROP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_PROP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions=*\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_probability) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability=*\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/celltypist:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="100" +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then + VIASH_PAR_REFERENCE_OBS_TARGET="cell_ontology_class" +fi +if [ -z ${VIASH_PAR_FEATURE_SELECTION+x} ]; then + VIASH_PAR_FEATURE_SELECTION="false" +fi +if [ -z ${VIASH_PAR_MAJORITY_VOTING+x} ]; then + VIASH_PAR_MAJORITY_VOTING="false" +fi +if [ -z ${VIASH_PAR_C+x} ]; then + VIASH_PAR_C="1.0" +fi +if [ -z ${VIASH_PAR_MAX_ITER+x} ]; then + VIASH_PAR_MAX_ITER="1000" +fi +if [ -z ${VIASH_PAR_USE_SGD+x} ]; then + VIASH_PAR_USE_SGD="false" +fi +if [ -z ${VIASH_PAR_MIN_PROP+x} ]; then + VIASH_PAR_MIN_PROP="0.0" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="celltypist_pred" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="celltypist_probability" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL" ] && [ ! -e "$VIASH_PAR_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_MODEL' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ]]; then + if ! [[ "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--input_reference_gene_overlap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP -lt 1 ]]; then + ViashError '--input_reference_gene_overlap' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FEATURE_SELECTION" ]]; then + if ! [[ "$VIASH_PAR_FEATURE_SELECTION" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--feature_selection' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAJORITY_VOTING" ]]; then + if ! [[ "$VIASH_PAR_MAJORITY_VOTING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--majority_voting' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_C" ]]; then + if ! [[ "$VIASH_PAR_C" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--C' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_ITER" ]]; then + if ! [[ "$VIASH_PAR_MAX_ITER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_iter' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_SGD" ]]; then + if ! [[ "$VIASH_PAR_USE_SGD" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--use_SGD' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_PROP" ]]; then + if ! [[ "$VIASH_PAR_MIN_PROP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_prop' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL")" ) + VIASH_PAR_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-celltypist-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import celltypist +import mudata as mu +import anndata as ad +import pandas as pd +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'feature_selection': $( if [ ! -z ${VIASH_PAR_FEATURE_SELECTION+x} ]; then echo "r'${VIASH_PAR_FEATURE_SELECTION//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'majority_voting': $( if [ ! -z ${VIASH_PAR_MAJORITY_VOTING+x} ]; then echo "r'${VIASH_PAR_MAJORITY_VOTING//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'C': $( if [ ! -z ${VIASH_PAR_C+x} ]; then echo "float(r'${VIASH_PAR_C//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'use_SGD': $( if [ ! -z ${VIASH_PAR_USE_SGD+x} ]; then echo "r'${VIASH_PAR_USE_SGD//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'min_prop': $( if [ ! -z ${VIASH_PAR_MIN_PROP+x} ]; then echo "float(r'${VIASH_PAR_MIN_PROP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index +from subset_vars import subset_vars + +logger = setup_logger() + + +def check_celltypist_format(indata): + if np.abs(np.expm1(indata[0]).sum() - 10000) > 1: + return False + return True + + +def main(par): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + + # Provide correct format of query data for celltypist annotation + ## Sanitize gene names and set as index + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + ## Fetch lognormalized counts + lognorm_counts = ( + input_modality.layers[par["input_layer"]].copy() + if par["input_layer"] + else input_modality.X.copy() + ) + ## Create AnnData object + input_modality = ad.AnnData( + X=lognorm_counts, var=pd.DataFrame(index=input_modality.var.index) + ) + + if par["model"]: + logger.info("Loading CellTypist model") + model = celltypist.models.Model.load(par["model"]) + cross_check_genes( + input_modality.var.index, + model.features, + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + elif par["reference"]: + reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]] + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + # CellTypist requires query gene names to be in index + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # Ensure enough overlap between genes in query and reference + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + + labels = reference_modality.obs[par["reference_obs_target"]] + + logger.info("Training CellTypist model on reference") + model = celltypist.train( + reference_matrix, + labels=labels, + genes=reference_modality.var.index, + C=par["C"], + max_iter=par["max_iter"], + use_SGD=par["use_SGD"], + feature_selection=par["feature_selection"], + check_expression=True, + ) + + logger.info("Predicting CellTypist annotations") + predictions = celltypist.annotate( + input_modality, model, majority_voting=par["majority_voting"] + ) + + input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels[ + "predicted_labels" + ].values + input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max( + axis=1 + ).values + + # copy observations back to input data (with full set of features) + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/annotate/celltypist/cross_check_genes.py b/target/executable/annotate/celltypist/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/executable/annotate/celltypist/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/executable/annotate/celltypist/nextflow_labels.config b/target/executable/annotate/celltypist/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/celltypist/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/celltypist/set_var_index.py b/target/executable/annotate/celltypist/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/annotate/celltypist/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/annotate/celltypist/setup_logger.py b/target/executable/annotate/celltypist/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/annotate/celltypist/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/celltypist/subset_vars.py b/target/executable/annotate/celltypist/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/annotate/celltypist/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/annotate/onclass/.config.vsh.yaml b/target/executable/annotate/onclass/.config.vsh.yaml new file mode 100644 index 00000000..15dfad5a --- /dev/null +++ b/target/executable/annotate/onclass/.config.vsh.yaml @@ -0,0 +1,442 @@ +name: "onclass" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Ontology" + description: "Ontology input files" + arguments: + - type: "file" + name: "--cl_nlp_emb_file" + description: "The .nlp.emb file with the cell type embeddings." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cl_ontology_file" + description: "The .ontology file with the cell type ontology." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cl_obo_file" + description: "The .obo file with the cell type ontology." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The name of the adata obs column in the reference data containing\ + \ cell type annotations." + info: null + example: + - "cell_ontology_class" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unknown_celltype" + description: "Label for unknown cell types.\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "onclass_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "onclass_prob" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments" + arguments: + - type: "string" + name: "--model" + description: "\"Pretrained model path without a file extension. If not provided,\ + \ the model will be trained \non the reference data and --reference should be\ + \ provided. The path namespace should contain:\n - a .npz or .pkl file\n -\ + \ a .data file\n - a .meta file\n - a .index file\ne.g. /path/to/model/pretrained_model_target1\ + \ as saved by OnClass.\"\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "Maximum number of iterations for training the model." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "OnClass is a python package for single-cell cell type annotation. It\ + \ uses the Cell Ontology to capture the cell type similarity. \nThese similarities\ + \ enable OnClass to annotate cell types that are never seen in the training data.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "OnClass~=1.3" + - "tensorflow" + - "obonet" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/onclass/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/onclass" + executable: "target/executable/annotate/onclass/onclass" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/onclass/cross_check_genes.py b/target/executable/annotate/onclass/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/executable/annotate/onclass/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/executable/annotate/onclass/nextflow_labels.config b/target/executable/annotate/onclass/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/onclass/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/onclass/onclass b/target/executable/annotate/onclass/onclass new file mode 100755 index 00000000..ee3d8021 --- /dev/null +++ b/target/executable/annotate/onclass/onclass @@ -0,0 +1,1742 @@ +#!/usr/bin/env bash + +# onclass main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="onclass" +VIASH_META_FUNCTIONALITY_NAME="onclass" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "OnClass~=1.3" "tensorflow" "obonet" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component annotate onclass" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "onclass main" + echo "" + echo "OnClass is a python package for single-cell cell type annotation. It uses the" + echo "Cell Ontology to capture the cell type similarity." + echo "These similarities enable OnClass to annotate cell types that are never seen in" + echo "the training data." + echo "" + echo "Inputs:" + echo " Input dataset (query) arguments" + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input (query) data to be labeled. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " The layer in the input data to be used for cell type annotation if .X is" + echo " not to be used." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the input data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --input_reference_gene_overlap" + echo " type: integer" + echo " default: 100" + echo " min: 1" + echo " The minimum number of genes present in both the reference and query" + echo " datasets." + echo "" + echo "Ontology:" + echo " Ontology input files" + echo "" + echo " --cl_nlp_emb_file" + echo " type: file, required parameter, file must exist" + echo " The .nlp.emb file with the cell type embeddings." + echo "" + echo " --cl_ontology_file" + echo " type: file, required parameter, file must exist" + echo " The .ontology file with the cell type ontology." + echo "" + echo " --cl_obo_file" + echo " type: file, required parameter, file must exist" + echo " The .obo file with the cell type ontology." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train the CellTypist classifiers on. Only required" + echo " if a pre-trained --model is not provided." + echo "" + echo " --reference_layer" + echo " type: string" + echo " The layer in the reference data to be used for cell type annotation if" + echo " .X is not to be used." + echo "" + echo " --reference_obs_target" + echo " type: string, required parameter" + echo " example: cell_ontology_class" + echo " The name of the adata obs column in the reference data containing cell" + echo " type annotations." + echo "" + echo " --reference_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the reference data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --reference_var_input" + echo " type: string" + echo " .var column containing highly variable genes. By default, do not subset" + echo " genes." + echo "" + echo " --unknown_celltype" + echo " type: string" + echo " default: Unknown" + echo " Label for unknown cell types." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_obs_predictions" + echo " type: string" + echo " default: onclass_pred" + echo " In which \`.obs\` slots to store the predicted information." + echo "" + echo " --output_obs_probability" + echo " type: string" + echo " default: onclass_prob" + echo " In which \`.obs\` slots to store the probability of the predictions." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Model arguments:" + echo " Model arguments" + echo "" + echo " --model" + echo " type: string" + echo " \"Pretrained model path without a file extension. If not provided, the" + echo " model will be trained" + echo " on the reference data and --reference should be provided. The path" + echo " namespace should contain:" + echo " - a .npz or .pkl file" + echo " - a .data file" + echo " - a .meta file" + echo " - a .index file" + echo " e.g. /path/to/model/pretrained_model_target1 as saved by OnClass.\"" + echo "" + echo " --max_iter" + echo " type: integer" + echo " default: 30" + echo " Maximum number of iterations for training the model." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "onclass main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_reference_gene_overlap) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_reference_gene_overlap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_reference_gene_overlap=*) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap=*\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cl_nlp_emb_file) + [ -n "$VIASH_PAR_CL_NLP_EMB_FILE" ] && ViashError Bad arguments for option \'--cl_nlp_emb_file\': \'$VIASH_PAR_CL_NLP_EMB_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CL_NLP_EMB_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cl_nlp_emb_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cl_nlp_emb_file=*) + [ -n "$VIASH_PAR_CL_NLP_EMB_FILE" ] && ViashError Bad arguments for option \'--cl_nlp_emb_file=*\': \'$VIASH_PAR_CL_NLP_EMB_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CL_NLP_EMB_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cl_ontology_file) + [ -n "$VIASH_PAR_CL_ONTOLOGY_FILE" ] && ViashError Bad arguments for option \'--cl_ontology_file\': \'$VIASH_PAR_CL_ONTOLOGY_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CL_ONTOLOGY_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cl_ontology_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cl_ontology_file=*) + [ -n "$VIASH_PAR_CL_ONTOLOGY_FILE" ] && ViashError Bad arguments for option \'--cl_ontology_file=*\': \'$VIASH_PAR_CL_ONTOLOGY_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CL_ONTOLOGY_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cl_obo_file) + [ -n "$VIASH_PAR_CL_OBO_FILE" ] && ViashError Bad arguments for option \'--cl_obo_file\': \'$VIASH_PAR_CL_OBO_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CL_OBO_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cl_obo_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cl_obo_file=*) + [ -n "$VIASH_PAR_CL_OBO_FILE" ] && ViashError Bad arguments for option \'--cl_obo_file=*\': \'$VIASH_PAR_CL_OBO_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CL_OBO_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_target) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_target. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_target=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target=*\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_gene_names) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_gene_names=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names=*\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_input) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_input=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input=*\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --unknown_celltype) + [ -n "$VIASH_PAR_UNKNOWN_CELLTYPE" ] && ViashError Bad arguments for option \'--unknown_celltype\': \'$VIASH_PAR_UNKNOWN_CELLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNKNOWN_CELLTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --unknown_celltype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --unknown_celltype=*) + [ -n "$VIASH_PAR_UNKNOWN_CELLTYPE" ] && ViashError Bad arguments for option \'--unknown_celltype=*\': \'$VIASH_PAR_UNKNOWN_CELLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNKNOWN_CELLTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions=*\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_probability) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability=*\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_iter) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_iter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_iter=*) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter=*\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/onclass:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_CL_NLP_EMB_FILE+x} ]; then + ViashError '--cl_nlp_emb_file' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_CL_ONTOLOGY_FILE+x} ]; then + ViashError '--cl_ontology_file' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_CL_OBO_FILE+x} ]; then + ViashError '--cl_obo_file' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then + ViashError '--reference_obs_target' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="100" +fi +if [ -z ${VIASH_PAR_UNKNOWN_CELLTYPE+x} ]; then + VIASH_PAR_UNKNOWN_CELLTYPE="Unknown" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="onclass_pred" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="onclass_prob" +fi +if [ -z ${VIASH_PAR_MAX_ITER+x} ]; then + VIASH_PAR_MAX_ITER="30" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CL_NLP_EMB_FILE" ] && [ ! -e "$VIASH_PAR_CL_NLP_EMB_FILE" ]; then + ViashError "Input file '$VIASH_PAR_CL_NLP_EMB_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CL_ONTOLOGY_FILE" ] && [ ! -e "$VIASH_PAR_CL_ONTOLOGY_FILE" ]; then + ViashError "Input file '$VIASH_PAR_CL_ONTOLOGY_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CL_OBO_FILE" ] && [ ! -e "$VIASH_PAR_CL_OBO_FILE" ]; then + ViashError "Input file '$VIASH_PAR_CL_OBO_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ]]; then + if ! [[ "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--input_reference_gene_overlap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP -lt 1 ]]; then + ViashError '--input_reference_gene_overlap' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_ITER" ]]; then + if ! [[ "$VIASH_PAR_MAX_ITER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_iter' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_CL_NLP_EMB_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CL_NLP_EMB_FILE")" ) + VIASH_PAR_CL_NLP_EMB_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_CL_NLP_EMB_FILE") +fi +if [ ! -z "$VIASH_PAR_CL_ONTOLOGY_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CL_ONTOLOGY_FILE")" ) + VIASH_PAR_CL_ONTOLOGY_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_CL_ONTOLOGY_FILE") +fi +if [ ! -z "$VIASH_PAR_CL_OBO_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CL_OBO_FILE")" ) + VIASH_PAR_CL_OBO_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_CL_OBO_FILE") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-onclass-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +import numpy as np +from OnClass.OnClassModel import OnClassModel +import obonet +from typing import Dict, List, Tuple + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'cl_nlp_emb_file': $( if [ ! -z ${VIASH_PAR_CL_NLP_EMB_FILE+x} ]; then echo "r'${VIASH_PAR_CL_NLP_EMB_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cl_ontology_file': $( if [ ! -z ${VIASH_PAR_CL_ONTOLOGY_FILE+x} ]; then echo "r'${VIASH_PAR_CL_ONTOLOGY_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cl_obo_file': $( if [ ! -z ${VIASH_PAR_CL_OBO_FILE+x} ]; then echo "r'${VIASH_PAR_CL_OBO_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'unknown_celltype': $( if [ ! -z ${VIASH_PAR_UNKNOWN_CELLTYPE+x} ]; then echo "r'${VIASH_PAR_UNKNOWN_CELLTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index +from subset_vars import subset_vars + +logger = setup_logger() + + +def map_celltype_to_ontology_id( + cl_obo_file: str, +) -> Tuple[Dict[str, str], Dict[str, str]]: + """ + Map cell type names to ontology IDs and vice versa. + + Parameters + ---------- + cl_obo_file : str + Path to the cell ontology file. + + Returns + ------- + Tuple[Dict[str, str], Dict[str, str]] + A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names. + The second dictionary maps cell type names to cell ontology IDs. + """ + graph = obonet.read_obo(cl_obo_file) + cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)} + cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None} + name_to_cl_id = {v: k for k, v in cl_id_to_name.items()} + return cl_id_to_name, name_to_cl_id + + +def cell_type_prediction( + model: OnClassModel, + input_matrix: np.array, + input_features: List[str], + id_to_name: dict, +) -> Tuple[List[str], List[float]]: + """ + Predict cell types for input data and save results to Anndata obj. + + Parameters + ---------- + model : OnClassModel + The OnClass model. + input_matrix : np.array + The input data matrix. + input_modality : ad.AnnData + The input data Anndata object. + id_to_name : dict + Dictionary mapping cell ontology IDs to cell type names. + + Returns + ------- + predictions: List[str] + The predicted cell types. + probabilities: List[float] + Probabilities of the predicted cell types. + """ + corr_test_feature = model.ProcessTestFeature( + test_feature=input_matrix, + test_genes=input_features, + log_transform=False, + ) + onclass_pred = model.Predict( + corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0 + ) + pred_label = [model.i2co[ind] for ind in onclass_pred[2]] + pred_cell_type_label = [id_to_name[id] for id in pred_label] + prob_cell_type_label = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1) + + return pred_cell_type_label, prob_cell_type_label + + +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + # Onclass needs dense matrix format + input_matrix = input_matrix.toarray() + + id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"]) + + if par["model"]: + logger.info("Predicting cell types using pre-trained model") + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) + + model.BuildModel(use_pretrain=par["model"], ngene=None) + cross_check_genes( + model.genes, input_modality.var.index, par["input_reference_gene_overlap"] + ) + + elif par["reference"]: + logger.info("Reading reference data") + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + # Onclass needs dense matrix format + reference_matrix = reference_matrix.toarray() + + logger.info("Training a model from reference...") + + labels = reference_modality.obs[par["reference_obs_target"]].tolist() + labels_cl = [ + name_to_id[label] if label in name_to_id else par["unknown_celltype"] + for label in labels + ] + + _ = model.EmbedCellTypes(labels_cl) + corr_train_feature, _, corr_train_genes, _ = model.ProcessTrainFeature( + train_feature=reference_matrix, + train_label=labels_cl, + train_genes=reference_modality.var.index, + test_feature=input_matrix, + test_genes=input_modality.var.index, + log_transform=False, + ) + model.BuildModel(ngene=len(corr_train_genes)) + model.Train(corr_train_feature, labels_cl, max_iter=par["max_iter"]) + + logger.info("Predicting cell types") + predictions, probabilities = cell_type_prediction( + model, input_matrix, input_modality.var.index, id_to_name + ) + + logger.info("Writing output data") + input_adata.obs[par["output_obs_predictions"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_CL_NLP_EMB_FILE" ]; then + VIASH_PAR_CL_NLP_EMB_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_CL_NLP_EMB_FILE") + fi + if [ ! -z "$VIASH_PAR_CL_ONTOLOGY_FILE" ]; then + VIASH_PAR_CL_ONTOLOGY_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_CL_ONTOLOGY_FILE") + fi + if [ ! -z "$VIASH_PAR_CL_OBO_FILE" ]; then + VIASH_PAR_CL_OBO_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_CL_OBO_FILE") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/annotate/onclass/set_var_index.py b/target/executable/annotate/onclass/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/annotate/onclass/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/annotate/onclass/setup_logger.py b/target/executable/annotate/onclass/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/annotate/onclass/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/onclass/subset_vars.py b/target/executable/annotate/onclass/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/annotate/onclass/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/annotate/popv/.config.vsh.yaml b/target/executable/annotate/popv/.config.vsh.yaml new file mode 100644 index 00000000..e7c792f4 --- /dev/null +++ b/target/executable/annotate/popv/.config.vsh.yaml @@ -0,0 +1,408 @@ +name: "popv" +namespace: "annotate" +version: "main" +authors: +- name: "Matthias Beyens" + roles: + - "author" + info: + role: "Contributor" + links: + github: "MatthiasBeyens" + orcid: "0000-0003-3304-0706" + email: "matthias.beyens@gmail.com" + linkedin: "mbeyens" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + description: "Arguments related to the input (aka query) dataset." + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Which layer to use. If no value is provided, the counts are assumed\ + \ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch" + description: "Key in obs field of input adata for batch information. If no value\ + \ is provided, batch label is assumed to be unknown." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_subset" + description: "Subset the input object with this column." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_label" + description: "Key in obs field of input adata for label information. This is only\ + \ used for training scANVI. Unlabelled cells should be set to `\"unknown_celltype_label\"\ + `." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unknown_celltype_label" + description: "If `input_obs_label` is specified, cells with this value will be\ + \ treated as unknown and will be predicted by the model." + info: null + default: + - "unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "User-provided reference tissue. The data that will be used as reference\ + \ to call cell types." + info: null + example: + - "TS_Bladder_filtered.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "Which layer to use. If no value is provided, the counts are assumed\ + \ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_label" + description: "Key in obs field of reference AnnData with cell-type information." + info: null + default: + - "cell_ontology_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch" + description: "Key in obs field of input adata for batch information." + info: null + default: + - "donor_assay" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + description: "Other arguments." + arguments: + - type: "string" + name: "--methods" + description: "Methods to call cell types. By default, runs to knn_on_scvi and\ + \ scanvi." + info: null + example: + - "knn_on_scvi" + - "scanvi" + required: true + choices: + - "celltypist" + - "knn_on_bbknn" + - "knn_on_scanorama" + - "knn_on_scvi" + - "onclass" + - "rf" + - "scanvi" + - "svm" + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs popular major vote cell typing on single cell sequence data\ + \ using multiple algorithms. Note that this is a one-shot version of PopV." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "CFLAGS=\"-mno-avx512f -mno-avx2\"" + - "CPPFLAGS=\"-mno-avx512f -mno-avx2\"" + - type: "apt" + packages: + - "procps" + - "git" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "popv~=0.4.2" + - "numpy<2" + - "setuptools" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "docker" + run: + - "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git\n" + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/popv/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/popv" + executable: "target/executable/annotate/popv/popv" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/popv/nextflow_labels.config b/target/executable/annotate/popv/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/popv/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/popv/popv b/target/executable/annotate/popv/popv new file mode 100755 index 00000000..d7852570 --- /dev/null +++ b/target/executable/annotate/popv/popv @@ -0,0 +1,1616 @@ +#!/usr/bin/env bash + +# popv main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Matthias Beyens (author) +# * Robrecht Cannoodt (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="popv" +VIASH_META_FUNCTIONALITY_NAME="popv" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +ENV CFLAGS="-mno-avx512f -mno-avx2" +ENV CPPFLAGS="-mno-avx512f -mno-avx2" +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git build-essential && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "popv~=0.4.2" "numpy<2" "setuptools" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git + +LABEL org.opencontainers.image.authors="Matthias Beyens, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component annotate popv" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "popv main" + echo "" + echo "Performs popular major vote cell typing on single cell sequence data using" + echo "multiple algorithms. Note that this is a one-shot version of PopV." + echo "" + echo "Inputs:" + echo " Arguments related to the input (aka query) dataset." + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Which layer to use. If no value is provided, the counts are assumed to" + echo " be in the \`.X\` slot. Otherwise, count data is expected to be in" + echo " \`.layers[input_layer]\`." + echo "" + echo " --input_obs_batch" + echo " type: string" + echo " Key in obs field of input adata for batch information. If no value is" + echo " provided, batch label is assumed to be unknown." + echo "" + echo " --input_var_subset" + echo " type: string" + echo " Subset the input object with this column." + echo "" + echo " --input_obs_label" + echo " type: string" + echo " Key in obs field of input adata for label information. This is only used" + echo " for training scANVI. Unlabelled cells should be set to" + echo " \`\"unknown_celltype_label\"\`." + echo "" + echo " --unknown_celltype_label" + echo " type: string" + echo " default: unknown" + echo " If \`input_obs_label\` is specified, cells with this value will be treated" + echo " as unknown and will be predicted by the model." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, required parameter, file must exist" + echo " example: TS_Bladder_filtered.h5ad" + echo " User-provided reference tissue. The data that will be used as reference" + echo " to call cell types." + echo "" + echo " --reference_layer" + echo " type: string" + echo " Which layer to use. If no value is provided, the counts are assumed to" + echo " be in the \`.X\` slot. Otherwise, count data is expected to be in" + echo " \`.layers[reference_layer]\`." + echo "" + echo " --reference_obs_label" + echo " type: string" + echo " default: cell_ontology_class" + echo " Key in obs field of reference AnnData with cell-type information." + echo "" + echo " --reference_obs_batch" + echo " type: string" + echo " default: donor_assay" + echo " Key in obs field of input adata for batch information." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " Other arguments." + echo "" + echo " --methods" + echo " type: string, required parameter, multiple values allowed" + echo " example: knn_on_scvi;scanvi" + echo " choices: [ celltypist, knn_on_bbknn, knn_on_scanorama, knn_on_scvi," + echo "onclass, rf, scanvi, svm ]" + echo " Methods to call cell types. By default, runs to knn_on_scvi and scanvi." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "popv main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_batch) + [ -n "$VIASH_PAR_INPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--input_obs_batch\': \'$VIASH_PAR_INPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_batch=*) + [ -n "$VIASH_PAR_INPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--input_obs_batch=*\': \'$VIASH_PAR_INPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_subset) + [ -n "$VIASH_PAR_INPUT_VAR_SUBSET" ] && ViashError Bad arguments for option \'--input_var_subset\': \'$VIASH_PAR_INPUT_VAR_SUBSET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_SUBSET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_subset. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_subset=*) + [ -n "$VIASH_PAR_INPUT_VAR_SUBSET" ] && ViashError Bad arguments for option \'--input_var_subset=*\': \'$VIASH_PAR_INPUT_VAR_SUBSET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_SUBSET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_label) + [ -n "$VIASH_PAR_INPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--input_obs_label\': \'$VIASH_PAR_INPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_label=*) + [ -n "$VIASH_PAR_INPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--input_obs_label=*\': \'$VIASH_PAR_INPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --unknown_celltype_label) + [ -n "$VIASH_PAR_UNKNOWN_CELLTYPE_LABEL" ] && ViashError Bad arguments for option \'--unknown_celltype_label\': \'$VIASH_PAR_UNKNOWN_CELLTYPE_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNKNOWN_CELLTYPE_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --unknown_celltype_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --unknown_celltype_label=*) + [ -n "$VIASH_PAR_UNKNOWN_CELLTYPE_LABEL" ] && ViashError Bad arguments for option \'--unknown_celltype_label=*\': \'$VIASH_PAR_UNKNOWN_CELLTYPE_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNKNOWN_CELLTYPE_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_label) + [ -n "$VIASH_PAR_REFERENCE_OBS_LABEL" ] && ViashError Bad arguments for option \'--reference_obs_label\': \'$VIASH_PAR_REFERENCE_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_label=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_LABEL" ] && ViashError Bad arguments for option \'--reference_obs_label=*\': \'$VIASH_PAR_REFERENCE_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_batch) + [ -n "$VIASH_PAR_REFERENCE_OBS_BATCH" ] && ViashError Bad arguments for option \'--reference_obs_batch\': \'$VIASH_PAR_REFERENCE_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_batch=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_BATCH" ] && ViashError Bad arguments for option \'--reference_obs_batch=*\': \'$VIASH_PAR_REFERENCE_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --methods) + if [ -z "$VIASH_PAR_METHODS" ]; then + VIASH_PAR_METHODS="$2" + else + VIASH_PAR_METHODS="$VIASH_PAR_METHODS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --methods. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --methods=*) + if [ -z "$VIASH_PAR_METHODS" ]; then + VIASH_PAR_METHODS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_METHODS="$VIASH_PAR_METHODS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/popv:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_METHODS+x} ]; then + ViashError '--methods' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL+x} ]; then + VIASH_PAR_UNKNOWN_CELLTYPE_LABEL="unknown" +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_LABEL+x} ]; then + VIASH_PAR_REFERENCE_OBS_LABEL="cell_ontology_class" +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then + VIASH_PAR_REFERENCE_OBS_BATCH="donor_assay" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_METHODS" ]; then + VIASH_PAR_METHODS_CHOICES=("celltypist;knn_on_bbknn;knn_on_scanorama;knn_on_scvi;onclass;rf;scanvi;svm") + IFS=';' + set -f + for val in $VIASH_PAR_METHODS; do + if ! [[ ";${VIASH_PAR_METHODS_CHOICES[*]};" =~ ";${val};" ]]; then + ViashError '--methods' specified value of \'${val}\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-popv-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import re +import tempfile +import typing +import numpy as np +import mudata as mu +import anndata as ad +import popv + +# todo: is this still needed? +from torch.cuda import is_available as cuda_is_available + +try: + from torch.backends.mps import is_available as mps_is_available +except ModuleNotFoundError: + # Older pytorch versions + # MacOS GPUs + def mps_is_available(): + return False + + +# where to find the obo files +cl_obo_folder = "/opt/PopV/resources/ontology/" + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_subset': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_SUBSET+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_SUBSET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'unknown_celltype_label': $( if [ ! -z ${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL+x} ]; then echo "r'${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_label': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_batch': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'methods': $( if [ ! -z ${VIASH_PAR_METHODS+x} ]; then echo "r'${VIASH_PAR_METHODS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +use_gpu = cuda_is_available() +logger.info("GPU enabled? %s", use_gpu) + + +# Helper functions +def get_X( + adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str] +): + """Fetch the counts data from X or a layer. Subset columns by var_index if so desired.""" + if var_index: + adata = adata[:, var_index] + if layer: + return adata.layers[layer] + else: + return adata.X + + +def get_obs(adata: ad.AnnData, obs_par_names): + """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch.""" + obs_columns = [par[x] for x in obs_par_names if par[x]] + return adata.obs[obs_columns] + + +def get_var(adata: ad.AnnData, var_index: list[str]): + """Fetch the var dataframe. Subset rows by var_index if so desired.""" + return adata.var.loc[var_index] + + +def main(par, meta): + assert len(par["methods"]) >= 1, ( + "Please, specify at least one method for cell typing." + ) + logger.info("Cell typing methods: {}".format(par["methods"])) + + ### PREPROCESSING REFERENCE ### + logger.info("### PREPROCESSING REFERENCE ###") + + # take a look at reference data + logger.info("Reading reference data '%s'", par["reference"]) + reference = ad.read_h5ad(par["reference"]) + + logger.info("Setting reference var index to Ensembl IDs") + reference.var["gene_symbol"] = list(reference.var.index) + reference.var.index = [ + re.sub("\\\\.[0-9]+\$", "", s) for s in reference.var["ensemblid"] + ] + + logger.info("Detect number of samples per label") + min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size()) + n_samples_per_label = np.max((min_celltype_size, 100)) + + ### PREPROCESSING INPUT ### + logger.info("### PREPROCESSING INPUT ###") + logger.info("Reading '%s'", par["input"]) + input = mu.read_h5mu(par["input"]) + input_modality = input.mod[par["modality"]] + + # subset with var column + if par["input_var_subset"]: + logger.info("Subset input with .var['%s']", par["input_var_subset"]) + assert par["input_var_subset"] in input_modality.var, ( + f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var" + ) + input_modality = input_modality[:, input_modality.var[par["input_var_subset"]]] + + ### ALIGN REFERENCE AND INPUT ### + logger.info("### ALIGN REFERENCE AND INPUT ###") + + logger.info("Detecting common vars based on ensembl ids") + common_ens_ids = list( + set(reference.var.index).intersection(set(input_modality.var.index)) + ) + + logger.info(" reference n_vars: %i", reference.n_vars) + logger.info(" input n_vars: %i", input_modality.n_vars) + logger.info(" intersect n_vars: %i", len(common_ens_ids)) + assert len(common_ens_ids) >= 100, "The intersection of genes is too small." + + # subset input objects to make sure popv is using the data we expect + input_modality = ad.AnnData( + X=get_X(input_modality, par["input_layer"], common_ens_ids), + obs=get_obs(input_modality, ["input_obs_label", "input_obs_batch"]), + var=get_var(input_modality, common_ens_ids), + ) + reference = ad.AnnData( + X=get_X(reference, par["reference_layer"], common_ens_ids), + obs=get_obs(reference, ["reference_obs_label", "reference_obs_batch"]), + var=get_var(reference, common_ens_ids), + ) + + # remove layers that + + ### ALIGN REFERENCE AND INPUT ### + logger.info("### ALIGN REFERENCE AND INPUT ###") + + with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir: + logger.info("Run PopV processing") + pq = popv.preprocessing.Process_Query( + # input + query_adata=input_modality, + query_labels_key=par["input_obs_label"], + query_batch_key=par["input_obs_batch"], + query_layers_key=None, # this is taken care of by subset + # reference + ref_adata=reference, + ref_labels_key=par["reference_obs_label"], + ref_batch_key=par["reference_obs_batch"], + # options + unknown_celltype_label=par["unknown_celltype_label"], + n_samples_per_label=n_samples_per_label, + # pretrained model + # Might need to be parameterized at some point + prediction_mode="retrain", + pretrained_scvi_path=None, + # outputs + # Might need to be parameterized at some point + save_path_trained_models=temp_dir, + # hardcoded values + cl_obo_folder=cl_obo_folder, + accelerator="cuda" if use_gpu else "cpu", + ) + method_kwargs = {} + if "scanorama" in par["methods"]: + method_kwargs["scanorama"] = {"approx": False} + logger.info("Annotate data") + popv.annotation.annotate_data( + adata=pq.adata, methods=par["methods"], methods_kwargs=method_kwargs + ) + + popv_input = pq.adata[input_modality.obs_names] + + # select columns starting with "popv_" + popv_obs_cols = popv_input.obs.columns[ + popv_input.obs.columns.str.startswith("popv_") + ] + + # create new data frame with selected columns + df_popv = popv_input.obs[popv_obs_cols] + + # remove prefix from column names + df_popv.columns = df_popv.columns.str.replace("popv_", "") + + # store output in mudata .obsm + input.mod[par["modality"]].obsm["popv_output"] = df_popv + + # copy important output in mudata .obs + for col in ["popv_prediction"]: + if col in popv_input.obs.columns: + input.mod[par["modality"]].obs[col] = popv_input.obs[col] + + # code to explore how the output differs from the original + # for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]: + # old_keys = set(getattr(pq_adata_orig, attr).keys()) + # new_keys = set(getattr(pq.adata, attr).keys()) + # diff_keys = list(new_keys.difference(old_keys)) + # diff_keys.sort() + # print(f"{attr}:", flush=True) + # for key in diff_keys: + # print(f" {key}", flush=True) + + # write output + logger.info("Writing %s", par["output"]) + input.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/annotate/popv/setup_logger.py b/target/executable/annotate/popv/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/annotate/popv/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/random_forest_annotation/.config.vsh.yaml b/target/executable/annotate/random_forest_annotation/.config.vsh.yaml new file mode 100644 index 00000000..519c47ea --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/.config.vsh.yaml @@ -0,0 +1,457 @@ +name: "random_forest_annotation" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "Key in obs field of reference modality with cell-type information." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "random_forest_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "random_forest_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments." + arguments: + - type: "file" + name: "--model" + description: "Pretrained model in pkl format. If not provided, the model will\ + \ be trained on the reference data and --reference should be provided." + info: null + example: + - "pretrained_model.pkl" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_estimators" + description: "Number of trees in the random forest." + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_depth" + description: "Maximum depth of the trees in the random forest. \nIf not provided,\ + \ the nodes are expanded until all leaves only contain a single sample.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--criterion" + description: "The function to measure the quality of a split." + info: null + default: + - "gini" + required: false + choices: + - "gini" + - "entropy" + - "log_loss" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--class_weight" + description: "Weights associated with classes.\nThe `balanced` mode uses the values\ + \ of y to automatically adjust weights inversely proportional to class frequencies\ + \ in the input data.\nThe `balanced_subsample` mode is the same as `balanced`\ + \ except that weights are computed based on the bootstrap sample for every tree\ + \ grown.\nThe `uniform` mode gives all classes a weight of one.\n" + info: null + default: + - "balanced_subsample" + required: false + choices: + - "balanced" + - "balanced_subsample" + - "uniform" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--max_features" + description: "The number of features to consider when looking for the best split.\ + \ The value can either be a positive integer or one of `sqrt`, `log2` or `all`.\n\ + If integer: consider max_features features at each split.\nIf `sqrt`: max_features\ + \ is the squareroot of all input features.\nIf `log2`: max_features is the log2\ + \ of all input features.\nIf `all`: max features equals all input features.\n" + info: null + default: + - "200" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\ + \ of random forest." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "scikit-learn==1.4.2" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/random_forest_annotation/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/random_forest_annotation" + executable: "target/executable/annotate/random_forest_annotation/random_forest_annotation" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/random_forest_annotation/cross_check_genes.py b/target/executable/annotate/random_forest_annotation/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/executable/annotate/random_forest_annotation/nextflow_labels.config b/target/executable/annotate/random_forest_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/random_forest_annotation/random_forest_annotation b/target/executable/annotate/random_forest_annotation/random_forest_annotation new file mode 100755 index 00000000..580c0633 --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/random_forest_annotation @@ -0,0 +1,1694 @@ +#!/usr/bin/env bash + +# random_forest_annotation main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="random_forest_annotation" +VIASH_META_FUNCTIONALITY_NAME="random_forest_annotation" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scikit-learn==1.4.2" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component annotate random_forest_annotation" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "random_forest_annotation main" + echo "" + echo "Automated cell type annotation tool for scRNA-seq datasets on the basis of" + echo "random forest." + echo "" + echo "Inputs:" + echo " Input dataset (query) arguments" + echo "" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input (query) data to be labeled. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " The layer in the input data to be used for cell type annotation if .X is" + echo " not to be used." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the input data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --input_reference_gene_overlap" + echo " type: integer" + echo " default: 100" + echo " min: 1" + echo " The minimum number of genes present in both the reference and query" + echo " datasets." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train the CellTypist classifiers on. Only required" + echo " if a pre-trained --model is not provided." + echo "" + echo " --reference_layer" + echo " type: string" + echo " The layer in the reference data to be used for cell type annotation if" + echo " .X is not to be used. Data are expected to be processed in the same way" + echo " as the --input query dataset." + echo "" + echo " --reference_obs_target" + echo " type: string, required parameter" + echo " Key in obs field of reference modality with cell-type information." + echo "" + echo " --reference_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the reference data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --reference_var_input" + echo " type: string" + echo " .var column containing highly variable genes. By default, do not subset" + echo " genes." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_obs_predictions" + echo " type: string" + echo " default: random_forest_pred" + echo " In which \`.obs\` slots to store the predicted information." + echo "" + echo " --output_obs_probability" + echo " type: string" + echo " default: random_forest_probability" + echo " In which \`.obs\` slots to store the probability of the predictions." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Model arguments:" + echo " Model arguments." + echo "" + echo " --model" + echo " type: file, file must exist" + echo " example: pretrained_model.pkl" + echo " Pretrained model in pkl format. If not provided, the model will be" + echo " trained on the reference data and --reference should be provided." + echo "" + echo " --n_estimators" + echo " type: integer" + echo " default: 100" + echo " Number of trees in the random forest." + echo "" + echo " --max_depth" + echo " type: integer" + echo " Maximum depth of the trees in the random forest." + echo " If not provided, the nodes are expanded until all leaves only contain a" + echo " single sample." + echo "" + echo " --criterion" + echo " type: string" + echo " default: gini" + echo " choices: [ gini, entropy, log_loss ]" + echo " The function to measure the quality of a split." + echo "" + echo " --class_weight" + echo " type: string" + echo " default: balanced_subsample" + echo " choices: [ balanced, balanced_subsample, uniform ]" + echo " Weights associated with classes." + echo " The \`balanced\` mode uses the values of y to automatically adjust weights" + echo " inversely proportional to class frequencies in the input data." + echo " The \`balanced_subsample\` mode is the same as \`balanced\` except that" + echo " weights are computed based on the bootstrap sample for every tree grown." + echo " The \`uniform\` mode gives all classes a weight of one." + echo "" + echo " --max_features" + echo " type: string" + echo " default: 200" + echo " The number of features to consider when looking for the best split. The" + echo " value can either be a positive integer or one of \`sqrt\`, \`log2\` or" + echo " \`all\`." + echo " If integer: consider max_features features at each split." + echo " If \`sqrt\`: max_features is the squareroot of all input features." + echo " If \`log2\`: max_features is the log2 of all input features." + echo " If \`all\`: max features equals all input features." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "random_forest_annotation main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_reference_gene_overlap) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_reference_gene_overlap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_reference_gene_overlap=*) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap=*\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_target) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_target. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_target=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target=*\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_gene_names) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_gene_names=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names=*\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_input) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_input=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input=*\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions=*\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_probability) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability=*\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_estimators) + [ -n "$VIASH_PAR_N_ESTIMATORS" ] && ViashError Bad arguments for option \'--n_estimators\': \'$VIASH_PAR_N_ESTIMATORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_ESTIMATORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_estimators. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_estimators=*) + [ -n "$VIASH_PAR_N_ESTIMATORS" ] && ViashError Bad arguments for option \'--n_estimators=*\': \'$VIASH_PAR_N_ESTIMATORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_ESTIMATORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_depth) + [ -n "$VIASH_PAR_MAX_DEPTH" ] && ViashError Bad arguments for option \'--max_depth\': \'$VIASH_PAR_MAX_DEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DEPTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_depth. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_depth=*) + [ -n "$VIASH_PAR_MAX_DEPTH" ] && ViashError Bad arguments for option \'--max_depth=*\': \'$VIASH_PAR_MAX_DEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DEPTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --criterion) + [ -n "$VIASH_PAR_CRITERION" ] && ViashError Bad arguments for option \'--criterion\': \'$VIASH_PAR_CRITERION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CRITERION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --criterion. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --criterion=*) + [ -n "$VIASH_PAR_CRITERION" ] && ViashError Bad arguments for option \'--criterion=*\': \'$VIASH_PAR_CRITERION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CRITERION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --class_weight) + [ -n "$VIASH_PAR_CLASS_WEIGHT" ] && ViashError Bad arguments for option \'--class_weight\': \'$VIASH_PAR_CLASS_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLASS_WEIGHT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --class_weight. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --class_weight=*) + [ -n "$VIASH_PAR_CLASS_WEIGHT" ] && ViashError Bad arguments for option \'--class_weight=*\': \'$VIASH_PAR_CLASS_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLASS_WEIGHT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_features) + [ -n "$VIASH_PAR_MAX_FEATURES" ] && ViashError Bad arguments for option \'--max_features\': \'$VIASH_PAR_MAX_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_FEATURES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_features=*) + [ -n "$VIASH_PAR_MAX_FEATURES" ] && ViashError Bad arguments for option \'--max_features=*\': \'$VIASH_PAR_MAX_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_FEATURES=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/random_forest_annotation:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then + ViashError '--reference_obs_target' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="100" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="random_forest_pred" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="random_forest_probability" +fi +if [ -z ${VIASH_PAR_N_ESTIMATORS+x} ]; then + VIASH_PAR_N_ESTIMATORS="100" +fi +if [ -z ${VIASH_PAR_CRITERION+x} ]; then + VIASH_PAR_CRITERION="gini" +fi +if [ -z ${VIASH_PAR_CLASS_WEIGHT+x} ]; then + VIASH_PAR_CLASS_WEIGHT="balanced_subsample" +fi +if [ -z ${VIASH_PAR_MAX_FEATURES+x} ]; then + VIASH_PAR_MAX_FEATURES="200" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL" ] && [ ! -e "$VIASH_PAR_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_MODEL' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ]]; then + if ! [[ "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--input_reference_gene_overlap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP -lt 1 ]]; then + ViashError '--input_reference_gene_overlap' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_ESTIMATORS" ]]; then + if ! [[ "$VIASH_PAR_N_ESTIMATORS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_estimators' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_DEPTH" ]]; then + if ! [[ "$VIASH_PAR_MAX_DEPTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_depth' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CRITERION" ]; then + VIASH_PAR_CRITERION_CHOICES=("gini;entropy;log_loss") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CRITERION_CHOICES[*]};" =~ ";$VIASH_PAR_CRITERION;" ]]; then + ViashError '--criterion' specified value of \'$VIASH_PAR_CRITERION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CLASS_WEIGHT" ]; then + VIASH_PAR_CLASS_WEIGHT_CHOICES=("balanced;balanced_subsample;uniform") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CLASS_WEIGHT_CHOICES[*]};" =~ ";$VIASH_PAR_CLASS_WEIGHT;" ]]; then + ViashError '--class_weight' specified value of \'$VIASH_PAR_CLASS_WEIGHT\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL")" ) + VIASH_PAR_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-random_forest_annotation-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +import numpy as np +from sklearn.ensemble import RandomForestClassifier +import pickle + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_estimators': $( if [ ! -z ${VIASH_PAR_N_ESTIMATORS+x} ]; then echo "int(r'${VIASH_PAR_N_ESTIMATORS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_depth': $( if [ ! -z ${VIASH_PAR_MAX_DEPTH+x} ]; then echo "int(r'${VIASH_PAR_MAX_DEPTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'criterion': $( if [ ! -z ${VIASH_PAR_CRITERION+x} ]; then echo "r'${VIASH_PAR_CRITERION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'class_weight': $( if [ ! -z ${VIASH_PAR_CLASS_WEIGHT+x} ]; then echo "r'${VIASH_PAR_CLASS_WEIGHT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'max_features': $( if [ ! -z ${VIASH_PAR_MAX_FEATURES+x} ]; then echo "r'${VIASH_PAR_MAX_FEATURES//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from subset_vars import subset_vars +from set_var_index import set_var_index + +logger = setup_logger() + + +def main(): + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + + # Handle max_features parameter + max_features_conversion = { + "all": None, + "sqrt": "sqrt", + "log2": "log2", + } + try: + max_features = max_features_conversion.get( + par["max_features"], int(par["max_features"]) + ) + except ValueError: + raise ValueError( + f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'" + ) + + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + if par["model"]: + logger.info("Loading a pre-trained model") + model = pickle.load(open(par["model"], "rb")) + if hasattr(model, "_feature_names_in"): + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) + if not len(common_genes) == len(model._feature_names_in): + raise ValueError("Input dataset does not contain all model features.") + input_modality = input_modality[:, common_genes] + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + else: + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) + + elif par["reference"]: + logger.info("Reading reference data") + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Query and input require the exact same features + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + reference_modality = reference_modality[:, common_genes] + input_modality = input_modality[:, common_genes] + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + logger.info("Training a model...") + labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() + model = RandomForestClassifier( + n_estimators=par["n_estimators"], + criterion=par["criterion"], + max_depth=par["max_depth"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + max_features=max_features, + ) + model.fit(reference_matrix, labels) + model._feature_names_in = reference_modality.var.index + + logger.info("Running predictions...") + predictions = model.predict(input_matrix) + probabilities = np.max(model.predict_proba(input_matrix), axis=1) + + input_adata.obs[par["output_obs_predictions"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + + logger.info("Writing output data") + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/annotate/random_forest_annotation/set_var_index.py b/target/executable/annotate/random_forest_annotation/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/annotate/random_forest_annotation/setup_logger.py b/target/executable/annotate/random_forest_annotation/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/random_forest_annotation/subset_vars.py b/target/executable/annotate/random_forest_annotation/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/annotate/random_forest_annotation/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/annotate/scanvi/.config.vsh.yaml b/target/executable/annotate/scanvi/.config.vsh.yaml new file mode 100644 index 00000000..250d5aa8 --- /dev/null +++ b/target/executable/annotate/scanvi/.config.vsh.yaml @@ -0,0 +1,480 @@ +name: "scanvi" +namespace: "annotate" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file. Note that this needs to be the exact same dataset\ + \ as the --scvi_model was trained on." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes that were used to train\ + \ the scVi model. By default, do not subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: ".var column containing gene names. By default, use the index." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_labels" + description: ".obs field containing the labels" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unlabeled_category" + description: "Value in the --obs_labels field that indicates unlabeled observations\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "scVI Model" + arguments: + - type: "file" + name: "--scvi_model" + description: "Pretrained SCVI reference model to initialize the SCANVI model with." + info: null + example: + - "scvi_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_model" + description: "Folder where the state of the trained model will be saved to." + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_scanvi_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_predictions" + description: "In which .obs slot to store the predicted labels." + info: null + default: + - "scanvi_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_probabilities" + description: "In which. obs slot to store the probabilities of the predicted labels." + info: null + default: + - "scanvi_proba" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "scANVI training arguments" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "scANVI () is a semi-supervised model for single-cell transcriptomics\ + \ data. scANVI is an scVI extension that can leverage the cell type knowledge for\ + \ a subset of the cells present in the data sets to infer the states of the rest\ + \ of the cells.\nThis component will instantiate a scANVI model from a pre-trained\ + \ scVI model, integrate the data and perform label prediction.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "scvi_model" +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + - "gpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "jax[cuda]" + - "scvi-tools~=1.3.1" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/scanvi/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/scanvi" + executable: "target/executable/annotate/scanvi/scanvi" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/scanvi/compress_h5mu.py b/target/executable/annotate/scanvi/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/annotate/scanvi/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/annotate/scanvi/nextflow_labels.config b/target/executable/annotate/scanvi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/scanvi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/scanvi/scanvi b/target/executable/annotate/scanvi/scanvi new file mode 100755 index 00000000..1b61ab6e --- /dev/null +++ b/target/executable/annotate/scanvi/scanvi @@ -0,0 +1,1748 @@ +#!/usr/bin/env bash + +# scanvi main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer) +# * Jakub Majercik (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scanvi" +VIASH_META_FUNCTIONALITY_NAME="scanvi" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:25.05-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "jax[cuda]" "scvi-tools~=1.3.1" + +LABEL org.opencontainers.image.authors="Dorien Roosen, Jakub Majercik, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component annotate scanvi" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scanvi main" + echo "" + echo "scANVI () is a semi-supervised model for single-cell transcriptomics data." + echo "scANVI is an scVI extension that can leverage the cell type knowledge for a" + echo "subset of the cells present in the data sets to infer the states of the rest of" + echo "the cells." + echo "This component will instantiate a scANVI model from a pre-trained scVI model," + echo "integrate the data and perform label prediction." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file. Note that this needs to be the exact same dataset as" + echo " the --scvi_model was trained on." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. If None, X is used" + echo "" + echo " --var_input" + echo " type: string" + echo " .var column containing highly variable genes that were used to train the" + echo " scVi model. By default, do not subset genes." + echo "" + echo " --var_gene_names" + echo " type: string" + echo " .var column containing gene names. By default, use the index." + echo "" + echo " --obs_labels" + echo " type: string, required parameter" + echo " .obs field containing the labels" + echo "" + echo " --unlabeled_category" + echo " type: string" + echo " default: Unknown" + echo " Value in the --obs_labels field that indicates unlabeled observations" + echo "" + echo "scVI Model:" + echo " --scvi_model" + echo " type: file, required parameter, file must exist" + echo " example: scvi_model.pt" + echo " Pretrained SCVI reference model to initialize the SCANVI model with." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --output_model" + echo " type: file, output, file must exist" + echo " Folder where the state of the trained model will be saved to." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_scanvi_integrated" + echo " In which .obsm slot to store the resulting integrated embedding." + echo "" + echo " --obs_output_predictions" + echo " type: string" + echo " default: scanvi_pred" + echo " In which .obs slot to store the predicted labels." + echo "" + echo " --obs_output_probabilities" + echo " type: string" + echo " default: scanvi_proba" + echo " In which. obs slot to store the probabilities of the predicted labels." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "scANVI training arguments:" + echo " --early_stopping" + echo " type: boolean" + echo " Whether to perform early stopping with respect to the validation set." + echo "" + echo " --early_stopping_monitor" + echo " type: string" + echo " default: elbo_validation" + echo " choices: [ elbo_validation, reconstruction_loss_validation," + echo "kl_local_validation ]" + echo " Metric logged during validation set epoch." + echo "" + echo " --early_stopping_patience" + echo " type: integer" + echo " default: 45" + echo " min: 1" + echo " Number of validation epochs with no improvement after which training" + echo " will be stopped." + echo "" + echo " --early_stopping_min_delta" + echo " type: double" + echo " default: 0.0" + echo " min: 0.0" + echo " Minimum change in the monitored quantity to qualify as an improvement," + echo " i.e. an absolute change of less than min_delta, will count as no" + echo " improvement." + echo "" + echo " --max_epochs" + echo " type: integer" + echo " Number of passes through the dataset, defaults to (20000 / number of" + echo " cells) * 400 or 400; whichever is smallest." + echo "" + echo " --reduce_lr_on_plateau" + echo " type: boolean" + echo " default: true" + echo " Whether to monitor validation loss and reduce learning rate when" + echo " validation set \`lr_scheduler_metric\` plateaus." + echo "" + echo " --lr_factor" + echo " type: double" + echo " default: 0.6" + echo " min: 0.0" + echo " Factor to reduce learning rate." + echo "" + echo " --lr_patience" + echo " type: double" + echo " default: 30.0" + echo " min: 0.0" + echo " Number of epochs with no improvement after which learning rate will be" + echo " reduced." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scanvi main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_gene_names) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_gene_names=*) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names=*\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_labels) + [ -n "$VIASH_PAR_OBS_LABELS" ] && ViashError Bad arguments for option \'--obs_labels\': \'$VIASH_PAR_OBS_LABELS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LABELS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_labels. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_labels=*) + [ -n "$VIASH_PAR_OBS_LABELS" ] && ViashError Bad arguments for option \'--obs_labels=*\': \'$VIASH_PAR_OBS_LABELS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LABELS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --unlabeled_category) + [ -n "$VIASH_PAR_UNLABELED_CATEGORY" ] && ViashError Bad arguments for option \'--unlabeled_category\': \'$VIASH_PAR_UNLABELED_CATEGORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNLABELED_CATEGORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --unlabeled_category. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --unlabeled_category=*) + [ -n "$VIASH_PAR_UNLABELED_CATEGORY" ] && ViashError Bad arguments for option \'--unlabeled_category=*\': \'$VIASH_PAR_UNLABELED_CATEGORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNLABELED_CATEGORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scvi_model) + [ -n "$VIASH_PAR_SCVI_MODEL" ] && ViashError Bad arguments for option \'--scvi_model\': \'$VIASH_PAR_SCVI_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCVI_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scvi_model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scvi_model=*) + [ -n "$VIASH_PAR_SCVI_MODEL" ] && ViashError Bad arguments for option \'--scvi_model=*\': \'$VIASH_PAR_SCVI_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCVI_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_model) + [ -n "$VIASH_PAR_OUTPUT_MODEL" ] && ViashError Bad arguments for option \'--output_model\': \'$VIASH_PAR_OUTPUT_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_model=*) + [ -n "$VIASH_PAR_OUTPUT_MODEL" ] && ViashError Bad arguments for option \'--output_model=*\': \'$VIASH_PAR_OUTPUT_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_output_predictions) + [ -n "$VIASH_PAR_OBS_OUTPUT_PREDICTIONS" ] && ViashError Bad arguments for option \'--obs_output_predictions\': \'$VIASH_PAR_OBS_OUTPUT_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_output_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_output_predictions=*) + [ -n "$VIASH_PAR_OBS_OUTPUT_PREDICTIONS" ] && ViashError Bad arguments for option \'--obs_output_predictions=*\': \'$VIASH_PAR_OBS_OUTPUT_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_output_probabilities) + [ -n "$VIASH_PAR_OBS_OUTPUT_PROBABILITIES" ] && ViashError Bad arguments for option \'--obs_output_probabilities\': \'$VIASH_PAR_OBS_OUTPUT_PROBABILITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PROBABILITIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_output_probabilities. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_output_probabilities=*) + [ -n "$VIASH_PAR_OBS_OUTPUT_PROBABILITIES" ] && ViashError Bad arguments for option \'--obs_output_probabilities=*\': \'$VIASH_PAR_OBS_OUTPUT_PROBABILITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PROBABILITIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping) + [ -n "$VIASH_PAR_EARLY_STOPPING" ] && ViashError Bad arguments for option \'--early_stopping\': \'$VIASH_PAR_EARLY_STOPPING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping=*) + [ -n "$VIASH_PAR_EARLY_STOPPING" ] && ViashError Bad arguments for option \'--early_stopping=*\': \'$VIASH_PAR_EARLY_STOPPING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_monitor) + [ -n "$VIASH_PAR_EARLY_STOPPING_MONITOR" ] && ViashError Bad arguments for option \'--early_stopping_monitor\': \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MONITOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_monitor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_monitor=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_MONITOR" ] && ViashError Bad arguments for option \'--early_stopping_monitor=*\': \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MONITOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_patience) + [ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ] && ViashError Bad arguments for option \'--early_stopping_patience\': \'$VIASH_PAR_EARLY_STOPPING_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_PATIENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_patience. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_patience=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ] && ViashError Bad arguments for option \'--early_stopping_patience=*\': \'$VIASH_PAR_EARLY_STOPPING_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_PATIENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_min_delta) + [ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ] && ViashError Bad arguments for option \'--early_stopping_min_delta\': \'$VIASH_PAR_EARLY_STOPPING_MIN_DELTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MIN_DELTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_min_delta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_min_delta=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ] && ViashError Bad arguments for option \'--early_stopping_min_delta=*\': \'$VIASH_PAR_EARLY_STOPPING_MIN_DELTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MIN_DELTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_epochs) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_epochs=*) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs=*\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reduce_lr_on_plateau) + [ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ] && ViashError Bad arguments for option \'--reduce_lr_on_plateau\': \'$VIASH_PAR_REDUCE_LR_ON_PLATEAU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REDUCE_LR_ON_PLATEAU="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reduce_lr_on_plateau. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reduce_lr_on_plateau=*) + [ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ] && ViashError Bad arguments for option \'--reduce_lr_on_plateau=*\': \'$VIASH_PAR_REDUCE_LR_ON_PLATEAU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REDUCE_LR_ON_PLATEAU=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lr_factor) + [ -n "$VIASH_PAR_LR_FACTOR" ] && ViashError Bad arguments for option \'--lr_factor\': \'$VIASH_PAR_LR_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lr_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lr_factor=*) + [ -n "$VIASH_PAR_LR_FACTOR" ] && ViashError Bad arguments for option \'--lr_factor=*\': \'$VIASH_PAR_LR_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lr_patience) + [ -n "$VIASH_PAR_LR_PATIENCE" ] && ViashError Bad arguments for option \'--lr_patience\': \'$VIASH_PAR_LR_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_PATIENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lr_patience. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lr_patience=*) + [ -n "$VIASH_PAR_LR_PATIENCE" ] && ViashError Bad arguments for option \'--lr_patience=*\': \'$VIASH_PAR_LR_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_PATIENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/scanvi:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_LABELS+x} ]; then + ViashError '--obs_labels' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_SCVI_MODEL+x} ]; then + ViashError '--scvi_model' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_UNLABELED_CATEGORY+x} ]; then + VIASH_PAR_UNLABELED_CATEGORY="Unknown" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_scanvi_integrated" +fi +if [ -z ${VIASH_PAR_OBS_OUTPUT_PREDICTIONS+x} ]; then + VIASH_PAR_OBS_OUTPUT_PREDICTIONS="scanvi_pred" +fi +if [ -z ${VIASH_PAR_OBS_OUTPUT_PROBABILITIES+x} ]; then + VIASH_PAR_OBS_OUTPUT_PROBABILITIES="scanvi_proba" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then + VIASH_PAR_EARLY_STOPPING_MONITOR="elbo_validation" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then + VIASH_PAR_EARLY_STOPPING_PATIENCE="45" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then + VIASH_PAR_EARLY_STOPPING_MIN_DELTA="0.0" +fi +if [ -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then + VIASH_PAR_REDUCE_LR_ON_PLATEAU="true" +fi +if [ -z ${VIASH_PAR_LR_FACTOR+x} ]; then + VIASH_PAR_LR_FACTOR="0.6" +fi +if [ -z ${VIASH_PAR_LR_PATIENCE+x} ]; then + VIASH_PAR_LR_PATIENCE="30.0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SCVI_MODEL" ] && [ ! -e "$VIASH_PAR_SCVI_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_SCVI_MODEL' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EARLY_STOPPING" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--early_stopping' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING_PATIENCE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--early_stopping_patience' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_EARLY_STOPPING_PATIENCE -lt 1 ]]; then + ViashError '--early_stopping_patience' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--early_stopping_min_delta' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_EARLY_STOPPING_MIN_DELTA '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--early_stopping_min_delta' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_EARLY_STOPPING_MIN_DELTA -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--early_stopping_min_delta' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--early_stopping_min_delta' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_MAX_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_MAX_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ]]; then + if ! [[ "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--reduce_lr_on_plateau' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LR_FACTOR" ]]; then + if ! [[ "$VIASH_PAR_LR_FACTOR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lr_factor' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LR_FACTOR '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--lr_factor' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LR_FACTOR -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lr_factor' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lr_factor' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_LR_PATIENCE" ]]; then + if ! [[ "$VIASH_PAR_LR_PATIENCE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lr_patience' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LR_PATIENCE '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--lr_patience' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LR_PATIENCE -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lr_patience' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lr_patience' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_EARLY_STOPPING_MONITOR" ]; then + VIASH_PAR_EARLY_STOPPING_MONITOR_CHOICES=("elbo_validation;reconstruction_loss_validation;kl_local_validation") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_EARLY_STOPPING_MONITOR_CHOICES[*]};" =~ ";$VIASH_PAR_EARLY_STOPPING_MONITOR;" ]]; then + ViashError '--early_stopping_monitor' specified value of \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_MODEL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_MODEL")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_SCVI_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SCVI_MODEL")" ) + VIASH_PAR_SCVI_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_SCVI_MODEL") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_MODEL")" ) + VIASH_PAR_OUTPUT_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_MODEL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_MODEL" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scanvi-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata +import scvi +import numpy as np + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_labels': $( if [ ! -z ${VIASH_PAR_OBS_LABELS+x} ]; then echo "r'${VIASH_PAR_OBS_LABELS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'unlabeled_category': $( if [ ! -z ${VIASH_PAR_UNLABELED_CATEGORY+x} ]; then echo "r'${VIASH_PAR_UNLABELED_CATEGORY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'scvi_model': $( if [ ! -z ${VIASH_PAR_SCVI_MODEL+x} ]; then echo "r'${VIASH_PAR_SCVI_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_model': $( if [ ! -z ${VIASH_PAR_OUTPUT_MODEL+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_output_predictions': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PREDICTIONS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_output_probabilities': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PROBABILITIES+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PROBABILITIES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'early_stopping': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping_monitor': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING_MONITOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'early_stopping_patience': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then echo "int(r'${VIASH_PAR_EARLY_STOPPING_PATIENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'early_stopping_min_delta': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then echo "float(r'${VIASH_PAR_EARLY_STOPPING_MIN_DELTA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reduce_lr_on_plateau': $( if [ ! -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then echo "r'${VIASH_PAR_REDUCE_LR_ON_PLATEAU//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'lr_factor': $( if [ ! -z ${VIASH_PAR_LR_FACTOR+x} ]; then echo "float(r'${VIASH_PAR_LR_FACTOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'lr_patience': $( if [ ! -z ${VIASH_PAR_LR_PATIENCE+x} ]; then echo "float(r'${VIASH_PAR_LR_PATIENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info("Reading input data...") + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + if par["var_input"]: + # Subset to HVG + adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() + else: + adata_subset = adata.copy() + + # Sanitize gene names and set as index of the AnnData object + adata_subset = set_var_index(adata_subset, par["var_gene_names"]) + + logger.info(f"Loading pre-trained scVI model from {par['scvi_model']}") + scvi_model = scvi.model.SCVI.load( + par["scvi_model"], + adata_subset, + accelerator="auto", + device="auto", + ) + + logger.info("Instantiating scANVI model from scVI model...") + vae_uns = scvi.model.SCANVI.from_scvi_model( + scvi_model, + unlabeled_category=par["unlabeled_category"], + labels_key=par["obs_labels"], + adata=adata_subset, + ) + + plan_kwargs = { + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], + } + + logger.info("Training scANVI model...") + # Train the model + vae_uns.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + plan_kwargs=plan_kwargs, + check_val_every_n_epoch=1, + accelerator="auto", + ) + # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set + + logger.info("Performing scANVI integration...") + # Get the latent output + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() + + logger.info("Performing scANVI prediction...") + adata.obs[par["obs_output_predictions"]] = vae_uns.predict() + adata.obs[par["obs_output_probabilities"]] = np.max( + vae_uns.predict(soft=True), axis=1 + ) + + logger.info("Writing output data...") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + if par["output_model"]: + vae_uns.save(par["output_model"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_SCVI_MODEL" ]; then + VIASH_PAR_SCVI_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_SCVI_MODEL") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ]; then + VIASH_PAR_OUTPUT_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_MODEL") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ] && [ ! -e "$VIASH_PAR_OUTPUT_MODEL" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_MODEL' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/annotate/scanvi/set_var_index.py b/target/executable/annotate/scanvi/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/annotate/scanvi/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/annotate/scanvi/setup_logger.py b/target/executable/annotate/scanvi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/annotate/scanvi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/scanvi/subset_vars.py b/target/executable/annotate/scanvi/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/annotate/scanvi/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/annotate/singler/.config.vsh.yaml b/target/executable/annotate/singler/.config.vsh.yaml new file mode 100644 index 00000000..13e5beb0 --- /dev/null +++ b/target/executable/annotate/singler/.config.vsh.yaml @@ -0,0 +1,519 @@ +name: "singler" +namespace: "annotate" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data containing log normalized counts to\ + \ be used for cell type annotation if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata .var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_clusters" + description: "The name of the adata .obs column containing cluster identities\ + \ of the observations. \nIf provided, annoation is performed on the aggregated\ + \ cluster profiles, \notherwise it defaults to annotation per observation.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data containing lognormalized couns to\ + \ be used for cell type annotation if .X is not to be used. Data are expected\ + \ to be processed in the same way as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The name of the adata obs column in the reference data containing\ + \ cell type annotations." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing a boolean mask corresponding to genes to\ + \ be used for marker selection. By default, do not subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + description: "Arguments related to the training of and classification with the SingleR\ + \ model" + arguments: + - type: "integer" + name: "--de_n_genes" + description: "The number of differentially expressed genes across labels to be\ + \ calculated from the reference.\nDefaults to 500 * (2/3) ^ log2(N) where N\ + \ is the number of unique labels when if `--de_method` is set to `classic`,\n\ + otherwise, defaults to 10.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--de_method" + description: "Method to detect differentially expressed genes between pairs of\ + \ labels." + info: null + default: + - "classic" + required: false + choices: + - "classic" + - "t" + - "wilcox" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--quantile" + description: "The quantile of the correlation distribution to use to compute the\ + \ score per label." + info: null + default: + - 0.8 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--fine_tune" + description: "Whether finetuning should be performed to improve the resolution.\ + \ \nIf set to True, an additional finetuning step is performed after initial\ + \ classification, \nnew marker genes are calculated based on all cells with\ + \ a score higher then the maximum score minus `--fine_tuning_thershold`,\nand\ + \ the calculation of the scores is repeated.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--fine_tuning_threshold" + description: "The maximum difference from the maximum correlation to use in fine-tuning\n" + info: null + default: + - 0.05 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--prune" + description: "Whether label pruning should be performed. If set to True, an additional\ + \ output .obs field `--output_obs_pruned_predictions` will be added to the `--output`,\ + \ containing labels where 'low-quality' labels are replaced with NA's. Labels\ + \ are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`)\ + \ fall more than 3 median absolute deviations below the median for that label\ + \ type." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slot to store the predicted labels. If `--fine_tune\ + \ False`, this is based only on the maximum entry in `--output_obsm_scores`.\n" + info: null + default: + - "singler_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predicted\ + \ labels.\n" + info: null + default: + - "singler_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_delta_next" + description: "In which `.obs` slot to store the delta between the best and next-best\ + \ score. If `--fine_tune True`, this is reported for scores after fine-tuning.\n" + info: null + default: + - "singler_delta_next" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_pruned_predictions" + description: "In which `.obs` slot to store the pruned labels, where low-quality\ + \ labels are replaced with NA's. Only added if `--prune True`.\n" + info: null + default: + - "singler_pruned_labels" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_scores" + description: "In which `.obsm` slot to store the matrix of prediction correlations\ + \ at the specified quantile for each label (column) in each cell (row).\n" + info: null + default: + - "singler_scores" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "SingleR performs reference-based cell type annotation for single-cell\ + \ RNA-seq data \nby computing Spearman correlations between test cells and reference\ + \ samples with known labels, \nusing marker genes to assign the most similar cell\ + \ type label to each new cell.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "RETICULATE_PYTHON=/usr/bin/python" + - type: "apt" + packages: + - "libhdf5-dev" + - "python3" + - "python3-pip" + - "python3-dev" + - "python-is-python3" + interactive: false + - type: "r" + cran: + - "anndata" + - "reticulate" + - "SingleR" + bioc: + - "scrapper" + - "bit64" + bioc_force_install: false + warnings_as_errors: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/singler/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/singler" + executable: "target/executable/annotate/singler/singler" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/singler/nextflow_labels.config b/target/executable/annotate/singler/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/singler/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/singler/singler b/target/executable/annotate/singler/singler new file mode 100755 index 00000000..c5f83de2 --- /dev/null +++ b/target/executable/annotate/singler/singler @@ -0,0 +1,1868 @@ +#!/usr/bin/env bash + +# singler main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="singler" +VIASH_META_FUNCTIONALITY_NAME="singler" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:22.04 +ENTRYPOINT [] +ENV RETICULATE_PYTHON=/usr/bin/python +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev python3 python3-pip python3-dev python-is-python3 && \ + rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("scrapper", quietly = TRUE)) BiocManager::install("scrapper")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("bit64", quietly = TRUE)) BiocManager::install("bit64")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("anndata", "reticulate", "SingleR"), repos = "https://cran.rstudio.com")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component annotate singler" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "singler main" + echo "" + echo "SingleR performs reference-based cell type annotation for single-cell RNA-seq" + echo "data" + echo "by computing Spearman correlations between test cells and reference samples with" + echo "known labels," + echo "using marker genes to assign the most similar cell type label to each new cell." + echo "" + echo "Inputs:" + echo " Input dataset (query) arguments" + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input (query) data to be labeled. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " The layer in the input data containing log normalized counts to be used" + echo " for cell type annotation if .X is not to be used." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " The name of the adata .var column in the input data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --input_obs_clusters" + echo " type: string" + echo " The name of the adata .obs column containing cluster identities of the" + echo " observations." + echo " If provided, annoation is performed on the aggregated cluster profiles," + echo " otherwise it defaults to annotation per observation." + echo "" + echo " --input_reference_gene_overlap" + echo " type: integer" + echo " default: 100" + echo " min: 1" + echo " The minimum number of genes present in both the reference and query" + echo " datasets." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, required parameter, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train the CellTypist classifiers on. Only required" + echo " if a pre-trained --model is not provided." + echo "" + echo " --reference_layer" + echo " type: string" + echo " The layer in the reference data containing lognormalized couns to be" + echo " used for cell type annotation if .X is not to be used. Data are expected" + echo " to be processed in the same way as the --input query dataset." + echo "" + echo " --reference_obs_target" + echo " type: string, required parameter" + echo " The name of the adata obs column in the reference data containing cell" + echo " type annotations." + echo "" + echo " --reference_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the reference data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --reference_var_input" + echo " type: string" + echo " .var column containing a boolean mask corresponding to genes to be used" + echo " for marker selection. By default, do not subset genes." + echo "" + echo "Arguments:" + echo " Arguments related to the training of and classification with the SingleR" + echo " model" + echo "" + echo " --de_n_genes" + echo " type: integer" + echo " min: 1" + echo " The number of differentially expressed genes across labels to be" + echo " calculated from the reference." + echo " Defaults to 500 * (2/3) ^ log2(N) where N is the number of unique labels" + echo " when if \`--de_method\` is set to \`classic\`," + echo " otherwise, defaults to 10." + echo "" + echo " --de_method" + echo " type: string" + echo " default: classic" + echo " choices: [ classic, t, wilcox ]" + echo " Method to detect differentially expressed genes between pairs of labels." + echo "" + echo " --quantile" + echo " type: double" + echo " default: 0.8" + echo " min: 0.0" + echo " max: 1.0" + echo " The quantile of the correlation distribution to use to compute the score" + echo " per label." + echo "" + echo " --fine_tune" + echo " type: boolean" + echo " default: true" + echo " Whether finetuning should be performed to improve the resolution." + echo " If set to True, an additional finetuning step is performed after initial" + echo " classification," + echo " new marker genes are calculated based on all cells with a score higher" + echo " then the maximum score minus \`--fine_tuning_thershold\`," + echo " and the calculation of the scores is repeated." + echo "" + echo " --fine_tuning_threshold" + echo " type: double" + echo " default: 0.05" + echo " The maximum difference from the maximum correlation to use in" + echo " fine-tuning" + echo "" + echo " --prune" + echo " type: boolean" + echo " default: true" + echo " Whether label pruning should be performed. If set to True, an additional" + echo " output .obs field \`--output_obs_pruned_predictions\` will be added to the" + echo " \`--output\`, containing labels where 'low-quality' labels are replaced" + echo " with NA's. Labels are considered 'low-quality' when their delta score" + echo " (stored in \`--output_obs_delta_next\`) fall more than 3 median absolute" + echo " deviations below the median for that label type." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_obs_predictions" + echo " type: string" + echo " default: singler_pred" + echo " In which \`.obs\` slot to store the predicted labels. If \`--fine_tune" + echo " False\`, this is based only on the maximum entry in" + echo " \`--output_obsm_scores\`." + echo "" + echo " --output_obs_probability" + echo " type: string" + echo " default: singler_probability" + echo " In which \`.obs\` slots to store the probability of the predicted labels." + echo "" + echo " --output_obs_delta_next" + echo " type: string" + echo " default: singler_delta_next" + echo " In which \`.obs\` slot to store the delta between the best and next-best" + echo " score. If \`--fine_tune True\`, this is reported for scores after" + echo " fine-tuning." + echo "" + echo " --output_obs_pruned_predictions" + echo " type: string" + echo " default: singler_pruned_labels" + echo " In which \`.obs\` slot to store the pruned labels, where low-quality" + echo " labels are replaced with NA's. Only added if \`--prune True\`." + echo "" + echo " --output_obsm_scores" + echo " type: string" + echo " default: singler_scores" + echo " In which \`.obsm\` slot to store the matrix of prediction correlations at" + echo " the specified quantile for each label (column) in each cell (row)." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "singler main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_clusters) + [ -n "$VIASH_PAR_INPUT_OBS_CLUSTERS" ] && ViashError Bad arguments for option \'--input_obs_clusters\': \'$VIASH_PAR_INPUT_OBS_CLUSTERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_CLUSTERS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_clusters. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_clusters=*) + [ -n "$VIASH_PAR_INPUT_OBS_CLUSTERS" ] && ViashError Bad arguments for option \'--input_obs_clusters=*\': \'$VIASH_PAR_INPUT_OBS_CLUSTERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_CLUSTERS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_reference_gene_overlap) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_reference_gene_overlap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_reference_gene_overlap=*) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap=*\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_target) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_target. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_target=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target=*\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_gene_names) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_gene_names=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names=*\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_input) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_input=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input=*\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --de_n_genes) + [ -n "$VIASH_PAR_DE_N_GENES" ] && ViashError Bad arguments for option \'--de_n_genes\': \'$VIASH_PAR_DE_N_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DE_N_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --de_n_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --de_n_genes=*) + [ -n "$VIASH_PAR_DE_N_GENES" ] && ViashError Bad arguments for option \'--de_n_genes=*\': \'$VIASH_PAR_DE_N_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DE_N_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --de_method) + [ -n "$VIASH_PAR_DE_METHOD" ] && ViashError Bad arguments for option \'--de_method\': \'$VIASH_PAR_DE_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DE_METHOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --de_method. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --de_method=*) + [ -n "$VIASH_PAR_DE_METHOD" ] && ViashError Bad arguments for option \'--de_method=*\': \'$VIASH_PAR_DE_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DE_METHOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantile) + [ -n "$VIASH_PAR_QUANTILE" ] && ViashError Bad arguments for option \'--quantile\': \'$VIASH_PAR_QUANTILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantile=*) + [ -n "$VIASH_PAR_QUANTILE" ] && ViashError Bad arguments for option \'--quantile=*\': \'$VIASH_PAR_QUANTILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fine_tune) + [ -n "$VIASH_PAR_FINE_TUNE" ] && ViashError Bad arguments for option \'--fine_tune\': \'$VIASH_PAR_FINE_TUNE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINE_TUNE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fine_tune. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fine_tune=*) + [ -n "$VIASH_PAR_FINE_TUNE" ] && ViashError Bad arguments for option \'--fine_tune=*\': \'$VIASH_PAR_FINE_TUNE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINE_TUNE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fine_tuning_threshold) + [ -n "$VIASH_PAR_FINE_TUNING_THRESHOLD" ] && ViashError Bad arguments for option \'--fine_tuning_threshold\': \'$VIASH_PAR_FINE_TUNING_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINE_TUNING_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fine_tuning_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fine_tuning_threshold=*) + [ -n "$VIASH_PAR_FINE_TUNING_THRESHOLD" ] && ViashError Bad arguments for option \'--fine_tuning_threshold=*\': \'$VIASH_PAR_FINE_TUNING_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINE_TUNING_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prune) + [ -n "$VIASH_PAR_PRUNE" ] && ViashError Bad arguments for option \'--prune\': \'$VIASH_PAR_PRUNE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PRUNE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prune. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prune=*) + [ -n "$VIASH_PAR_PRUNE" ] && ViashError Bad arguments for option \'--prune=*\': \'$VIASH_PAR_PRUNE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PRUNE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions=*\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_probability) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability=*\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_delta_next) + [ -n "$VIASH_PAR_OUTPUT_OBS_DELTA_NEXT" ] && ViashError Bad arguments for option \'--output_obs_delta_next\': \'$VIASH_PAR_OUTPUT_OBS_DELTA_NEXT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_DELTA_NEXT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_delta_next. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_delta_next=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_DELTA_NEXT" ] && ViashError Bad arguments for option \'--output_obs_delta_next=*\': \'$VIASH_PAR_OUTPUT_OBS_DELTA_NEXT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_DELTA_NEXT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_pruned_predictions) + [ -n "$VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_pruned_predictions\': \'$VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_pruned_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_pruned_predictions=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_pruned_predictions=*\': \'$VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obsm_scores) + [ -n "$VIASH_PAR_OUTPUT_OBSM_SCORES" ] && ViashError Bad arguments for option \'--output_obsm_scores\': \'$VIASH_PAR_OUTPUT_OBSM_SCORES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBSM_SCORES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obsm_scores. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obsm_scores=*) + [ -n "$VIASH_PAR_OUTPUT_OBSM_SCORES" ] && ViashError Bad arguments for option \'--output_obsm_scores=*\': \'$VIASH_PAR_OUTPUT_OBSM_SCORES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBSM_SCORES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/singler:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then + ViashError '--reference_obs_target' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="100" +fi +if [ -z ${VIASH_PAR_DE_METHOD+x} ]; then + VIASH_PAR_DE_METHOD="classic" +fi +if [ -z ${VIASH_PAR_QUANTILE+x} ]; then + VIASH_PAR_QUANTILE="0.8" +fi +if [ -z ${VIASH_PAR_FINE_TUNE+x} ]; then + VIASH_PAR_FINE_TUNE="true" +fi +if [ -z ${VIASH_PAR_FINE_TUNING_THRESHOLD+x} ]; then + VIASH_PAR_FINE_TUNING_THRESHOLD="0.05" +fi +if [ -z ${VIASH_PAR_PRUNE+x} ]; then + VIASH_PAR_PRUNE="true" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="singler_pred" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="singler_probability" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_DELTA_NEXT+x} ]; then + VIASH_PAR_OUTPUT_OBS_DELTA_NEXT="singler_delta_next" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS+x} ]; then + VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS="singler_pruned_labels" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBSM_SCORES+x} ]; then + VIASH_PAR_OUTPUT_OBSM_SCORES="singler_scores" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ]]; then + if ! [[ "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--input_reference_gene_overlap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP -lt 1 ]]; then + ViashError '--input_reference_gene_overlap' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DE_N_GENES" ]]; then + if ! [[ "$VIASH_PAR_DE_N_GENES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--de_n_genes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_DE_N_GENES -lt 1 ]]; then + ViashError '--de_n_genes' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_QUANTILE" ]]; then + if ! [[ "$VIASH_PAR_QUANTILE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--quantile' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_QUANTILE '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--quantile' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_QUANTILE -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--quantile' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--quantile' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_QUANTILE '<=' 1.0 | bc` -eq 1 ]]; then + ViashError '--quantile' has to be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_QUANTILE -v n2=1.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--quantile' has be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--quantile' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_FINE_TUNE" ]]; then + if ! [[ "$VIASH_PAR_FINE_TUNE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--fine_tune' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FINE_TUNING_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_FINE_TUNING_THRESHOLD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--fine_tuning_threshold' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PRUNE" ]]; then + if ! [[ "$VIASH_PAR_PRUNE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--prune' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_DE_METHOD" ]; then + VIASH_PAR_DE_METHOD_CHOICES=("classic;t;wilcox") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_DE_METHOD_CHOICES[*]};" =~ ";$VIASH_PAR_DE_METHOD;" ]]; then + ViashError '--de_method' specified value of \'$VIASH_PAR_DE_METHOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-singler-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +library(SingleR) +library(Matrix) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +mudata <- reticulate::import("mudata") + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "modality" = $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_MODALITY" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_layer" = $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_LAYER" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_var_gene_names" = $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_obs_clusters" = $( if [ ! -z ${VIASH_PAR_INPUT_OBS_CLUSTERS+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_OBS_CLUSTERS" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_reference_gene_overlap" = $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "reference" = $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_layer" = $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_LAYER" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_obs_target" = $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_OBS_TARGET" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_var_gene_names" = $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_var_input" = $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_VAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "de_n_genes" = $( if [ ! -z ${VIASH_PAR_DE_N_GENES+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_DE_N_GENES" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "de_method" = $( if [ ! -z ${VIASH_PAR_DE_METHOD+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_DE_METHOD" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "quantile" = $( if [ ! -z ${VIASH_PAR_QUANTILE+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_QUANTILE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "fine_tune" = $( if [ ! -z ${VIASH_PAR_FINE_TUNE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_FINE_TUNE" | sed "s#['\\]#\\\\&#g"; echo "'))"; else echo NULL; fi ), + "fine_tuning_threshold" = $( if [ ! -z ${VIASH_PAR_FINE_TUNING_THRESHOLD+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_FINE_TUNING_THRESHOLD" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "prune" = $( if [ ! -z ${VIASH_PAR_PRUNE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_PRUNE" | sed "s#['\\]#\\\\&#g"; echo "'))"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_predictions" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_probability" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_delta_next" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_DELTA_NEXT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_DELTA_NEXT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_pruned_predictions" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obsm_scores" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBSM_SCORES+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBSM_SCORES" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_compression" = $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_COMPRESSION" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + +get_layer <- function(adata, layer, var_gene_names) { + # find data + data <- + if (is.null(layer)) { + adata\$X + } else { + adata\$layers[[layer]] + } + + # check if data is available + if (is.null(data)) { + if (is.null(layer)) { + stop("No layer specified and no .X slot available in the AnnData object.") + } else { + stop( + "Requested layer '", + layer, + "' is not available in the AnnData object. Available layers: ", + paste(names(adata\$layers), collapse = ", ") + ) + } + } + + # Set matrix dimnames + input_gene_names <- sanitize_gene_names(adata, var_gene_names) + dimnames(data) <- list(adata\$obs_names, input_gene_names) + + # return output + data +} + +sanitize_gene_names <- function(adata, gene_symbol = NULL) { + if (is.null(gene_symbol)) { + gene_names <- adata\$var_names + } else { + gene_names <- adata\$var[[gene_symbol]] + } + # Remove version numbers (dot followed by digits at end of string) + sanitized <- gsub("\\\\.[0-9]+\$", "", gene_names) + sanitized +} + +subset_vars <- function(adata, subset_col) { + # Check if column exists + if (!subset_col %in% colnames(adata\$var)) { + stop( + "Requested to use .var column '", + subset_col, + "' as a selection of genes, but the column is not available." + ) + } + + # Get the column + subset_values <- adata\$var[[subset_col]] + + # Check for NA values + if (any(is.na(subset_values))) { + stop( + "The .var column \`", + subset_col, + "\` contains NA values. Can not subset data." + ) + } + + # Check if it's logical/boolean + if (!is.logical(subset_values)) { + stop( + "Expected dtype of .var column '", + subset_col, + "' to be \`logical\`, but found ", + class(subset_values)[1], + ". Can not subset data." + ) + } + + # Subset and return copy + adata_subset <- adata[, subset_values]\$copy() + adata_subset +} + +# Read input data +cat("Reading input file\\n") +input_mdata <- mudata\$read_h5mu(par\$input) +input_adata <- input_mdata\$mod[[par\$modality]] +input_matrix <- Matrix::t( + get_layer(input_adata, par\$input_layer, par\$input_var_gene_names) +) + +# Read reference +cat("Reading reference file\\n") +ref_adata <- mudata\$read_h5ad(par\$reference, mod = "rna") +if (!is.null(par\$reference_var_input)) { + ref_adata <- subset_vars(ref_adata, par\$reference_var_input) +} +ref_matrix <- Matrix::t( + get_layer(ref_adata, par\$reference_layer, par\$reference_var_gene_names) +) + +# Check overlap genes +cat("Checking overlap of genes between input and reference file\\n") +common_ens_ids <- intersect(rownames(ref_matrix), rownames(input_matrix)) +if (length(common_ens_ids) < par\$input_reference_gene_overlap) { + stop( + "The intersection of genes between the query and reference is too small.\\n", + paste0("Expected overlap: ", par\$input_reference_gene_overlap), + paste0("Detected overlap: ", length(common_ens_ids)) + ) +} + +# Calculate CPU cores +n_workers <- meta\$cpus %||% max(1, parallel::detectCores() - 1) + +# Assign target labels +if (!par\$reference_obs_target %in% colnames(ref_adata\$obs)) { + stop( + "Requested to use .obs column '", + par\$reference_obs_target, + "' as target labels, but the column is not available." + ) +} else { + target_labels <- ref_adata\$obs[[par\$reference_obs_target]] +} + +cat("Performing SingleR cell type prediction\\n") +predictions <- SingleR( + test = input_matrix, + ref = ref_matrix, + labels = target_labels, + clusters = par\$input_obs_clusters, + genes = "de", + de.method = par\$de_method, + de.n = par\$de_n_genes, + quantile = par\$quantile, + fine.tune = par\$fine_tune, + tune.thresh = par\$fine_tuning_threshold, + prune = par\$prune, + num.threads = n_workers +) + +cat("Writing output data\\n") +# Writing output slots +input_adata\$obs[[par\$output_obs_predictions]] <- + predictions\$labels +input_adata\$obs[[par\$output_obs_probability]] <- + apply(predictions\$scores, 1, max) +input_adata\$obs[[par\$output_obs_delta_next]] <- + predictions\$delta.next +input_adata\$obs[[par\$output_obs_pruned_predictions]] <- + predictions\$pruned.labels +input_adata\$obsm[[par\$output_obsm_scores]] <- + predictions\$scores +# Writing output H5MU +input_mdata\$write(par\$output, compression = par\$output_compression) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/annotate/svm_annotation/.config.vsh.yaml b/target/executable/annotate/svm_annotation/.config.vsh.yaml new file mode 100644 index 00000000..013482a3 --- /dev/null +++ b/target/executable/annotate/svm_annotation/.config.vsh.yaml @@ -0,0 +1,440 @@ +name: "svm_annotation" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "Key in .obs attribute of reference modality with cell-type information.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_prediction" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "svm_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "svm_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments." + arguments: + - type: "file" + name: "--model" + description: "Pretrained model in pkl format. If not provided, the model will\ + \ be trained on the reference data and --reference should be provided." + info: null + example: + - "pretrained_model.pkl" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--feature_selection" + description: "Whether to perform feature selection." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "Maximum number of iterations for the SVM." + info: null + default: + - 5000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--c_reg" + description: "Regularization parameter for the SVM." + info: null + default: + - 1.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--class_weight" + description: "\"Class weights for the SVM. The `uniform` mode gives all classes\ + \ a weight of one. \nThe `balanced` mode (default) uses the values of y to\ + \ automatically adjust weights inversely \nproportional to class frequencies\ + \ in the input data as n_samples / (n_classes * np.bincount(y))\"\n" + info: null + default: + - "balanced" + required: false + choices: + - "balanced" + - "uniform" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\ + \ of SVMs." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "scikit-learn==1.5.2" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/svm_annotation/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/annotate/svm_annotation" + executable: "target/executable/annotate/svm_annotation/svm_annotation" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/annotate/svm_annotation/cross_check_genes.py b/target/executable/annotate/svm_annotation/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/executable/annotate/svm_annotation/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/executable/annotate/svm_annotation/nextflow_labels.config b/target/executable/annotate/svm_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/annotate/svm_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/annotate/svm_annotation/set_var_index.py b/target/executable/annotate/svm_annotation/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/annotate/svm_annotation/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/annotate/svm_annotation/setup_logger.py b/target/executable/annotate/svm_annotation/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/annotate/svm_annotation/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/annotate/svm_annotation/subset_vars.py b/target/executable/annotate/svm_annotation/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/annotate/svm_annotation/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/annotate/svm_annotation/svm_annotation b/target/executable/annotate/svm_annotation/svm_annotation new file mode 100755 index 00000000..d82c6ff0 --- /dev/null +++ b/target/executable/annotate/svm_annotation/svm_annotation @@ -0,0 +1,1665 @@ +#!/usr/bin/env bash + +# svm_annotation main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="svm_annotation" +VIASH_META_FUNCTIONALITY_NAME="svm_annotation" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scikit-learn==1.5.2" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component annotate svm_annotation" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "svm_annotation main" + echo "" + echo "Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs." + echo "" + echo "Inputs:" + echo " Input dataset (query) arguments" + echo "" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input (query) data to be labeled. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " The layer in the input data to be used for cell type annotation if .X is" + echo " not to be used." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the input data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --input_reference_gene_overlap" + echo " type: integer" + echo " default: 100" + echo " min: 1" + echo " The minimum number of genes present in both the reference and query" + echo " datasets." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train the CellTypist classifiers on. Only required" + echo " if a pre-trained --model is not provided." + echo "" + echo " --reference_layer" + echo " type: string" + echo " The layer in the reference data to be used for cell type annotation if" + echo " .X is not to be used. Data are expected to be processed in the same way" + echo " as the --input query dataset." + echo "" + echo " --reference_obs_target" + echo " type: string, required parameter" + echo " Key in .obs attribute of reference modality with cell-type information." + echo "" + echo " --reference_var_gene_names" + echo " type: string" + echo " The name of the adata var column in the reference data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --reference_var_input" + echo " type: string" + echo " .var column containing highly variable genes. By default, do not subset" + echo " genes." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_obs_prediction" + echo " type: string" + echo " default: svm_pred" + echo " In which \`.obs\` slots to store the predicted information." + echo "" + echo " --output_obs_probability" + echo " type: string" + echo " default: svm_probability" + echo " In which \`.obs\` slots to store the probability of the predictions." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Model arguments:" + echo " Model arguments." + echo "" + echo " --model" + echo " type: file, file must exist" + echo " example: pretrained_model.pkl" + echo " Pretrained model in pkl format. If not provided, the model will be" + echo " trained on the reference data and --reference should be provided." + echo "" + echo " --feature_selection" + echo " type: boolean" + echo " default: true" + echo " Whether to perform feature selection." + echo "" + echo " --max_iter" + echo " type: integer" + echo " default: 5000" + echo " min: 1" + echo " Maximum number of iterations for the SVM." + echo "" + echo " --c_reg" + echo " type: double" + echo " default: 1.0" + echo " min: 0.0" + echo " Regularization parameter for the SVM." + echo "" + echo " --class_weight" + echo " type: string" + echo " default: balanced" + echo " choices: [ balanced, uniform ]" + echo " \"Class weights for the SVM. The \`uniform\` mode gives all classes a" + echo " weight of one." + echo " The \`balanced\` mode (default) uses the values of y to automatically" + echo " adjust weights inversely" + echo " proportional to class frequencies in the input data as n_samples /" + echo " (n_classes * np.bincount(y))\"" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "svm_annotation main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_reference_gene_overlap) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_reference_gene_overlap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_reference_gene_overlap=*) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap=*\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_target) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_target. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_target=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_TARGET" ] && ViashError Bad arguments for option \'--reference_obs_target=*\': \'$VIASH_PAR_REFERENCE_OBS_TARGET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_TARGET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_gene_names) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_gene_names=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names=*\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_input) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_input=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_INPUT" ] && ViashError Bad arguments for option \'--reference_var_input=*\': \'$VIASH_PAR_REFERENCE_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_prediction) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTION" ] && ViashError Bad arguments for option \'--output_obs_prediction\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_prediction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_prediction=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTION" ] && ViashError Bad arguments for option \'--output_obs_prediction=*\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_probability) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability=*\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --feature_selection) + [ -n "$VIASH_PAR_FEATURE_SELECTION" ] && ViashError Bad arguments for option \'--feature_selection\': \'$VIASH_PAR_FEATURE_SELECTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_SELECTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --feature_selection. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --feature_selection=*) + [ -n "$VIASH_PAR_FEATURE_SELECTION" ] && ViashError Bad arguments for option \'--feature_selection=*\': \'$VIASH_PAR_FEATURE_SELECTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_SELECTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_iter) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_iter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_iter=*) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter=*\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --c_reg) + [ -n "$VIASH_PAR_C_REG" ] && ViashError Bad arguments for option \'--c_reg\': \'$VIASH_PAR_C_REG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_C_REG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --c_reg. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --c_reg=*) + [ -n "$VIASH_PAR_C_REG" ] && ViashError Bad arguments for option \'--c_reg=*\': \'$VIASH_PAR_C_REG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_C_REG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --class_weight) + [ -n "$VIASH_PAR_CLASS_WEIGHT" ] && ViashError Bad arguments for option \'--class_weight\': \'$VIASH_PAR_CLASS_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLASS_WEIGHT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --class_weight. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --class_weight=*) + [ -n "$VIASH_PAR_CLASS_WEIGHT" ] && ViashError Bad arguments for option \'--class_weight=*\': \'$VIASH_PAR_CLASS_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLASS_WEIGHT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/annotate/svm_annotation:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then + ViashError '--reference_obs_target' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="100" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PREDICTION+x} ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTION="svm_pred" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="svm_probability" +fi +if [ -z ${VIASH_PAR_FEATURE_SELECTION+x} ]; then + VIASH_PAR_FEATURE_SELECTION="true" +fi +if [ -z ${VIASH_PAR_MAX_ITER+x} ]; then + VIASH_PAR_MAX_ITER="5000" +fi +if [ -z ${VIASH_PAR_C_REG+x} ]; then + VIASH_PAR_C_REG="1.0" +fi +if [ -z ${VIASH_PAR_CLASS_WEIGHT+x} ]; then + VIASH_PAR_CLASS_WEIGHT="balanced" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL" ] && [ ! -e "$VIASH_PAR_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_MODEL' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ]]; then + if ! [[ "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--input_reference_gene_overlap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP -lt 1 ]]; then + ViashError '--input_reference_gene_overlap' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FEATURE_SELECTION" ]]; then + if ! [[ "$VIASH_PAR_FEATURE_SELECTION" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--feature_selection' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_ITER" ]]; then + if ! [[ "$VIASH_PAR_MAX_ITER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_iter' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_MAX_ITER -lt 1 ]]; then + ViashError '--max_iter' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_C_REG" ]]; then + if ! [[ "$VIASH_PAR_C_REG" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--c_reg' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_C_REG '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--c_reg' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_C_REG -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--c_reg' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--c_reg' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CLASS_WEIGHT" ]; then + VIASH_PAR_CLASS_WEIGHT_CHOICES=("balanced;uniform") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CLASS_WEIGHT_CHOICES[*]};" =~ ";$VIASH_PAR_CLASS_WEIGHT;" ]]; then + ViashError '--class_weight' specified value of \'$VIASH_PAR_CLASS_WEIGHT\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL")" ) + VIASH_PAR_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-svm_annotation-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +import numpy as np +from sklearn.calibration import CalibratedClassifierCV +from sklearn import svm +import pickle + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_prediction': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'feature_selection': $( if [ ! -z ${VIASH_PAR_FEATURE_SELECTION+x} ]; then echo "r'${VIASH_PAR_FEATURE_SELECTION//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'c_reg': $( if [ ! -z ${VIASH_PAR_C_REG+x} ]; then echo "float(r'${VIASH_PAR_C_REG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'class_weight': $( if [ ! -z ${VIASH_PAR_CLASS_WEIGHT+x} ]; then echo "r'${VIASH_PAR_CLASS_WEIGHT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from subset_vars import subset_vars +from set_var_index import set_var_index + +logger = setup_logger() + + +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + + if par["model"]: + logger.info("Loading a pre-trained model") + model = pickle.load(open(par["model"], "rb")) + if hasattr(model, "_feature_names_in"): + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) + if not len(common_genes) == len(model._feature_names_in): + raise ValueError("Input dataset does not contain all model features.") + input_modality = input_modality[:, common_genes] + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + else: + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) + + elif par["reference"]: + logger.info("Reading reference data") + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Query and input require the exact same features + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + reference_modality = reference_modality[:, common_genes] + input_modality = input_modality[:, common_genes] + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + logger.info("Training a model...") + labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() + model = CalibratedClassifierCV( + svm.LinearSVC( + C=par["c_reg"], + max_iter=par["max_iter"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + dual="auto", + ) + ) + model.fit(reference_matrix, labels) + model._feature_names_in = reference_modality.var.index + + logger.info("Running predictions...") + predictions = model.predict(input_matrix) + probabilities = np.max(model.predict_proba(input_matrix), axis=1) + + logger.info("Writing output data") + input_adata.obs[par["output_obs_prediction"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/cluster/leiden/.config.vsh.yaml b/target/executable/cluster/leiden/.config.vsh.yaml new file mode 100644 index 00000000..3167f72e --- /dev/null +++ b/target/executable/cluster/leiden/.config.vsh.yaml @@ -0,0 +1,305 @@ +name: "leiden" +namespace: "cluster" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot the neighbor connectivities can be found." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_name" + description: "Name of the .obsm key under which to add the cluster labels.\nThe\ + \ name of the columns in the matrix will correspond to the resolutions.\n" + info: null + default: + - "leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--resolution" + description: "A parameter value controlling the coarseness of the clustering.\ + \ Higher values lead to more clusters.\nMultiple values will result in clustering\ + \ being performed multiple times.\n" + info: null + default: + - 1.0 + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cluster cells using the [Leiden algorithm] [Traag18] implemented in\ + \ the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain\ + \ algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15]\ + \ [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn`\ + \ first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in\ + \ large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven\ + \ Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with\ + \ Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing\ + \ well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale\ + \ single-cell gene expression data analysis, Genome Biology. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.13-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "leidenalg~=0.10.0" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/cluster/leiden/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/cluster/leiden" + executable: "target/executable/cluster/leiden/leiden" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/cluster/leiden/compress_h5mu.py b/target/executable/cluster/leiden/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/cluster/leiden/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/cluster/leiden/leiden b/target/executable/cluster/leiden/leiden new file mode 100755 index 00000000..8e33c945 --- /dev/null +++ b/target/executable/cluster/leiden/leiden @@ -0,0 +1,1594 @@ +#!/usr/bin/env bash + +# leiden main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="leiden" +VIASH_META_FUNCTIONALITY_NAME="leiden" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.13-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "leidenalg~=0.10.0" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer" +LABEL org.opencontainers.image.description="Companion container for running component cluster leiden" +LABEL org.opencontainers.image.created="2025-08-22T07:23:09Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "leiden main" + echo "" + echo "Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy" + echo "framework] [Wolf18]." + echo "Leiden is an improved version of the [Louvain algorithm] [Blondel08]." + echo "It has been proposed for single-cell analysis by [Levine15] [Levine15]." + echo "This requires having ran \`neighbors/find_neighbors\` or \`neighbors/bbknn\` first." + echo "" + echo "[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large" + echo "networks, J. Stat. Mech." + echo "[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML" + echo "Reveals Progenitor-like Cells that Correlate with Prognosis, Cell." + echo "[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing" + echo "well-connected communities arXiv." + echo "[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression" + echo "data analysis, Genome Biology." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obsp_connectivities" + echo " type: string" + echo " default: connectivities" + echo " In which .obsp slot the neighbor connectivities can be found." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output file." + echo "" + echo " --obsm_name" + echo " type: string" + echo " default: leiden" + echo " Name of the .obsm key under which to add the cluster labels." + echo " The name of the columns in the matrix will correspond to the" + echo " resolutions." + echo "" + echo " --resolution" + echo " type: double, required parameter, multiple values allowed" + echo " default: 1.0" + echo " A parameter value controlling the coarseness of the clustering. Higher" + echo " values lead to more clusters." + echo " Multiple values will result in clustering being performed multiple" + echo " times." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "leiden main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsp_connectivities) + [ -n "$VIASH_PAR_OBSP_CONNECTIVITIES" ] && ViashError Bad arguments for option \'--obsp_connectivities\': \'$VIASH_PAR_OBSP_CONNECTIVITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_CONNECTIVITIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsp_connectivities. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsp_connectivities=*) + [ -n "$VIASH_PAR_OBSP_CONNECTIVITIES" ] && ViashError Bad arguments for option \'--obsp_connectivities=*\': \'$VIASH_PAR_OBSP_CONNECTIVITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_CONNECTIVITIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_name) + [ -n "$VIASH_PAR_OBSM_NAME" ] && ViashError Bad arguments for option \'--obsm_name\': \'$VIASH_PAR_OBSM_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_name=*) + [ -n "$VIASH_PAR_OBSM_NAME" ] && ViashError Bad arguments for option \'--obsm_name=*\': \'$VIASH_PAR_OBSM_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --resolution) + if [ -z "$VIASH_PAR_RESOLUTION" ]; then + VIASH_PAR_RESOLUTION="$2" + else + VIASH_PAR_RESOLUTION="$VIASH_PAR_RESOLUTION;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --resolution. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --resolution=*) + if [ -z "$VIASH_PAR_RESOLUTION" ]; then + VIASH_PAR_RESOLUTION=$(ViashRemoveFlags "$1") + else + VIASH_PAR_RESOLUTION="$VIASH_PAR_RESOLUTION;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/cluster/leiden:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_RESOLUTION+x} ]; then + ViashError '--resolution' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then + VIASH_PAR_OBSP_CONNECTIVITIES="connectivities" +fi +if [ -z ${VIASH_PAR_OBSM_NAME+x} ]; then + VIASH_PAR_OBSM_NAME="leiden" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [ -n "$VIASH_PAR_RESOLUTION" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_RESOLUTION; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--resolution' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-leiden-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import signal +import os +import time +import logging +import logging.handlers +import warnings +import mudata as mu +import pandas as pd +import scanpy as sc +import numpy as np +import numpy.typing as npt +import anndata as ad +from multiprocessing import managers, shared_memory, get_context +from concurrent.futures import ProcessPoolExecutor, process, as_completed +from scipy.sparse import csr_matrix +from pathlib import Path +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsp_connectivities': $( if [ ! -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then echo "r'${VIASH_PAR_OBSP_CONNECTIVITIES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_name': $( if [ ! -z ${VIASH_PAR_OBSM_NAME+x} ]; then echo "r'${VIASH_PAR_OBSM_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resolution': $( if [ ! -z ${VIASH_PAR_RESOLUTION+x} ]; then echo "list(map(float, r'${VIASH_PAR_RESOLUTION//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) + +from compress_h5mu import compress_h5mu + +_shared_logger_name = "leiden" + + +# Function to check available space in /dev/shm +def get_available_shared_memory(): + shm_path = "/dev/shm" + shm_stats = os.statvfs(shm_path) + return shm_stats.f_bsize * shm_stats.f_bavail + + +class SharedNumpyMatrix: + def __init__( + self, + shared_memory: shared_memory.SharedMemory, + dtype: npt.DTypeLike, + shape: tuple[int, int], + ) -> None: + self._memory = shared_memory + self._dtype = dtype + self._shape = shape + + @classmethod + def from_numpy( + cls, memory_manager: managers.SharedMemoryManager, array: npt.ArrayLike + ): + available_shared_memory = get_available_shared_memory() + n_bytes_required = array.nbytes + if available_shared_memory < n_bytes_required: + raise ValueError( + "Not enough shared memory (/dev/shm) is available to load the data. " + f"Required amount: {n_bytes_required}, available: {available_shared_memory}." + ) + shm = memory_manager.SharedMemory(size=array.nbytes) + array_in_shared_memory = np.ndarray( + array.shape, dtype=array.dtype, buffer=shm.buf + ) + # Copy the data into shared memory + array_in_shared_memory[:] = array[:] + return cls(shm, array.dtype, array.shape) + + def to_numpy(self): + return np.ndarray(self._shape, dtype=self._dtype, buffer=self._memory.buf) + + def close(self): + self._memory.close() + + +class SharedCsrMatrix: + def __init__( + self, + data: SharedNumpyMatrix, + indices: SharedNumpyMatrix, + indptr: SharedNumpyMatrix, + shape: npt.DTypeLike, + ): + self._data = data + self._indices = indices + self._indptr = indptr + self._shape = shape + + @classmethod + def from_csr_matrix( + cls, memory_manager: managers.SharedMemoryManager, csr_matrix_obj: csr_matrix + ): + return cls( + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.data), + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indices), + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indptr), + csr_matrix_obj.shape, + ) + + def to_csr_matrix(self): + return csr_matrix( + (self._data.to_numpy(), self._indices.to_numpy(), self._indptr.to_numpy()), + shape=self._shape, + copy=False, + ) + + def close(self): + self._data.close() + self._indices.close() + self._indptr.close() + + +def create_empty_anndata_with_connectivities(connectivities, obs_names): + empty_anndata = ad.AnnData( + np.zeros((connectivities.shape[0], 1)), obs=pd.DataFrame(index=list(obs_names)) + ) + empty_anndata.obsp["connectivities"] = connectivities + return empty_anndata + + +def run_single_resolution(shared_csr_matrix, obs_names, resolution): + logger = logging.getLogger(_shared_logger_name) + logger.info( + "Process with PID '%s' for resolution '%s' started", os.getpid(), resolution + ) + try: + connectivities = shared_csr_matrix.to_csr_matrix() + adata = create_empty_anndata_with_connectivities(connectivities, obs_names) + with warnings.catch_warnings(): + # In the future, the default backend for leiden will be igraph instead of leidenalg. + warnings.simplefilter(action="ignore", category=FutureWarning) + adata_out = sc.tl.leiden( + adata, + resolution=resolution, + key_added=str(resolution), + obsp="connectivities", + copy=True, + ) + logger.info(f"Returning result for resolution {resolution}") + return adata_out.obs[str(resolution)] + finally: + obs_names.shm.close() + shared_csr_matrix.close() + + +def init_worker(parent_process_id, exit_event, log_queue, log_level): + import os + import threading + import time + + pid = os.getpid() + + logger = logging.getLogger(_shared_logger_name) + logger.setLevel(log_level) + + handler = logging.handlers.QueueHandler(log_queue) + logger.addHandler(handler) + + logger.info("Initializing process %s", pid) + + def exit_if_orphaned(): + logger.info( + "Starting orphanned process checker for process %s, parent process %s.", + pid, + parent_process_id, + ) + while True: + # Check if parent process is gone + try: + # If sig is 0, then no signal is sent, but error checking is still performed; + # this can be used to check for the existence of a process ID + os.kill(parent_process_id, 0) + except ProcessLookupError: + logger.info("Parent process is gone, shutting down %s", pid) + # Kill self + os.kill(pid, signal.SIGTERM) + time.sleep(0.2) + # Parent process requested exit + try: + exit_event_set = exit_event.wait(timeout=1) + except BrokenPipeError: + logger.info( + "Checking for shutdown resulted in BrokenPipeError, " + "parent process is most likely gone. Shutting down %s", + pid, + ) + os.kill(pid, signal.SIGTERM) + else: + if exit_event_set: + logger.info("Exit event set, shutting down %s", pid) + os.kill(pid, signal.SIGTERM) + time.sleep(1) + + threading.Thread(target=exit_if_orphaned, daemon=True).start() + logger.info( + "Initialization of process %s is complete, process is now waiting for work.", + pid, + ) + + +def main(): + with managers.SyncManager() as syncm: + log_level = logging.INFO + log_format = "%(name)s:%(levelname)s:%(asctime)s: %(message)s" + formatter = logging.Formatter(log_format) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + log_queue = syncm.Queue() + log_listener = logging.handlers.QueueListener(log_queue, console_handler) + log_listener.start() + + logger = logging.getLogger(_shared_logger_name) + logger.setLevel(log_level) + handler = logging.handlers.QueueHandler(log_queue) + logger.addHandler(handler) + + logger.info("Reading %s.", par["input"]) + adata = mu.read_h5ad(par["input"], mod=par["modality"], backed="r") + logger.info("Processing modality '%s'.", par["modality"]) + try: + connectivities = adata.obsp[par["obsp_connectivities"]] + except KeyError: + raise ValueError( + f'Could not find .obsp key "{par["obsp_connectivities"]}" ' + "in modality {par['modality']}" + ) + + # An event that, when triggered, will kill the child processes that are still running + exit_early_event = syncm.Event() + with managers.SharedMemoryManager() as smm: + # anndata converts the index to strings, so no worries that it cannot be stored in ShareableList + # because it has an unsupported dtype. It should always be string... + index_contents = adata.obs.index.to_list() + assert all([isinstance(item, str) for item in index_contents]) + obs_names = smm.ShareableList(index_contents) + + shared_csr_matrix = SharedCsrMatrix.from_csr_matrix(smm, connectivities) + results = {} + n_workers = ( + meta["cpus"] - 2 if (meta["cpus"] and (meta["cpus"] - 2) > 0) else 1 + ) + logger.info(f"Requesting {n_workers} workers") + executor = ProcessPoolExecutor( + max_workers=n_workers, + max_tasks_per_child=1, + mp_context=get_context("spawn"), + initializer=init_worker, + initargs=((os.getpid(), exit_early_event, log_queue, log_level)), + ) + pending_futures = { + executor.submit( + run_single_resolution, shared_csr_matrix, obs_names, resolution + ): resolution + for resolution in par["resolution"] + } + try: + logger.info("All futures sheduled") + for done_future in as_completed(pending_futures): + resolution = pending_futures[done_future] + data = done_future.result() + logger.info(f"Processed resolution '{resolution}'") + results[str(resolution)] = data + except process.BrokenProcessPool: + # This assumes that one of the child processses was killed by the kernel + # because the oom killer was activated. This the is the most likely scenario, + # other causes could be: + # * Subprocess terminates without raising a proper exception. + # * The code of the process handling the communication is broke (i.e. a python bug) + # * The return data could not be pickled. + logger.error("BrokenProcessPool is raised") + executor.shutdown(wait=False, cancel_futures=True) + time.sleep(3) + exit_early_event.set() + time.sleep(3) + sys.exit(137) + finally: + logger.info("Closing shared resources in main process") + shared_csr_matrix.close() + obs_names.shm.close() + logger.info("Shared resources closed") + logger.info( + "Shutting down logging queue and re-enabling logging from main thread." + ) + log_listener.enqueue_sentinel() + log_listener.stop() + stdout_handler = logging.StreamHandler() + stdout_handler.setFormatter(formatter) + logger.addHandler(stdout_handler) + logger.info("Re-enabled logging in main thread.") + logger.info("Waiting for shutdown of processes") + executor.shutdown() + logger.info("Executor shut down.") + adata.obsm[par["obsm_name"]] = pd.DataFrame(results) + + output_file = Path(par["output"]) + logger.info("Writing output to %s.", par["output"]) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if par["output_compression"] + else output_file + ) + shutil.copyfile(par["input"], output_file_uncompressed) + mu.write_h5ad( + filename=output_file_uncompressed, mod=par["modality"], data=adata + ) + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, + output_file, + compression=par["output_compression"], + ) + output_file_uncompressed.unlink() + logger.info("Finished.") + log_listener.enqueue_sentinel() + time.sleep(3) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/cluster/leiden/nextflow_labels.config b/target/executable/cluster/leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/cluster/leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/cluster/leiden/setup_logger.py b/target/executable/cluster/leiden/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/cluster/leiden/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/compression/compress_h5mu/.config.vsh.yaml b/target/executable/compression/compress_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..9e4bdf83 --- /dev/null +++ b/target/executable/compression/compress_h5mu/.config.vsh.yaml @@ -0,0 +1,245 @@ +name: "compress_h5mu" +namespace: "compression" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the input .h5mu." + info: null + example: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "location of output file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Compress a MuData file. \n" +test_resources: +- type: "python_script" + path: "run_test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/compression/compress_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/compression/compress_h5mu" + executable: "target/executable/compression/compress_h5mu/compress_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/compression/compress_h5mu/compress_h5mu b/target/executable/compression/compress_h5mu/compress_h5mu new file mode 100755 index 00000000..2492a182 --- /dev/null +++ b/target/executable/compression/compress_h5mu/compress_h5mu @@ -0,0 +1,1160 @@ +#!/usr/bin/env bash + +# compress_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="compress_h5mu" +VIASH_META_FUNCTIONALITY_NAME="compress_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component compression compress_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "compress_h5mu main" + echo "" + echo "Compress a MuData file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: sample_path" + echo " Path to the input .h5mu." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " location of output file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "compress_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/compression/compress_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-compress_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import compress_h5mu + +if __name__ == "__main__": + compress_h5mu(par["input"], par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/compression/compress_h5mu/compress_h5mu.py b/target/executable/compression/compress_h5mu/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/compression/compress_h5mu/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/compression/compress_h5mu/nextflow_labels.config b/target/executable/compression/compress_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/compression/compress_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/compression/tar_extract/.config.vsh.yaml b/target/executable/compression/tar_extract/.config.vsh.yaml new file mode 100644 index 00000000..c88489e8 --- /dev/null +++ b/target/executable/compression/tar_extract/.config.vsh.yaml @@ -0,0 +1,219 @@ +name: "tar_extract" +namespace: "compression" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input file" + info: null + example: + - "input.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Folder to restore file(s) to." + info: null + example: + - "output_folder" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--strip_components" + alternatives: + - "-s" + description: "Strip this amount of leading components from file names on extraction.\ + \ For example, to extract only 'myfile.txt' from an archive containing the structure\ + \ `this/goes/deep/myfile.txt', use 3 to strip 'this/goes/deep/'." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--exclude" + alternatives: + - "-e" + description: "Prevents any file or member whose name matches the shell wildcard\ + \ (pattern) from being extracted." + info: null + example: + - "docs/figures" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Extract files from a tar archive" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "LICENSE" +info: null +status: "deprecated" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/compression/tar_extract/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/compression/tar_extract" + executable: "target/executable/compression/tar_extract/tar_extract" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/compression/tar_extract/nextflow_labels.config b/target/executable/compression/tar_extract/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/compression/tar_extract/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/compression/tar_extract/tar_extract b/target/executable/compression/tar_extract/tar_extract new file mode 100755 index 00000000..69cb65c1 --- /dev/null +++ b/target/executable/compression/tar_extract/tar_extract @@ -0,0 +1,1180 @@ +#!/usr/bin/env bash + +# tar_extract main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="tar_extract" +VIASH_META_FUNCTIONALITY_NAME="tar_extract" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:latest +ENTRYPOINT [] +LABEL org.opencontainers.image.description="Companion container for running component compression tar_extract" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "tar_extract main" + echo "" + echo "Extract files from a tar archive" + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.tar.gz" + echo " Input file" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output_folder" + echo " Folder to restore file(s) to." + echo "" + echo " -s, --strip_components" + echo " type: integer" + echo " example: 1" + echo " Strip this amount of leading components from file names on extraction." + echo " For example, to extract only 'myfile.txt' from an archive containing the" + echo " structure \`this/goes/deep/myfile.txt', use 3 to strip 'this/goes/deep/'." + echo "" + echo " -e, --exclude" + echo " type: string" + echo " example: docs/figures" + echo " Prevents any file or member whose name matches the shell wildcard" + echo " (pattern) from being extracted." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "tar_extract main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --strip_components) + [ -n "$VIASH_PAR_STRIP_COMPONENTS" ] && ViashError Bad arguments for option \'--strip_components\': \'$VIASH_PAR_STRIP_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRIP_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --strip_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --strip_components=*) + [ -n "$VIASH_PAR_STRIP_COMPONENTS" ] && ViashError Bad arguments for option \'--strip_components=*\': \'$VIASH_PAR_STRIP_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRIP_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + -s) + [ -n "$VIASH_PAR_STRIP_COMPONENTS" ] && ViashError Bad arguments for option \'-s\': \'$VIASH_PAR_STRIP_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRIP_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -s. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude) + [ -n "$VIASH_PAR_EXCLUDE" ] && ViashError Bad arguments for option \'--exclude\': \'$VIASH_PAR_EXCLUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exclude. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude=*) + [ -n "$VIASH_PAR_EXCLUDE" ] && ViashError Bad arguments for option \'--exclude=*\': \'$VIASH_PAR_EXCLUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -e) + [ -n "$VIASH_PAR_EXCLUDE" ] && ViashError Bad arguments for option \'-e\': \'$VIASH_PAR_EXCLUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -e. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/compression/tar_extract:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_STRIP_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_STRIP_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--strip_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-tar_extract-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/usr/bin/env bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_STRIP_COMPONENTS+x} ]; then echo "${VIASH_PAR_STRIP_COMPONENTS}" | sed "s#'#'\"'\"'#g;s#.*#par_strip_components='&'#" ; else echo "# par_strip_components="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE+x} ]; then echo "${VIASH_PAR_EXCLUDE}" | sed "s#'#'\"'\"'#g;s#.*#par_exclude='&'#" ; else echo "# par_exclude="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=() +mkdir -p \$par_output # Create output directory if it doesn't exist already + +if [ "\$par_strip_components" != "" ]; then + extra_params+=("--strip-components=\$par_strip_components") +fi + +if [ "\$par_exclude" != "" ]; then + extra_params+=("--exclude=\$par_exclude") +fi + +echo "Extracting \$par_input to \$par_output..." +echo "" +tar "\${extra_params[@]}" -xvf "\$par_input" -C "\$par_output" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_10xh5_to_h5mu/.config.vsh.yaml b/target/executable/convert/from_10xh5_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..5ba69f88 --- /dev/null +++ b/target/executable/convert/from_10xh5_to_h5mu/.config.vsh.yaml @@ -0,0 +1,352 @@ +name: "from_10xh5_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "A 10x h5 file as generated by Cell Ranger." + info: null + example: + - "raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_metrics_summary" + description: "A metrics summary csv file as generated by Cell Ranger." + info: null + example: + - "metrics_cellranger.h5" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: + slots: + mod: + - name: "rna" + required: true + description: "Gene expression counts." + slots: + var: + - name: "gene_symbol" + type: "string" + description: "Identification of the gene." + required: true + - name: "feature_types" + type: "string" + description: "The full name of the modality." + required: true + - name: "genome" + type: "string" + description: "Reference that was used to generate the data." + required: true + - name: "prot" + required: false + description: "Protein abundancy" + slots: + var: + - name: "gene_symbol" + type: "string" + description: "Identification of the gene." + required: true + - name: "feature_types" + type: "string" + description: "The full name of the modality." + required: true + - name: "genome" + type: "string" + description: "Reference that was used to generate the data." + required: true + - name: "vdj" + required: false + description: "VDJ transcript counts" + slots: + var: + - name: "gene_symbol" + type: "string" + description: "Identification of the gene." + required: true + - name: "feature_types" + type: "string" + description: "The full name of the modality." + required: true + - name: "genome" + type: "string" + description: "Reference that was used to generate the data." + required: true + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_metrics" + description: "Name of the .uns slot under which to QC metrics (if any)." + info: null + default: + - "metrics_cellranger" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--min_genes" + description: "Minimum number of counts required for a cell to pass filtering." + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts" + description: "Minimum number of genes expressed required for a cell to pass filtering." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a 10x h5 into an h5mu file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_10xh5_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_10xh5_to_h5mu" + executable: "target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu b/target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu new file mode 100755 index 00000000..2d9f7a4e --- /dev/null +++ b/target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu @@ -0,0 +1,1314 @@ +#!/usr/bin/env bash + +# from_10xh5_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_10xh5_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="from_10xh5_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component convert from_10xh5_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:03Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_10xh5_to_h5mu main" + echo "" + echo "Converts a 10x h5 into an h5mu file." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: raw_feature_bc_matrix.h5" + echo " A 10x h5 file as generated by Cell Ranger." + echo "" + echo " --input_metrics_summary" + echo " type: file, file must exist" + echo " example: metrics_cellranger.h5" + echo " A metrics summary csv file as generated by Cell Ranger." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --uns_metrics" + echo " type: string" + echo " default: metrics_cellranger" + echo " Name of the .uns slot under which to QC metrics (if any)." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --min_genes" + echo " type: integer" + echo " example: 100" + echo " Minimum number of counts required for a cell to pass filtering." + echo "" + echo " --min_counts" + echo " type: integer" + echo " example: 1000" + echo " Minimum number of genes expressed required for a cell to pass filtering." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_10xh5_to_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_metrics_summary) + [ -n "$VIASH_PAR_INPUT_METRICS_SUMMARY" ] && ViashError Bad arguments for option \'--input_metrics_summary\': \'$VIASH_PAR_INPUT_METRICS_SUMMARY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_METRICS_SUMMARY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_metrics_summary. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_metrics_summary=*) + [ -n "$VIASH_PAR_INPUT_METRICS_SUMMARY" ] && ViashError Bad arguments for option \'--input_metrics_summary=*\': \'$VIASH_PAR_INPUT_METRICS_SUMMARY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_METRICS_SUMMARY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_metrics) + [ -n "$VIASH_PAR_UNS_METRICS" ] && ViashError Bad arguments for option \'--uns_metrics\': \'$VIASH_PAR_UNS_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_METRICS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_metrics=*) + [ -n "$VIASH_PAR_UNS_METRICS" ] && ViashError Bad arguments for option \'--uns_metrics=*\': \'$VIASH_PAR_UNS_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_METRICS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_genes) + [ -n "$VIASH_PAR_MIN_GENES" ] && ViashError Bad arguments for option \'--min_genes\': \'$VIASH_PAR_MIN_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_genes=*) + [ -n "$VIASH_PAR_MIN_GENES" ] && ViashError Bad arguments for option \'--min_genes=*\': \'$VIASH_PAR_MIN_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_counts) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_counts=*) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts=*\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_10xh5_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_UNS_METRICS+x} ]; then + VIASH_PAR_UNS_METRICS="metrics_cellranger" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_METRICS_SUMMARY" ] && [ ! -e "$VIASH_PAR_INPUT_METRICS_SUMMARY" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_METRICS_SUMMARY' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_GENES" ]]; then + if ! [[ "$VIASH_PAR_MIN_GENES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_genes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_MIN_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_INPUT_METRICS_SUMMARY" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_METRICS_SUMMARY")" ) + VIASH_PAR_INPUT_METRICS_SUMMARY=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_METRICS_SUMMARY") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_10xh5_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata +import scanpy as sc +import sys +import pandas as pd + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_metrics_summary': $( if [ ! -z ${VIASH_PAR_INPUT_METRICS_SUMMARY+x} ]; then echo "r'${VIASH_PAR_INPUT_METRICS_SUMMARY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_metrics': $( if [ ! -z ${VIASH_PAR_UNS_METRICS+x} ]; then echo "r'${VIASH_PAR_UNS_METRICS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_genes': $( if [ ! -z ${VIASH_PAR_MIN_GENES+x} ]; then echo "int(r'${VIASH_PAR_MIN_GENES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading %s.", par["input"]) +adata = sc.read_10x_h5(par["input"], gex_only=False) + +# set the gene ids as var_names +logger.info("Renaming var columns") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + +# parse metrics summary file and store in .uns +if par["input_metrics_summary"] and par["uns_metrics"]: + logger.info("Reading metrics summary file '%s'", par["input_metrics_summary"]) + + def read_percentage(val): + try: + return float(val.strip("%")) / 100 + except AttributeError: + return val + + metrics_summary = pd.read_csv( + par["input_metrics_summary"], decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) + + logger.info("Storing metrics summary in .uns['%s']", par["uns_metrics"]) + adata.uns[par["uns_metrics"]] = metrics_summary +else: + is_none = ( + "input_metrics_summary" if not par["input_metrics_summary"] else "uns_metrics" + ) + logger.info("Not storing metrics summary because par['%s'] is None", is_none) + +# might perform basic filtering to get rid of some data +# applicable when starting from the raw counts +if par["min_genes"]: + logger.info("Filtering with min_genes=%d", par["min_genes"]) + sc.pp.filter_cells(adata, min_genes=par["min_genes"]) + +if par["min_counts"]: + logger.info("Filtering with min_counts=%d", par["min_counts"]) + sc.pp.filter_cells(adata, min_counts=par["min_counts"]) + +# generate output +logger.info("Convert to mudata") +mdata = mudata.MuData(adata) + +# override root .obs and .uns +mdata.obs = adata.obs +mdata.uns = adata.uns + +# write output +logger.info("Writing %s", par["output"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_INPUT_METRICS_SUMMARY" ]; then + VIASH_PAR_INPUT_METRICS_SUMMARY=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_METRICS_SUMMARY") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_10xh5_to_h5mu/nextflow_labels.config b/target/executable/convert/from_10xh5_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_10xh5_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_10xh5_to_h5mu/setup_logger.py b/target/executable/convert/from_10xh5_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/convert/from_10xh5_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/convert/from_10xmtx_to_h5mu/.config.vsh.yaml b/target/executable/convert/from_10xmtx_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..b17cceda --- /dev/null +++ b/target/executable/convert/from_10xmtx_to_h5mu/.config.vsh.yaml @@ -0,0 +1,253 @@ +name: "from_10xmtx_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input mtx folder" + info: null + example: + - "input_dir_containing_gz_files" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a 10x mtx into an h5mu file.\n" +test_resources: +- type: "python_script" + path: "run_test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_10xmtx_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_10xmtx_to_h5mu" + executable: "target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu b/target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu new file mode 100755 index 00000000..cea91613 --- /dev/null +++ b/target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu @@ -0,0 +1,1181 @@ +#!/usr/bin/env bash + +# from_10xmtx_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_10xmtx_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="from_10xmtx_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component convert from_10xmtx_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_10xmtx_to_h5mu main" + echo "" + echo "Converts a 10x mtx into an h5mu file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input_dir_containing_gz_files" + echo " Input mtx folder" + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_10xmtx_to_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_10xmtx_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_10xmtx_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import scanpy as sc +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading %s.", par["input"]) +adata = sc.read_10x_mtx(par["input"], gex_only=False) + +logger.info("Renaming keys.") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + +# generate output +logger.info("Convert to mudata") +mdata = mu.MuData(adata) + +# override root .obs +mdata.obs = adata.obs + +# write output +logger.info("Writing %s", par["output"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_10xmtx_to_h5mu/nextflow_labels.config b/target/executable/convert/from_10xmtx_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_10xmtx_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_10xmtx_to_h5mu/setup_logger.py b/target/executable/convert/from_10xmtx_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/convert/from_10xmtx_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/.config.vsh.yaml b/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/.config.vsh.yaml new file mode 100644 index 00000000..76e8f27c --- /dev/null +++ b/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/.config.vsh.yaml @@ -0,0 +1,228 @@ +name: "from_bd_to_10x_molecular_barcode_tags" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input SAM or BAM file." + info: null + example: + - "input.bam" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output alignment file." + info: null + example: + - "output.sam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--bam" + description: "Output a BAM file." + info: null + direction: "input" + - type: "integer" + name: "--threads" + alternatives: + - "-t" + description: "Number of threads" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the molecular barcode sequence SAM tag from BD format (MA) to\ + \ 10X format (UB).\n" +test_resources: +- type: "bash_script" + path: "run_test.sh" + is_executable: true +- type: "file" + path: "output_raw" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "samtools" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_bd_to_10x_molecular_barcode_tags" + executable: "target/executable/convert/from_bd_to_10x_molecular_barcode_tags/from_bd_to_10x_molecular_barcode_tags" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/from_bd_to_10x_molecular_barcode_tags b/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/from_bd_to_10x_molecular_barcode_tags new file mode 100755 index 00000000..21790525 --- /dev/null +++ b/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/from_bd_to_10x_molecular_barcode_tags @@ -0,0 +1,1233 @@ +#!/usr/bin/env bash + +# from_bd_to_10x_molecular_barcode_tags main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_bd_to_10x_molecular_barcode_tags" +VIASH_META_FUNCTIONALITY_NAME="from_bd_to_10x_molecular_barcode_tags" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:latest +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y samtools && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component convert from_bd_to_10x_molecular_barcode_tags" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_bd_to_10x_molecular_barcode_tags main" + echo "" + echo "Convert the molecular barcode sequence SAM tag from BD format (MA) to 10X format" + echo "(UB)." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.bam" + echo " Input SAM or BAM file." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.sam" + echo " Output alignment file." + echo "" + echo " --bam" + echo " type: boolean_true" + echo " Output a BAM file." + echo "" + echo " -t, --threads" + echo " type: integer" + echo " Number of threads" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_bd_to_10x_molecular_barcode_tags main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=true + shift 1 + ;; + --threads) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'--threads\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --threads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --threads=*) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'--threads=*\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS=$(ViashRemoveFlags "$1") + shift 1 + ;; + -t) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'-t\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -t. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_bd_to_10x_molecular_barcode_tags:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_BAM+x} ]; then + VIASH_PAR_BAM="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_BAM" ]]; then + if ! [[ "$VIASH_PAR_BAM" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--bam' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_THREADS" ]]; then + if ! [[ "$VIASH_PAR_THREADS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--threads' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_bd_to_10x_molecular_barcode_tags-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_THREADS+x} ]; then echo "${VIASH_PAR_THREADS}" | sed "s#'#'\"'\"'#g;s#.*#par_threads='&'#" ; else echo "# par_threads="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Sam tags added by BD Rhapsody Pipeline +# From: https://www.bd.com/documents/guides/user-guides/GMX_BD-Rhapsody-genomics-informatics_UG_EN.pdf +# +# ========================================================================================= +# | | Definition | +# ========================================================================================= +# | CB | A number between 1 and 96 3 (884,736) representing a unique cell label sequence | +# | | (CB = 0 when no cell label sequence is detected) | +# ----------------------------------------------------------------------------------------- +# | MR | Raw molecular identifier sequence | +# ----------------------------------------------------------------------------------------- +# | MA | RSEC-adjusted molecular identifier sequence. If not a true cell, the raw UMI is | +# | | repeated in this tag. | +# ----------------------------------------------------------------------------------------- +# | PT | T if a poly(T) tail was found in the expected position on R1, or F if poly(T) | +# | | was not found | +# ----------------------------------------------------------------------------------------- +# | CN | Indicates if a sequence is derived from a putative cell, as determined by the | +# | | cell label filtering algorithm (T: putative cell; x: invalid cell label or noise | +# | | cell) Note: You can distinguish between an invalid cell label and a noise cell | +# | | with the CB tag (invalid cell labels are 0). | +# ----------------------------------------------------------------------------------------- +# | ST | The value is 1-12, indicating the Sample Tag of the called putative cell, or M | +# | | for multiplet, or x for undetermined. | +# ========================================================================================= + + +# SAM tags added by 10X +# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam +# ========================================================================================= +# | | Definition | +# ========================================================================================= +# | CB | Chromium cellular barcode sequence that is error-corrected and confirmed against | +# | | a list of known-good barcode sequences. For multiplex Fixed RNA Profiling, the | +# | | cellular barcode is a combination of the 10x GEM Barcode and Probe Barcode | +# | | sequences. | +# ----------------------------------------------------------------------------------------- +# | CR | Chromium cellular barcode sequence as reported by the sequencer. For multiplex | +# | | Fixed RNA Profiling, the cellular barcode is a combination of the 10x GEM | +# | | Barcode and Probe Barcode sequences. | +# ----------------------------------------------------------------------------------------- +# | CY | Chromium cellular barcode read quality. For multiplex Fixed RNA Profiling, the | +# | | cellular barcode is a combination of the 10x GEM Barcode and Probe Barcode | +# | | sequences. Phred scores as reported by sequencer. | +# ----------------------------------------------------------------------------------------- +# | UB | Chromium molecular barcode sequence that is error-corrected among other | +# | | molecular barcodes with the same cellular barcode and gene alignment. | +# ----------------------------------------------------------------------------------------- +# | UR | Chromium molecular barcode sequence as reported by the sequencer. | +# ----------------------------------------------------------------------------------------- +# | UY | Chromium molecular barcode read quality. Phred scores as reported by sequencer. | +# ----------------------------------------------------------------------------------------- +# | TR | Trimmed sequence. For the Single Cell 3' v1 chemistry, this is trailing sequence | +# | | following the UMI on Read 2. For the Single Cell 3' v2 chemistry, this is | +# | | trailing sequence following the cell and molecular barcodes on Read 1. | +# ========================================================================================= + +extra_params=() + +if [ "\$par_bam" == "true" ]; then + extra_params+=("--bam") +fi + +cat \\ + <(samtools view -SH "\$par_input") \\ + <(samtools view "\$par_input" | grep "MA:Z:*" | sed "s/MA:Z:/UB:Z:/" ) | \\ +samtools view -Sh "\${extra_params[@]}" -@"\$par_threads" - > "\$par_output" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_labels.config b/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_bdrhap_to_h5mu/.config.vsh.yaml b/target/executable/convert/from_bdrhap_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..188a8d83 --- /dev/null +++ b/target/executable/convert/from_bdrhap_to_h5mu/.config.vsh.yaml @@ -0,0 +1,266 @@ +name: "from_bdrhap_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "A sample ID." + info: null + example: + - "my_id" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The output h5mu of a BD Rhapsody workflow." + info: null + example: + - "sample.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "sample.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_bdrhap_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_bdrhap_to_h5mu" + executable: "target/executable/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu b/target/executable/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu new file mode 100755 index 00000000..810a1c17 --- /dev/null +++ b/target/executable/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu @@ -0,0 +1,1225 @@ +#!/usr/bin/env bash + +# from_bdrhap_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (author, maintainer) +# * Robrecht Cannoodt (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_bdrhap_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="from_bdrhap_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component convert from_bdrhap_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_bdrhap_to_h5mu main" + echo "" + echo "Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file." + echo "" + echo "Arguments:" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Inputs:" + echo " --id" + echo " type: string, required parameter" + echo " example: my_id" + echo " A sample ID." + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: sample.h5mu" + echo " The output h5mu of a BD Rhapsody workflow." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_bdrhap_to_h5mu main" + exit + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --id) + [ -n "$VIASH_PAR_ID" ] && ViashError Bad arguments for option \'--id\': \'$VIASH_PAR_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id=*) + [ -n "$VIASH_PAR_ID" ] && ViashError Bad arguments for option \'--id=*\': \'$VIASH_PAR_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_bdrhap_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_ID+x} ]; then + ViashError '--id' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_bdrhap_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'id': $( if [ ! -z ${VIASH_PAR_ID+x} ]; then echo "r'${VIASH_PAR_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print(">> Reading input file", flush=True) +mdata = mu.read_h5mu(par["input"]) + +# Check if modalities are present +modalities = list(mdata.mod.keys()) +assert len(modalities) > 0, "No modalities found in input data" + + +def process_modality_inline(adata, modality): + adata.obs["library_id"] = " & ".join(adata.uns["Pipeline_Inputs"]["Libraries"]) + adata.obs["cell_id"] = adata.obs.index + adata.obs["run_id"] = par["id"] + + adata.obs.rename( + columns={"Sample_Tag": "sample_tag", "Sample_Name": "sample_id"}, inplace=True + ) + + adata.var["gene_ids"] = adata.var.index + adata.var["gene_name"] = adata.var.index + + if modality == "rna": + adata.var["feature_type"] = "Gene Expression" + adata.var["reference_file"] = adata.uns["Pipeline_Inputs"]["Reference_Archive"] + + elif modality == "prot": + adata.var["feature_type"] = "Antibody Capture" + adata.var["reference_file"] = " & ".join( + adata.uns["Pipeline_Inputs"]["AbSeq_Reference"] + ) + + # TODO: add other modalities + + +for key, value in mdata.mod.items(): + print(">> Processing modality:", key, flush=True) + process_modality_inline(value, key) + +print(">> Writing output file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_bdrhap_to_h5mu/nextflow_labels.config b/target/executable/convert/from_bdrhap_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_bdrhap_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_cellranger_multi_to_h5mu/.config.vsh.yaml b/target/executable/convert/from_cellranger_multi_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..a67dcc13 --- /dev/null +++ b/target/executable/convert/from_cellranger_multi_to_h5mu/.config.vsh.yaml @@ -0,0 +1,284 @@ +name: "from_cellranger_multi_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input folder. Must contain the output from a cellranger multi run." + info: null + example: + - "input_dir_containing_modalities" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Locations for the output files. Must contain a wildcard (*) character,\n\ + which will be replaced with the sample name.\n" + info: null + example: + - "*.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sample_csv" + description: "CSV file describing the sample name per output file" + info: null + example: + - "samples.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_metrics" + description: "Name of the .uns slot under which to QC metrics (if any)." + info: null + default: + - "metrics_cellranger" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts the output from cellranger multi to a single .h5mu file.\n\ + By default, will map the following library type names to modality names:\n - Gene\ + \ Expression: rna\n - Peaks: atac\n - Antibody Capture: prot\n - VDJ: vdj\n \ + \ - VDJ-T: vdj_t\n - VDJ-B: vdj_b\n - CRISPR Guide Capture: crispr\n - Multiplexing\ + \ Capture: hashing\n \nOther library types have their whitepace removed and dashes\ + \ replaced by\nunderscores to generate the modality name.\n\nCurrently does not\ + \ allow parsing the output from cell barcode demultiplexing.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "10x_5k_anticmv" +- type: "file" + path: "10x_5k_lung_crispr" +- type: "file" + path: "10x_5k_beam" +- type: "file" + path: "10x_5k_fixed" +- type: "file" + path: "10x_4plex_dtc" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scirpy~=0.12.0" + - "pandas~=2.2.3" + - "pytest" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_cellranger_multi_to_h5mu" + executable: "target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu b/target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu new file mode 100755 index 00000000..9d40036a --- /dev/null +++ b/target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu @@ -0,0 +1,1680 @@ +#!/usr/bin/env bash + +# from_cellranger_multi_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_cellranger_multi_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="from_cellranger_multi_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "scirpy~=0.12.0" "pandas~=2.2.3" "pytest" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component convert from_cellranger_multi_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_cellranger_multi_to_h5mu main" + echo "" + echo "Converts the output from cellranger multi to a single .h5mu file." + echo "By default, will map the following library type names to modality names:" + echo " - Gene Expression: rna" + echo " - Peaks: atac" + echo " - Antibody Capture: prot" + echo " - VDJ: vdj" + echo " - VDJ-T: vdj_t" + echo " - VDJ-B: vdj_b" + echo " - CRISPR Guide Capture: crispr" + echo " - Multiplexing Capture: hashing" + echo "Other library types have their whitepace removed and dashes replaced by" + echo "underscores to generate the modality name." + echo "" + echo "Currently does not allow parsing the output from cell barcode demultiplexing." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input_dir_containing_modalities" + echo " Input folder. Must contain the output from a cellranger multi run." + echo "" + echo " -o, --output" + echo " type: file, multiple values allowed, output, file must exist" + echo " example: *.h5mu" + echo " Locations for the output files. Must contain a wildcard (*) character," + echo " which will be replaced with the sample name." + echo "" + echo " --sample_csv" + echo " type: file, output, file must exist" + echo " example: samples.csv" + echo " CSV file describing the sample name per output file" + echo "" + echo " --uns_metrics" + echo " type: string" + echo " default: metrics_cellranger" + echo " Name of the .uns slot under which to QC metrics (if any)." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_cellranger_multi_to_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_csv) + [ -n "$VIASH_PAR_SAMPLE_CSV" ] && ViashError Bad arguments for option \'--sample_csv\': \'$VIASH_PAR_SAMPLE_CSV\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_CSV="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_csv. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_csv=*) + [ -n "$VIASH_PAR_SAMPLE_CSV" ] && ViashError Bad arguments for option \'--sample_csv=*\': \'$VIASH_PAR_SAMPLE_CSV\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_CSV=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_metrics) + [ -n "$VIASH_PAR_UNS_METRICS" ] && ViashError Bad arguments for option \'--uns_metrics\': \'$VIASH_PAR_UNS_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_METRICS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_metrics=*) + [ -n "$VIASH_PAR_UNS_METRICS" ] && ViashError Bad arguments for option \'--uns_metrics=*\': \'$VIASH_PAR_UNS_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_METRICS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_cellranger_multi_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_UNS_METRICS+x} ]; then + VIASH_PAR_UNS_METRICS="metrics_cellranger" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_OUTPUT" ]]; then + if ! [[ "$VIASH_PAR_OUTPUT" =~ \* ]]; then + ViashError '--output' has to be a path containing a wildcard, e.g. 'output_*.txt'. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_SAMPLE_CSV" ] && [ ! -d "$(dirname "$VIASH_PAR_SAMPLE_CSV")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_SAMPLE_CSV")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_TEST_OUTPUT=() + IFS=';' + for var in $VIASH_PAR_OUTPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_OUTPUT+=( "$var" ) + VIASH_CHOWN_VARS+=( "$var" ) + done + VIASH_PAR_OUTPUT=$(IFS=';' ; echo "${VIASH_TEST_OUTPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_SAMPLE_CSV" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_CSV")" ) + VIASH_PAR_SAMPLE_CSV=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_CSV") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_SAMPLE_CSV" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_cellranger_multi_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from pathlib import Path +import sys +import scanpy +import pandas as pd +import mudata +import numpy as np +from scirpy.io import read_10x_vdj +from collections import defaultdict +from functools import partial +import json +import csv +import tempfile + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sample_csv': $( if [ ! -z ${VIASH_PAR_SAMPLE_CSV+x} ]; then echo "r'${VIASH_PAR_SAMPLE_CSV//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_metrics': $( if [ ! -z ${VIASH_PAR_UNS_METRICS+x} ]; then echo "r'${VIASH_PAR_UNS_METRICS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +POSSIBLE_LIBRARY_TYPES = ( + "vdj_t", + "vdj_b", + "vdj_t_gd", + "count", + "antigen_analysis", + "multiplexing_analysis", +) + +FEATURE_TYPES_NAMES = { + "Gene Expression": "rna", + "Peaks": "atac", + "Antibody Capture": "prot", + "VDJ": "vdj", + "VDJ-T": "vdj_t", + "VDJ-B": "vdj_b", + "CRISPR Guide Capture": "gdo", + "Multiplexing Capture": "hto", + "Antigen Capture": "antigen", + "Custom": "custom", +} + + +def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: + """ + Cast the dataframe to dtypes that can be written by mudata. + """ + # dtype inferral workfs better with np.nan + result = result.replace({pd.NA: np.nan}) + + # MuData supports nullable booleans and ints + # ie. \`IntegerArray\` and \`BooleanArray\` + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # Convert leftover 'object' columns to string + # However, na values are supported, so convert all values except NA's to string + object_cols = result.select_dtypes(include="object").columns.values + for obj_col in object_cols: + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) + return result + + +def gather_input_data(dir: Path): + # / + # +-- multi + # | +-- count (raw output) + # | | +-- feature_reference.csv + # | | +-- raw_feature_bc_matrix.h5 + # | +-- vdj_t + # | | +-- all_contig_annotations.json + # | +-- vdj_b + # | | +-- all_contig_annotations.json + # | +-- vdj_t_gd + # | | +-- all_contig_annotations.json + # | +-- multiplexing_analysis + # | +-- cells_per_tag.json + # +-- per_sample_outs (filtered outputs) + # +-- example_1 + # +-- antigen_analysis + # | +-- per_barcode.csv + # | +-- antigen_specificity_scores.csv + # +-- count + # | +-- antibody_analysis + # | +-- crispr_analysis + # | +-- perturbation_efficiencies_by_feature.csv + # | +-- perturbation_efficiencies_by_target.csv + # +-- vdj_t (unused) + # +-- vdj_b (unused) + # +-- vdj_t_gd (unused) + # +-- metrics_summary.csv + + if not dir.is_dir(): + raise ValueError("Specified input is not a directory.") + folder_contents = list(dir.iterdir()) + config = dir / "config.csv" + if config not in folder_contents: + logger.warning( + "Config.csv not found in input directory, this folder might not be a valid cellranger multi output." + ) + + required_subfolders = [ + dir / subfolder_name for subfolder_name in ("multi", "per_sample_outs") + ] + found_input = {key_: {} for key_ in POSSIBLE_LIBRARY_TYPES} + for required_subfolder in required_subfolders: + if required_subfolder not in folder_contents: + raise ValueError( + f"Input folder must contain the subfolder {required_subfolder} please make " + "sure that the specified input folder is a valid cellranger multi output." + ) + + multi_dir = dir / "multi" + for library_type in multi_dir.iterdir(): + if not library_type.is_dir(): + logger.warning( + "%s is not a directory. Contents of the multi folder " + "must be directories to be recognized as valid input data", + library_type, + ) + continue + if library_type.name not in POSSIBLE_LIBRARY_TYPES: + raise ValueError( + f"Contents of the 'multi' folder must be found one of the following: {','.join(POSSIBLE_LIBRARY_TYPES)}." + ) + + found_input[library_type.name] = library_type + + per_sample_outs_dir = dir / "per_sample_outs" + samples_dirs = [ + samplepath + for samplepath in per_sample_outs_dir.iterdir() + if samplepath.is_dir() + ] + for samples_dir in samples_dirs: + for file_part in ( + "metrics_summary.csv", + "count/feature_reference.csv", + "count/crispr_analysis/perturbation_efficiencies_by_feature.csv", + "count/crispr_analysis/perturbation_efficiencies_by_target.csv", + "antigen_analysis", + ): + found_file = samples_dir / file_part + if found_file.exists(): + file_name = found_file.name.removesuffix(".csv") + found_input.setdefault(file_name, {})[samples_dir.name] = found_file + + return found_input + + +def proces_perturbation( + key_name: str, mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): + for sample_name, mudata_obj in mudatas.items(): + efficiency_file = efficiency_files[sample_name] + assert "gdo" in mudata_obj.mod + eff_df = pd.read_csv( + efficiency_file, + index_col="Perturbation", + sep=",", + decimal=".", + quotechar='"', + ) + mudata_obj.mod["gdo"].uns[key_name] = eff_df + return mudatas + + +def process_feature_reference( + mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): + for sample, mudata_obj in mudatas.items(): + efficiency_file = efficiency_files[sample] + df = pd.read_csv( + efficiency_file, index_col="id", sep=",", decimal=".", quotechar='"' + ) + assert "feature_type" in df.columns, ( + "Columns 'feature_type' should be present in features_reference file." + ) + feature_types = df["feature_type"] + missing_features = set(feature_types) - set(FEATURE_TYPES_NAMES) + if missing_features: + raise ValueError( + "Not all feature types present in the features_reference file are supported by this component.\\n" + f"Missing support for features: {','.join(missing_features)}." + ) + for feature_type in feature_types: + modality = FEATURE_TYPES_NAMES[feature_type] + subset_df = df.loc[df["feature_type"] == feature_type] + mudata_obj.mod[modality].uns["feature_reference"] = subset_df + return mudatas + + +def process_counts(counts_folder: Path, multiplexing_info, metrics_files): + counts_matrix_file = counts_folder / "raw_feature_bc_matrix.h5" + logger.info("Reading %s.", counts_matrix_file) + adata = scanpy.read_10x_h5(counts_matrix_file, gex_only=False) + + # set the gene ids as var_names + logger.info("Renaming var columns") + adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + + # generate output + logger.info("Convert to mudata") + + def modality_name_factory(library_type): + return ("".join(library_type.replace("-", "_").split())).lower() + + feature_types = defaultdict(modality_name_factory, FEATURE_TYPES_NAMES) + mudata_all_samples = mudata.MuData(adata, feature_types_names=feature_types) + if multiplexing_info: + # Get the mapping between the barcode and the sample ID from one of the metrics files + metrics_file = pd.read_csv( + list(metrics_files.values())[0], decimal=".", quotechar='"', thousands="," + ) + sample_ids = metrics_file[metrics_file["Metric Name"] == "Sample ID"] + barcode_sample_mapping = ( + sample_ids.loc[:, ["Group Name", "Metric Value"]] + .set_index("Group Name") + .squeeze() + .to_dict() + ) + # When probe barcodes are combined with multiplexing barcodes; the probe barcode (BC) + # is paired with other sample with other sample information using the syntax "{BC}+{antibody}" + # (or even "{BC}+{antibody}+{crispr guide}"). The cells_per_tag + # file only encodes the BCs; so we need to strip the other information. + barcode_sample_mapping = { + key_.partition("+")[0]: value_ + for key_, value_ in barcode_sample_mapping.items() + } + return split_samples( + mudata_all_samples, multiplexing_info, barcode_sample_mapping + ) + return {"run": mudata_all_samples} + + +def split_samples(mudata_obj, multiplexing_analysis_folder, barcode_sample_mapping): + result = {} + cells_per_tag_file = multiplexing_analysis_folder / "cells_per_tag.json" + with cells_per_tag_file.open("r") as open_json: + sample_cell_mapping = json.load(open_json) + + for barcode, indices in sample_cell_mapping.items(): + if indices: + sample_mudata = mudata_obj[indices] + sample = barcode_sample_mapping[barcode] + result[sample] = sample_mudata.copy() + return result + + +def process_metrics_summary( + mudatas: dict[str, mudata.MuData], metrics_files: dict[str, Path] +): + def read_percentage(val): + try: + if str(val).endswith("%"): + return float(val.strip("%")) / 100 + else: + return val + except (AttributeError, ValueError): + return val + + for sample, mudata_obj in mudatas.items(): + metrics_file = metrics_files[sample] + metrics_summary = pd.read_csv( + metrics_file, decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) + + mudata_obj.uns[par["uns_metrics"]] = metrics_summary + for colname, coldata in metrics_summary.items(): + try: + new_column = coldata.astype(str, copy=True).astype( + {colname: "category"} + ) + metrics_summary[colname] = new_column + except (ValueError, TypeError): + logger.warning(f"Could not store column {colname} from metrics.") + pass + return mudatas + + +def process_antigen_analysis( + mudatas: dict[str, mudata.MuData], antigen_analysis_folder_paths: dict[str, Path] +): + for sample_id, mudata_obj in mudatas.items(): + antigen_analysis_folder_path = antigen_analysis_folder_paths[sample_id] + assert "antigen" in mudata_obj.mod + per_barcodes_file = antigen_analysis_folder_path / "per_barcode.csv" + assert per_barcodes_file.is_file(), ( + "Expected a per_barcode.csv file to be present." + ) + per_barcodes_df = pd.read_csv( + per_barcodes_file, index_col="barcode", sep=",", decimal=".", quotechar='"' + ) + is_gex_cell = per_barcodes_df["is_gex_cell"] + assert len(set(is_gex_cell.unique().tolist()) - set([False, True])) == 0, ( + "Expected 'is_gex_cell' column to be boolean. Please report this as a bug." + ) + barcodes_in_gex = per_barcodes_df[is_gex_cell] + # All of the barcodes listed in the per_barcode.csv with is_gex_cell set to 'True' + # must be in the 'rna' (an thus also 'antigen') modality + assert barcodes_in_gex.index.difference(mudata_obj["rna"].obs_names).empty + orig_obs_names = mudata_obj["antigen"].obs_names.copy() + mudata_obj["antigen"].obs = cast_to_writeable_dtype( + pd.concat( + [mudata_obj["antigen"].obs, barcodes_in_gex], + axis="columns", + join="outer", + verify_integrity=True, + sort=False, + ) + ) + assert orig_obs_names.equals(mudata_obj["antigen"].obs_names) + del orig_obs_names + + # The antigen_specificity_scores.csv file is only present when cellranger + # multi was run with a [antigen-specificity] section in config + specificity_file = ( + antigen_analysis_folder_path / "antigen_specificity_scores.csv" + ) + if specificity_file.is_file(): + antigen_scores_df = pd.read_csv( + specificity_file, + index_col=["barcode", "antigen"], + sep=",", + decimal=".", + quotechar='"', + ) + score = antigen_scores_df.unstack() + assert score.index.difference(mudata_obj["rna"].obs_names).empty + antigens = score.columns.unique(level="antigen") + for antigen in antigens: + score_antigen = score.loc[:, (slice(None), antigen)].droplevel( + "antigen", axis=1 + ) + score_antigen = score_antigen.reindex(mudata_obj["rna"].obs_names) + mudata_obj["antigen"].obsm[f"antigen_specificity_scores_{antigen}"] = ( + cast_to_writeable_dtype(score_antigen) + ) + return mudatas + + +def process_vdj(mudatas: dict[str, mudata.MuData], vdj_folder_path: str): + # https://scverse.org/scirpy/latest/generated/scirpy.io.read_10x_vdj.html#scirpy-io-read-10x-vdj + # According to docs, using the json is preferred as this file includes intron info. + all_config_json_file = vdj_folder_path / "all_contig_annotations.json" + vdj_type = vdj_folder_path.name + with all_config_json_file.open("r") as open_json: + json_obj = json.load(open_json) + for _, mudata_obj in mudatas.items(): + json_for_sample = [ + entry for entry in json_obj if entry["barcode"] in mudata_obj.obs_names + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as tfile: + json.dump(json_for_sample, tfile, indent=4) + tfile.flush() + vdj_anndata = read_10x_vdj(tfile.name) + mudata_obj.mod[vdj_type] = vdj_anndata + return mudatas + + +def get_modalities(input_data): + dispatcher = { + "multiplexing_analysis": split_samples, + "vdj_t": process_vdj, + "vdj_b": process_vdj, + "vdj_t_gd": process_vdj, + "metrics_summary": process_metrics_summary, + "feature_reference": process_feature_reference, + "perturbation_efficiencies_by_feature": partial( + proces_perturbation, "perturbation_efficiencies_by_feature" + ), + "perturbation_efficiencies_by_target": partial( + proces_perturbation, "perturbation_efficiencies_by_target" + ), + "antigen_analysis": process_antigen_analysis, + } + mudata_per_sample = process_counts( + input_data["count"], + input_data["multiplexing_analysis"], + input_data["metrics_summary"], + ) + for modality_name, modality_data_path in input_data.items(): + if ( + modality_name in ("count", "multiplexing_analysis") + or not modality_data_path + ): + continue + try: + parser_function = dispatcher[modality_name] + except KeyError as e: + raise ValueError( + "This component does not support the " + f"parsing of the '{modality_name}' yet." + ) from e + mudata_per_sample = parser_function(mudata_per_sample, modality_data_path) + return mudata_per_sample + + +def main(): + cellranger_multi_dir = Path(par["input"]) + # TODO: remove when issue https://github.com/viash-io/viash/issues/706 is resolved. + if isinstance(par["output"], (list, set, tuple)): + assert len(par["output"]) == 1, ( + "A single output file template should have been provided." + ) + par["output"] = par["output"][0] + assert par["output"].count("*") == 1, ( + f"Expected exactly one wildcard character (*) in output " + f"files template ({par['output']}). Found {par['output'].count('*')}" + ) + input_data = gather_input_data(cellranger_multi_dir) + result = get_modalities(input_data) + output_files = { + par["output"].replace("*", sample_name) for sample_name in result.keys() + } + assert len(output_files) == len(result.keys()), ( + "Replacing the wildcard in the output files " + "template did not produce unique file paths." + ) + logger.info( + "Writing output for samples: '%s' to '%s'", + "".join(result.keys()), + par["output"], + ) + with Path(par["sample_csv"]).open("w", newline="") as open_csv: + csvwriter = csv.DictWriter(open_csv, fieldnames=["sample_name", "file"]) + csvwriter.writeheader() + for sample_name, mudata_obj in result.items(): + output_file = Path(par["output"].replace("*", sample_name)) + mudata_obj.write_h5mu(output_file, compression=par["output_compression"]) + csvwriter.writerow({"sample_name": sample_name, "file": output_file.name}) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_CSV" ]; then + VIASH_PAR_SAMPLE_CSV=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_CSV") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && ! compgen -G "$VIASH_PAR_OUTPUT" > /dev/null; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_CSV" ] && [ ! -e "$VIASH_PAR_SAMPLE_CSV" ]; then + ViashError "Output file '$VIASH_PAR_SAMPLE_CSV' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_cellranger_multi_to_h5mu/nextflow_labels.config b/target/executable/convert/from_cellranger_multi_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_cellranger_multi_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_cellranger_multi_to_h5mu/setup_logger.py b/target/executable/convert/from_cellranger_multi_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/convert/from_cellranger_multi_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/convert/from_h5ad_to_h5mu/.config.vsh.yaml b/target/executable/convert/from_h5ad_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..9ec01892 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_h5mu/.config.vsh.yaml @@ -0,0 +1,261 @@ +name: "from_h5ad_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5ad files" + info: null + default: + - "input.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of names to use for the modalities. Will be used as the keys\ + \ in the .mod attribute in the output MuData object\nThe number of items provided\ + \ for this argument equal the number of input files (--input) and their order\ + \ should match.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output MuData file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a single layer h5ad file into a single MuData object\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5ad_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_h5ad_to_h5mu" + executable: "target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu b/target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu new file mode 100755 index 00000000..1325b6e2 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu @@ -0,0 +1,1254 @@ +#!/usr/bin/env bash + +# from_h5ad_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_h5ad_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="from_h5ad_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer" +LABEL org.opencontainers.image.description="Companion container for running component convert from_h5ad_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:03Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_h5ad_to_h5mu main" + echo "" + echo "Converts a single layer h5ad file into a single MuData object" + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " default: input.h5ad" + echo " Input h5ad files" + echo "" + echo " --modality" + echo " type: string, multiple values allowed" + echo " default: rna" + echo " List of names to use for the modalities. Will be used as the keys in the" + echo " .mod attribute in the output MuData object" + echo " The number of items provided for this argument equal the number of input" + echo " files (--input) and their order should match." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " default: output.h5mu" + echo " Output MuData file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_h5ad_to_h5mu main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -i) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + if [ -z "$VIASH_PAR_MODALITY" ]; then + VIASH_PAR_MODALITY="$2" + else + VIASH_PAR_MODALITY="$VIASH_PAR_MODALITY;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + if [ -z "$VIASH_PAR_MODALITY" ]; then + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MODALITY="$VIASH_PAR_MODALITY;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_h5ad_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="output.h5mu" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_h5ad_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import anndata +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +assert len(par["input"]) == len(par["modality"]), ( + "Number of input files should be the same length as the number of modalities" +) + +logger.info("Reading input files") +data = { + key: anndata.read_h5ad(path) for key, path in zip(par["modality"], par["input"]) +} + +logger.info("Converting to mudata") +mudata = mu.MuData(data) + +try: + mudata.var_names_make_unique() +except (TypeError, ValueError): + pass + +logger.info("Writing to %s.", par["output"]) +mudata.write_h5mu(par["output"], compression=par["output_compression"]) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_h5ad_to_h5mu/nextflow_labels.config b/target/executable/convert/from_h5ad_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_h5ad_to_h5mu/setup_logger.py b/target/executable/convert/from_h5ad_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/convert/from_h5ad_to_seurat/.config.vsh.yaml b/target/executable/convert/from_h5ad_to_seurat/.config.vsh.yaml new file mode 100644 index 00000000..2cb22e49 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_seurat/.config.vsh.yaml @@ -0,0 +1,242 @@ +name: "from_h5ad_to_seurat" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5ad file" + info: null + example: + - "input.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--assay" + description: "Name of the assay to be created." + info: null + default: + - "RNA" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output Seurat file" + info: null + example: + - "output.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts an h5ad file into a Seurat file.\n" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "libgeos-dev" + interactive: false + - type: "r" + cran: + - "hdf5r" + - "Seurat" + - "SeuratObject" + github: + - "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + cran: + - "testthat" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5ad_to_seurat/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_h5ad_to_seurat" + executable: "target/executable/convert/from_h5ad_to_seurat/from_h5ad_to_seurat" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_h5ad_to_seurat/from_h5ad_to_seurat b/target/executable/convert/from_h5ad_to_seurat/from_h5ad_to_seurat new file mode 100755 index 00000000..6f916cb1 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_seurat/from_h5ad_to_seurat @@ -0,0 +1,1169 @@ +#!/usr/bin/env bash + +# from_h5ad_to_seurat main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_h5ad_to_seurat" +VIASH_META_FUNCTIONALITY_NAME="from_h5ad_to_seurat" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:24.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev libgeos-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("hdf5r", "Seurat", "SeuratObject"), repos = "https://cran.rstudio.com")' && \ + Rscript -e 'options(warn = 2); remotes::install_github(c("scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component convert from_h5ad_to_seurat" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_h5ad_to_seurat main" + echo "" + echo "Converts an h5ad file into a Seurat file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5ad" + echo " Input h5ad file" + echo "" + echo " --assay" + echo " type: string" + echo " default: RNA" + echo " Name of the assay to be created." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.rds" + echo " Output Seurat file" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_h5ad_to_seurat main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --assay) + [ -n "$VIASH_PAR_ASSAY" ] && ViashError Bad arguments for option \'--assay\': \'$VIASH_PAR_ASSAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ASSAY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --assay. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --assay=*) + [ -n "$VIASH_PAR_ASSAY" ] && ViashError Bad arguments for option \'--assay=*\': \'$VIASH_PAR_ASSAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ASSAY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_h5ad_to_seurat:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_ASSAY+x} ]; then + VIASH_PAR_ASSAY="RNA" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_h5ad_to_seurat-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +library(anndataR) + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "assay" = $( if [ ! -z ${VIASH_PAR_ASSAY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ASSAY" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + + +seurat_obj <- read_h5ad( + par\$input, + mode = "r", + as = "Seurat", + assay_name = par\$assay +) + +saveRDS(seurat_obj, file = par\$output) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_h5ad_to_seurat/nextflow_labels.config b/target/executable/convert/from_h5ad_to_seurat/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_h5ad_to_seurat/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_seurat/.config.vsh.yaml b/target/executable/convert/from_h5mu_or_h5ad_to_seurat/.config.vsh.yaml new file mode 100644 index 00000000..1b04ffe6 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_seurat/.config.vsh.yaml @@ -0,0 +1,251 @@ +name: "from_h5mu_or_h5ad_to_seurat" +namespace: "convert" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5ad or h5mu file" + info: null + example: + - "input.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Modality to be converted if the input file is an h5mu file." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--assay" + description: "Name of the assay to be created." + info: null + default: + - "RNA" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output Seurat file" + info: null + example: + - "output.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts an h5ad file or a single modality of an h5mu file into a Seurat\ + \ file.\n" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "libgeos-dev" + - "hdf5-tools" + interactive: false + - type: "r" + cran: + - "anndata" + - "hdf5r" + - "Seurat" + - "SeuratObject" + github: + - "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + cran: + - "testthat" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_h5mu_or_h5ad_to_seurat" + executable: "target/executable/convert/from_h5mu_or_h5ad_to_seurat/from_h5mu_or_h5ad_to_seurat" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_seurat/from_h5mu_or_h5ad_to_seurat b/target/executable/convert/from_h5mu_or_h5ad_to_seurat/from_h5mu_or_h5ad_to_seurat new file mode 100755 index 00000000..4437b852 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_seurat/from_h5mu_or_h5ad_to_seurat @@ -0,0 +1,1250 @@ +#!/usr/bin/env bash + +# from_h5mu_or_h5ad_to_seurat main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_h5mu_or_h5ad_to_seurat" +VIASH_META_FUNCTIONALITY_NAME="from_h5mu_or_h5ad_to_seurat" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:22.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev libgeos-dev hdf5-tools && \ + rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("anndata", "hdf5r", "Seurat", "SeuratObject"), repos = "https://cran.rstudio.com")' && \ + Rscript -e 'options(warn = 2); remotes::install_github(c("scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component convert from_h5mu_or_h5ad_to_seurat" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_h5mu_or_h5ad_to_seurat main" + echo "" + echo "Converts an h5ad file or a single modality of an h5mu file into a Seurat file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5ad" + echo " Input h5ad or h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Modality to be converted if the input file is an h5mu file." + echo "" + echo " --assay" + echo " type: string" + echo " default: RNA" + echo " Name of the assay to be created." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.rds" + echo " Output Seurat file" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_h5mu_or_h5ad_to_seurat main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --assay) + [ -n "$VIASH_PAR_ASSAY" ] && ViashError Bad arguments for option \'--assay\': \'$VIASH_PAR_ASSAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ASSAY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --assay. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --assay=*) + [ -n "$VIASH_PAR_ASSAY" ] && ViashError Bad arguments for option \'--assay=*\': \'$VIASH_PAR_ASSAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ASSAY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_h5mu_or_h5ad_to_seurat:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_ASSAY+x} ]; then + VIASH_PAR_ASSAY="RNA" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_h5mu_or_h5ad_to_seurat-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +library(anndataR) +library(hdf5r) + + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "modality" = $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_MODALITY" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "assay" = $( if [ ! -z ${VIASH_PAR_ASSAY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ASSAY" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + +is_mudata_file <- function(file_path) { + con <- file(file_path, "rb") + tryCatch({ + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + header <- readBin(con, "raw", n = 6) + identical(header, charToRaw("MuData")) + }, finally = { + close(con) + }) +} + +h5mu_to_h5ad <- function(h5mu_path, modality_name) { + tmp_path <- tempfile(fileext = ".h5ad") + mod_location <- paste("mod", modality_name, sep = "/") + h5src <- hdf5r::H5File\$new(h5mu_path, "r") + h5dest <- hdf5r::H5File\$new(tmp_path, "w") + # Copy over the child objects and the child attributes from root + # Root cannot be copied directly because it always exists and + # copying does not allow overwriting. + children <- hdf5r::list.objects(h5src, + path = mod_location, + full.names = FALSE, recursive = FALSE + ) + for (child in children) { + h5dest\$obj_copy_from( + h5src, paste(mod_location, child, sep = "/"), + paste0("/", child) + ) + } + # Also copy the root attributes + root_attrs <- hdf5r::h5attr_names(x = h5src) + for (attr in root_attrs) { + h5a <- h5src\$attr_open(attr_name = attr) + robj <- h5a\$read() + h5dest\$create_attr_by_name( + attr_name = attr, + obj_name = ".", + robj = robj, + space = h5a\$get_space(), + dtype = h5a\$get_type() + ) + } + h5src\$close() + h5dest\$close() + + tmp_path +} + +# If the input is a MuData file, use a single modality as input instead +if (is_mudata_file(par\$input)) { + if (is.null(par\$modality) || par\$modality == "") { + stop("'modality' argument must be set if the input is a MuData file.") + } + h5ad_path <- h5mu_to_h5ad(par\$input, par\$modality) +} else { + h5ad_path <- par\$input +} + +seurat_obj <- read_h5ad( + h5ad_path, + mode = "r", + as = "Seurat", + assay_name = par\$assay +) + +saveRDS(seurat_obj, file = par\$output) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_seurat/nextflow_labels.config b/target/executable/convert/from_h5mu_or_h5ad_to_seurat/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_seurat/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/.config.vsh.yaml b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/.config.vsh.yaml new file mode 100644 index 00000000..3c76f836 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/.config.vsh.yaml @@ -0,0 +1,428 @@ +name: "from_h5mu_or_h5ad_to_tiledb" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Input AnnData or MuData file. When an AnnData file is provided,\ + \ it is automatically assumed to \ncontain transcriptome counts.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "RNA modality" + arguments: + - type: "string" + name: "--rna_modality" + description: "The name used for the RNA modality. Used when input file is a MuData\ + \ object.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_raw_layer_input" + description: "Location of the layer containing the raw transcriptome counts. Layers\ + \ are looked for in .layers,\nexcept when using the value 'X'; in which case\ + \ .X is used.\n" + info: null + example: + - "X" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_normalized_layer_input" + description: "Location of the layer containing the normalized counts. Layers are\ + \ looked for in .layers,\nexcept when using the value 'X'; in which case .X\ + \ is used.\n" + info: null + example: + - "log_normalized" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_var_gene_names_input" + description: "Column in .var that provides the gene names. If not specified, the\ + \ index from the input is used.\n" + info: null + example: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Protein modality" + arguments: + - type: "string" + name: "--prot_modality" + description: "The name used for the protein modality. Used when input file is\ + \ a MuData object.\nWhen not specified, the protein modality will not be processed.\n" + info: null + example: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_raw_layer_input" + description: "Location of the layer containing the raw protein counts. Layers\ + \ are looked for in .layers,\nexcept when using the value 'X'; in which case\ + \ .X is used.\n" + info: null + example: + - "X" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_normalized_layer_input" + description: "Location of the layer containing the normalized counts. Layers are\ + \ looked for in .layers,\nexcept when using the value 'X'; in which case .X\ + \ is used.\n" + info: null + example: + - "clr" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output slots" + arguments: + - type: "string" + name: "--rna_modality_output" + description: "TileDB Measurement name where the RNA modality will be stored.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_modality_output" + description: "Name of the TileDB Measurement where the protein modality will be\ + \ stored.\n" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_index_name_output" + description: "Name of the index that is used to describe the cells (observations).\n" + info: null + default: + - "cell_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_var_index_name_output" + description: "Output name of the index that is used to describe the genes.\n" + info: null + default: + - "rna_index" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_raw_layer_output" + description: "Output location for the raw transcriptomics counts.\n" + info: null + default: + - "X" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_normalized_layer_output" + description: "Output location for the normalized RNA counts.\n" + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_var_gene_names_output" + description: "Name of the .var column that specifies the gene games.\n" + info: null + default: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_var_index_name_output" + description: "Output name of the index that is used to describe the proteins.\ + \ \n" + info: null + default: + - "prot_index" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_raw_layer_output" + description: "Output location for the raw protein counts.\n" + info: null + default: + - "X" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_normalized_layer_output" + description: "Output location for the normalized protein counts.\n" + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--tiledb_dir" + description: "Directory where the TileDB output will be written to.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert a MuData or AnnData object to tiledb. Currently, transcriptome\ + \ and protein modalities are supported.\n\nNOTE: The functionality provided by this\ + \ component is experimental and may be subject to change. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "tiledbsoma" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_h5mu_or_h5ad_to_tiledb" + executable: "target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb new file mode 100755 index 00000000..43f6b314 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb @@ -0,0 +1,2139 @@ +#!/usr/bin/env bash + +# from_h5mu_or_h5ad_to_tiledb main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_h5mu_or_h5ad_to_tiledb" +VIASH_META_FUNCTIONALITY_NAME="from_h5mu_or_h5ad_to_tiledb" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "tiledbsoma" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component convert from_h5mu_or_h5ad_to_tiledb" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_h5mu_or_h5ad_to_tiledb main" + echo "" + echo "Convert a MuData or AnnData object to tiledb. Currently, transcriptome and" + echo "protein modalities are supported." + echo "" + echo "NOTE: The functionality provided by this component is experimental and may be" + echo "subject to change." + echo "" + echo "Input:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input AnnData or MuData file. When an AnnData file is provided, it is" + echo " automatically assumed to" + echo " contain transcriptome counts." + echo "" + echo "RNA modality:" + echo " --rna_modality" + echo " type: string" + echo " default: rna" + echo " The name used for the RNA modality. Used when input file is a MuData" + echo " object." + echo "" + echo " --rna_raw_layer_input" + echo " type: string, required parameter" + echo " example: X" + echo " Location of the layer containing the raw transcriptome counts. Layers" + echo " are looked for in .layers," + echo " except when using the value 'X'; in which case .X is used." + echo "" + echo " --rna_normalized_layer_input" + echo " type: string, required parameter" + echo " example: log_normalized" + echo " Location of the layer containing the normalized counts. Layers are" + echo " looked for in .layers," + echo " except when using the value 'X'; in which case .X is used." + echo "" + echo " --rna_var_gene_names_input" + echo " type: string" + echo " example: gene_symbol" + echo " Column in .var that provides the gene names. If not specified, the index" + echo " from the input is used." + echo "" + echo "Protein modality:" + echo " --prot_modality" + echo " type: string" + echo " example: prot" + echo " The name used for the protein modality. Used when input file is a MuData" + echo " object." + echo " When not specified, the protein modality will not be processed." + echo "" + echo " --prot_raw_layer_input" + echo " type: string" + echo " example: X" + echo " Location of the layer containing the raw protein counts. Layers are" + echo " looked for in .layers," + echo " except when using the value 'X'; in which case .X is used." + echo "" + echo " --prot_normalized_layer_input" + echo " type: string" + echo " example: clr" + echo " Location of the layer containing the normalized counts. Layers are" + echo " looked for in .layers," + echo " except when using the value 'X'; in which case .X is used." + echo "" + echo "Output slots:" + echo " --rna_modality_output" + echo " type: string" + echo " default: rna" + echo " TileDB Measurement name where the RNA modality will be stored." + echo "" + echo " --prot_modality_output" + echo " type: string" + echo " default: prot" + echo " Name of the TileDB Measurement where the protein modality will be" + echo " stored." + echo "" + echo " --obs_index_name_output" + echo " type: string" + echo " default: cell_id" + echo " Name of the index that is used to describe the cells (observations)." + echo "" + echo " --rna_var_index_name_output" + echo " type: string" + echo " default: rna_index" + echo " Output name of the index that is used to describe the genes." + echo "" + echo " --rna_raw_layer_output" + echo " type: string" + echo " default: X" + echo " Output location for the raw transcriptomics counts." + echo "" + echo " --rna_normalized_layer_output" + echo " type: string" + echo " default: log_normalized" + echo " Output location for the normalized RNA counts." + echo "" + echo " --rna_var_gene_names_output" + echo " type: string" + echo " default: gene_symbol" + echo " Name of the .var column that specifies the gene games." + echo "" + echo " --prot_var_index_name_output" + echo " type: string" + echo " default: prot_index" + echo " Output name of the index that is used to describe the proteins." + echo "" + echo " --prot_raw_layer_output" + echo " type: string" + echo " default: X" + echo " Output location for the raw protein counts." + echo "" + echo " --prot_normalized_layer_output" + echo " type: string" + echo " default: log_normalized" + echo " Output location for the normalized protein counts." + echo "" + echo "Output arguments:" + echo " --tiledb_dir" + echo " type: file, output, file must exist" + echo " Directory where the TileDB output will be written to." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_h5mu_or_h5ad_to_tiledb main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_modality) + [ -n "$VIASH_PAR_RNA_MODALITY" ] && ViashError Bad arguments for option \'--rna_modality\': \'$VIASH_PAR_RNA_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_modality=*) + [ -n "$VIASH_PAR_RNA_MODALITY" ] && ViashError Bad arguments for option \'--rna_modality=*\': \'$VIASH_PAR_RNA_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_raw_layer_input) + [ -n "$VIASH_PAR_RNA_RAW_LAYER_INPUT" ] && ViashError Bad arguments for option \'--rna_raw_layer_input\': \'$VIASH_PAR_RNA_RAW_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_RAW_LAYER_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_raw_layer_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_raw_layer_input=*) + [ -n "$VIASH_PAR_RNA_RAW_LAYER_INPUT" ] && ViashError Bad arguments for option \'--rna_raw_layer_input=*\': \'$VIASH_PAR_RNA_RAW_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_RAW_LAYER_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_normalized_layer_input) + [ -n "$VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT" ] && ViashError Bad arguments for option \'--rna_normalized_layer_input\': \'$VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_normalized_layer_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_normalized_layer_input=*) + [ -n "$VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT" ] && ViashError Bad arguments for option \'--rna_normalized_layer_input=*\': \'$VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_var_gene_names_input) + [ -n "$VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT" ] && ViashError Bad arguments for option \'--rna_var_gene_names_input\': \'$VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_var_gene_names_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_var_gene_names_input=*) + [ -n "$VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT" ] && ViashError Bad arguments for option \'--rna_var_gene_names_input=*\': \'$VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_modality) + [ -n "$VIASH_PAR_PROT_MODALITY" ] && ViashError Bad arguments for option \'--prot_modality\': \'$VIASH_PAR_PROT_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_modality=*) + [ -n "$VIASH_PAR_PROT_MODALITY" ] && ViashError Bad arguments for option \'--prot_modality=*\': \'$VIASH_PAR_PROT_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_raw_layer_input) + [ -n "$VIASH_PAR_PROT_RAW_LAYER_INPUT" ] && ViashError Bad arguments for option \'--prot_raw_layer_input\': \'$VIASH_PAR_PROT_RAW_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_RAW_LAYER_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_raw_layer_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_raw_layer_input=*) + [ -n "$VIASH_PAR_PROT_RAW_LAYER_INPUT" ] && ViashError Bad arguments for option \'--prot_raw_layer_input=*\': \'$VIASH_PAR_PROT_RAW_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_RAW_LAYER_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_normalized_layer_input) + [ -n "$VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT" ] && ViashError Bad arguments for option \'--prot_normalized_layer_input\': \'$VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_normalized_layer_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_normalized_layer_input=*) + [ -n "$VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT" ] && ViashError Bad arguments for option \'--prot_normalized_layer_input=*\': \'$VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_modality_output) + [ -n "$VIASH_PAR_RNA_MODALITY_OUTPUT" ] && ViashError Bad arguments for option \'--rna_modality_output\': \'$VIASH_PAR_RNA_MODALITY_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_MODALITY_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_modality_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_modality_output=*) + [ -n "$VIASH_PAR_RNA_MODALITY_OUTPUT" ] && ViashError Bad arguments for option \'--rna_modality_output=*\': \'$VIASH_PAR_RNA_MODALITY_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_MODALITY_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_modality_output) + [ -n "$VIASH_PAR_PROT_MODALITY_OUTPUT" ] && ViashError Bad arguments for option \'--prot_modality_output\': \'$VIASH_PAR_PROT_MODALITY_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_MODALITY_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_modality_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_modality_output=*) + [ -n "$VIASH_PAR_PROT_MODALITY_OUTPUT" ] && ViashError Bad arguments for option \'--prot_modality_output=*\': \'$VIASH_PAR_PROT_MODALITY_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_MODALITY_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_index_name_output) + [ -n "$VIASH_PAR_OBS_INDEX_NAME_OUTPUT" ] && ViashError Bad arguments for option \'--obs_index_name_output\': \'$VIASH_PAR_OBS_INDEX_NAME_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_INDEX_NAME_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_index_name_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_index_name_output=*) + [ -n "$VIASH_PAR_OBS_INDEX_NAME_OUTPUT" ] && ViashError Bad arguments for option \'--obs_index_name_output=*\': \'$VIASH_PAR_OBS_INDEX_NAME_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_INDEX_NAME_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_var_index_name_output) + [ -n "$VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT" ] && ViashError Bad arguments for option \'--rna_var_index_name_output\': \'$VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_var_index_name_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_var_index_name_output=*) + [ -n "$VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT" ] && ViashError Bad arguments for option \'--rna_var_index_name_output=*\': \'$VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_raw_layer_output) + [ -n "$VIASH_PAR_RNA_RAW_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--rna_raw_layer_output\': \'$VIASH_PAR_RNA_RAW_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_RAW_LAYER_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_raw_layer_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_raw_layer_output=*) + [ -n "$VIASH_PAR_RNA_RAW_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--rna_raw_layer_output=*\': \'$VIASH_PAR_RNA_RAW_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_RAW_LAYER_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_normalized_layer_output) + [ -n "$VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--rna_normalized_layer_output\': \'$VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_normalized_layer_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_normalized_layer_output=*) + [ -n "$VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--rna_normalized_layer_output=*\': \'$VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rna_var_gene_names_output) + [ -n "$VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT" ] && ViashError Bad arguments for option \'--rna_var_gene_names_output\': \'$VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rna_var_gene_names_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rna_var_gene_names_output=*) + [ -n "$VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT" ] && ViashError Bad arguments for option \'--rna_var_gene_names_output=*\': \'$VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_var_index_name_output) + [ -n "$VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT" ] && ViashError Bad arguments for option \'--prot_var_index_name_output\': \'$VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_var_index_name_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_var_index_name_output=*) + [ -n "$VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT" ] && ViashError Bad arguments for option \'--prot_var_index_name_output=*\': \'$VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_raw_layer_output) + [ -n "$VIASH_PAR_PROT_RAW_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--prot_raw_layer_output\': \'$VIASH_PAR_PROT_RAW_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_RAW_LAYER_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_raw_layer_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_raw_layer_output=*) + [ -n "$VIASH_PAR_PROT_RAW_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--prot_raw_layer_output=*\': \'$VIASH_PAR_PROT_RAW_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_RAW_LAYER_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prot_normalized_layer_output) + [ -n "$VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--prot_normalized_layer_output\': \'$VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prot_normalized_layer_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prot_normalized_layer_output=*) + [ -n "$VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--prot_normalized_layer_output=*\': \'$VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tiledb_dir) + [ -n "$VIASH_PAR_TILEDB_DIR" ] && ViashError Bad arguments for option \'--tiledb_dir\': \'$VIASH_PAR_TILEDB_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TILEDB_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tiledb_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tiledb_dir=*) + [ -n "$VIASH_PAR_TILEDB_DIR" ] && ViashError Bad arguments for option \'--tiledb_dir=*\': \'$VIASH_PAR_TILEDB_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TILEDB_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_h5mu_or_h5ad_to_tiledb:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_RNA_RAW_LAYER_INPUT+x} ]; then + ViashError '--rna_raw_layer_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT+x} ]; then + ViashError '--rna_normalized_layer_input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_RNA_MODALITY+x} ]; then + VIASH_PAR_RNA_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_RNA_MODALITY_OUTPUT+x} ]; then + VIASH_PAR_RNA_MODALITY_OUTPUT="rna" +fi +if [ -z ${VIASH_PAR_PROT_MODALITY_OUTPUT+x} ]; then + VIASH_PAR_PROT_MODALITY_OUTPUT="prot" +fi +if [ -z ${VIASH_PAR_OBS_INDEX_NAME_OUTPUT+x} ]; then + VIASH_PAR_OBS_INDEX_NAME_OUTPUT="cell_id" +fi +if [ -z ${VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT+x} ]; then + VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT="rna_index" +fi +if [ -z ${VIASH_PAR_RNA_RAW_LAYER_OUTPUT+x} ]; then + VIASH_PAR_RNA_RAW_LAYER_OUTPUT="X" +fi +if [ -z ${VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT+x} ]; then + VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT="log_normalized" +fi +if [ -z ${VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT+x} ]; then + VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT="gene_symbol" +fi +if [ -z ${VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT+x} ]; then + VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT="prot_index" +fi +if [ -z ${VIASH_PAR_PROT_RAW_LAYER_OUTPUT+x} ]; then + VIASH_PAR_PROT_RAW_LAYER_OUTPUT="X" +fi +if [ -z ${VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT+x} ]; then + VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT="log_normalized" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_TILEDB_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_TILEDB_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_TILEDB_DIR")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_TILEDB_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TILEDB_DIR")" ) + VIASH_PAR_TILEDB_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_TILEDB_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_TILEDB_DIR" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_h5mu_or_h5ad_to_tiledb-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import anndata +import numpy as np +import tiledbsoma + +# This import seems redundant, but it is required! +import tiledbsoma.io +import pandas as pd +from functools import singledispatch +from collections.abc import Mapping +import h5py +from shutil import copy2, rmtree +from collections.abc import Callable +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +import pyarrow as pa +from collections import defaultdict +import json + +anndata.settings.allow_write_nullable_strings = True + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_modality': $( if [ ! -z ${VIASH_PAR_RNA_MODALITY+x} ]; then echo "r'${VIASH_PAR_RNA_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_raw_layer_input': $( if [ ! -z ${VIASH_PAR_RNA_RAW_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_RNA_RAW_LAYER_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_normalized_layer_input': $( if [ ! -z ${VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_var_gene_names_input': $( if [ ! -z ${VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT+x} ]; then echo "r'${VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_modality': $( if [ ! -z ${VIASH_PAR_PROT_MODALITY+x} ]; then echo "r'${VIASH_PAR_PROT_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_raw_layer_input': $( if [ ! -z ${VIASH_PAR_PROT_RAW_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_PROT_RAW_LAYER_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_normalized_layer_input': $( if [ ! -z ${VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_modality_output': $( if [ ! -z ${VIASH_PAR_RNA_MODALITY_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_MODALITY_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_modality_output': $( if [ ! -z ${VIASH_PAR_PROT_MODALITY_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_MODALITY_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_index_name_output': $( if [ ! -z ${VIASH_PAR_OBS_INDEX_NAME_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBS_INDEX_NAME_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_var_index_name_output': $( if [ ! -z ${VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_raw_layer_output': $( if [ ! -z ${VIASH_PAR_RNA_RAW_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_RAW_LAYER_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_normalized_layer_output': $( if [ ! -z ${VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rna_var_gene_names_output': $( if [ ! -z ${VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_var_index_name_output': $( if [ ! -z ${VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_raw_layer_output': $( if [ ! -z ${VIASH_PAR_PROT_RAW_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_RAW_LAYER_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'prot_normalized_layer_output': $( if [ ! -z ${VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'tiledb_dir': $( if [ ! -z ${VIASH_PAR_TILEDB_DIR+x} ]; then echo "r'${VIASH_PAR_TILEDB_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +LAYER_ARGUMENTS = ( + "rna_raw_layer_input", + "rna_raw_layer_output", + "rna_normalized_layer_input", + "rna_normalized_layer_output", + "prot_raw_layer_input", + "prot_raw_layer_output", + "prot_normalized_layer_input", + "prot_normalized_layer_output", +) + + +multidim_column_indices = {} + + +def _convert_to_array(mod_name, _, group): + """ + tiledb SOMA does not support pandas dataframes in varm and obsm + Convert those to structured arrays and store column index as metadata + """ + entries = group.keys() + for entry in entries: + encoding = group[entry].attrs["encoding-type"] + if encoding in ("array", "csr_matrix", "csc_matrix"): + return + if encoding in { + "dataframe", + "nullable-integer", + "nullable-boolean", + "nullable-string-array", + }: + df = anndata.io.read_elem(group[entry]) + data = df.to_numpy() + anndata.io.write_elem(group, entry, data) + index_to_write = df.columns + # No need to store write the index to tileDB in the case its just the integer index + if not isinstance(index_to_write, pd.RangeIndex): + multidim_column_indices.setdefault(mod_name, {}).setdefault( + group.name.strip("/"), {} + ).update({entry: index_to_write.to_list()}) + return + logger.info( + "Deleting %s because it is of encoding %s, which is not supported.", + f"{group.name}/{entry}", + encoding, + ) + del group[entry] + + +def _read_obs(anndata_file): + with h5py.File(anndata_file, "r") as open_rna: + return _to_pandas_nullable(anndata.io.read_elem(open_rna["obs"])) + + +def _write_obs(anndata_file, obs_df): + with h5py.File(anndata_file, "r+") as open_h5: + anndata.io.write_elem(open_h5, "obs", obs_df) + + +def _log_arguments(function_obj, arg_dict): + """ + Format a dictionairy of arguments into a string that is put into the script logs. + """ + args_str = [f"\\t{param}: {param_val}\\n" for param, param_val in arg_dict.items()] + logger.info( + "Calling %s with arguments:\\n%s", + function_obj.__name__, + "".join(args_str).rstrip(), + ) + + +def _to_pandas_nullable(table_or_df: pa.Table | pd.DataFrame) -> pd.DataFrame: + """ + Convert pyarrow Table or pandas dataframe to pandas dataframe that uses nullable data types. + + This function takes the following into account: + 1. AnnData supports writing BooleanDtype, IntegerDtype and StringDtype (which uses pd.NA) + 2. AnnData does *not* support FloatingDtype + 3. AnnData does support numpy-stype floating and integer dtypes (which use np.nan). + 4. TileDB SOMA's \`update_obs\` does not allow casting floats to integers (potential information loss). + 5. When pandas's \`convert_dtypes\` convert_integer argument is set to True, + floats will be converted to integer if they can be 'faithfully' casted. + + Giving the previous constraints, it is *not* possible to only convert integers using \`convert_types\` because + it might cause some float columns to be converted to integers (which is incompatible with tiledbsoma). + Therefore, this function will leave integers and floats as-is. + + Based on the conversion provided by this function, a suitable na representation (pd.NA or np.NA) for a column can + be chosen using \`pd.api.types.is_extension_array_dtype(column_dtype)\` + """ + try: + df = table_or_df.to_pandas( + types_mapper=defaultdict(None, {pa.large_string(): pd.StringDtype()}).get + ) + except AttributeError: + df = table_or_df.convert_dtypes( + infer_objects=False, + convert_string=True, + convert_integer=False, + convert_boolean=False, + convert_floating=False, + ) + return df.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_integer=False, + convert_boolean=True, + convert_floating=False, + ) + + +def _get_temp_h5ad(): + return Path(NamedTemporaryFile(suffix=".h5ad", dir=meta["temp_dir"]).name) + + +def _h5mu_to_h5ad(h5mu_path, modality_name): + """ + Create an anndata file from a modality in a mudata file by + copying over the relevant h5 objects. + """ + result = _get_temp_h5ad() + with h5py.File(h5mu_path, "r") as open_anndata: + with h5py.File(result, "w") as file_to_write: + for h5_key in open_anndata["mod"][modality_name].keys(): + open_anndata.copy( + source=f"/mod/{modality_name}/{h5_key}", dest=file_to_write + ) + return result + + +def _handle_layer_location(layer_name): + """ + Handle the special case where 'X' is specified as layer + this layer is not present in \`/layers/X\` but in \`/X\`. + """ + prefix = "layers/" if layer_name != "X" else "" + return f"{prefix}{layer_name}" + + +def _check_input_args(par, input, is_mudata): + """ + Check the input arguments: + * Arguments for protein modalities are not allowed when the input is not a MuData file. + * Arguments for input layers may not point to the same layer + * Output locations for layers may not be the same + * Input modalities must not be the same + * Input modalities must exist + """ + if not is_mudata: + for par_name in ( + "prot_modality", + "prot_raw_layer_input", + "prot_normalized_layer_input", + ): + if par[par_name]: + raise ValueError( + f"'{par_name}' can not be used when the input is not a MuData file." + ) + + if par["prot_modality"]: + for prot_arg in ("prot_raw_layer_input", "prot_raw_layer_output"): + if not par[prot_arg]: + ValueError( + f"When providing 'prot_modality', '{prot_arg}' must also be set." + ) + + NOT_SAME_VALUE = ( + ("rna_raw_layer_input", "rna_normalized_layer_input"), + ("prot_raw_layer_input", "prot_normalized_layer_input"), + ("rna_raw_layer_output", "rna_normalized_layer_output"), + ("prot_raw_layer_output", "prot_normalized_layer_output"), + ("rna_modality", "prot_modality"), + ) + for layer1_arg, layer2_arg in NOT_SAME_VALUE: + arg1_val, arg2_val = par[layer1_arg], par[layer2_arg] + if (arg1_val == arg2_val) and arg1_val is not None: + raise ValueError( + f"The value for argument '{layer1_arg}' ({arg1_val}) must not be the same as for argument '{layer2_arg}' ({arg2_val})" + ) + + with h5py.File(input, "r") as open_h5: + if "mod" in open_h5: + available_modalities = tuple(open_h5["mod"].keys()) + for mod in (par["rna_modality"], par["prot_modality"]): + if mod is not None and f"/mod/{mod}" not in open_h5: + raise ValueError( + f"Modality '{mod}' was not found in object, available modalities: {available_modalities}" + ) + + +def _copy_layer(source: str, dest: str, input_h5ad: Path, mod_obj: h5py.Group): + """ + Copy a h5py object from one location (source) to a new location (dest). + The source location must be located in the input AnnData file, the destination + will be written to a h5py Group object. + """ + logger.info("Copying layer '%s' from '%s' to '%s'", source, input_h5ad, dest) + if source == f"{mod_obj.name}/{dest}": + logger.info("Layer is already at the correct location. Nothing to do.") + return + if not input_h5ad.is_file(): + raise FileNotFoundError(f"Could not link to {input_h5ad}: file does not exist.") + with h5py.File(input_h5ad) as open_input: + if source not in open_input: + raise ValueError(f"Layer {source} was not found in {input_h5ad}.") + if dest in mod_obj: + logger.info(f"{dest} is already present. Removing before creating link.") + del mod_obj[dest] + # ExternalLink could have been used instead of 'copy', but it does not seem to work with tileDB + with h5py.File(str(input_h5ad), "r") as open_input: + open_input.copy( + source=open_input[source], dest=mod_obj, name=dest, expand_external=True + ) + logger.info("Copying complete.") + + +def _copy_column(src, dest, _, df_h5_obj): + """ + Duplicate a column in a dataframe and give a new name. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Copying column '%s' to '%s' for '%s'.", src, dest, df_h5_obj.name) + columns = df_h5_obj.attrs["column-order"] + if src == dest: + logger.info("Source and destination for the column are the same, not copying.") + return None + if dest in columns: + raise ValueError(f"Column {dest} already exists in {df_h5_obj.name}.") + df_h5_obj.attrs["column-order"] = np.append(columns, dest) + if src is None: + # Use the index as source. + src = df_h5_obj.attrs["_index"] + # Both the index and columns are written as keys to df_h5_obj + # An index name may be allowed to also be column name _only_ when + # the column and the index have the same contents (this is an anndata requirement). + # Here, this is always the case. + if src in df_h5_obj.attrs["column-order"]: + return None + df_h5_obj.copy(src, dest, expand_refs=True, expand_soft=True) + + +def _set_index_name(name, _, df_h5_obj): + """ + Set the index for a dataframe to an existing column. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Setting index name of '%s' to '%s'.", df_h5_obj.name, name) + original_index_name = df_h5_obj.attrs["_index"] + # An index and a column may share a key in df_h5_obj + # In this case we must not remove the original key from the object + operator = ( + df_h5_obj.move + if original_index_name not in df_h5_obj.attrs["column-order"] + else df_h5_obj.copy + ) + operator(original_index_name, name) + df_h5_obj.attrs["_index"] = name + logger.info("Done setting index name") + + +def _delete_all_keys(_, group_obj, exception=None): + """ + Delete all keys from a h5py.Group object; which optional exceptions. + """ + if exception is None: + exception = () + logger.info( + "Deleting all keys from '%s'%s.", + group_obj.name, + f" except {', '.join(exception)}" if exception else "", + ) + keys_to_delete = set(group_obj.keys()) - set(exception) + if not keys_to_delete: + logger.info("No keys to delete.") + return + for key in keys_to_delete: + del group_obj[key] + logger.info("Done deleting keys %s.", ", ".join(keys_to_delete)) + + +def _set_index_to_string_dtype(_, df_obj): + """ + Change the index dataype for a dataframe to string when it is encoded as categorical. + The dataframe must be provided as a h5py Group object that has been encoded by AnnData. + """ + logger.info( + "Checking if index of object '%s' is encoded as a categorical.", df_obj.name + ) + index_col_name = df_obj.attrs["_index"] + index_col = df_obj[index_col_name] + logger.info("Column '%s' is being used as the index.", index_col_name) + index_col_dtype = index_col.attrs["encoding-type"] + if index_col_dtype == "categorical": + logger.info( + "Column '%s' is encoded as categorical, converting to string.", + index_col_name, + ) + is_ordered = bool(index_col.attrs.get("ordered", False)) + codes, categories = index_col["codes"], index_col["categories"] + column = pd.Categorical.from_codes(codes, categories, is_ordered) + column_str = column.astype(str).astype(object) + del df_obj[index_col_name] + df_obj.create_dataset(index_col_name, data=column_str) + index_col.attrs["encoding-type"] = "string-array" + logger.info("Conversion to string dtype complete.") + return + logger.info( + "Column '%s' not encoded as categorical. Leaving as is.", index_col_name + ) + + +def _get_rna_conversion_specification(par): + rna_spec = [ + partial(_copy_layer, par["rna_raw_layer_input"], par["rna_raw_layer_output"]), + partial( + _copy_layer, + par["rna_normalized_layer_input"], + par["rna_normalized_layer_output"], + ), + { + "varm": partial(_convert_to_array, "rna"), + "obsm": partial(_convert_to_array, "rna"), + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + "var": [ + partial( + _copy_column, + par["rna_var_gene_names_input"], + par["rna_var_gene_names_output"], + ), + partial(_set_index_name, par["rna_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "obs": partial(_set_index_name, par["obs_index_name_output"]), + "layers": partial( + _delete_all_keys, + exception=( + par["rna_raw_layer_output"].removeprefix("layers/"), + par["rna_normalized_layer_output"].removeprefix("layers/"), + ), + ), + }, + ] + return rna_spec + + +def _get_prot_conversion_specification(par): + prot_spec = [ + partial(_copy_layer, par["prot_raw_layer_input"], par["prot_raw_layer_output"]), + partial( + _copy_layer, + par["prot_normalized_layer_input"], + par["prot_normalized_layer_output"], + ), + { + "varm": partial(_convert_to_array, "prot"), + "obsm": partial(_convert_to_array, "prot"), + "var": [ + partial(_set_index_name, par["prot_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + # "varm": _delete_all_keys, + "layers": partial( + _delete_all_keys, + exception=( + par["prot_raw_layer_output"].removeprefix("layers/"), + par["prot_normalized_layer_output"].removeprefix("layers/"), + ), + ), + "obs": partial(_set_index_name, par["obs_index_name_output"]), + }, + ] + + return prot_spec + + +def _copy_anndata_and_convert_inplace(anndata_path, conversion_specification): + converted_anndata = _get_temp_h5ad() + copy2(anndata_path, converted_anndata) + with h5py.File(converted_anndata, "r+") as open_prot_h5: + apply_conversion(conversion_specification, anndata_path, open_prot_h5) + return converted_anndata + + +@singledispatch +def apply_conversion(conversion_specification, input_h5ad, open_h5): + raise NotImplementedError( + f"Error: function 'apply_conversion' is not implemented for type '{type(conversion_specification)}'" + ) + + +@apply_conversion.register +def _(conversion_specification: list, input_h5ad, open_h5): + for item in conversion_specification: + apply_conversion(item, input_h5ad, open_h5) + + +@apply_conversion.register +def _(conversion_specifications: Mapping, input_h5ad, open_h5): + for h5_key, conversion_spec in conversion_specifications.items(): + if hasattr(open_h5, h5_key): + h5_obj = getattr(open_h5, h5_key) + else: + h5_obj = open_h5[h5_key] + apply_conversion(conversion_spec, input_h5ad, h5_obj) + + +@apply_conversion.register +def _(conversion_specifications: Callable, input_h5ad, open_h5): + conversion_specifications(input_h5ad, open_h5) + + +def _write_varm_obsm_indices(output_dir, mod_name, multidim_column_indices): + # TileDB SOMA does not support column indices for .varm and obsm matrices + # As a workaround, we will store these in the metadata slot + for multidim_key in ("varm", "obsm"): + collection_name = f"{str(output_dir)}/ms/{mod_name}/{multidim_key}" + if tiledbsoma.Collection.exists(collection_name): + with tiledbsoma.Collection.open( + collection_name, "w" + ) as multidim_collection: + for item in multidim_collection: + index_to_write = ( + multidim_column_indices.get(mod_name, {}) + .get(multidim_key, {}) + .get(item, None) + ) + if index_to_write: + multidim_collection[item].metadata["column_index"] = json.dumps( + index_to_write + ) + + +def _remove_directory(dir_path): + """ + Check if a directory exists and remove it does. + """ + if dir_path.exists(): + try: + # Delete the directory and all its contents + for item in dir_path.iterdir(): + if item.is_file() or item.is_symlink(): + item.unlink() # Removes files or symbolic links + elif item.is_dir(): + rmtree(item) # Removes directories recursively + logger.info("Directory '%s' has been deleted successfully.", dir_path) + except FileNotFoundError: + logger.info("Directory '%s' not found.", dir_path) + except PermissionError as e: + raise ValueError( + f"Permission denied: Unable to delete '{dir_path}'." + ) from e + + +def _align_obs_columns(experiment_df, anndata_df): + def _create_na_columns(df_to_write, columns, dtype_df): + """ + Create new columns in a dataframe with NA's as content while + making sure that the datatype of the newly created columns match + with those from another dataframe. + """ + for col in columns: + dtype = dtype_df[col].dtype + na = pd.NA if pd.api.types.is_extension_array_dtype(dtype) else np.nan + df_to_write.loc[:, col] = pd.Series( + na, index=df_to_write.index, dtype=dtype + ) + return df_to_write + + logger.info("Making sure obs columns are aligned.") + contents_columns = set(experiment_df.columns) + logger.info("Already ingested .obs columns: %s", ", ".join(contents_columns)) + logger.info("Retreiving obs columns to be added.") + + obs_to_add_columns = set(anndata_df.columns) + + logger.info("Checking dtype of common columns") + common_columns = obs_to_add_columns.intersection(contents_columns) + common_dtypes = {} + for column_name in common_columns: + # Combine the two columns, but only keep the calculated dtype. + # We'll update the data types in the two original frames and let tiledb do the joining. + new_series = experiment_df[column_name].combine_first(anndata_df[column_name]) + result_dtype = new_series.dtype + # Nullable floats are not supported by anndata... + if pd.api.types.is_float_dtype( + result_dtype + ) and pd.api.types.is_extension_array_dtype(result_dtype): + result_dtype = result_dtype.numpy_dtype + common_dtypes[column_name] = result_dtype + + logger.info( + "Updating the dtypes to comply with the following schema: %s", common_dtypes + ) + experiment_df, anndata_df = ( + experiment_df.astype(common_dtypes), + anndata_df.astype(common_dtypes), + ) + + logger.info("Will add the following columns: %s", ", ".join(obs_to_add_columns)) + missing_columns_in_old = obs_to_add_columns - contents_columns + logger.info( + "Adding the following columns to the schema: %s.", + ", ".join(missing_columns_in_old), + ) + experiment_df = _create_na_columns( + experiment_df, missing_columns_in_old, anndata_df + ) + + logger.info("Adjusting .obs succeeded!") + missing_columns_in_new = ( + contents_columns + - obs_to_add_columns + - {"soma_joinid", par["obs_index_name_output"]} + ) + logger.info( + "Adding the following columns to modality to be ingested: %s", + ", ".join(missing_columns_in_new), + ) + anndata_df = _create_na_columns(anndata_df, missing_columns_in_new, experiment_df) + return experiment_df, anndata_df + + +def main(par): + logger.info(f"Component {meta['name']} started.") + par["input"], par["tiledb_dir"] = Path(par["input"]), Path(par["tiledb_dir"]) + + # Handle case where input is a mudat file + with open(par["input"], "rb") as open_input_file: + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + is_mudata = open_input_file.read(6) == b"MuData" + + _check_input_args(par, par["input"], is_mudata) + + # Reference layers other than "X" refer as "layer/" + for arg_name in LAYER_ARGUMENTS: + par[arg_name] = _handle_layer_location(par[arg_name]) + + rna_input = par["input"] + # Create a temporary file that holds the converted RNA h5ad + if is_mudata: + if not par["rna_modality"]: + raise ValueError( + "'rna_modality' argument must be set if the input is a MuData file." + ) + # If the input is a MuData file, take the RNA modality and use that as input instead + rna_input = _h5mu_to_h5ad(par["input"], par["rna_modality"]) + + # Put a copy of the RNA input in the output file and convert it in-place + rna_converted = _copy_anndata_and_convert_inplace( + rna_input, _get_rna_conversion_specification(par) + ) + + if par["prot_modality"]: + prot_input = _h5mu_to_h5ad(par["input"], par["prot_modality"]) + prot_converted = _copy_anndata_and_convert_inplace( + prot_input, _get_prot_conversion_specification(par) + ) + + logger.info( + "Making sure that obs columns are shared between the two modalities." + ) + contents, obs_to_add = _read_obs(rna_converted), _read_obs(prot_converted) + contents, obs_to_add = _align_obs_columns(contents, obs_to_add) + _write_obs(prot_converted, obs_to_add) + _write_obs(rna_converted, contents) + + tiledbsoma.logging.info() + _remove_directory(par["tiledb_dir"]) + + logger.info("Ingesting RNA modality") + rna_from_h5ad_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": rna_converted, + "measurement_name": par["rna_modality_output"], + "uns_keys": [], + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, rna_from_h5ad_args) + func_to_call(**rna_from_h5ad_args) + + _write_varm_obsm_indices(par["tiledb_dir"], "rna", multidim_column_indices) + + logger.info("Done ingesting RNA modality") + + if par["prot_modality"]: + logger.info("Ingesting protein modality") + + logger.info("Initalizing prot modality schema.") + # Note the 'schema_only' + prot_from_h5ad_schema_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "uns_keys": [], + "ingest_mode": "schema_only", + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, prot_from_h5ad_schema_args) + func_to_call(**prot_from_h5ad_schema_args) + + logger.info("Registering prot modality anndata") + # Register the second anndata object in the protein measurement + register_prot_args = { + "experiment_uri": str(par["tiledb_dir"]), + "h5ad_file_names": [prot_converted], + "measurement_name": par["prot_modality_output"], + "obs_field_name": par["obs_index_name_output"], + "var_field_name": par["prot_var_index_name_output"], + "append_obsm_varm": True, + } + func_to_call = tiledbsoma.io.register_h5ads + _log_arguments(func_to_call, register_prot_args) + rd = func_to_call(**register_prot_args) + + rd.prepare_experiment(str(par["tiledb_dir"])) + + # Ingest the second anndata object into the protein measurement + prot_write_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "registration_mapping": rd, + "ingest_mode": "write", + "uns_keys": [], + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, prot_write_args) + func_to_call(**prot_write_args) + + _write_varm_obsm_indices(par["tiledb_dir"], "prot", multidim_column_indices) + + logger.info("Finished!") + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_TILEDB_DIR" ]; then + VIASH_PAR_TILEDB_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_TILEDB_DIR") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_TILEDB_DIR" ] && [ ! -e "$VIASH_PAR_TILEDB_DIR" ]; then + ViashError "Output file '$VIASH_PAR_TILEDB_DIR' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_labels.config b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/setup_logger.py b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/convert/from_h5mu_or_h5ad_to_tiledb/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/convert/from_h5mu_to_h5ad/.config.vsh.yaml b/target/executable/convert/from_h5mu_to_h5ad/.config.vsh.yaml new file mode 100644 index 00000000..8bb6738c --- /dev/null +++ b/target/executable/convert/from_h5mu_to_h5ad/.config.vsh.yaml @@ -0,0 +1,262 @@ +name: "from_h5mu_to_h5ad" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input MuData file" + info: null + default: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output AnnData file." + info: null + default: + - "output.h5ad" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a h5mu file into a h5ad file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_to_h5ad/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_h5mu_to_h5ad" + executable: "target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad b/target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad new file mode 100755 index 00000000..7bac46d3 --- /dev/null +++ b/target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad @@ -0,0 +1,1198 @@ +#!/usr/bin/env bash + +# from_h5mu_to_h5ad main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_h5mu_to_h5ad" +VIASH_META_FUNCTIONALITY_NAME="from_h5mu_to_h5ad" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component convert from_h5mu_to_h5ad" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_h5mu_to_h5ad main" + echo "" + echo "Converts a h5mu file into a h5ad file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " default: input.h5mu" + echo " Input MuData file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " default: output.h5ad" + echo " Output AnnData file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_h5mu_to_h5ad main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_h5mu_to_h5ad:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="output.h5ad" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_h5mu_to_h5ad-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +# TODO: Merge modalities into one layer + +logger.info("Reading input h5mu file %s, modality %s", par["input"], par["modality"]) +adat = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Writing to %s.", par["output"]) +adat.write_h5ad(par["output"], compression=par["output_compression"]) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_h5mu_to_h5ad/nextflow_labels.config b/target/executable/convert/from_h5mu_to_h5ad/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_h5mu_to_h5ad/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/from_h5mu_to_h5ad/setup_logger.py b/target/executable/convert/from_h5mu_to_h5ad/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/convert/from_h5mu_to_h5ad/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/convert/from_h5mu_to_seurat/.config.vsh.yaml b/target/executable/convert/from_h5mu_to_seurat/.config.vsh.yaml new file mode 100644 index 00000000..2799a0bb --- /dev/null +++ b/target/executable/convert/from_h5mu_to_seurat/.config.vsh.yaml @@ -0,0 +1,237 @@ +name: "from_h5mu_to_seurat" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output Seurat file" + info: null + example: + - "output.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts an h5mu file into a Seurat file.\n\nRestrictions:\n - Only\ + \ the intersection of cells is currently loaded into the Seurat object due to the\ + \ object structure limitation.\n - Multimodal embeddings (global .obsm slot) are\ + \ loaded with the assay.used field set to the default assay.\n - Embeddings names\ + \ are changed in order to comply with R & Seurat requirements and conventions.\n\ + \ - Feature names with underscores ('_') are automatically replaced with dashes\ + \ ('-')\n - Seurat does not support global variables metadata /var.\n" +test_resources: +- type: "r_script" + path: "run_test.R" + is_executable: true +- type: "file" + path: "10x_5k_anticmv" +info: null +status: "deprecated" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "libgeos-dev" + interactive: false + - type: "r" + cran: + - "anndata" + - "hdf5r" + - "testthat" + - "SeuratObject" + - "Seurat" + bioc_force_install: false + warnings_as_errors: true + - type: "r" + github: + - "pmbio/MuDataSeurat@empty-tables-and-nullable" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_to_seurat/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/from_h5mu_to_seurat" + executable: "target/executable/convert/from_h5mu_to_seurat/from_h5mu_to_seurat" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/from_h5mu_to_seurat/from_h5mu_to_seurat b/target/executable/convert/from_h5mu_to_seurat/from_h5mu_to_seurat new file mode 100755 index 00000000..b45b4286 --- /dev/null +++ b/target/executable/convert/from_h5mu_to_seurat/from_h5mu_to_seurat @@ -0,0 +1,1219 @@ +#!/usr/bin/env bash + +# from_h5mu_to_seurat main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="from_h5mu_to_seurat" +VIASH_META_FUNCTIONALITY_NAME="from_h5mu_to_seurat" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:24.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev libgeos-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("anndata", "hdf5r", "testthat", "SeuratObject", "Seurat"), repos = "https://cran.rstudio.com")' + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_github(c("pmbio/MuDataSeurat@empty-tables-and-nullable"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component convert from_h5mu_to_seurat" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "from_h5mu_to_seurat main" + echo "" + echo "Converts an h5mu file into a Seurat file." + echo "" + echo "Restrictions:" + echo " - Only the intersection of cells is currently loaded into the Seurat object" + echo "due to the object structure limitation." + echo " - Multimodal embeddings (global .obsm slot) are loaded with the assay.used" + echo "field set to the default assay." + echo " - Embeddings names are changed in order to comply with R & Seurat requirements" + echo "and conventions." + echo " - Feature names with underscores ('_') are automatically replaced with dashes" + echo "('-')" + echo " - Seurat does not support global variables metadata /var." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.rds" + echo " Output Seurat file" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "from_h5mu_to_seurat main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/from_h5mu_to_seurat:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-from_h5mu_to_seurat-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +library(MuDataSeurat) +library(hdf5r) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + + +temp_h5mu <- tempfile(fileext = ".h5mu") +file.copy(par\$input, temp_h5mu) + +delete_modality <- function(open_h5, modality_path) { + open_h5\$link_delete(modality_path) + mod_name <- sub("/mod/", "", modality_path) + if ("mod-order" %in% names(hdf5r::h5attributes(open_h5[["mod"]]))) { + current_attributes <- hdf5r::h5attributes(open_h5[["mod"]])\$\`mod-order\` + current_attributes <- current_attributes[current_attributes != mod_name] + hdf5r::h5attr(open_h5[["mod"]], "mod-order") <- current_attributes + } + for (obj_prefix in c("obsm/", "varm/", "varmap/", "obsmap/")) { + obj_path <- paste0(obj_prefix, mod_name) + if (hdf5r::existsGroup(open_h5, obj_path)) { + open_h5\$link_delete(obj_path) + } + } +} + +open_file <- H5File\$new(temp_h5mu, mode = "r+") +modalities <- list.groups(open_file[["mod"]], + full.names = TRUE, recursive = FALSE +) +to_delete <- c() + +determine_matrix_dims <- function(dataset, indexpointers, indices, rowwise) { + if ("shape" %in% hdf5r::h5attr_names(dataset)) { + x_dims <- hdf5r::h5attr(dataset, "shape") + } else { + x_dims <- c(length(indexpointers) - 1, max(indices) + 1) + if (rowwise) { + x_dims <- rev(x_dims) + } + } + x_dims +} +for (modality_path in modalities) { + dataset <- open_file[[modality_path]][["X"]] + dataset_names <- names(dataset) + if ( + "data" %in% dataset_names && + "indices" %in% dataset_names && + "indptr" %in% dataset_names + ) { + indexpointers <- dataset[["indptr"]]\$read() + indices <- dataset[["indices"]]\$read() + rowwise <- FALSE + if ("encoding-type" %in% h5attr_names(dataset)) { + rowwise <- h5attr(dataset, "encoding-type") == "csr_matrix" + } + x_dims <- determine_matrix_dims(dataset, indexpointers, indices, rowwise) + if (x_dims[2] < 1) { + delete_modality(open_file, modality_path) + } + } else if (dataset\$dims[1] < 1) { + delete_modality(open_file, modality_path) + } +} + +open_file\$close_all() + +cat("Reading input file\\n") +obj <- ReadH5MU(temp_h5mu) + +cat("Writing output file\\n") +saveRDS(obj, file = par\$output, compress = TRUE) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/convert/from_h5mu_to_seurat/nextflow_labels.config b/target/executable/convert/from_h5mu_to_seurat/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/from_h5mu_to_seurat/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/velocyto_to_h5mu/.config.vsh.yaml b/target/executable/convert/velocyto_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..965d0fd8 --- /dev/null +++ b/target/executable/convert/velocyto_to_h5mu/.config.vsh.yaml @@ -0,0 +1,322 @@ +name: "velocyto_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input_loom" + description: "Path to the input loom file." + info: null + example: + - "input.loom" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_h5mu" + description: "If a MuData file is provided," + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "The name of the modality to operate on." + info: null + default: + - "rna_velocity" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Path to the output MuData file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_spliced" + description: "Output layer for the spliced reads." + info: null + default: + - "velo_spliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_unspliced" + description: "Output layer for the unspliced reads." + info: null + default: + - "velo_unspliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_ambiguous" + description: "Output layer for the ambiguous reads." + info: null + default: + - "velo_ambiguous" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert a velocyto loom file to a h5mu file.\n\nIf an input h5mu file\ + \ is also provided, the velocity\nh5ad object will get added to that h5mu instead.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "loompy" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/velocity/velocyto_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/convert/velocyto_to_h5mu" + executable: "target/executable/convert/velocyto_to_h5mu/velocyto_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/convert/velocyto_to_h5mu/nextflow_labels.config b/target/executable/convert/velocyto_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/convert/velocyto_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/convert/velocyto_to_h5mu/velocyto_to_h5mu b/target/executable/convert/velocyto_to_h5mu/velocyto_to_h5mu new file mode 100755 index 00000000..858cdc32 --- /dev/null +++ b/target/executable/convert/velocyto_to_h5mu/velocyto_to_h5mu @@ -0,0 +1,1302 @@ +#!/usr/bin/env bash + +# velocyto_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer, author) +# * Robrecht Cannoodt (author) +# * Angela Oliveira Pisco (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="velocyto_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="velocyto_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "loompy" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont, Robrecht Cannoodt, Angela Oliveira Pisco" +LABEL org.opencontainers.image.description="Companion container for running component convert velocyto_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:07Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "velocyto_to_h5mu main" + echo "" + echo "Convert a velocyto loom file to a h5mu file." + echo "" + echo "If an input h5mu file is also provided, the velocity" + echo "h5ad object will get added to that h5mu instead." + echo "" + echo "Inputs:" + echo " --input_loom" + echo " type: file, required parameter, file must exist" + echo " example: input.loom" + echo " Path to the input loom file." + echo "" + echo " --input_h5mu" + echo " type: file, file must exist" + echo " example: input.h5mu" + echo " If a MuData file is provided," + echo "" + echo " --modality" + echo " type: string" + echo " default: rna_velocity" + echo " The name of the modality to operate on." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Path to the output MuData file." + echo "" + echo " --layer_spliced" + echo " type: string" + echo " default: velo_spliced" + echo " Output layer for the spliced reads." + echo "" + echo " --layer_unspliced" + echo " type: string" + echo " default: velo_unspliced" + echo " Output layer for the unspliced reads." + echo "" + echo " --layer_ambiguous" + echo " type: string" + echo " default: velo_ambiguous" + echo " Output layer for the ambiguous reads." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "velocyto_to_h5mu main" + exit + ;; + --input_loom) + [ -n "$VIASH_PAR_INPUT_LOOM" ] && ViashError Bad arguments for option \'--input_loom\': \'$VIASH_PAR_INPUT_LOOM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LOOM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_loom. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_loom=*) + [ -n "$VIASH_PAR_INPUT_LOOM" ] && ViashError Bad arguments for option \'--input_loom=*\': \'$VIASH_PAR_INPUT_LOOM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LOOM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_h5mu) + [ -n "$VIASH_PAR_INPUT_H5MU" ] && ViashError Bad arguments for option \'--input_h5mu\': \'$VIASH_PAR_INPUT_H5MU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_H5MU="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_h5mu. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_h5mu=*) + [ -n "$VIASH_PAR_INPUT_H5MU" ] && ViashError Bad arguments for option \'--input_h5mu=*\': \'$VIASH_PAR_INPUT_H5MU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_H5MU=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer_spliced) + [ -n "$VIASH_PAR_LAYER_SPLICED" ] && ViashError Bad arguments for option \'--layer_spliced\': \'$VIASH_PAR_LAYER_SPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_SPLICED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_spliced. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_spliced=*) + [ -n "$VIASH_PAR_LAYER_SPLICED" ] && ViashError Bad arguments for option \'--layer_spliced=*\': \'$VIASH_PAR_LAYER_SPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_SPLICED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer_unspliced) + [ -n "$VIASH_PAR_LAYER_UNSPLICED" ] && ViashError Bad arguments for option \'--layer_unspliced\': \'$VIASH_PAR_LAYER_UNSPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_UNSPLICED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_unspliced. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_unspliced=*) + [ -n "$VIASH_PAR_LAYER_UNSPLICED" ] && ViashError Bad arguments for option \'--layer_unspliced=*\': \'$VIASH_PAR_LAYER_UNSPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_UNSPLICED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer_ambiguous) + [ -n "$VIASH_PAR_LAYER_AMBIGUOUS" ] && ViashError Bad arguments for option \'--layer_ambiguous\': \'$VIASH_PAR_LAYER_AMBIGUOUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_AMBIGUOUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_ambiguous. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_ambiguous=*) + [ -n "$VIASH_PAR_LAYER_AMBIGUOUS" ] && ViashError Bad arguments for option \'--layer_ambiguous=*\': \'$VIASH_PAR_LAYER_AMBIGUOUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_AMBIGUOUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/convert/velocyto_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_LOOM+x} ]; then + ViashError '--input_loom' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna_velocity" +fi +if [ -z ${VIASH_PAR_LAYER_SPLICED+x} ]; then + VIASH_PAR_LAYER_SPLICED="velo_spliced" +fi +if [ -z ${VIASH_PAR_LAYER_UNSPLICED+x} ]; then + VIASH_PAR_LAYER_UNSPLICED="velo_unspliced" +fi +if [ -z ${VIASH_PAR_LAYER_AMBIGUOUS+x} ]; then + VIASH_PAR_LAYER_AMBIGUOUS="velo_ambiguous" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_LOOM" ] && [ ! -e "$VIASH_PAR_INPUT_LOOM" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_LOOM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_H5MU" ] && [ ! -e "$VIASH_PAR_INPUT_H5MU" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_H5MU' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_LOOM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_LOOM")" ) + VIASH_PAR_INPUT_LOOM=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_LOOM") +fi +if [ ! -z "$VIASH_PAR_INPUT_H5MU" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_H5MU")" ) + VIASH_PAR_INPUT_H5MU=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_H5MU") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-velocyto_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import anndata as ad +import mudata as mu +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.string_ = np.bytes_ +numpy_module.unicode_ = np.str_ +sys.modules["numpy"] = numpy_module + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_loom': $( if [ ! -z ${VIASH_PAR_INPUT_LOOM+x} ]; then echo "r'${VIASH_PAR_INPUT_LOOM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_h5mu': $( if [ ! -z ${VIASH_PAR_INPUT_H5MU+x} ]; then echo "r'${VIASH_PAR_INPUT_H5MU//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_spliced': $( if [ ! -z ${VIASH_PAR_LAYER_SPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_SPLICED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_unspliced': $( if [ ! -z ${VIASH_PAR_LAYER_UNSPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_UNSPLICED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_ambiguous': $( if [ ! -z ${VIASH_PAR_LAYER_AMBIGUOUS+x} ]; then echo "r'${VIASH_PAR_LAYER_AMBIGUOUS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Parameters:", par, flush=True) + +print("Reading AnnData from loom", flush=True) +adata_in = ad.read_loom(par["input_loom"]) +adata_in.var_names = adata_in.var["Accession"] + +print("Creating clean AnnData", flush=True) +adata = ad.AnnData( + X=adata_in.X, + obs=adata_in.obs[[]], + var=adata_in.var[[]], + layers={ + par["layer_spliced"]: adata_in.layers["spliced"], + par["layer_unspliced"]: adata_in.layers["unspliced"], + par["layer_ambiguous"]: adata_in.layers["ambiguous"], + }, +) + +if par["input_h5mu"]: + print("Received input h5mu to read", flush=True) + mdata = mu.read_h5mu(par["input_h5mu"]) + + print(f"Storing AnnData in modality {par['modality']}", flush=True) + mdata.mod[par["modality"]] = adata +else: + print("Creating h5mu from scratch", flush=True) + mdata = mu.MuData({par["modality"]: adata}) + +print("Resulting mudata:", mdata, flush=True) + +print("Writing h5mu to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_LOOM" ]; then + VIASH_PAR_INPUT_LOOM=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_LOOM") + fi + if [ ! -z "$VIASH_PAR_INPUT_H5MU" ]; then + VIASH_PAR_INPUT_H5MU=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_H5MU") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/correction/cellbender_remove_background/.config.vsh.yaml b/target/executable/correction/cellbender_remove_background/.config.vsh.yaml new file mode 100644 index 00000000..2d2a0f79 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background/.config.vsh.yaml @@ -0,0 +1,660 @@ +name: "cellbender_remove_background" +namespace: "correction" +version: "main" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file. Data file on which to run tool. Data must be un-filtered:\ + \ it should include empty droplets." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of modalities to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Full count matrix as an h5mu file, with background RNA removed.\ + \ This file contains all the original droplet barcodes." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_output" + description: "Output layer" + info: null + default: + - "cellbender_corrected" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_background_fraction" + info: null + default: + - "cellbender_background_fraction" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_cell_probability" + info: null + default: + - "cellbender_cell_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_cell_size" + info: null + default: + - "cellbender_cell_size" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_droplet_efficiency" + description: "Name of the column in the .obs dataframe to store the droplet efficiencies\ + \ in.\n" + info: null + default: + - "cellbender_droplet_efficiency" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_scale" + info: null + default: + - "cellbender_latent_scale" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_ambient_expression" + info: null + default: + - "cellbender_ambient_expression" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_expression_encoding" + info: null + default: + - "cellbender_gene_expression_encoding" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean" + name: "--expected_cells_from_qc" + description: "Will use the Cell Ranger QC to determine the estimated number of\ + \ cells" + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--expected_cells" + description: "Number of cells expected in the dataset (a rough estimate within\ + \ a factor of 2 is sufficient)." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--total_droplets_included" + description: "The number of droplets from the rank-ordered UMI plot\nthat will\ + \ have their cell probabilities inferred as an\noutput. Include the droplets\ + \ which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should\ + \ be\n'surely empty' droplets.\n" + info: null + example: + - 25000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_cell_umi_prior" + description: "Ignore CellBender's heuristic prior estimation, and use this prior\ + \ for UMI counts in cells." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_empty_umi_prior" + description: "Ignore CellBender's heuristic prior estimation, and use this prior\ + \ for UMI counts in empty droplets." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--model" + description: "Which model is being used for count data.\n\n* 'naive' subtracts\ + \ the estimated ambient profile.\n* 'simple' does not model either ambient RNA\ + \ or random barcode swapping (for debugging purposes -- not recommended).\n\ + * 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping'\ + \ assumes background RNA comes from random barcode swapping (via PCR chimeras).\n\ + * 'full' uses a combined ambient and swapping model.\n" + info: null + default: + - "full" + required: false + choices: + - "naive" + - "simple" + - "ambient" + - "swapping" + - "full" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--epochs" + description: "Number of epochs to train." + info: null + default: + - 150 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--low_count_threshold" + description: "Droplets with UMI counts below this number are completely \nexcluded\ + \ from the analysis. This can help identify the correct \nprior for empty droplet\ + \ counts in the rare case where empty \ncounts are extremely high (over 200).\n" + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_dim" + description: "Dimension of latent variable z.\n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_layers" + description: "Dimension of hidden layers in the encoder for z.\n" + info: null + default: + - 512 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--training_fraction" + description: "Training detail: the fraction of the data used for training.\nThe\ + \ rest is never seen by the inference algorithm. Speeds up learning.\n" + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--empty_drop_training_fraction" + description: "Training detail: the fraction of the training data each epoch that\ + \ \nis drawn (randomly sampled) from surely empty droplets.\n" + info: null + default: + - 0.2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ignore_features" + description: "Integer indices of features to ignore entirely. In the output\n\ + count matrix, the counts for these features will be unchanged.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--fpr" + description: "Target 'delta' false positive rate in [0, 1). Use 0 for a cohort\n\ + of samples which will be jointly analyzed for differential expression.\nA false\ + \ positive is a true signal count that is erroneously removed.\nMore background\ + \ removal is accompanied by more signal removal at\nhigh values of FPR. You\ + \ can specify multiple values, which will\ncreate multiple output files.\n" + info: null + default: + - 0.01 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--exclude_feature_types" + description: "Feature types to ignore during the analysis. These features will\n\ + be left unchanged in the output file.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--projected_ambient_count_threshold" + description: "Controls how many features are included in the analysis, which\n\ + can lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD\ + \ counts total in all cells\n(summed), then that gene is excluded, and it will\ + \ be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD\ + \ = 0 will include all features\nwhich have even a single count in any empty\ + \ droplet.\n" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--learning_rate" + description: "Training detail: lower learning rate for inference.\nA OneCycle\ + \ learning rate schedule is used, where the\nupper learning rate is ten times\ + \ this value. (For this\nvalue, probably do not exceed 1e-3).\n" + info: null + default: + - 1.0E-4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--final_elbo_fail_fraction" + description: "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO\ + \ - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically\ + \ re-run if --num-training-tries > 1.\nBy default, will not fail training based\ + \ on final_training_ELBO.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--epoch_elbo_fail_fraction" + description: "Training is considered to have failed if \n(previous_epoch_test_ELBO\ + \ - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO)\ + \ > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries\ + \ > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_training_tries" + description: "Number of times to attempt to train the model. At each subsequent\ + \ attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--learning_rate_retry_mult" + description: "Learning rate is multiplied by this amount each time a new training\n\ + attempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION\ + \ or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is > 1.) \n" + info: null + default: + - 0.2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--posterior_batch_size" + description: "Training detail: size of batches when creating the posterior.\n\ + Reduce this to avoid running out of GPU memory creating the posterior\n(will\ + \ be slower).\n" + info: null + default: + - 128 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--posterior_regulation" + description: "Posterior regularization method. (For experts: not required for\ + \ normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n\ + * PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n\ + * PRmu_gene is approximate mean-targeting per gene.\n" + info: null + required: false + choices: + - "PRq" + - "PRmu" + - "PRmu_gene" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "Tunable parameter alpha for the PRq posterior regularization method\n\ + (not normally used: see documentation).\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--q" + description: "Tunable parameter q for the CDF threshold estimation method (not\n\ + normally used: see documentation).\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--estimator" + description: "Output denoised count estimation method. (For experts: not required\n\ + for normal usage, see documentation).\n" + info: null + default: + - "mckp" + required: false + choices: + - "map" + - "mean" + - "cdf" + - "sample" + - "mckp" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--estimator_multiple_cpu" + description: "Including the flag --estimator-multiple-cpu will use more than one\n\ + CPU to compute the MCKP output count estimator in parallel (does nothing\nfor\ + \ other estimators).\n" + info: null + direction: "input" + - type: "boolean" + name: "--constant_learning_rate" + description: "Including the flag --constant-learning-rate will use the ClippedAdam\n\ + optimizer instead of the OneCycleLR learning rate schedule, which is\nthe default.\ + \ Learning is faster with the OneCycleLR schedule.\nHowever, training can easily\ + \ be continued from a checkpoint for more\nepochs than the initial command specified\ + \ when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule\ + \ with 150 epochs\nspecified, it is not possible to pick up from that final\ + \ checkpoint\nand continue training until 250 epochs.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--debug" + description: "Including the flag --debug will log extra messages useful for debugging.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--cuda" + description: "Including the flag --cuda will run the inference on a\nGPU.\n" + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Eliminating technical artifacts from high-throughput single-cell RNA\ + \ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\ + \ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\ + \ moment, only the count matrices produced by the CellRanger count pipeline is supported.\ + \ Support for additional tools and protocols \nwill be added in the future. A quick\ + \ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential\ + \ libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates\ + \ curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev\ + \ liblzma-dev mecab-ipadic-utf8 git \\\n&& curl https://pyenv.run | bash \\\n\ + && pyenv update \\\n&& pyenv install $PYTHON_VERSION \\\n&& pyenv global $PYTHON_VERSION\ + \ \\\n&& apt-get clean\n" + env: + - "PYENV_ROOT=\"/root/.pyenv\"" + - "PATH=\"$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH\"" + - "PYTHON_VERSION=3.7.16" + - type: "python" + user: false + packages: + - "lxml~=4.8.0" + - "mudata~=0.2.1" + - "cellbender~=0.3.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/correction/cellbender_remove_background/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/correction/cellbender_remove_background" + executable: "target/executable/correction/cellbender_remove_background/cellbender_remove_background" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/correction/cellbender_remove_background/cellbender_remove_background b/target/executable/correction/cellbender_remove_background/cellbender_remove_background new file mode 100755 index 00000000..5d3dc607 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background/cellbender_remove_background @@ -0,0 +1,2390 @@ +#!/usr/bin/env bash + +# cellbender_remove_background main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellbender_remove_background" +VIASH_META_FUNCTIONALITY_NAME="cellbender_remove_background" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04 +ENTRYPOINT [] +ENV PYENV_ROOT="/root/.pyenv" +ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH" +ENV PYTHON_VERSION=3.7.16 +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8 git \ +&& curl https://pyenv.run | bash \ +&& pyenv update \ +&& pyenv install $PYTHON_VERSION \ +&& pyenv global $PYTHON_VERSION \ +&& apt-get clean + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "lxml~=4.8.0" "mudata~=0.2.1" "cellbender~=0.3.0" + +LABEL org.opencontainers.image.description="Companion container for running component correction cellbender_remove_background" +LABEL org.opencontainers.image.created="2025-08-22T07:23:09Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellbender_remove_background main" + echo "" + echo "Eliminating technical artifacts from high-throughput single-cell RNA sequencing" + echo "data." + echo "" + echo "This module removes counts due to ambient RNA molecules and random barcode" + echo "swapping from (raw) UMI-based scRNA-seq count matrices." + echo "At the moment, only the count matrices produced by the CellRanger count pipeline" + echo "is supported. Support for additional tools and protocols" + echo "will be added in the future. A quick start tutorial can be found here." + echo "" + echo "Fleming et al. 2022, bioRxiv." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file. Data file on which to run tool. Data must be" + echo " un-filtered: it should include empty droplets." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " List of modalities to process." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Full count matrix as an h5mu file, with background RNA removed. This" + echo " file contains all the original droplet barcodes." + echo "" + echo " --layer_output" + echo " type: string" + echo " default: cellbender_corrected" + echo " Output layer" + echo "" + echo " --obs_background_fraction" + echo " type: string" + echo " default: cellbender_background_fraction" + echo "" + echo " --obs_cell_probability" + echo " type: string" + echo " default: cellbender_cell_probability" + echo "" + echo " --obs_cell_size" + echo " type: string" + echo " default: cellbender_cell_size" + echo "" + echo " --obs_droplet_efficiency" + echo " type: string" + echo " default: cellbender_droplet_efficiency" + echo " Name of the column in the .obs dataframe to store the droplet" + echo " efficiencies in." + echo "" + echo " --obs_latent_scale" + echo " type: string" + echo " default: cellbender_latent_scale" + echo "" + echo " --var_ambient_expression" + echo " type: string" + echo " default: cellbender_ambient_expression" + echo "" + echo " --obsm_gene_expression_encoding" + echo " type: string" + echo " default: cellbender_gene_expression_encoding" + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --expected_cells_from_qc" + echo " type: boolean" + echo " default: false" + echo " Will use the Cell Ranger QC to determine the estimated number of cells" + echo "" + echo " --expected_cells" + echo " type: integer" + echo " example: 1000" + echo " Number of cells expected in the dataset (a rough estimate within a" + echo " factor of 2 is sufficient)." + echo "" + echo " --total_droplets_included" + echo " type: integer" + echo " example: 25000" + echo " The number of droplets from the rank-ordered UMI plot" + echo " that will have their cell probabilities inferred as an" + echo " output. Include the droplets which might contain cells." + echo " Droplets beyond TOTAL_DROPLETS_INCLUDED should be" + echo " 'surely empty' droplets." + echo "" + echo " --force_cell_umi_prior" + echo " type: integer" + echo " Ignore CellBender's heuristic prior estimation, and use this prior for" + echo " UMI counts in cells." + echo "" + echo " --force_empty_umi_prior" + echo " type: integer" + echo " Ignore CellBender's heuristic prior estimation, and use this prior for" + echo " UMI counts in empty droplets." + echo "" + echo " --model" + echo " type: string" + echo " default: full" + echo " choices: [ naive, simple, ambient, swapping, full ]" + echo " Which model is being used for count data." + echo " * 'naive' subtracts the estimated ambient profile." + echo " * 'simple' does not model either ambient RNA or random barcode swapping" + echo " (for debugging purposes -- not recommended)." + echo " * 'ambient' assumes background RNA is incorporated into droplets." + echo " * 'swapping' assumes background RNA comes from random barcode swapping" + echo " (via PCR chimeras)." + echo " * 'full' uses a combined ambient and swapping model." + echo "" + echo " --epochs" + echo " type: integer" + echo " default: 150" + echo " Number of epochs to train." + echo "" + echo " --low_count_threshold" + echo " type: integer" + echo " default: 5" + echo " Droplets with UMI counts below this number are completely" + echo " excluded from the analysis. This can help identify the correct" + echo " prior for empty droplet counts in the rare case where empty" + echo " counts are extremely high (over 200)." + echo "" + echo " --z_dim" + echo " type: integer" + echo " default: 64" + echo " Dimension of latent variable z." + echo "" + echo " --z_layers" + echo " type: integer, multiple values allowed" + echo " default: 512" + echo " Dimension of hidden layers in the encoder for z." + echo "" + echo " --training_fraction" + echo " type: double" + echo " default: 0.9" + echo " Training detail: the fraction of the data used for training." + echo " The rest is never seen by the inference algorithm. Speeds up learning." + echo "" + echo " --empty_drop_training_fraction" + echo " type: double" + echo " default: 0.2" + echo " Training detail: the fraction of the training data each epoch that" + echo " is drawn (randomly sampled) from surely empty droplets." + echo "" + echo " --ignore_features" + echo " type: integer, multiple values allowed" + echo " Integer indices of features to ignore entirely. In the output" + echo " count matrix, the counts for these features will be unchanged." + echo "" + echo " --fpr" + echo " type: double, multiple values allowed" + echo " default: 0.01" + echo " Target 'delta' false positive rate in [0, 1). Use 0 for a cohort" + echo " of samples which will be jointly analyzed for differential expression." + echo " A false positive is a true signal count that is erroneously removed." + echo " More background removal is accompanied by more signal removal at" + echo " high values of FPR. You can specify multiple values, which will" + echo " create multiple output files." + echo "" + echo " --exclude_feature_types" + echo " type: string, multiple values allowed" + echo " Feature types to ignore during the analysis. These features will" + echo " be left unchanged in the output file." + echo "" + echo " --projected_ambient_count_threshold" + echo " type: double" + echo " default: 0.1" + echo " Controls how many features are included in the analysis, which" + echo " can lead to a large speedup. If a feature is expected to have less" + echo " than PROJECTED_AMBIENT_COUNT_THRESHOLD counts total in all cells" + echo " (summed), then that gene is excluded, and it will be unchanged" + echo " in the output count matrix. For example," + echo " PROJECTED_AMBIENT_COUNT_THRESHOLD = 0 will include all features" + echo " which have even a single count in any empty droplet." + echo "" + echo " --learning_rate" + echo " type: double" + echo " default: 1.0E-4" + echo " Training detail: lower learning rate for inference." + echo " A OneCycle learning rate schedule is used, where the" + echo " upper learning rate is ten times this value. (For this" + echo " value, probably do not exceed 1e-3)." + echo "" + echo " --final_elbo_fail_fraction" + echo " type: double" + echo " Training is considered to have failed if" + echo " (best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO)" + echo " > FINAL_ELBO_FAIL_FRACTION." + echo " Training will automatically re-run if --num-training-tries > 1." + echo " By default, will not fail training based on final_training_ELBO." + echo "" + echo " --epoch_elbo_fail_fraction" + echo " type: double" + echo " Training is considered to have failed if" + echo " (previous_epoch_test_ELBO -" + echo " current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO)" + echo " > EPOCH_ELBO_FAIL_FRACTION." + echo " Training will automatically re-run if --num-training-tries > 1." + echo " By default, will not fail training based on epoch_training_ELBO." + echo "" + echo " --num_training_tries" + echo " type: integer" + echo " default: 1" + echo " Number of times to attempt to train the model. At each subsequent" + echo " attempt," + echo " the learning rate is multiplied by LEARNING_RATE_RETRY_MULT." + echo "" + echo " --learning_rate_retry_mult" + echo " type: double" + echo " default: 0.2" + echo " Learning rate is multiplied by this amount each time a new training" + echo " attempt is made. (This parameter is only used if training fails based" + echo " on EPOCH_ELBO_FAIL_FRACTION or FINAL_ELBO_FAIL_FRACTION and" + echo " NUM_TRAINING_TRIES is > 1.)" + echo "" + echo " --posterior_batch_size" + echo " type: integer" + echo " default: 128" + echo " Training detail: size of batches when creating the posterior." + echo " Reduce this to avoid running out of GPU memory creating the posterior" + echo " (will be slower)." + echo "" + echo " --posterior_regulation" + echo " type: string" + echo " choices: [ PRq, PRmu, PRmu_gene ]" + echo " Posterior regularization method. (For experts: not required for normal" + echo " usage," + echo " see documentation)." + echo " * PRq is approximate quantile-targeting." + echo " * PRmu is approximate mean-targeting aggregated over genes (behavior of" + echo " v0.2.0)." + echo " * PRmu_gene is approximate mean-targeting per gene." + echo "" + echo " --alpha" + echo " type: double" + echo " Tunable parameter alpha for the PRq posterior regularization method" + echo " (not normally used: see documentation)." + echo "" + echo " --q" + echo " type: double" + echo " Tunable parameter q for the CDF threshold estimation method (not" + echo " normally used: see documentation)." + echo "" + echo " --estimator" + echo " type: string" + echo " default: mckp" + echo " choices: [ map, mean, cdf, sample, mckp ]" + echo " Output denoised count estimation method. (For experts: not required" + echo " for normal usage, see documentation)." + echo "" + echo " --estimator_multiple_cpu" + echo " type: boolean_true" + echo " Including the flag --estimator-multiple-cpu will use more than one" + echo " CPU to compute the MCKP output count estimator in parallel (does nothing" + echo " for other estimators)." + echo "" + echo " --constant_learning_rate" + echo " type: boolean" + echo " Including the flag --constant-learning-rate will use the ClippedAdam" + echo " optimizer instead of the OneCycleLR learning rate schedule, which is" + echo " the default. Learning is faster with the OneCycleLR schedule." + echo " However, training can easily be continued from a checkpoint for more" + echo " epochs than the initial command specified when using ClippedAdam. On" + echo " the other hand, if using the OneCycleLR schedule with 150 epochs" + echo " specified, it is not possible to pick up from that final checkpoint" + echo " and continue training until 250 epochs." + echo "" + echo " --debug" + echo " type: boolean_true" + echo " Including the flag --debug will log extra messages useful for debugging." + echo "" + echo " --cuda" + echo " type: boolean_true" + echo " Including the flag --cuda will run the inference on a" + echo " GPU." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellbender_remove_background main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_output) + [ -n "$VIASH_PAR_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--layer_output\': \'$VIASH_PAR_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_output=*) + [ -n "$VIASH_PAR_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--layer_output=*\': \'$VIASH_PAR_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_background_fraction) + [ -n "$VIASH_PAR_OBS_BACKGROUND_FRACTION" ] && ViashError Bad arguments for option \'--obs_background_fraction\': \'$VIASH_PAR_OBS_BACKGROUND_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BACKGROUND_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_background_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_background_fraction=*) + [ -n "$VIASH_PAR_OBS_BACKGROUND_FRACTION" ] && ViashError Bad arguments for option \'--obs_background_fraction=*\': \'$VIASH_PAR_OBS_BACKGROUND_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BACKGROUND_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_cell_probability) + [ -n "$VIASH_PAR_OBS_CELL_PROBABILITY" ] && ViashError Bad arguments for option \'--obs_cell_probability\': \'$VIASH_PAR_OBS_CELL_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_CELL_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_cell_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_cell_probability=*) + [ -n "$VIASH_PAR_OBS_CELL_PROBABILITY" ] && ViashError Bad arguments for option \'--obs_cell_probability=*\': \'$VIASH_PAR_OBS_CELL_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_CELL_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_cell_size) + [ -n "$VIASH_PAR_OBS_CELL_SIZE" ] && ViashError Bad arguments for option \'--obs_cell_size\': \'$VIASH_PAR_OBS_CELL_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_CELL_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_cell_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_cell_size=*) + [ -n "$VIASH_PAR_OBS_CELL_SIZE" ] && ViashError Bad arguments for option \'--obs_cell_size=*\': \'$VIASH_PAR_OBS_CELL_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_CELL_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_droplet_efficiency) + [ -n "$VIASH_PAR_OBS_DROPLET_EFFICIENCY" ] && ViashError Bad arguments for option \'--obs_droplet_efficiency\': \'$VIASH_PAR_OBS_DROPLET_EFFICIENCY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_DROPLET_EFFICIENCY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_droplet_efficiency. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_droplet_efficiency=*) + [ -n "$VIASH_PAR_OBS_DROPLET_EFFICIENCY" ] && ViashError Bad arguments for option \'--obs_droplet_efficiency=*\': \'$VIASH_PAR_OBS_DROPLET_EFFICIENCY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_DROPLET_EFFICIENCY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_latent_scale) + [ -n "$VIASH_PAR_OBS_LATENT_SCALE" ] && ViashError Bad arguments for option \'--obs_latent_scale\': \'$VIASH_PAR_OBS_LATENT_SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_SCALE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_latent_scale. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_latent_scale=*) + [ -n "$VIASH_PAR_OBS_LATENT_SCALE" ] && ViashError Bad arguments for option \'--obs_latent_scale=*\': \'$VIASH_PAR_OBS_LATENT_SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_SCALE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_ambient_expression) + [ -n "$VIASH_PAR_VAR_AMBIENT_EXPRESSION" ] && ViashError Bad arguments for option \'--var_ambient_expression\': \'$VIASH_PAR_VAR_AMBIENT_EXPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_AMBIENT_EXPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_ambient_expression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_ambient_expression=*) + [ -n "$VIASH_PAR_VAR_AMBIENT_EXPRESSION" ] && ViashError Bad arguments for option \'--var_ambient_expression=*\': \'$VIASH_PAR_VAR_AMBIENT_EXPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_AMBIENT_EXPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_gene_expression_encoding) + [ -n "$VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING" ] && ViashError Bad arguments for option \'--obsm_gene_expression_encoding\': \'$VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_gene_expression_encoding. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_gene_expression_encoding=*) + [ -n "$VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING" ] && ViashError Bad arguments for option \'--obsm_gene_expression_encoding=*\': \'$VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expected_cells_from_qc) + [ -n "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" ] && ViashError Bad arguments for option \'--expected_cells_from_qc\': \'$VIASH_PAR_EXPECTED_CELLS_FROM_QC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS_FROM_QC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expected_cells_from_qc. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expected_cells_from_qc=*) + [ -n "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" ] && ViashError Bad arguments for option \'--expected_cells_from_qc=*\': \'$VIASH_PAR_EXPECTED_CELLS_FROM_QC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS_FROM_QC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expected_cells) + [ -n "$VIASH_PAR_EXPECTED_CELLS" ] && ViashError Bad arguments for option \'--expected_cells\': \'$VIASH_PAR_EXPECTED_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expected_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expected_cells=*) + [ -n "$VIASH_PAR_EXPECTED_CELLS" ] && ViashError Bad arguments for option \'--expected_cells=*\': \'$VIASH_PAR_EXPECTED_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --total_droplets_included) + [ -n "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" ] && ViashError Bad arguments for option \'--total_droplets_included\': \'$VIASH_PAR_TOTAL_DROPLETS_INCLUDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TOTAL_DROPLETS_INCLUDED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --total_droplets_included. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --total_droplets_included=*) + [ -n "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" ] && ViashError Bad arguments for option \'--total_droplets_included=*\': \'$VIASH_PAR_TOTAL_DROPLETS_INCLUDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TOTAL_DROPLETS_INCLUDED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --force_cell_umi_prior) + [ -n "$VIASH_PAR_FORCE_CELL_UMI_PRIOR" ] && ViashError Bad arguments for option \'--force_cell_umi_prior\': \'$VIASH_PAR_FORCE_CELL_UMI_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_CELL_UMI_PRIOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --force_cell_umi_prior. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --force_cell_umi_prior=*) + [ -n "$VIASH_PAR_FORCE_CELL_UMI_PRIOR" ] && ViashError Bad arguments for option \'--force_cell_umi_prior=*\': \'$VIASH_PAR_FORCE_CELL_UMI_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_CELL_UMI_PRIOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --force_empty_umi_prior) + [ -n "$VIASH_PAR_FORCE_EMPTY_UMI_PRIOR" ] && ViashError Bad arguments for option \'--force_empty_umi_prior\': \'$VIASH_PAR_FORCE_EMPTY_UMI_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_EMPTY_UMI_PRIOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --force_empty_umi_prior. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --force_empty_umi_prior=*) + [ -n "$VIASH_PAR_FORCE_EMPTY_UMI_PRIOR" ] && ViashError Bad arguments for option \'--force_empty_umi_prior=*\': \'$VIASH_PAR_FORCE_EMPTY_UMI_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_EMPTY_UMI_PRIOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --epochs) + [ -n "$VIASH_PAR_EPOCHS" ] && ViashError Bad arguments for option \'--epochs\': \'$VIASH_PAR_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --epochs=*) + [ -n "$VIASH_PAR_EPOCHS" ] && ViashError Bad arguments for option \'--epochs=*\': \'$VIASH_PAR_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --low_count_threshold) + [ -n "$VIASH_PAR_LOW_COUNT_THRESHOLD" ] && ViashError Bad arguments for option \'--low_count_threshold\': \'$VIASH_PAR_LOW_COUNT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOW_COUNT_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --low_count_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --low_count_threshold=*) + [ -n "$VIASH_PAR_LOW_COUNT_THRESHOLD" ] && ViashError Bad arguments for option \'--low_count_threshold=*\': \'$VIASH_PAR_LOW_COUNT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOW_COUNT_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --z_dim) + [ -n "$VIASH_PAR_Z_DIM" ] && ViashError Bad arguments for option \'--z_dim\': \'$VIASH_PAR_Z_DIM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_Z_DIM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --z_dim. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --z_dim=*) + [ -n "$VIASH_PAR_Z_DIM" ] && ViashError Bad arguments for option \'--z_dim=*\': \'$VIASH_PAR_Z_DIM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_Z_DIM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --z_layers) + if [ -z "$VIASH_PAR_Z_LAYERS" ]; then + VIASH_PAR_Z_LAYERS="$2" + else + VIASH_PAR_Z_LAYERS="$VIASH_PAR_Z_LAYERS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --z_layers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --z_layers=*) + if [ -z "$VIASH_PAR_Z_LAYERS" ]; then + VIASH_PAR_Z_LAYERS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_Z_LAYERS="$VIASH_PAR_Z_LAYERS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --training_fraction) + [ -n "$VIASH_PAR_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--training_fraction\': \'$VIASH_PAR_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRAINING_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --training_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --training_fraction=*) + [ -n "$VIASH_PAR_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--training_fraction=*\': \'$VIASH_PAR_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRAINING_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --empty_drop_training_fraction) + [ -n "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--empty_drop_training_fraction\': \'$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --empty_drop_training_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --empty_drop_training_fraction=*) + [ -n "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--empty_drop_training_fraction=*\': \'$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ignore_features) + if [ -z "$VIASH_PAR_IGNORE_FEATURES" ]; then + VIASH_PAR_IGNORE_FEATURES="$2" + else + VIASH_PAR_IGNORE_FEATURES="$VIASH_PAR_IGNORE_FEATURES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ignore_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ignore_features=*) + if [ -z "$VIASH_PAR_IGNORE_FEATURES" ]; then + VIASH_PAR_IGNORE_FEATURES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_IGNORE_FEATURES="$VIASH_PAR_IGNORE_FEATURES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --fpr) + if [ -z "$VIASH_PAR_FPR" ]; then + VIASH_PAR_FPR="$2" + else + VIASH_PAR_FPR="$VIASH_PAR_FPR;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fpr. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fpr=*) + if [ -z "$VIASH_PAR_FPR" ]; then + VIASH_PAR_FPR=$(ViashRemoveFlags "$1") + else + VIASH_PAR_FPR="$VIASH_PAR_FPR;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --exclude_feature_types) + if [ -z "$VIASH_PAR_EXCLUDE_FEATURE_TYPES" ]; then + VIASH_PAR_EXCLUDE_FEATURE_TYPES="$2" + else + VIASH_PAR_EXCLUDE_FEATURE_TYPES="$VIASH_PAR_EXCLUDE_FEATURE_TYPES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exclude_feature_types. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude_feature_types=*) + if [ -z "$VIASH_PAR_EXCLUDE_FEATURE_TYPES" ]; then + VIASH_PAR_EXCLUDE_FEATURE_TYPES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_EXCLUDE_FEATURE_TYPES="$VIASH_PAR_EXCLUDE_FEATURE_TYPES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --projected_ambient_count_threshold) + [ -n "$VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD" ] && ViashError Bad arguments for option \'--projected_ambient_count_threshold\': \'$VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --projected_ambient_count_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --projected_ambient_count_threshold=*) + [ -n "$VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD" ] && ViashError Bad arguments for option \'--projected_ambient_count_threshold=*\': \'$VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --learning_rate) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --learning_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --learning_rate=*) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate=*\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --final_elbo_fail_fraction) + [ -n "$VIASH_PAR_FINAL_ELBO_FAIL_FRACTION" ] && ViashError Bad arguments for option \'--final_elbo_fail_fraction\': \'$VIASH_PAR_FINAL_ELBO_FAIL_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINAL_ELBO_FAIL_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --final_elbo_fail_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --final_elbo_fail_fraction=*) + [ -n "$VIASH_PAR_FINAL_ELBO_FAIL_FRACTION" ] && ViashError Bad arguments for option \'--final_elbo_fail_fraction=*\': \'$VIASH_PAR_FINAL_ELBO_FAIL_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINAL_ELBO_FAIL_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --epoch_elbo_fail_fraction) + [ -n "$VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION" ] && ViashError Bad arguments for option \'--epoch_elbo_fail_fraction\': \'$VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --epoch_elbo_fail_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --epoch_elbo_fail_fraction=*) + [ -n "$VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION" ] && ViashError Bad arguments for option \'--epoch_elbo_fail_fraction=*\': \'$VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_training_tries) + [ -n "$VIASH_PAR_NUM_TRAINING_TRIES" ] && ViashError Bad arguments for option \'--num_training_tries\': \'$VIASH_PAR_NUM_TRAINING_TRIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_TRAINING_TRIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_training_tries. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_training_tries=*) + [ -n "$VIASH_PAR_NUM_TRAINING_TRIES" ] && ViashError Bad arguments for option \'--num_training_tries=*\': \'$VIASH_PAR_NUM_TRAINING_TRIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_TRAINING_TRIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --learning_rate_retry_mult) + [ -n "$VIASH_PAR_LEARNING_RATE_RETRY_MULT" ] && ViashError Bad arguments for option \'--learning_rate_retry_mult\': \'$VIASH_PAR_LEARNING_RATE_RETRY_MULT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE_RETRY_MULT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --learning_rate_retry_mult. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --learning_rate_retry_mult=*) + [ -n "$VIASH_PAR_LEARNING_RATE_RETRY_MULT" ] && ViashError Bad arguments for option \'--learning_rate_retry_mult=*\': \'$VIASH_PAR_LEARNING_RATE_RETRY_MULT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE_RETRY_MULT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --posterior_batch_size) + [ -n "$VIASH_PAR_POSTERIOR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--posterior_batch_size\': \'$VIASH_PAR_POSTERIOR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POSTERIOR_BATCH_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --posterior_batch_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --posterior_batch_size=*) + [ -n "$VIASH_PAR_POSTERIOR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--posterior_batch_size=*\': \'$VIASH_PAR_POSTERIOR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POSTERIOR_BATCH_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --posterior_regulation) + [ -n "$VIASH_PAR_POSTERIOR_REGULATION" ] && ViashError Bad arguments for option \'--posterior_regulation\': \'$VIASH_PAR_POSTERIOR_REGULATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POSTERIOR_REGULATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --posterior_regulation. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --posterior_regulation=*) + [ -n "$VIASH_PAR_POSTERIOR_REGULATION" ] && ViashError Bad arguments for option \'--posterior_regulation=*\': \'$VIASH_PAR_POSTERIOR_REGULATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POSTERIOR_REGULATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alpha) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alpha=*) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha=*\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --q) + [ -n "$VIASH_PAR_Q" ] && ViashError Bad arguments for option \'--q\': \'$VIASH_PAR_Q\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_Q="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --q. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --q=*) + [ -n "$VIASH_PAR_Q" ] && ViashError Bad arguments for option \'--q=*\': \'$VIASH_PAR_Q\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_Q=$(ViashRemoveFlags "$1") + shift 1 + ;; + --estimator) + [ -n "$VIASH_PAR_ESTIMATOR" ] && ViashError Bad arguments for option \'--estimator\': \'$VIASH_PAR_ESTIMATOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESTIMATOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --estimator. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --estimator=*) + [ -n "$VIASH_PAR_ESTIMATOR" ] && ViashError Bad arguments for option \'--estimator=*\': \'$VIASH_PAR_ESTIMATOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESTIMATOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --estimator_multiple_cpu) + [ -n "$VIASH_PAR_ESTIMATOR_MULTIPLE_CPU" ] && ViashError Bad arguments for option \'--estimator_multiple_cpu\': \'$VIASH_PAR_ESTIMATOR_MULTIPLE_CPU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESTIMATOR_MULTIPLE_CPU=true + shift 1 + ;; + --constant_learning_rate) + [ -n "$VIASH_PAR_CONSTANT_LEARNING_RATE" ] && ViashError Bad arguments for option \'--constant_learning_rate\': \'$VIASH_PAR_CONSTANT_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CONSTANT_LEARNING_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --constant_learning_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --constant_learning_rate=*) + [ -n "$VIASH_PAR_CONSTANT_LEARNING_RATE" ] && ViashError Bad arguments for option \'--constant_learning_rate=*\': \'$VIASH_PAR_CONSTANT_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CONSTANT_LEARNING_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --debug) + [ -n "$VIASH_PAR_DEBUG" ] && ViashError Bad arguments for option \'--debug\': \'$VIASH_PAR_DEBUG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DEBUG=true + shift 1 + ;; + --cuda) + [ -n "$VIASH_PAR_CUDA" ] && ViashError Bad arguments for option \'--cuda\': \'$VIASH_PAR_CUDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CUDA=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/correction/cellbender_remove_background:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_LAYER_OUTPUT+x} ]; then + VIASH_PAR_LAYER_OUTPUT="cellbender_corrected" +fi +if [ -z ${VIASH_PAR_OBS_BACKGROUND_FRACTION+x} ]; then + VIASH_PAR_OBS_BACKGROUND_FRACTION="cellbender_background_fraction" +fi +if [ -z ${VIASH_PAR_OBS_CELL_PROBABILITY+x} ]; then + VIASH_PAR_OBS_CELL_PROBABILITY="cellbender_cell_probability" +fi +if [ -z ${VIASH_PAR_OBS_CELL_SIZE+x} ]; then + VIASH_PAR_OBS_CELL_SIZE="cellbender_cell_size" +fi +if [ -z ${VIASH_PAR_OBS_DROPLET_EFFICIENCY+x} ]; then + VIASH_PAR_OBS_DROPLET_EFFICIENCY="cellbender_droplet_efficiency" +fi +if [ -z ${VIASH_PAR_OBS_LATENT_SCALE+x} ]; then + VIASH_PAR_OBS_LATENT_SCALE="cellbender_latent_scale" +fi +if [ -z ${VIASH_PAR_VAR_AMBIENT_EXPRESSION+x} ]; then + VIASH_PAR_VAR_AMBIENT_EXPRESSION="cellbender_ambient_expression" +fi +if [ -z ${VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING+x} ]; then + VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING="cellbender_gene_expression_encoding" +fi +if [ -z ${VIASH_PAR_EXPECTED_CELLS_FROM_QC+x} ]; then + VIASH_PAR_EXPECTED_CELLS_FROM_QC="false" +fi +if [ -z ${VIASH_PAR_MODEL+x} ]; then + VIASH_PAR_MODEL="full" +fi +if [ -z ${VIASH_PAR_EPOCHS+x} ]; then + VIASH_PAR_EPOCHS="150" +fi +if [ -z ${VIASH_PAR_LOW_COUNT_THRESHOLD+x} ]; then + VIASH_PAR_LOW_COUNT_THRESHOLD="5" +fi +if [ -z ${VIASH_PAR_Z_DIM+x} ]; then + VIASH_PAR_Z_DIM="64" +fi +if [ -z ${VIASH_PAR_Z_LAYERS+x} ]; then + VIASH_PAR_Z_LAYERS="512" +fi +if [ -z ${VIASH_PAR_TRAINING_FRACTION+x} ]; then + VIASH_PAR_TRAINING_FRACTION="0.9" +fi +if [ -z ${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION+x} ]; then + VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION="0.2" +fi +if [ -z ${VIASH_PAR_FPR+x} ]; then + VIASH_PAR_FPR="0.01" +fi +if [ -z ${VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD+x} ]; then + VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD="0.1" +fi +if [ -z ${VIASH_PAR_LEARNING_RATE+x} ]; then + VIASH_PAR_LEARNING_RATE="1.0E-4" +fi +if [ -z ${VIASH_PAR_NUM_TRAINING_TRIES+x} ]; then + VIASH_PAR_NUM_TRAINING_TRIES="1" +fi +if [ -z ${VIASH_PAR_LEARNING_RATE_RETRY_MULT+x} ]; then + VIASH_PAR_LEARNING_RATE_RETRY_MULT="0.2" +fi +if [ -z ${VIASH_PAR_POSTERIOR_BATCH_SIZE+x} ]; then + VIASH_PAR_POSTERIOR_BATCH_SIZE="128" +fi +if [ -z ${VIASH_PAR_ESTIMATOR+x} ]; then + VIASH_PAR_ESTIMATOR="mckp" +fi +if [ -z ${VIASH_PAR_ESTIMATOR_MULTIPLE_CPU+x} ]; then + VIASH_PAR_ESTIMATOR_MULTIPLE_CPU="false" +fi +if [ -z ${VIASH_PAR_DEBUG+x} ]; then + VIASH_PAR_DEBUG="false" +fi +if [ -z ${VIASH_PAR_CUDA+x} ]; then + VIASH_PAR_CUDA="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" ]]; then + if ! [[ "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--expected_cells_from_qc' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXPECTED_CELLS" ]]; then + if ! [[ "$VIASH_PAR_EXPECTED_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--expected_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" ]]; then + if ! [[ "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--total_droplets_included' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FORCE_CELL_UMI_PRIOR" ]]; then + if ! [[ "$VIASH_PAR_FORCE_CELL_UMI_PRIOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--force_cell_umi_prior' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FORCE_EMPTY_UMI_PRIOR" ]]; then + if ! [[ "$VIASH_PAR_FORCE_EMPTY_UMI_PRIOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--force_empty_umi_prior' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LOW_COUNT_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_LOW_COUNT_THRESHOLD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--low_count_threshold' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_Z_DIM" ]]; then + if ! [[ "$VIASH_PAR_Z_DIM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--z_dim' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_Z_LAYERS" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_Z_LAYERS; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--z_layers' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_TRAINING_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_TRAINING_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--training_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--empty_drop_training_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_IGNORE_FEATURES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_IGNORE_FEATURES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--ignore_features' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_FPR" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_FPR; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--fpr' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--projected_ambient_count_threshold' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LEARNING_RATE" ]]; then + if ! [[ "$VIASH_PAR_LEARNING_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--learning_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FINAL_ELBO_FAIL_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_FINAL_ELBO_FAIL_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--final_elbo_fail_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--epoch_elbo_fail_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NUM_TRAINING_TRIES" ]]; then + if ! [[ "$VIASH_PAR_NUM_TRAINING_TRIES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_training_tries' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LEARNING_RATE_RETRY_MULT" ]]; then + if ! [[ "$VIASH_PAR_LEARNING_RATE_RETRY_MULT" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--learning_rate_retry_mult' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_POSTERIOR_BATCH_SIZE" ]]; then + if ! [[ "$VIASH_PAR_POSTERIOR_BATCH_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--posterior_batch_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALPHA" ]]; then + if ! [[ "$VIASH_PAR_ALPHA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alpha' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_Q" ]]; then + if ! [[ "$VIASH_PAR_Q" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--q' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ESTIMATOR_MULTIPLE_CPU" ]]; then + if ! [[ "$VIASH_PAR_ESTIMATOR_MULTIPLE_CPU" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--estimator_multiple_cpu' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CONSTANT_LEARNING_RATE" ]]; then + if ! [[ "$VIASH_PAR_CONSTANT_LEARNING_RATE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--constant_learning_rate' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DEBUG" ]]; then + if ! [[ "$VIASH_PAR_DEBUG" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--debug' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CUDA" ]]; then + if ! [[ "$VIASH_PAR_CUDA" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--cuda' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL_CHOICES=("naive;simple;ambient;swapping;full") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MODEL_CHOICES[*]};" =~ ";$VIASH_PAR_MODEL;" ]]; then + ViashError '--model' specified value of \'$VIASH_PAR_MODEL\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_POSTERIOR_REGULATION" ]; then + VIASH_PAR_POSTERIOR_REGULATION_CHOICES=("PRq;PRmu;PRmu_gene") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_POSTERIOR_REGULATION_CHOICES[*]};" =~ ";$VIASH_PAR_POSTERIOR_REGULATION;" ]]; then + ViashError '--posterior_regulation' specified value of \'$VIASH_PAR_POSTERIOR_REGULATION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_ESTIMATOR" ]; then + VIASH_PAR_ESTIMATOR_CHOICES=("map;mean;cdf;sample;mckp") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_ESTIMATOR_CHOICES[*]};" =~ ";$VIASH_PAR_ESTIMATOR;" ]]; then + ViashError '--estimator' specified value of \'$VIASH_PAR_ESTIMATOR\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellbender_remove_background-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import tempfile +import subprocess +import os +import sys +import numpy as np +from scipy.sparse import csr_matrix +from cellbender.remove_background.downstream import anndata_from_h5 + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_output': $( if [ ! -z ${VIASH_PAR_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_LAYER_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_background_fraction': $( if [ ! -z ${VIASH_PAR_OBS_BACKGROUND_FRACTION+x} ]; then echo "r'${VIASH_PAR_OBS_BACKGROUND_FRACTION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_cell_probability': $( if [ ! -z ${VIASH_PAR_OBS_CELL_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OBS_CELL_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_cell_size': $( if [ ! -z ${VIASH_PAR_OBS_CELL_SIZE+x} ]; then echo "r'${VIASH_PAR_OBS_CELL_SIZE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_droplet_efficiency': $( if [ ! -z ${VIASH_PAR_OBS_DROPLET_EFFICIENCY+x} ]; then echo "r'${VIASH_PAR_OBS_DROPLET_EFFICIENCY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_latent_scale': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_SCALE+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_SCALE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_ambient_expression': $( if [ ! -z ${VIASH_PAR_VAR_AMBIENT_EXPRESSION+x} ]; then echo "r'${VIASH_PAR_VAR_AMBIENT_EXPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_gene_expression_encoding': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'expected_cells_from_qc': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS_FROM_QC+x} ]; then echo "r'${VIASH_PAR_EXPECTED_CELLS_FROM_QC//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'expected_cells': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS+x} ]; then echo "int(r'${VIASH_PAR_EXPECTED_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'total_droplets_included': $( if [ ! -z ${VIASH_PAR_TOTAL_DROPLETS_INCLUDED+x} ]; then echo "int(r'${VIASH_PAR_TOTAL_DROPLETS_INCLUDED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'force_cell_umi_prior': $( if [ ! -z ${VIASH_PAR_FORCE_CELL_UMI_PRIOR+x} ]; then echo "int(r'${VIASH_PAR_FORCE_CELL_UMI_PRIOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'force_empty_umi_prior': $( if [ ! -z ${VIASH_PAR_FORCE_EMPTY_UMI_PRIOR+x} ]; then echo "int(r'${VIASH_PAR_FORCE_EMPTY_UMI_PRIOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'epochs': $( if [ ! -z ${VIASH_PAR_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'low_count_threshold': $( if [ ! -z ${VIASH_PAR_LOW_COUNT_THRESHOLD+x} ]; then echo "int(r'${VIASH_PAR_LOW_COUNT_THRESHOLD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'z_dim': $( if [ ! -z ${VIASH_PAR_Z_DIM+x} ]; then echo "int(r'${VIASH_PAR_Z_DIM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'z_layers': $( if [ ! -z ${VIASH_PAR_Z_LAYERS+x} ]; then echo "list(map(int, r'${VIASH_PAR_Z_LAYERS//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'training_fraction': $( if [ ! -z ${VIASH_PAR_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_TRAINING_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'empty_drop_training_fraction': $( if [ ! -z ${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'ignore_features': $( if [ ! -z ${VIASH_PAR_IGNORE_FEATURES+x} ]; then echo "list(map(int, r'${VIASH_PAR_IGNORE_FEATURES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'fpr': $( if [ ! -z ${VIASH_PAR_FPR+x} ]; then echo "list(map(float, r'${VIASH_PAR_FPR//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'exclude_feature_types': $( if [ ! -z ${VIASH_PAR_EXCLUDE_FEATURE_TYPES+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_FEATURE_TYPES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'projected_ambient_count_threshold': $( if [ ! -z ${VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD+x} ]; then echo "float(r'${VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'final_elbo_fail_fraction': $( if [ ! -z ${VIASH_PAR_FINAL_ELBO_FAIL_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_FINAL_ELBO_FAIL_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'epoch_elbo_fail_fraction': $( if [ ! -z ${VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'num_training_tries': $( if [ ! -z ${VIASH_PAR_NUM_TRAINING_TRIES+x} ]; then echo "int(r'${VIASH_PAR_NUM_TRAINING_TRIES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'learning_rate_retry_mult': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE_RETRY_MULT+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE_RETRY_MULT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'posterior_batch_size': $( if [ ! -z ${VIASH_PAR_POSTERIOR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_POSTERIOR_BATCH_SIZE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'posterior_regulation': $( if [ ! -z ${VIASH_PAR_POSTERIOR_REGULATION+x} ]; then echo "r'${VIASH_PAR_POSTERIOR_REGULATION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'q': $( if [ ! -z ${VIASH_PAR_Q+x} ]; then echo "float(r'${VIASH_PAR_Q//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'estimator': $( if [ ! -z ${VIASH_PAR_ESTIMATOR+x} ]; then echo "r'${VIASH_PAR_ESTIMATOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'estimator_multiple_cpu': $( if [ ! -z ${VIASH_PAR_ESTIMATOR_MULTIPLE_CPU+x} ]; then echo "r'${VIASH_PAR_ESTIMATOR_MULTIPLE_CPU//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'constant_learning_rate': $( if [ ! -z ${VIASH_PAR_CONSTANT_LEARNING_RATE+x} ]; then echo "r'${VIASH_PAR_CONSTANT_LEARNING_RATE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'debug': $( if [ ! -z ${VIASH_PAR_DEBUG+x} ]; then echo "r'${VIASH_PAR_DEBUG//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'cuda': $( if [ ! -z ${VIASH_PAR_CUDA+x} ]; then echo "r'${VIASH_PAR_CUDA//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) + +mod = par["modality"] +logger.info("Performing log transformation on modality %s", mod) +data = mdata.mod[mod] + +# import pathlib +# with pathlib.Path(os.path.dirname(par["output"])) / "cellbender" as temp_dir: +# os.mkdir(temp_dir) +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: + # construct paths within tempdir + input_file = os.path.join(temp_dir, "input.h5ad") + output_file = os.path.join(temp_dir, "output.h5") + + logger.info("Creating AnnData input file for CellBender: '%s'", input_file) + data.write_h5ad(input_file) + + logger.info("Constructing CellBender command") + cmd_pars = [ + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, + # don't create checkpoints because they're not used / returned anyways + "--checkpoint-mins", + "99999999", + ] + + if meta.get("cpus") is not None: + cmd_pars += ["--cpu-threads", str(meta["cpus"])] + + extra_args = [ + ("--expected-cells", "expected_cells", True), + ("--total-droplets-included", "total_droplets_included", True), + ("--force-cell-umi-prior", "force_cell_umi_prior", True), + ("--force-empty-umi-prior", "force_empty_umi_prior", True), + ("--model", "model", True), + ("--epochs", "epochs", True), + ("--low-count-threshold", "low_count_threshold", True), + ("--z-dim", "z_dim", True), + ("--z-layers", "z_layers", True), + ("--training-fraction", "training_fraction", True), + ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), + ("--ignore-features", "ignore_features", True), + ("--fpr", "fpr", True), + ("--exclude-feature-types", "exclude_feature_types", True), + ( + "--projected-ambient-count-threshold", + "projected_ambient_count_threshold", + True, + ), + ("--learning-rate", "learning_rate", True), + ("--final-elbo-fail-fraction", "final_elbo_fail_fraction", True), + ("--epoch-elbo-fail-fraction", "epoch_elbo_fail_fraction", True), + ("--num-training-tries", "num_training_tries", True), + ("--learning-rate-retry-mult", "learning_rate_retry_mult", True), + ("--posterior-batch-size", "posterior_batch_size", True), + ("--posterior-regulation", "posterior_regulation", True), + ("--alpha", "alpha", True), + ("--q", "q", True), + ("--estimator", "estimator", True), + ("--estimator-multiple-cpu", "estimator_multiple_cpu", False), + ("--constant-learning-rate", "constant_learning_rate", False), + ("--debug", "debug", False), + ("--cuda", "cuda", False), + ] + for flag, name, is_kwarg in extra_args: + if par[name]: + values = par[name] if isinstance(par[name], list) else [par[name]] + cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] + + if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: + assert par["expected_cells"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + assert par["total_droplets_included"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + met = data.uns["metrics_cellranger"] + col_name = "Estimated Number of Cells" + assert col_name in met.columns, ( + "%s should be a column in .obs[metrics_cellranger]" + ) + est_cells = met[col_name].values[0] + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) + out = subprocess.check_output(cmd_pars).decode("utf-8") + + logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) + adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=False) + + logger.info("CellBender output format:", adata_out) + + # AnnData object with n_obs x n_vars = 6794880 x 33538 + # obs: 'cellbender_analyzed' + # var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed' + # uns: 'background_fraction', 'barcode_indices_for_latents', 'cell_probability', 'cell_size', 'droplet_efficiency', 'gene_expression_encoding', + # 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'swapping_fraction_dist_params', + # 'barcodes_analyzed', 'barcodes_analyzed_inds', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', + # 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_elbo', 'learning_curve_train_epoch', + # 'target_false_positive_rate' + + logger.info("Copying X output to MuData") + data.layers[par["layer_output"]] = adata_out.X + + logger.info("Copying .obs output to MuData") + obs_store = { + "obs_background_fraction": "background_fraction", + "obs_cell_probability": "cell_probability", + "obs_cell_size": "cell_size", + "obs_droplet_efficiency": "droplet_efficiency", + "obs_latent_scale": "latent_scale", + } + for to_name, from_name in obs_store.items(): + if par[to_name]: + if from_name in adata_out.obs: + data.obs[par[to_name]] = adata_out.obs[from_name] + # when using unfiltered data, the values will be in uns instead of obs + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + vec = np.zeros(data.n_obs) + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] + data.obs[par[to_name]] = vec + + logger.info("Copying .var output to MuData") + var_store = {"var_ambient_expression": "ambient_expression"} + for to_name, from_name in var_store.items(): + if par[to_name]: + data.var[par[to_name]] = adata_out.var[from_name] + + logger.info("Copying obsm_gene_expression_encoding output to MuData") + obsm_store = {"obsm_gene_expression_encoding": "gene_expression_encoding"} + for to_name, from_name in obsm_store.items(): + if par[to_name]: + if from_name in adata_out.obsm: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + matrix_to_store = adata_out.uns[from_name] + number_of_obs = data.X.shape[0] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] + data.obsm[par[to_name]] = latent_space_sparse + else: + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) + + +logger.info("Writing to file %s", par["output"]) +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/correction/cellbender_remove_background/nextflow_labels.config b/target/executable/correction/cellbender_remove_background/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/correction/cellbender_remove_background/setup_logger.py b/target/executable/correction/cellbender_remove_background/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/correction/cellbender_remove_background_v0_2/.config.vsh.yaml b/target/executable/correction/cellbender_remove_background_v0_2/.config.vsh.yaml new file mode 100644 index 00000000..57653995 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background_v0_2/.config.vsh.yaml @@ -0,0 +1,455 @@ +name: "cellbender_remove_background_v0_2" +namespace: "correction" +version: "main" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of modalities to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Full count matrix as an h5mu file, with background RNA removed.\ + \ This file contains all the original droplet barcodes." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_output" + description: "Output layer" + info: null + default: + - "corrected" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_rt_efficiency" + info: null + default: + - "latent_rt_efficiency" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_cell_probability" + info: null + default: + - "latent_cell_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_scale" + info: null + default: + - "latent_scale" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_ambient_expression" + info: null + default: + - "ambient_expression" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_latent_gene_encoding" + info: null + default: + - "cellbender_latent_gene_encoding" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--expected_cells" + description: "Number of cells expected in the dataset (a rough estimate within\ + \ a factor of 2 is sufficient)." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--total_droplets_included" + description: "The number of droplets from the rank-ordered UMI plot\nthat will\ + \ be analyzed. The largest 'total_droplets'\ndroplets will have their cell probabilities\ + \ inferred\nas an output.\n" + info: null + example: + - 25000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--expected_cells_from_qc" + description: "Will use the Cell Ranger QC to determine the estimated number of\ + \ cells" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--model" + description: "Which model is being used for count data. 'simple'\ndoes not model\ + \ either ambient RNA or random barcode\nswapping (for debugging purposes --\ + \ not recommended).\n'ambient' assumes background RNA is incorporated into\n\ + droplets. 'swapping' assumes background RNA comes from\nrandom barcode swapping.\ + \ 'full' uses a combined\nambient and swapping model.\n" + info: null + default: + - "full" + required: false + choices: + - "simple" + - "ambient" + - "swapping" + - "full" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--epochs" + description: "Number of epochs to train." + info: null + default: + - 150 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--low_count_threshold" + description: "Droplets with UMI counts below this number are completely \nexcluded\ + \ from the analysis. This can help identify the correct \nprior for empty droplet\ + \ counts in the rare case where empty \ncounts are extremely high (over 200).\n" + info: null + default: + - 15 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_dim" + description: "Dimension of latent variable z.\n" + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_layers" + description: "Dimension of hidden layers in the encoder for z.\n" + info: null + default: + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--training_fraction" + description: "Training detail: the fraction of the data used for training.\nThe\ + \ rest is never seen by the inference algorithm. Speeds up learning.\n" + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--empty_drop_training_fraction" + description: "Training detail: the fraction of the training data each epoch that\ + \ \nis drawn (randomly sampled) from surely empty droplets.\n" + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--fpr" + description: "Target false positive rate in (0, 1). A false positive\nis a true\ + \ signal count that is erroneously removed.\nMore background removal is accompanied\ + \ by more signal\nremoval at high values of FPR. You can specify\nmultiple values,\ + \ which will create multiple output\nfiles.\n" + info: null + default: + - 0.01 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--exclude_antibody_capture" + description: "Including the flag --exclude-antibody-capture will\ncause remove-background\ + \ to operate on gene counts\nonly, ignoring other features.\n" + info: null + direction: "input" + - type: "double" + name: "--learning_rate" + description: "Training detail: lower learning rate for inference. A\nOneCycle\ + \ learning rate schedule is used, where the\nupper learning rate is ten times\ + \ this value. (For this\nvalue, probably do not exceed 1e-3).\n" + info: null + example: + - 1.0E-4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--cuda" + description: "Including the flag --cuda will run the inference on a\nGPU.\n" + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Eliminating technical artifacts from high-throughput single-cell RNA\ + \ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\ + \ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\ + \ moment, only the count matrices produced by the CellRanger count pipeline is supported.\ + \ Support for additional tools and protocols \nwill be added in the future. A quick\ + \ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.12-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "muon==0.1.5" + - "cellbender==0.2.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "muon~=0.1.4" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/correction/cellbender_remove_background_v0_2/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/correction/cellbender_remove_background_v0_2" + executable: "target/executable/correction/cellbender_remove_background_v0_2/cellbender_remove_background_v0_2" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/correction/cellbender_remove_background_v0_2/cellbender_remove_background_v0_2 b/target/executable/correction/cellbender_remove_background_v0_2/cellbender_remove_background_v0_2 new file mode 100755 index 00000000..5bff7c62 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background_v0_2/cellbender_remove_background_v0_2 @@ -0,0 +1,1859 @@ +#!/usr/bin/env bash + +# cellbender_remove_background_v0_2 main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellbender_remove_background_v0_2" +VIASH_META_FUNCTIONALITY_NAME="cellbender_remove_background_v0_2" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:23.12-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "muon==0.1.5" "cellbender==0.2.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.description="Companion container for running component correction cellbender_remove_background_v0_2" +LABEL org.opencontainers.image.created="2025-08-22T07:23:09Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellbender_remove_background_v0_2 main" + echo "" + echo "Eliminating technical artifacts from high-throughput single-cell RNA sequencing" + echo "data." + echo "" + echo "This module removes counts due to ambient RNA molecules and random barcode" + echo "swapping from (raw) UMI-based scRNA-seq count matrices." + echo "At the moment, only the count matrices produced by the CellRanger count pipeline" + echo "is supported. Support for additional tools and protocols" + echo "will be added in the future. A quick start tutorial can be found here." + echo "" + echo "Fleming et al. 2022, bioRxiv." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " List of modalities to process." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Full count matrix as an h5mu file, with background RNA removed. This" + echo " file contains all the original droplet barcodes." + echo "" + echo " --layer_output" + echo " type: string" + echo " default: corrected" + echo " Output layer" + echo "" + echo " --obs_latent_rt_efficiency" + echo " type: string" + echo " default: latent_rt_efficiency" + echo "" + echo " --obs_latent_cell_probability" + echo " type: string" + echo " default: latent_cell_probability" + echo "" + echo " --obs_latent_scale" + echo " type: string" + echo " default: latent_scale" + echo "" + echo " --var_ambient_expression" + echo " type: string" + echo " default: ambient_expression" + echo "" + echo " --obsm_latent_gene_encoding" + echo " type: string" + echo " default: cellbender_latent_gene_encoding" + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --expected_cells" + echo " type: integer" + echo " example: 1000" + echo " Number of cells expected in the dataset (a rough estimate within a" + echo " factor of 2 is sufficient)." + echo "" + echo " --total_droplets_included" + echo " type: integer" + echo " example: 25000" + echo " The number of droplets from the rank-ordered UMI plot" + echo " that will be analyzed. The largest 'total_droplets'" + echo " droplets will have their cell probabilities inferred" + echo " as an output." + echo "" + echo " --expected_cells_from_qc" + echo " type: boolean" + echo " default: true" + echo " Will use the Cell Ranger QC to determine the estimated number of cells" + echo "" + echo " --model" + echo " type: string" + echo " default: full" + echo " choices: [ simple, ambient, swapping, full ]" + echo " Which model is being used for count data. 'simple'" + echo " does not model either ambient RNA or random barcode" + echo " swapping (for debugging purposes -- not recommended)." + echo " 'ambient' assumes background RNA is incorporated into" + echo " droplets. 'swapping' assumes background RNA comes from" + echo " random barcode swapping. 'full' uses a combined" + echo " ambient and swapping model." + echo "" + echo " --epochs" + echo " type: integer" + echo " default: 150" + echo " Number of epochs to train." + echo "" + echo " --low_count_threshold" + echo " type: integer" + echo " default: 15" + echo " Droplets with UMI counts below this number are completely" + echo " excluded from the analysis. This can help identify the correct" + echo " prior for empty droplet counts in the rare case where empty" + echo " counts are extremely high (over 200)." + echo "" + echo " --z_dim" + echo " type: integer" + echo " default: 100" + echo " Dimension of latent variable z." + echo "" + echo " --z_layers" + echo " type: integer, multiple values allowed" + echo " default: 500" + echo " Dimension of hidden layers in the encoder for z." + echo "" + echo " --training_fraction" + echo " type: double" + echo " default: 0.9" + echo " Training detail: the fraction of the data used for training." + echo " The rest is never seen by the inference algorithm. Speeds up learning." + echo "" + echo " --empty_drop_training_fraction" + echo " type: double" + echo " default: 0.5" + echo " Training detail: the fraction of the training data each epoch that" + echo " is drawn (randomly sampled) from surely empty droplets." + echo "" + echo " --fpr" + echo " type: double, multiple values allowed" + echo " default: 0.01" + echo " Target false positive rate in (0, 1). A false positive" + echo " is a true signal count that is erroneously removed." + echo " More background removal is accompanied by more signal" + echo " removal at high values of FPR. You can specify" + echo " multiple values, which will create multiple output" + echo " files." + echo "" + echo " --exclude_antibody_capture" + echo " type: boolean_true" + echo " Including the flag --exclude-antibody-capture will" + echo " cause remove-background to operate on gene counts" + echo " only, ignoring other features." + echo "" + echo " --learning_rate" + echo " type: double" + echo " example: 1.0E-4" + echo " Training detail: lower learning rate for inference. A" + echo " OneCycle learning rate schedule is used, where the" + echo " upper learning rate is ten times this value. (For this" + echo " value, probably do not exceed 1e-3)." + echo "" + echo " --cuda" + echo " type: boolean_true" + echo " Including the flag --cuda will run the inference on a" + echo " GPU." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellbender_remove_background_v0_2 main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_output) + [ -n "$VIASH_PAR_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--layer_output\': \'$VIASH_PAR_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_output=*) + [ -n "$VIASH_PAR_LAYER_OUTPUT" ] && ViashError Bad arguments for option \'--layer_output=*\': \'$VIASH_PAR_LAYER_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_latent_rt_efficiency) + [ -n "$VIASH_PAR_OBS_LATENT_RT_EFFICIENCY" ] && ViashError Bad arguments for option \'--obs_latent_rt_efficiency\': \'$VIASH_PAR_OBS_LATENT_RT_EFFICIENCY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_RT_EFFICIENCY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_latent_rt_efficiency. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_latent_rt_efficiency=*) + [ -n "$VIASH_PAR_OBS_LATENT_RT_EFFICIENCY" ] && ViashError Bad arguments for option \'--obs_latent_rt_efficiency=*\': \'$VIASH_PAR_OBS_LATENT_RT_EFFICIENCY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_RT_EFFICIENCY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_latent_cell_probability) + [ -n "$VIASH_PAR_OBS_LATENT_CELL_PROBABILITY" ] && ViashError Bad arguments for option \'--obs_latent_cell_probability\': \'$VIASH_PAR_OBS_LATENT_CELL_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_CELL_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_latent_cell_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_latent_cell_probability=*) + [ -n "$VIASH_PAR_OBS_LATENT_CELL_PROBABILITY" ] && ViashError Bad arguments for option \'--obs_latent_cell_probability=*\': \'$VIASH_PAR_OBS_LATENT_CELL_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_CELL_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_latent_scale) + [ -n "$VIASH_PAR_OBS_LATENT_SCALE" ] && ViashError Bad arguments for option \'--obs_latent_scale\': \'$VIASH_PAR_OBS_LATENT_SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_SCALE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_latent_scale. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_latent_scale=*) + [ -n "$VIASH_PAR_OBS_LATENT_SCALE" ] && ViashError Bad arguments for option \'--obs_latent_scale=*\': \'$VIASH_PAR_OBS_LATENT_SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LATENT_SCALE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_ambient_expression) + [ -n "$VIASH_PAR_VAR_AMBIENT_EXPRESSION" ] && ViashError Bad arguments for option \'--var_ambient_expression\': \'$VIASH_PAR_VAR_AMBIENT_EXPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_AMBIENT_EXPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_ambient_expression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_ambient_expression=*) + [ -n "$VIASH_PAR_VAR_AMBIENT_EXPRESSION" ] && ViashError Bad arguments for option \'--var_ambient_expression=*\': \'$VIASH_PAR_VAR_AMBIENT_EXPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_AMBIENT_EXPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_latent_gene_encoding) + [ -n "$VIASH_PAR_OBSM_LATENT_GENE_ENCODING" ] && ViashError Bad arguments for option \'--obsm_latent_gene_encoding\': \'$VIASH_PAR_OBSM_LATENT_GENE_ENCODING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_LATENT_GENE_ENCODING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_latent_gene_encoding. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_latent_gene_encoding=*) + [ -n "$VIASH_PAR_OBSM_LATENT_GENE_ENCODING" ] && ViashError Bad arguments for option \'--obsm_latent_gene_encoding=*\': \'$VIASH_PAR_OBSM_LATENT_GENE_ENCODING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_LATENT_GENE_ENCODING=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expected_cells) + [ -n "$VIASH_PAR_EXPECTED_CELLS" ] && ViashError Bad arguments for option \'--expected_cells\': \'$VIASH_PAR_EXPECTED_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expected_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expected_cells=*) + [ -n "$VIASH_PAR_EXPECTED_CELLS" ] && ViashError Bad arguments for option \'--expected_cells=*\': \'$VIASH_PAR_EXPECTED_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --total_droplets_included) + [ -n "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" ] && ViashError Bad arguments for option \'--total_droplets_included\': \'$VIASH_PAR_TOTAL_DROPLETS_INCLUDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TOTAL_DROPLETS_INCLUDED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --total_droplets_included. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --total_droplets_included=*) + [ -n "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" ] && ViashError Bad arguments for option \'--total_droplets_included=*\': \'$VIASH_PAR_TOTAL_DROPLETS_INCLUDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TOTAL_DROPLETS_INCLUDED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expected_cells_from_qc) + [ -n "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" ] && ViashError Bad arguments for option \'--expected_cells_from_qc\': \'$VIASH_PAR_EXPECTED_CELLS_FROM_QC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS_FROM_QC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expected_cells_from_qc. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expected_cells_from_qc=*) + [ -n "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" ] && ViashError Bad arguments for option \'--expected_cells_from_qc=*\': \'$VIASH_PAR_EXPECTED_CELLS_FROM_QC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELLS_FROM_QC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --epochs) + [ -n "$VIASH_PAR_EPOCHS" ] && ViashError Bad arguments for option \'--epochs\': \'$VIASH_PAR_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --epochs=*) + [ -n "$VIASH_PAR_EPOCHS" ] && ViashError Bad arguments for option \'--epochs=*\': \'$VIASH_PAR_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --low_count_threshold) + [ -n "$VIASH_PAR_LOW_COUNT_THRESHOLD" ] && ViashError Bad arguments for option \'--low_count_threshold\': \'$VIASH_PAR_LOW_COUNT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOW_COUNT_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --low_count_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --low_count_threshold=*) + [ -n "$VIASH_PAR_LOW_COUNT_THRESHOLD" ] && ViashError Bad arguments for option \'--low_count_threshold=*\': \'$VIASH_PAR_LOW_COUNT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOW_COUNT_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --z_dim) + [ -n "$VIASH_PAR_Z_DIM" ] && ViashError Bad arguments for option \'--z_dim\': \'$VIASH_PAR_Z_DIM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_Z_DIM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --z_dim. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --z_dim=*) + [ -n "$VIASH_PAR_Z_DIM" ] && ViashError Bad arguments for option \'--z_dim=*\': \'$VIASH_PAR_Z_DIM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_Z_DIM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --z_layers) + if [ -z "$VIASH_PAR_Z_LAYERS" ]; then + VIASH_PAR_Z_LAYERS="$2" + else + VIASH_PAR_Z_LAYERS="$VIASH_PAR_Z_LAYERS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --z_layers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --z_layers=*) + if [ -z "$VIASH_PAR_Z_LAYERS" ]; then + VIASH_PAR_Z_LAYERS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_Z_LAYERS="$VIASH_PAR_Z_LAYERS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --training_fraction) + [ -n "$VIASH_PAR_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--training_fraction\': \'$VIASH_PAR_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRAINING_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --training_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --training_fraction=*) + [ -n "$VIASH_PAR_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--training_fraction=*\': \'$VIASH_PAR_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRAINING_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --empty_drop_training_fraction) + [ -n "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--empty_drop_training_fraction\': \'$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --empty_drop_training_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --empty_drop_training_fraction=*) + [ -n "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" ] && ViashError Bad arguments for option \'--empty_drop_training_fraction=*\': \'$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fpr) + if [ -z "$VIASH_PAR_FPR" ]; then + VIASH_PAR_FPR="$2" + else + VIASH_PAR_FPR="$VIASH_PAR_FPR;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fpr. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fpr=*) + if [ -z "$VIASH_PAR_FPR" ]; then + VIASH_PAR_FPR=$(ViashRemoveFlags "$1") + else + VIASH_PAR_FPR="$VIASH_PAR_FPR;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --exclude_antibody_capture) + [ -n "$VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE" ] && ViashError Bad arguments for option \'--exclude_antibody_capture\': \'$VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE=true + shift 1 + ;; + --learning_rate) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --learning_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --learning_rate=*) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate=*\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cuda) + [ -n "$VIASH_PAR_CUDA" ] && ViashError Bad arguments for option \'--cuda\': \'$VIASH_PAR_CUDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CUDA=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/correction/cellbender_remove_background_v0_2:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_LAYER_OUTPUT+x} ]; then + VIASH_PAR_LAYER_OUTPUT="corrected" +fi +if [ -z ${VIASH_PAR_OBS_LATENT_RT_EFFICIENCY+x} ]; then + VIASH_PAR_OBS_LATENT_RT_EFFICIENCY="latent_rt_efficiency" +fi +if [ -z ${VIASH_PAR_OBS_LATENT_CELL_PROBABILITY+x} ]; then + VIASH_PAR_OBS_LATENT_CELL_PROBABILITY="latent_cell_probability" +fi +if [ -z ${VIASH_PAR_OBS_LATENT_SCALE+x} ]; then + VIASH_PAR_OBS_LATENT_SCALE="latent_scale" +fi +if [ -z ${VIASH_PAR_VAR_AMBIENT_EXPRESSION+x} ]; then + VIASH_PAR_VAR_AMBIENT_EXPRESSION="ambient_expression" +fi +if [ -z ${VIASH_PAR_OBSM_LATENT_GENE_ENCODING+x} ]; then + VIASH_PAR_OBSM_LATENT_GENE_ENCODING="cellbender_latent_gene_encoding" +fi +if [ -z ${VIASH_PAR_EXPECTED_CELLS_FROM_QC+x} ]; then + VIASH_PAR_EXPECTED_CELLS_FROM_QC="true" +fi +if [ -z ${VIASH_PAR_MODEL+x} ]; then + VIASH_PAR_MODEL="full" +fi +if [ -z ${VIASH_PAR_EPOCHS+x} ]; then + VIASH_PAR_EPOCHS="150" +fi +if [ -z ${VIASH_PAR_LOW_COUNT_THRESHOLD+x} ]; then + VIASH_PAR_LOW_COUNT_THRESHOLD="15" +fi +if [ -z ${VIASH_PAR_Z_DIM+x} ]; then + VIASH_PAR_Z_DIM="100" +fi +if [ -z ${VIASH_PAR_Z_LAYERS+x} ]; then + VIASH_PAR_Z_LAYERS="500" +fi +if [ -z ${VIASH_PAR_TRAINING_FRACTION+x} ]; then + VIASH_PAR_TRAINING_FRACTION="0.9" +fi +if [ -z ${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION+x} ]; then + VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION="0.5" +fi +if [ -z ${VIASH_PAR_FPR+x} ]; then + VIASH_PAR_FPR="0.01" +fi +if [ -z ${VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE+x} ]; then + VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE="false" +fi +if [ -z ${VIASH_PAR_CUDA+x} ]; then + VIASH_PAR_CUDA="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EXPECTED_CELLS" ]]; then + if ! [[ "$VIASH_PAR_EXPECTED_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--expected_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" ]]; then + if ! [[ "$VIASH_PAR_TOTAL_DROPLETS_INCLUDED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--total_droplets_included' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" ]]; then + if ! [[ "$VIASH_PAR_EXPECTED_CELLS_FROM_QC" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--expected_cells_from_qc' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LOW_COUNT_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_LOW_COUNT_THRESHOLD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--low_count_threshold' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_Z_DIM" ]]; then + if ! [[ "$VIASH_PAR_Z_DIM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--z_dim' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_Z_LAYERS" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_Z_LAYERS; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--z_layers' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_TRAINING_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_TRAINING_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--training_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--empty_drop_training_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_FPR" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_FPR; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--fpr' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE" ]]; then + if ! [[ "$VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--exclude_antibody_capture' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LEARNING_RATE" ]]; then + if ! [[ "$VIASH_PAR_LEARNING_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--learning_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CUDA" ]]; then + if ! [[ "$VIASH_PAR_CUDA" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--cuda' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL_CHOICES=("simple;ambient;swapping;full") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MODEL_CHOICES[*]};" =~ ";$VIASH_PAR_MODEL;" ]]; then + ViashError '--model' specified value of \'$VIASH_PAR_MODEL\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellbender_remove_background_v0_2-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import tempfile +import subprocess +import os +import sys +import numpy as np +from scipy.sparse import csr_matrix + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_output': $( if [ ! -z ${VIASH_PAR_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_LAYER_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_latent_rt_efficiency': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_RT_EFFICIENCY+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_RT_EFFICIENCY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_latent_cell_probability': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_CELL_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_CELL_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_latent_scale': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_SCALE+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_SCALE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_ambient_expression': $( if [ ! -z ${VIASH_PAR_VAR_AMBIENT_EXPRESSION+x} ]; then echo "r'${VIASH_PAR_VAR_AMBIENT_EXPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_latent_gene_encoding': $( if [ ! -z ${VIASH_PAR_OBSM_LATENT_GENE_ENCODING+x} ]; then echo "r'${VIASH_PAR_OBSM_LATENT_GENE_ENCODING//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'expected_cells': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS+x} ]; then echo "int(r'${VIASH_PAR_EXPECTED_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'total_droplets_included': $( if [ ! -z ${VIASH_PAR_TOTAL_DROPLETS_INCLUDED+x} ]; then echo "int(r'${VIASH_PAR_TOTAL_DROPLETS_INCLUDED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'expected_cells_from_qc': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS_FROM_QC+x} ]; then echo "r'${VIASH_PAR_EXPECTED_CELLS_FROM_QC//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'epochs': $( if [ ! -z ${VIASH_PAR_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'low_count_threshold': $( if [ ! -z ${VIASH_PAR_LOW_COUNT_THRESHOLD+x} ]; then echo "int(r'${VIASH_PAR_LOW_COUNT_THRESHOLD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'z_dim': $( if [ ! -z ${VIASH_PAR_Z_DIM+x} ]; then echo "int(r'${VIASH_PAR_Z_DIM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'z_layers': $( if [ ! -z ${VIASH_PAR_Z_LAYERS+x} ]; then echo "list(map(int, r'${VIASH_PAR_Z_LAYERS//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'training_fraction': $( if [ ! -z ${VIASH_PAR_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_TRAINING_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'empty_drop_training_fraction': $( if [ ! -z ${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'fpr': $( if [ ! -z ${VIASH_PAR_FPR+x} ]; then echo "list(map(float, r'${VIASH_PAR_FPR//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'exclude_antibody_capture': $( if [ ! -z ${VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'cuda': $( if [ ! -z ${VIASH_PAR_CUDA+x} ]; then echo "r'${VIASH_PAR_CUDA//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +from helper import anndata_from_h5 + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) + +mod = par["modality"] +logger.info("Performing log transformation on modality %s", mod) +data = mdata.mod[mod] + +# with pathlib.Path(meta["temp_dir"]) / "cellbender" as temp_dir: +# os.mkdir(temp_dir) +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: + # construct paths within tempdir + input_file = os.path.join(temp_dir, "input.h5ad") + output_file = os.path.join(temp_dir, "output.h5") + + logger.info("Creating AnnData input file for CellBender: '%s'", input_file) + data.write_h5ad(input_file) + + logger.info("Constructing CellBender command") + cmd_pars = [ + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, + ] + + extra_args = [ + ("--expected-cells", "expected_cells", True), + ("--total-droplets-included", "total_droplets_included", True), + ("--model", "model", True), + ("--epochs", "epochs", True), + ("--cuda", "cuda", False), + ("--low-count-threshold", "low_count_threshold", True), + ("--z-dim", "z_dim", True), + ("--z-layers", "z_layers", True), + ("--training-fraction", "training_fraction", True), + ("--exclude-antibody-capture", "exclude_antibody_capture", False), + ("--learning-rate", "learning_rate", True), + ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), + ] + for flag, name, is_kwarg in extra_args: + if par[name]: + values = par[name] if isinstance(par[name], list) else [par[name]] + cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] + + if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: + assert par["expected_cells"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + assert par["total_droplets_included"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + met = data.uns["metrics_cellranger"] + col_name = "Estimated Number of Cells" + assert col_name in met.columns, ( + "%s should be a column in .obs[metrics_cellranger]" + ) + est_cells = met[col_name].values[0] + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) + out = subprocess.check_output(cmd_pars).decode("utf-8") + + logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) + # have to use custom read_10x_h5 function for now + # will be fixed when https://github.com/scverse/scanpy/pull/2344 is merged + # adata_out = sc.read_10x_h5(output_file, gex_only=False) + adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=False) + + logger.info("Copying X output to MuData") + data.layers[par["layer_output"]] = adata_out.X + + logger.info("Copying .obs output to MuData") + obs_store = { + "obs_latent_rt_efficiency": "latent_RT_efficiency", + "obs_latent_cell_probability": "latent_cell_probability", + "obs_latent_scale": "latent_scale", + } + for to_name, from_name in obs_store.items(): + if par[to_name]: + if from_name in adata_out.obs: + data.obs[par[to_name]] = adata_out.obs[from_name] + # when using unfiltered data, the values will be in uns instead of obs + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + vec = np.zeros(data.n_obs) + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] + data.obs[par[to_name]] = vec + + logger.info("Copying .var output to MuData") + var_store = {"var_ambient_expression": "ambient_expression"} + for to_name, from_name in var_store.items(): + if par[to_name]: + data.var[par[to_name]] = adata_out.var[from_name] + + logger.info("Copying obsm_latent_gene_encoding output to MuData") + obsm_store = {"obsm_latent_gene_encoding": "latent_gene_encoding"} + for to_name, from_name in obsm_store.items(): + if par[to_name]: + if from_name in adata_out.obsm: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + matrix_to_store = adata_out.uns[from_name] + number_of_obs = data.X.shape[0] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] + data.obsm[par[to_name]] = latent_space_sparse + else: + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) + + +logger.info("Writing to file %s", par["output"]) +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/correction/cellbender_remove_background_v0_2/helper.py b/target/executable/correction/cellbender_remove_background_v0_2/helper.py new file mode 100644 index 00000000..55e0cd36 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background_v0_2/helper.py @@ -0,0 +1,166 @@ +# This file is copied from https://github.com/broadinstitute/CellBender/issues/128#issuecomment-1175336065 +# to solve an issue with scanpy not being able to read in the 10x h5 files produced by cellbender. +# +# Note: If something doesn't work in this helper function, it may be interesting to +# take a look at the comments by Dries: https://github.com/openpipelines-bio/openpipeline/pull/115 +# I'm not going to apply them for now -- if it ain't broke, don't fix it. +import tables +import numpy as np +import scipy.sparse as sp +import anndata +from typing import Dict + + +def anndata_from_h5( + file: str, analyzed_barcodes_only: bool = True +) -> "anndata.AnnData": + """Load an output h5 file into an AnnData object for downstream work. + + Args: + file: The h5 file + analyzed_barcodes_only: False to load all barcodes, so that the size of + the AnnData object will match the size of the input raw count matrix. + True to load a limited set of barcodes: only those analyzed by the + algorithm. This allows relevant latent variables to be loaded + properly into adata.obs and adata.obsm, rather than adata.uns. + + Returns: + adata: The anndata object, populated with inferred latent variables + and metadata. + + """ + + d = dict_from_h5(file) + X = ( + sp.csc_matrix( + (d.pop("data"), d.pop("indices"), d.pop("indptr")), shape=d.pop("shape") + ) + .transpose() + .tocsr() + ) + + # check and see if we have barcode index annotations, and if the file is filtered + barcode_key = [k for k in d.keys() if (("barcode" in k) and ("ind" in k))] + if len(barcode_key) > 0: + max_barcode_ind = d[barcode_key[0]].max() + filtered_file = max_barcode_ind >= X.shape[0] + else: + filtered_file = True + + if analyzed_barcodes_only: + if filtered_file: + # filtered file being read, so we don't need to subset + print('Assuming we are loading a "filtered" file that contains only cells.') + pass + elif "barcode_indices_for_latents" in d.keys(): + X = X[d["barcode_indices_for_latents"], :] + d["barcodes"] = d["barcodes"][d["barcode_indices_for_latents"]] + elif "barcodes_analyzed_inds" in d.keys(): + X = X[d["barcodes_analyzed_inds"], :] + d["barcodes"] = d["barcodes"][d["barcodes_analyzed_inds"]] + else: + print( + "Warning: analyzed_barcodes_only=True, but the key " + '"barcodes_analyzed_inds" or "barcode_indices_for_latents" ' + "is missing from the h5 file. " + "Will output all barcodes, and proceed as if " + "analyzed_barcodes_only=False" + ) + + # Construct the anndata object. + adata = anndata.AnnData( + X=X, + obs={"barcode": d.pop("barcodes").astype(str)}, + var={ + "gene_name": ( + d.pop("gene_names") if "gene_names" in d.keys() else d.pop("name") + ).astype(str) + }, + dtype=X.dtype, + ) + adata.obs.set_index("barcode", inplace=True) + adata.var.set_index("gene_name", inplace=True) + + # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this + if "genes" in d.keys(): + d["id"] = d.pop("genes") + + # For purely aesthetic purposes, rename "id" to "gene_id" + if "id" in d.keys(): + d["gene_id"] = d.pop("id") + + # If genomes are empty, try to guess them based on gene_id + if "genome" in d.keys(): + if np.array([s.decode() == "" for s in d["genome"]]).all(): + if "_" in d["gene_id"][0].decode(): + print( + "Genome field blank, so attempting to guess genomes based on gene_id prefixes" + ) + d["genome"] = np.array( + [s.decode().split("_")[0] for s in d["gene_id"]], dtype=str + ) + + # Add other information to the anndata object in the appropriate slot. + _fill_adata_slots_automatically(adata, d) + + # Add a special additional field to .var if it exists. + if "features_analyzed_inds" in adata.uns.keys(): + adata.var["cellbender_analyzed"] = [ + True if (i in adata.uns["features_analyzed_inds"]) else False + for i in range(adata.shape[1]) + ] + + if analyzed_barcodes_only: + for col in adata.obs.columns[ + adata.obs.columns.str.startswith("barcodes_analyzed") + | adata.obs.columns.str.startswith("barcode_indices") + ]: + try: + del adata.obs[col] + except Exception: + pass + else: + # Add a special additional field to .obs if all barcodes are included. + if "barcodes_analyzed_inds" in adata.uns.keys(): + adata.obs["cellbender_analyzed"] = [ + True if (i in adata.uns["barcodes_analyzed_inds"]) else False + for i in range(adata.shape[0]) + ] + + return adata + + +def dict_from_h5(file: str) -> Dict[str, np.ndarray]: + """Read in everything from an h5 file and put into a dictionary.""" + d = {} + with tables.open_file(file) as f: + # read in everything + for array in f.walk_nodes("/", "Array"): + d[array.name] = array.read() + return d + + +def _fill_adata_slots_automatically(adata, d): + """Add other information to the adata object in the appropriate slot.""" + + for key, value in d.items(): + try: + if value is None: + continue + value = np.asarray(value) + if len(value.shape) == 0: + adata.uns[key] = value + elif value.shape[0] == adata.shape[0]: + if (len(value.shape) < 2) or (value.shape[1] < 2): + adata.obs[key] = value + else: + adata.obsm[key] = value + elif value.shape[0] == adata.shape[1]: + if value.dtype.name.startswith("bytes"): + adata.var[key] = value.astype(str) + else: + adata.var[key] = value + else: + adata.uns[key] = value + except Exception: + print("Unable to load data into AnnData: ", key, value, type(value)) diff --git a/target/executable/correction/cellbender_remove_background_v0_2/nextflow_labels.config b/target/executable/correction/cellbender_remove_background_v0_2/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background_v0_2/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/correction/cellbender_remove_background_v0_2/setup_logger.py b/target/executable/correction/cellbender_remove_background_v0_2/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/correction/cellbender_remove_background_v0_2/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/create_pseudobulk/differential_expression/.config.vsh.yaml b/target/executable/create_pseudobulk/differential_expression/.config.vsh.yaml new file mode 100644 index 00000000..cb7ddf54 --- /dev/null +++ b/target/executable/create_pseudobulk/differential_expression/.config.vsh.yaml @@ -0,0 +1,365 @@ +name: "differential_expression" +namespace: "create_pseudobulk" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Dries De Maeyer" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used. This layer must contain\ + \ raw counts." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_label" + description: ".obs field containing the variable to group on. Typically this field\ + \ contains cell groups such as annotated cell types or clusters." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_groups" + description: ".obs fields containing the experimental condition(s) to create pseudobulk\ + \ samples for." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--aggregation_method" + description: "Method to aggregate the raw counts for pseudoreplicates. Either\ + \ sum or mean." + info: null + default: + - "sum" + required: false + choices: + - "sum" + - "mean" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_obs_per_sample" + description: "Minimum number of cells per pseudobulk sample." + info: null + default: + - 30 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed for sampling.\n" + info: null + default: + - 0 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_cell_count" + description: "The .obs field to store the number of observations per pseudobulk\ + \ sample." + info: null + default: + - "n_cells" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file containing the aggregated pseudobulk samples." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Generation of pseudobulk samples from single-cell transcriptomics data,\n\ + by aggregating raw gene expression counts from individual cells to create \nbulk-like\ + \ expression profiles suitable for differential expression analysis\nwith methods\ + \ designed for bulk differential expression analysis.\nNote that this componentonly\ + \ considers factors as explanatory variables, \nand excludes covariates from the\ + \ analysis.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/differential_expression/create_pseudobulk/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/create_pseudobulk/differential_expression" + executable: "target/executable/create_pseudobulk/differential_expression/differential_expression" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/create_pseudobulk/differential_expression/differential_expression b/target/executable/create_pseudobulk/differential_expression/differential_expression new file mode 100755 index 00000000..a654c2d2 --- /dev/null +++ b/target/executable/create_pseudobulk/differential_expression/differential_expression @@ -0,0 +1,1448 @@ +#!/usr/bin/env bash + +# differential_expression main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (author) +# * Jakub Majercik (author) +# * Dries De Maeyer (contributor) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="differential_expression" +VIASH_META_FUNCTIONALITY_NAME="differential_expression" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen, Jakub Majercik, Dries De Maeyer, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component create_pseudobulk differential_expression" +LABEL org.opencontainers.image.created="2025-08-22T07:23:23Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "differential_expression main" + echo "" + echo "Generation of pseudobulk samples from single-cell transcriptomics data," + echo "by aggregating raw gene expression counts from individual cells to create" + echo "bulk-like expression profiles suitable for differential expression analysis" + echo "with methods designed for bulk differential expression analysis." + echo "Note that this componentonly considers factors as explanatory variables," + echo "and excludes covariates from the analysis." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. If None, X is used. This layer must contain raw" + echo " counts." + echo "" + echo " --obs_label" + echo " type: string, required parameter" + echo " .obs field containing the variable to group on. Typically this field" + echo " contains cell groups such as annotated cell types or clusters." + echo "" + echo " --obs_groups" + echo " type: string, multiple values allowed" + echo " .obs fields containing the experimental condition(s) to create" + echo " pseudobulk samples for." + echo "" + echo "Arguments:" + echo " --aggregation_method" + echo " type: string" + echo " default: sum" + echo " choices: [ sum, mean ]" + echo " Method to aggregate the raw counts for pseudoreplicates. Either sum or" + echo " mean." + echo "" + echo " --min_obs_per_sample" + echo " type: integer" + echo " default: 30" + echo " min: 1" + echo " Minimum number of cells per pseudobulk sample." + echo "" + echo " --random_state" + echo " type: integer" + echo " default: 0" + echo " min: 0" + echo " The random seed for sampling." + echo "" + echo " --obs_cell_count" + echo " type: string" + echo " default: n_cells" + echo " The .obs field to store the number of observations per pseudobulk" + echo " sample." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file containing the aggregated pseudobulk samples." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " The compression format to be used on the output h5mu object." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "differential_expression main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_label) + [ -n "$VIASH_PAR_OBS_LABEL" ] && ViashError Bad arguments for option \'--obs_label\': \'$VIASH_PAR_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_label=*) + [ -n "$VIASH_PAR_OBS_LABEL" ] && ViashError Bad arguments for option \'--obs_label=*\': \'$VIASH_PAR_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_groups) + if [ -z "$VIASH_PAR_OBS_GROUPS" ]; then + VIASH_PAR_OBS_GROUPS="$2" + else + VIASH_PAR_OBS_GROUPS="$VIASH_PAR_OBS_GROUPS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_groups. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_groups=*) + if [ -z "$VIASH_PAR_OBS_GROUPS" ]; then + VIASH_PAR_OBS_GROUPS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_GROUPS="$VIASH_PAR_OBS_GROUPS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --aggregation_method) + [ -n "$VIASH_PAR_AGGREGATION_METHOD" ] && ViashError Bad arguments for option \'--aggregation_method\': \'$VIASH_PAR_AGGREGATION_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AGGREGATION_METHOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --aggregation_method. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --aggregation_method=*) + [ -n "$VIASH_PAR_AGGREGATION_METHOD" ] && ViashError Bad arguments for option \'--aggregation_method=*\': \'$VIASH_PAR_AGGREGATION_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AGGREGATION_METHOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_obs_per_sample) + [ -n "$VIASH_PAR_MIN_OBS_PER_SAMPLE" ] && ViashError Bad arguments for option \'--min_obs_per_sample\': \'$VIASH_PAR_MIN_OBS_PER_SAMPLE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_OBS_PER_SAMPLE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_obs_per_sample. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_obs_per_sample=*) + [ -n "$VIASH_PAR_MIN_OBS_PER_SAMPLE" ] && ViashError Bad arguments for option \'--min_obs_per_sample=*\': \'$VIASH_PAR_MIN_OBS_PER_SAMPLE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_OBS_PER_SAMPLE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --random_state) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --random_state. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --random_state=*) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state=*\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_cell_count) + [ -n "$VIASH_PAR_OBS_CELL_COUNT" ] && ViashError Bad arguments for option \'--obs_cell_count\': \'$VIASH_PAR_OBS_CELL_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_CELL_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_cell_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_cell_count=*) + [ -n "$VIASH_PAR_OBS_CELL_COUNT" ] && ViashError Bad arguments for option \'--obs_cell_count=*\': \'$VIASH_PAR_OBS_CELL_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_CELL_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/create_pseudobulk/differential_expression:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_LABEL+x} ]; then + ViashError '--obs_label' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_AGGREGATION_METHOD+x} ]; then + VIASH_PAR_AGGREGATION_METHOD="sum" +fi +if [ -z ${VIASH_PAR_MIN_OBS_PER_SAMPLE+x} ]; then + VIASH_PAR_MIN_OBS_PER_SAMPLE="30" +fi +if [ -z ${VIASH_PAR_RANDOM_STATE+x} ]; then + VIASH_PAR_RANDOM_STATE="0" +fi +if [ -z ${VIASH_PAR_OBS_CELL_COUNT+x} ]; then + VIASH_PAR_OBS_CELL_COUNT="n_cells" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_OBS_PER_SAMPLE" ]]; then + if ! [[ "$VIASH_PAR_MIN_OBS_PER_SAMPLE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_obs_per_sample' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_MIN_OBS_PER_SAMPLE -lt 1 ]]; then + ViashError '--min_obs_per_sample' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RANDOM_STATE" ]]; then + if ! [[ "$VIASH_PAR_RANDOM_STATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--random_state' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_RANDOM_STATE -lt 0 ]]; then + ViashError '--random_state' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_AGGREGATION_METHOD" ]; then + VIASH_PAR_AGGREGATION_METHOD_CHOICES=("sum;mean") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_AGGREGATION_METHOD_CHOICES[*]};" =~ ";$VIASH_PAR_AGGREGATION_METHOD;" ]]; then + ViashError '--aggregation_method' specified value of \'$VIASH_PAR_AGGREGATION_METHOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-differential_expression-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import numpy as np +import mudata as mu +import pandas as pd +import sys +import scanpy as sc +import scipy.sparse as sp + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_label': $( if [ ! -z ${VIASH_PAR_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_groups': $( if [ ! -z ${VIASH_PAR_OBS_GROUPS+x} ]; then echo "r'${VIASH_PAR_OBS_GROUPS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'aggregation_method': $( if [ ! -z ${VIASH_PAR_AGGREGATION_METHOD+x} ]; then echo "r'${VIASH_PAR_AGGREGATION_METHOD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_obs_per_sample': $( if [ ! -z ${VIASH_PAR_MIN_OBS_PER_SAMPLE+x} ]; then echo "int(r'${VIASH_PAR_MIN_OBS_PER_SAMPLE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'obs_cell_count': $( if [ ! -z ${VIASH_PAR_OBS_CELL_COUNT+x} ]; then echo "r'${VIASH_PAR_OBS_CELL_COUNT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def is_normalized(layer): + if sp.issparse(layer): + row_sums = np.array(layer.sum(axis=1)).flatten() + else: + row_sums = layer.sum(axis=1) + + return np.allclose(row_sums, 1) + + +def count_obs(adata, pb_adata, obs_cols): + counts = [] + for i in range(pb_adata.n_obs): + values = pb_adata.obs.iloc[i][obs_cols] + + mask = pd.Series([True] * adata.n_obs) + for col in obs_cols: + mask &= (adata.obs[col] == values[col]).values + + count = mask.sum() + counts.append(count) + + return counts + + +def main(): + # Read in data + logger.info(f"Reading input data {par['input']}...") + adata = mu.read_h5ad(par["input"], mod=par["modality"]).copy() + + # Make sure .X contains raw counts + if par["input_layer"]: + adata.X = adata.layers[par["input_layer"]] + if is_normalized(adata.X): + raise ValueError("Input layer must contain raw counts.") + + # Sanitize pseudobulk aggregation fields + pseudobulk_cols = [par["obs_label"]] + if par["obs_groups"]: + pseudobulk_cols += par["obs_groups"] + + for col in pseudobulk_cols: + if col not in adata.obs.columns: + raise ValueError(f"Required column '{col}' not found in .obs.") + adata.obs[col] = ( + adata.obs[col] + .astype(str) + .str.replace(" ", "_") + .str.replace("+", "") + .astype("category") + ) + + # Aggregate pseudobulk data per cell group + logger.info("Creating pseudobulk samples...") + adata_pb = sc.get.aggregate(adata, by=pseudobulk_cols, func="sum", axis=0) + adata_pb.X = adata_pb.layers["sum"] + del adata_pb.layers["sum"] + + adata_pb.obs_names = [ + "_".join(values) + for values in zip(*[adata_pb.obs[col] for col in pseudobulk_cols]) + ] + + # Filter pseudobulk samples based on minimum observation count + logger.info("Filtering pseudobulk samples based on minimum observation count...") + adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols) + adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]] + + logger.info( + f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes" + ) + + logger.info("Writing output data...") + mdata_pb = mu.MuData({"rna": adata_pb}) + mdata_pb.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/create_pseudobulk/differential_expression/nextflow_labels.config b/target/executable/create_pseudobulk/differential_expression/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/create_pseudobulk/differential_expression/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/create_pseudobulk/differential_expression/setup_logger.py b/target/executable/create_pseudobulk/differential_expression/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/create_pseudobulk/differential_expression/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dataflow/concatenate_h5mu/.config.vsh.yaml b/target/executable/dataflow/concatenate_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..2cf39d32 --- /dev/null +++ b/target/executable/dataflow/concatenate_h5mu/.config.vsh.yaml @@ -0,0 +1,337 @@ +name: "concatenate_h5mu" +namespace: "dataflow" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Paths to the different samples to be concatenated." + info: null + example: + - "sample_paths" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Only output concatenated objects for the provided modalities. Outputs\ + \ all modalities by default." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--input_id" + description: "Names of the different samples that have to be concatenated. Must\ + \ be specified when using '--mode move'.\nIn this case, the ids will be used\ + \ for the columns names of the dataframes registring the conflicts.\nIf specified,\ + \ must be of same length as `--input`.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output location for the concatenated MuData object file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_sample_name" + description: "Name of the .obs key under which to add the sample names." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--other_axis_mode" + description: "How to handle the merging of other axis (var, obs, ...).\n\n -\ + \ None: keep no data\n - same: only keep elements of the matrices which are\ + \ the same in each of the samples\n - unique: only keep elements for which\ + \ there is only 1 possible value (1 value that can occur in multiple samples)\n\ + \ - first: keep the annotation from the first sample\n - only: keep elements\ + \ that show up in only one of the objects (1 unique element in only 1 sample)\n\ + \ - move: identical to 'same', but moving the conflicting values to .varm or\ + \ .obsm\n" + info: null + default: + - "move" + required: false + choices: + - "same" + - "unique" + - "first" + - "only" + - "concat" + - "move" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_merge_mode" + description: "How to handle the merging of .uns across modalities\n - None: keep\ + \ no data\n - same: only keep elements of the matrices which are the same in\ + \ each of the samples\n - unique: only keep elements for which there is only\ + \ 1 possible value (1 value that can occur in multiple samples)\n - first:\ + \ keep the annotation from the first sample\n - only: keep elements that show\ + \ up in only one of the objects (1 unique element in only 1 sample)\n - make_unique:\ + \ identical to 'unique', but keys which are not unique are made unique by prefixing\ + \ them with the sample id.\n" + info: null + default: + - "make_unique" + required: false + choices: + - "same" + - "unique" + - "first" + - "only" + - "make_unique" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Concatenate observations from samples in several (uni- and/or multi-modal)\ + \ MuData files into a single file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +- type: "file" + path: "human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "highmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "pandas~=2.1.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/concatenate_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dataflow/concatenate_h5mu" + executable: "target/executable/dataflow/concatenate_h5mu/concatenate_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dataflow/concatenate_h5mu/compress_h5mu.py b/target/executable/dataflow/concatenate_h5mu/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/dataflow/concatenate_h5mu/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/dataflow/concatenate_h5mu/concatenate_h5mu b/target/executable/dataflow/concatenate_h5mu/concatenate_h5mu new file mode 100755 index 00000000..2c649810 --- /dev/null +++ b/target/executable/dataflow/concatenate_h5mu/concatenate_h5mu @@ -0,0 +1,1730 @@ +#!/usr/bin/env bash + +# concatenate_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="concatenate_h5mu" +VIASH_META_FUNCTIONALITY_NAME="concatenate_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "pandas~=2.1.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component dataflow concatenate_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "concatenate_h5mu main" + echo "" + echo "Concatenate observations from samples in several (uni- and/or multi-modal)" + echo "MuData files into a single file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: sample_paths" + echo " Paths to the different samples to be concatenated." + echo "" + echo " --modality" + echo " type: string, multiple values allowed" + echo " Only output concatenated objects for the provided modalities. Outputs" + echo " all modalities by default." + echo "" + echo " --input_id" + echo " type: string, multiple values allowed" + echo " Names of the different samples that have to be concatenated. Must be" + echo " specified when using '--mode move'." + echo " In this case, the ids will be used for the columns names of the" + echo " dataframes registring the conflicts." + echo " If specified, must be of same length as \`--input\`." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output location for the concatenated MuData object file." + echo "" + echo " --obs_sample_name" + echo " type: string" + echo " default: sample_id" + echo " Name of the .obs key under which to add the sample names." + echo "" + echo " --other_axis_mode" + echo " type: string" + echo " default: move" + echo " choices: [ same, unique, first, only, concat, move ]" + echo " How to handle the merging of other axis (var, obs, ...)." + echo " - None: keep no data" + echo " - same: only keep elements of the matrices which are the same in each" + echo " of the samples" + echo " - unique: only keep elements for which there is only 1 possible value" + echo " (1 value that can occur in multiple samples)" + echo " - first: keep the annotation from the first sample" + echo " - only: keep elements that show up in only one of the objects (1" + echo " unique element in only 1 sample)" + echo " - move: identical to 'same', but moving the conflicting values to" + echo " .varm or .obsm" + echo "" + echo " --uns_merge_mode" + echo " type: string" + echo " default: make_unique" + echo " choices: [ same, unique, first, only, make_unique ]" + echo " How to handle the merging of .uns across modalities" + echo " - None: keep no data" + echo " - same: only keep elements of the matrices which are the same in each" + echo " of the samples" + echo " - unique: only keep elements for which there is only 1 possible value" + echo " (1 value that can occur in multiple samples)" + echo " - first: keep the annotation from the first sample" + echo " - only: keep elements that show up in only one of the objects (1" + echo " unique element in only 1 sample)" + echo " - make_unique: identical to 'unique', but keys which are not unique" + echo " are made unique by prefixing them with the sample id." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "concatenate_h5mu main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -i) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + if [ -z "$VIASH_PAR_MODALITY" ]; then + VIASH_PAR_MODALITY="$2" + else + VIASH_PAR_MODALITY="$VIASH_PAR_MODALITY;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + if [ -z "$VIASH_PAR_MODALITY" ]; then + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MODALITY="$VIASH_PAR_MODALITY;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_id) + if [ -z "$VIASH_PAR_INPUT_ID" ]; then + VIASH_PAR_INPUT_ID="$2" + else + VIASH_PAR_INPUT_ID="$VIASH_PAR_INPUT_ID;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_id=*) + if [ -z "$VIASH_PAR_INPUT_ID" ]; then + VIASH_PAR_INPUT_ID=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_ID="$VIASH_PAR_INPUT_ID;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_sample_name) + [ -n "$VIASH_PAR_OBS_SAMPLE_NAME" ] && ViashError Bad arguments for option \'--obs_sample_name\': \'$VIASH_PAR_OBS_SAMPLE_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_SAMPLE_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_sample_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_sample_name=*) + [ -n "$VIASH_PAR_OBS_SAMPLE_NAME" ] && ViashError Bad arguments for option \'--obs_sample_name=*\': \'$VIASH_PAR_OBS_SAMPLE_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_SAMPLE_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --other_axis_mode) + [ -n "$VIASH_PAR_OTHER_AXIS_MODE" ] && ViashError Bad arguments for option \'--other_axis_mode\': \'$VIASH_PAR_OTHER_AXIS_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OTHER_AXIS_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --other_axis_mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --other_axis_mode=*) + [ -n "$VIASH_PAR_OTHER_AXIS_MODE" ] && ViashError Bad arguments for option \'--other_axis_mode=*\': \'$VIASH_PAR_OTHER_AXIS_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OTHER_AXIS_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_merge_mode) + [ -n "$VIASH_PAR_UNS_MERGE_MODE" ] && ViashError Bad arguments for option \'--uns_merge_mode\': \'$VIASH_PAR_UNS_MERGE_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_MERGE_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_merge_mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_merge_mode=*) + [ -n "$VIASH_PAR_UNS_MERGE_MODE" ] && ViashError Bad arguments for option \'--uns_merge_mode=*\': \'$VIASH_PAR_UNS_MERGE_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_MERGE_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dataflow/concatenate_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OBS_SAMPLE_NAME+x} ]; then + VIASH_PAR_OBS_SAMPLE_NAME="sample_id" +fi +if [ -z ${VIASH_PAR_OTHER_AXIS_MODE+x} ]; then + VIASH_PAR_OTHER_AXIS_MODE="move" +fi +if [ -z ${VIASH_PAR_UNS_MERGE_MODE+x} ]; then + VIASH_PAR_UNS_MERGE_MODE="make_unique" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OTHER_AXIS_MODE" ]; then + VIASH_PAR_OTHER_AXIS_MODE_CHOICES=("same;unique;first;only;concat;move") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OTHER_AXIS_MODE_CHOICES[*]};" =~ ";$VIASH_PAR_OTHER_AXIS_MODE;" ]]; then + ViashError '--other_axis_mode' specified value of \'$VIASH_PAR_OTHER_AXIS_MODE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_UNS_MERGE_MODE" ]; then + VIASH_PAR_UNS_MERGE_MODE_CHOICES=("same;unique;first;only;make_unique") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_UNS_MERGE_MODE_CHOICES[*]};" =~ ";$VIASH_PAR_UNS_MERGE_MODE;" ]]; then + ViashError '--uns_merge_mode' specified value of \'$VIASH_PAR_UNS_MERGE_MODE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-concatenate_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from __future__ import annotations +import sys +import anndata +import mudata as mu +import pandas as pd +import numpy as np +from collections.abc import Iterable +from multiprocessing import Pool +from pathlib import Path +from h5py import File as H5File +from typing import Literal +import shutil + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_sample_name': $( if [ ! -z ${VIASH_PAR_OBS_SAMPLE_NAME+x} ]; then echo "r'${VIASH_PAR_OBS_SAMPLE_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'other_axis_mode': $( if [ ! -z ${VIASH_PAR_OTHER_AXIS_MODE+x} ]; then echo "r'${VIASH_PAR_OTHER_AXIS_MODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_merge_mode': $( if [ ! -z ${VIASH_PAR_UNS_MERGE_MODE+x} ]; then echo "r'${VIASH_PAR_UNS_MERGE_MODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import compress_h5mu +from setup_logger import setup_logger + +logger = setup_logger() + + +def nunique(row): + unique = pd.unique(row) + unique_without_na = pd.core.dtypes.missing.remove_na_arraylike(unique) + return len(unique_without_na) > 1 + + +def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) -> bool: + """ + Check if any row contains duplicate values, that are not NA. + """ + numpy_array = frame.to_numpy() + with Pool(n_processes) as pool: + is_duplicated = pool.map(nunique, iter(numpy_array)) + return any(is_duplicated) + + +def concatenate_matrices( + n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index +) -> tuple[ + dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype] +]: + """ + Merge matrices by combining columns that have the same name. + Columns that contain conflicting values (e.i. the columns have different values), + are not merged, but instead moved to a new dataframe. + """ + column_names = set(column_name for var in matrices.values() for column_name in var) + logger.debug("Trying to concatenate columns: %s.", ",".join(column_names)) + if not column_names: + return {}, pd.DataFrame(index=align_to) + conflicts, concatenated_matrix = split_conflicts_and_concatenated_columns( + n_processes, matrices, column_names, align_to + ) + concatenated_matrix = cast_to_writeable_dtype(concatenated_matrix) + conflicts = { + conflict_name: cast_to_writeable_dtype(conflict_df) + for conflict_name, conflict_df in conflicts.items() + } + return conflicts, concatenated_matrix + + +def get_first_non_na_value_vector(df): + numpy_arr = df.to_numpy() + n_rows, n_cols = numpy_arr.shape + col_index = pd.isna(numpy_arr).argmin(axis=1) + flat_index = n_cols * np.arange(n_rows) + col_index + return pd.Series(numpy_arr.ravel()[flat_index], index=df.index, name=df.columns[0]) + + +def make_uns_keys_unique(mod_data, concatenated_data): + """ + Check if the uns keys across samples are unique before adding them + to the final concatenated object. If a conflict occurs between the samples, + add the sample ID to make the key unique again. + """ + all_uns_keys = {} + for sample_id, mod in mod_data.items(): + for uns_key, _ in mod.uns.items(): + all_uns_keys.setdefault(uns_key, []).append(sample_id) + for uns_key, samples_ids in all_uns_keys.items(): + assert samples_ids + if len(samples_ids) == 1: + sample_id = samples_ids[0] + concatenated_data.uns[uns_key] = mod_data[sample_id].uns[uns_key] + else: + for sample_id in samples_ids: + concatenated_data.uns[f"{sample_id}_{uns_key}"] = mod_data[ + sample_id + ].uns[uns_key] + return concatenated_data + + +def split_conflicts_and_concatenated_columns( + n_processes: int, + matrices: dict[str, pd.DataFrame], + column_names: Iterable[str], + align_to: pd.Index, +) -> tuple[dict[str, pd.DataFrame], pd.DataFrame]: + """ + Retrieve columns with the same name from a list of dataframes which are + identical across all the frames (ignoring NA values). + Columns which are not the same are regarded as 'conflicts', + which are stored in seperate dataframes, one per columns + with the same name that store conflicting values. + """ + conflicts = {} + concatenated_matrix = [] + for column_name in column_names: + columns = { + input_id: var[column_name] + for input_id, var in matrices.items() + if column_name in var + } + assert columns, "Some columns should have been found." + concatenated_columns = pd.concat( + columns.values(), axis=1, join="outer", sort=False + ) + if any_row_contains_duplicate_values(n_processes, concatenated_columns): + concatenated_columns.columns = ( + columns.keys() + ) # Use the sample id as column name + concatenated_columns = concatenated_columns.reindex(align_to, copy=False) + conflicts[f"conflict_{column_name}"] = concatenated_columns + else: + unique_values = get_first_non_na_value_vector(concatenated_columns) + concatenated_matrix.append(unique_values) + if not concatenated_matrix: + return conflicts, pd.DataFrame(index=align_to) + concatenated_matrix = pd.concat( + concatenated_matrix, join="outer", axis=1, sort=False + ) + concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False) + return conflicts, concatenated_matrix + + +def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: + """ + Cast the dataframe to dtypes that can be written by mudata. + """ + # dtype inferral workfs better with np.nan + result = result.replace({pd.NA: np.nan}) + + # MuData supports nullable booleans and ints + # ie. \`IntegerArray\` and \`BooleanArray\` + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # Convert leftover 'object' columns to string + # However, na values are supported, so convert all values except NA's to string + object_cols = result.select_dtypes(include="object").columns.values + for obj_col in object_cols: + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) + return result + + +def split_conflicts_modalities( + n_processes: int, samples: dict[str, anndata.AnnData], output: anndata.AnnData +) -> anndata.AnnData: + """ + Merge .var and .obs matrices of the anndata objects. Columns are merged + when the values (excl NA) are the same in each of the matrices. + Conflicting columns are moved to a separate dataframe (one dataframe for each column, + containing all the corresponding column from each sample). + """ + matrices_to_parse = ("var", "obs") + for matrix_name in matrices_to_parse: + matrices = { + sample_id: getattr(sample, matrix_name) + for sample_id, sample in samples.items() + } + output_index = getattr(output, matrix_name).index + conflicts, concatenated_matrix = concatenate_matrices( + n_processes, matrices, output_index + ) + if concatenated_matrix.empty: + concatenated_matrix.index = output_index + + # Even though we did not touch the varm and obsm matrices that were already present, + # the joining of observations might have caused a dtype change in these matrices as well + # so these also need to be casted to a writable dtype... + for multidim_name, multidim_data in getattr(output, f"{matrix_name}m").items(): + new_data = ( + cast_to_writeable_dtype(multidim_data) + if isinstance(multidim_data, pd.DataFrame) + else multidim_data + ) + getattr(output, f"{matrix_name}m")[multidim_name] = new_data + + # Write the conflicts to the output + for conflict_name, conflict_data in conflicts.items(): + getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data + + # Set other annotation matrices in the output + setattr(output, matrix_name, concatenated_matrix) + + return output + + +def concatenate_modality( + n_processes: int, + mod: str | None, + input_files: Iterable[str | Path], + other_axis_mode: str, + uns_merge_mode: str, + input_ids: tuple[str], +) -> anndata.AnnData: + concat_modes = { + "move": "unique", + } + other_axis_mode_to_apply = concat_modes.get(other_axis_mode, other_axis_mode) + + uns_merge_modes = {"make_unique": None} + uns_merge_mode_to_apply = uns_merge_modes.get(uns_merge_mode, uns_merge_mode) + + mod_data = {} + mod_indices_combined = pd.Index([]) + for input_id, input_file in zip(input_ids, input_files): + if mod is not None: + try: + data = mu.read_h5ad(input_file, mod=mod) + mod_data[input_id] = data + mod_indices_combined = mod_indices_combined.append(data.obs.index) + except KeyError as e: # Modality does not exist for this sample, skip it + if ( + f"Unable to synchronously open object (object '{mod}' doesn't exist)" + not in str(e) + ): + raise e + pass + else: # When mod=None, process the 'global' h5mu state + with H5File(input_file, "r") as input_h5: + if "uns" in input_h5.keys(): + uns_data = anndata.experimental.read_elem(input_h5["uns"]) + if uns_data: + mod_data[input_id] = anndata.AnnData(uns=uns_data) + + if not mod_indices_combined.is_unique: + raise ValueError("Observations are not unique across samples.") + + if not mod_data: + return anndata.AnnData() + + concatenated_data = anndata.concat( + mod_data.values(), + join="outer", + merge=other_axis_mode_to_apply, + uns_merge=uns_merge_mode_to_apply, + ) + + if other_axis_mode == "move": + concatenated_data = split_conflicts_modalities( + n_processes, mod_data, concatenated_data + ) + + if uns_merge_mode == "make_unique": + concatenated_data = make_uns_keys_unique(mod_data, concatenated_data) + + return concatenated_data + + +def concatenate_modalities( + n_processes: int, + modalities: list[str], + input_files: Path | str, + other_axis_mode: str, + uns_merge_mode: str, + output_file: Path | str, + compression: Literal["gzip"] | Literal["lzf"], + input_ids: tuple[str] | None = None, +) -> None: + """ + Join the modalities together into a single multimodal sample. + """ + logger.info("Concatenating samples.") + output_file, input_files = ( + Path(output_file), + [Path(input_file) for input_file in input_files], + ) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) + output_file_uncompressed.touch() + # Create empty mudata file + mdata = mu.MuData({modality: anndata.AnnData() for modality in modalities}) + mdata.write(output_file_uncompressed, compression=compression) + + # Use "None" for the global slots (not assigned to any modality) + for mod_name in modalities + [ + None, + ]: + new_mod = concatenate_modality( + n_processes, + mod_name, + input_files, + other_axis_mode, + uns_merge_mode, + input_ids, + ) + if mod_name is None: + if new_mod.uns: + with H5File(output_file_uncompressed, "r+") as open_h5mu_file: + anndata.experimental.write_elem( + open_h5mu_file, "uns", dict(new_mod.uns) + ) + continue + logger.info( + "Writing out modality '%s' to '%s' with compression '%s'.", + mod_name, + output_file_uncompressed, + compression, + ) + mu.write_h5ad(output_file_uncompressed, data=new_mod, mod=mod_name) + + if compression: + compress_h5mu(output_file_uncompressed, output_file, compression=compression) + output_file_uncompressed.unlink() + else: + shutil.move(output_file_uncompressed, output_file) + + logger.info("Concatenation successful.") + + +def main() -> None: + # Get a list of all possible modalities + mods = set() + for path in par["input"]: + try: + with H5File(path, "r") as f_root: + mods = mods | set(f_root["mod"].keys()) + except OSError: + raise OSError(f"Failed to load {path}. Is it a valid h5 file?") + + input_ids = None + if par["input_id"]: + input_ids: tuple[str] = tuple(i.strip() for i in par["input_id"]) + if len(input_ids) != len(par["input"]): + raise ValueError( + "The number of sample names must match the number of sample files." + ) + + if len(set(input_ids)) != len(input_ids): + raise ValueError("The sample names should be unique.") + + logger.info("\\nConcatenating data from paths:\\n\\t%s", "\\n\\t".join(par["input"])) + + if par["other_axis_mode"] == "move" and not input_ids: + raise ValueError("--mode 'move' requires --input_ids.") + + n_processes = meta["cpus"] if meta["cpus"] else 1 + + if par["modality"]: + par["modality"] = set(par["modality"]) + if not par["modality"].issubset(mods): + mods_joined, input_mods_joined = ", ".join(mods), ", ".join(par["modality"]) + raise ValueError( + f"One of the modalities provided ({input_mods_joined}) is not available in the input data {mods_joined}" + ) + mods = par["modality"] + + concatenate_modalities( + n_processes, + list(mods), + par["input"], + par["other_axis_mode"], + par["uns_merge_mode"], + par["output"], + par["output_compression"], + input_ids=input_ids, + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dataflow/concatenate_h5mu/nextflow_labels.config b/target/executable/dataflow/concatenate_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dataflow/concatenate_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dataflow/concatenate_h5mu/setup_logger.py b/target/executable/dataflow/concatenate_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dataflow/concatenate_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dataflow/merge/.config.vsh.yaml b/target/executable/dataflow/merge/.config.vsh.yaml new file mode 100644 index 00000000..1c75e4f3 --- /dev/null +++ b/target/executable/dataflow/merge/.config.vsh.yaml @@ -0,0 +1,251 @@ +name: "merge" +namespace: "dataflow" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Paths to the single-modality .h5mu files that need to be combined" + info: null + default: + - "sample_paths" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Path to the output file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Combine one or more single-modality .h5mu files together into one .h5mu\ + \ file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "highmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/merge/config.vsh.yml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dataflow/merge" + executable: "target/executable/dataflow/merge/merge" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dataflow/merge/merge b/target/executable/dataflow/merge/merge new file mode 100755 index 00000000..0c2acd53 --- /dev/null +++ b/target/executable/dataflow/merge/merge @@ -0,0 +1,1266 @@ +#!/usr/bin/env bash + +# merge main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="merge" +VIASH_META_FUNCTIONALITY_NAME="merge" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component dataflow merge" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "merge main" + echo "" + echo "Combine one or more single-modality .h5mu files together into one .h5mu file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " default: sample_paths" + echo " Paths to the single-modality .h5mu files that need to be combined" + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " default: output.h5mu" + echo " Path to the output file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " The compression format to be used on the output h5mu object." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "merge main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -i) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dataflow/merge:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="output.h5mu" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-merge-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from __future__ import annotations +import sys +import mudata as md +import pandas as pd +import numpy as np + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info("Reading input files %s", ",".join(par["input"])) + input_samples = [md.read_h5mu(path) for path in par["input"]] + + logger.info("Merging into single object.") + sample_modalities = {} + for input_sample in input_samples: + for mod_name, mod_data in input_sample.mod.items(): + if mod_name in sample_modalities: + raise ValueError( + f"Modality '{mod_name}' was found in more than 1 sample." + ) + sample_modalities[mod_name] = mod_data + + merged = md.MuData(sample_modalities) + merged.update() + for df_attr in ("var", "obs"): + df = getattr(merged, df_attr) + df = df.replace({pd.NA: np.nan}, inplace=False) + + # MuData supports nullable booleans and ints + # ie. \`IntegerArray\` and \`BooleanArray\` + df = df.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # pd.convert_dtypes does not convert already existing nullable dtypes + # to their native numpy dtypes. + # At this point, integer and boolean columns use a nullable dtype; and string columns + # might use 'object' or pd.StringDtype. This is OK. + # However, for example floating number columns might still use a nullable dtype + # from before the call to convert_dtypes. So we need to explicitly cast. + col_dtypes = df.select_dtypes( + exclude=["integer", "string", "object", "boolean"] + ).dtypes.to_dict() + numpy_dtypes = { + col_name: np.dtype(col_dtype.kind) + for col_name, col_dtype in col_dtypes.items() + if pd.api.types.is_extension_array_dtype(col_dtype) + } + df = df.astype(numpy_dtypes) + + # Convert leftover 'object' columns to string + object_cols = df.select_dtypes(include="object").columns.values + for obj_col in object_cols: + df[obj_col] = df[obj_col].astype(str).astype("category") + + setattr(merged, df_attr, df) + + merged.write_h5mu(par["output"], compression=par["output_compression"]) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dataflow/merge/nextflow_labels.config b/target/executable/dataflow/merge/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dataflow/merge/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dataflow/merge/setup_logger.py b/target/executable/dataflow/merge/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dataflow/merge/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dataflow/split_h5mu/.config.vsh.yaml b/target/executable/dataflow/split_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..18fb2300 --- /dev/null +++ b/target/executable/dataflow/split_h5mu/.config.vsh.yaml @@ -0,0 +1,291 @@ +name: "split_h5mu" +namespace: "dataflow" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input & specifications" + arguments: + - type: "file" + name: "--input" + description: "Path to a single .h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_feature" + description: "The .obs column to split the mudata on." + info: null + example: + - "celltype" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--drop_obs_nan" + description: "Whether to drop all .obs columns that contain only nan values after\ + \ splitting." + info: null + direction: "input" + - type: "boolean_true" + name: "--ensure_unique_filenames" + description: "Append number suffixes to ensure unique filenames after sanitizing\ + \ obs feature values." + info: null + direction: "input" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output directory containing multiple h5mu files." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_files" + description: "A csv containing the base filename and obs feature by which it was\ + \ split." + info: null + example: + - "sample_files.csv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split the samples of a single modality from a .h5mu (multimodal) sample\ + \ into seperate .h5mu files based on the values of an .obs column of this modality.\ + \ \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/split_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dataflow/split_h5mu" + executable: "target/executable/dataflow/split_h5mu/split_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dataflow/split_h5mu/nextflow_labels.config b/target/executable/dataflow/split_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dataflow/split_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dataflow/split_h5mu/setup_logger.py b/target/executable/dataflow/split_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dataflow/split_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dataflow/split_h5mu/split_h5mu b/target/executable/dataflow/split_h5mu/split_h5mu new file mode 100755 index 00000000..f6cdb519 --- /dev/null +++ b/target/executable/dataflow/split_h5mu/split_h5mu @@ -0,0 +1,1358 @@ +#!/usr/bin/env bash + +# split_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="split_h5mu" +VIASH_META_FUNCTIONALITY_NAME="split_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component dataflow split_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "split_h5mu main" + echo "" + echo "Split the samples of a single modality from a .h5mu (multimodal) sample into" + echo "seperate .h5mu files based on the values of an .obs column of this modality." + echo "" + echo "Input & specifications:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " Path to a single .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obs_feature" + echo " type: string, required parameter" + echo " example: celltype" + echo " The .obs column to split the mudata on." + echo "" + echo " --drop_obs_nan" + echo " type: boolean_true" + echo " Whether to drop all .obs columns that contain only nan values after" + echo " splitting." + echo "" + echo " --ensure_unique_filenames" + echo " type: boolean_true" + echo " Append number suffixes to ensure unique filenames after sanitizing obs" + echo " feature values." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/output" + echo " Output directory containing multiple h5mu files." + echo "" + echo " --output_files" + echo " type: file, required parameter, output, file must exist" + echo " example: sample_files.csv" + echo " A csv containing the base filename and obs feature by which it was" + echo " split." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "split_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_feature) + [ -n "$VIASH_PAR_OBS_FEATURE" ] && ViashError Bad arguments for option \'--obs_feature\': \'$VIASH_PAR_OBS_FEATURE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_FEATURE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_feature. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_feature=*) + [ -n "$VIASH_PAR_OBS_FEATURE" ] && ViashError Bad arguments for option \'--obs_feature=*\': \'$VIASH_PAR_OBS_FEATURE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_FEATURE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --drop_obs_nan) + [ -n "$VIASH_PAR_DROP_OBS_NAN" ] && ViashError Bad arguments for option \'--drop_obs_nan\': \'$VIASH_PAR_DROP_OBS_NAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DROP_OBS_NAN=true + shift 1 + ;; + --ensure_unique_filenames) + [ -n "$VIASH_PAR_ENSURE_UNIQUE_FILENAMES" ] && ViashError Bad arguments for option \'--ensure_unique_filenames\': \'$VIASH_PAR_ENSURE_UNIQUE_FILENAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ENSURE_UNIQUE_FILENAMES=true + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_files) + [ -n "$VIASH_PAR_OUTPUT_FILES" ] && ViashError Bad arguments for option \'--output_files\': \'$VIASH_PAR_OUTPUT_FILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FILES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_files. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_files=*) + [ -n "$VIASH_PAR_OUTPUT_FILES" ] && ViashError Bad arguments for option \'--output_files=*\': \'$VIASH_PAR_OUTPUT_FILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FILES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dataflow/split_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_FEATURE+x} ]; then + ViashError '--obs_feature' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_FILES+x} ]; then + ViashError '--output_files' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_DROP_OBS_NAN+x} ]; then + VIASH_PAR_DROP_OBS_NAN="false" +fi +if [ -z ${VIASH_PAR_ENSURE_UNIQUE_FILENAMES+x} ]; then + VIASH_PAR_ENSURE_UNIQUE_FILENAMES="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_DROP_OBS_NAN" ]]; then + if ! [[ "$VIASH_PAR_DROP_OBS_NAN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--drop_obs_nan' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ENSURE_UNIQUE_FILENAMES" ]]; then + if ! [[ "$VIASH_PAR_ENSURE_UNIQUE_FILENAMES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--ensure_unique_filenames' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_FILES" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_FILES")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_FILES")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_FILES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_FILES")" ) + VIASH_PAR_OUTPUT_FILES=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_FILES") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_FILES" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-split_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +import pandas as pd +import re +import gc +from pathlib import Path +from collections import defaultdict + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_feature': $( if [ ! -z ${VIASH_PAR_OBS_FEATURE+x} ]; then echo "r'${VIASH_PAR_OBS_FEATURE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'drop_obs_nan': $( if [ ! -z ${VIASH_PAR_DROP_OBS_NAN+x} ]; then echo "r'${VIASH_PAR_DROP_OBS_NAN//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'ensure_unique_filenames': $( if [ ! -z ${VIASH_PAR_ENSURE_UNIQUE_FILENAMES+x} ]; then echo "r'${VIASH_PAR_ENSURE_UNIQUE_FILENAMES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_files': $( if [ ! -z ${VIASH_PAR_OUTPUT_FILES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_FILES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info(f"Reading {par['input']}") + input_file = Path(par["input"].strip()) + + mdata = mu.read_h5mu(input_file) + adata = mdata.mod[par["modality"]] + + logger.info(f"Reading unique features from {par['obs_feature']}") + obs_features = adata.obs[par["obs_feature"]].unique().tolist() + + # sanitize --obs_feature values + obs_features_s = [re.sub(r"[-\\s]", "_", str(s).strip()) for s in obs_features] + obs_features_s = [re.sub(r"[^A-Za-z0-9_]", "", s) for s in obs_features_s] + + # ensure that names are unique, if not raise or append number as suffix + if not len(obs_features_s) == len(set(obs_features_s)): + if not par["ensure_unique_filenames"]: + raise ValueError( + f"File names are not unique after sanitizing the --obs_feature {par['obs_feature']} values" + ) + + logger.info("Ensuring unique names for par['obs_feature']") + counts = defaultdict(lambda: -1) + for i, feature in enumerate(obs_features_s): + counts[feature] += 1 + if (curr_counts := counts[feature]) > 0: + obs_features_s[i] += f"_{curr_counts}" + + # generate output dir + output_dir = Path(par["output"]) + if not output_dir.is_dir(): + output_dir.mkdir(parents=True) + + # split modality of mdata file base on obs_feature + logger.info(f"Splitting file based on {par['obs_feature']} values {obs_features}") + obs_files = [] + + for obs_name, file_name in zip(obs_features, obs_features_s): + logger.info( + f"Filtering modality '{par['modality']}' observations by .obs['{par['obs_feature']}'] == {obs_name}" + ) + mdata_obs = mdata.copy() + adata_obs = mdata_obs.mod[par["modality"]] + + # split the samples + adata_obs = adata_obs[adata_obs.obs[par["obs_feature"]] == obs_name] + mdata_obs_name = f"{input_file.stem}_{file_name}.h5mu" + obs_files.append(mdata_obs_name) + + # Dropping columns that only have nan values after splitting + if par["drop_obs_nan"]: + logger.info("Dropping all .obs columns with NaN values") + adata_obs.obs.dropna(axis=1, how="all", inplace=True) + + # replace mdata file with modality adata contianing split samples + logger.info( + f"Writing h5mu filtered for {par['obs_feature']} {obs_name} to file {output_dir / mdata_obs_name}" + ) + mdata_obs.mod[par["modality"]] = adata_obs + mdata_obs.write_h5mu( + output_dir / mdata_obs_name, compression=par["output_compression"] + ) + + # avoid keeping files in memory + del mdata_obs + del adata_obs + gc.collect() + + logger.info(f"Writing output_files CSV file to {par['output_files']}") + df = pd.DataFrame({"name": obs_features_s, "filename": obs_files}) + df.to_csv(par["output_files"], index=False) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_FILES" ]; then + VIASH_PAR_OUTPUT_FILES=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_FILES") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_FILES" ] && [ ! -e "$VIASH_PAR_OUTPUT_FILES" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_FILES' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dataflow/split_modalities/.config.vsh.yaml b/target/executable/dataflow/split_modalities/.config.vsh.yaml new file mode 100644 index 00000000..82cb9768 --- /dev/null +++ b/target/executable/dataflow/split_modalities/.config.vsh.yaml @@ -0,0 +1,278 @@ +name: "split_modalities" +namespace: "dataflow" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to a single .h5mu file." + info: null + default: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containing multiple h5mu files." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_types" + description: "A csv containing the base filename and modality type per output\ + \ file." + info: null + example: + - "types.csv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split the modalities from a single .h5mu multimodal sample into seperate\ + \ .h5mu files. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/split_modalities/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dataflow/split_modalities" + executable: "target/executable/dataflow/split_modalities/split_modalities" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dataflow/split_modalities/nextflow_labels.config b/target/executable/dataflow/split_modalities/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dataflow/split_modalities/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dataflow/split_modalities/setup_logger.py b/target/executable/dataflow/split_modalities/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dataflow/split_modalities/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dataflow/split_modalities/split_modalities b/target/executable/dataflow/split_modalities/split_modalities new file mode 100755 index 00000000..5023f4bc --- /dev/null +++ b/target/executable/dataflow/split_modalities/split_modalities @@ -0,0 +1,1253 @@ +#!/usr/bin/env bash + +# split_modalities main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Robrecht Cannoodt (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="split_modalities" +VIASH_META_FUNCTIONALITY_NAME="split_modalities" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component dataflow split_modalities" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "split_modalities main" + echo "" + echo "Split the modalities from a single .h5mu multimodal sample into seperate .h5mu" + echo "files." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " default: sample_path" + echo " Path to a single .h5mu file." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/output" + echo " Output directory containing multiple h5mu files." + echo "" + echo " --output_types" + echo " type: file, required parameter, output, file must exist" + echo " example: types.csv" + echo " A csv containing the base filename and modality type per output file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "split_modalities main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_types) + [ -n "$VIASH_PAR_OUTPUT_TYPES" ] && ViashError Bad arguments for option \'--output_types\': \'$VIASH_PAR_OUTPUT_TYPES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_TYPES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_types. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_types=*) + [ -n "$VIASH_PAR_OUTPUT_TYPES" ] && ViashError Bad arguments for option \'--output_types=*\': \'$VIASH_PAR_OUTPUT_TYPES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_TYPES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dataflow/split_modalities:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_TYPES+x} ]; then + ViashError '--output_types' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TYPES" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_TYPES")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_TYPES")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TYPES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_TYPES")" ) + VIASH_PAR_OUTPUT_TYPES=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_TYPES") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_TYPES" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-split_modalities-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from __future__ import annotations +import sys +import mudata as md +from pathlib import Path +import pandas as pd + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_types': $( if [ ! -z ${VIASH_PAR_OUTPUT_TYPES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_TYPES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main() -> None: + output_dir = Path(par["output"]) + logger.info("Creating output directory '%s' if it does not exist", output_dir) + if not output_dir.is_dir(): + logger.info("Creating %s", output_dir) + output_dir.mkdir(parents=True) + + logger.info("Reading input file '%s'", par["input"]) + input_file = Path(par["input"].strip()) + sample = md.read_h5mu(input_file) + + logger.info("Creating output types CSV.") + modalities = list(sample.mod.keys()) + + logger.info("Found the following modalities:\\n%s", "\\n".join(modalities)) + names = {mod_name: f"{input_file.stem}_{mod_name}.h5mu" for mod_name in modalities} + output_files = list(names.values()) + logger.info( + "Will be creating the following output .h5mu files:\\n%s", + "\\n".join(output_files), + ) + df = pd.DataFrame({"name": modalities, "filename": output_files}) + logger.info("Writing output_types CSV file to '%s'.", par["output_types"]) + df.to_csv(par["output_types"], index=False) + + logger.info("Splitting input file into unimodal output files.") + for mod_name, mod in sample.mod.items(): + logger.info("Processing modality '%s'", mod_name) + new_sample = md.MuData({mod_name: mod}) + logger.info( + "Writing to '%s', with compression '%s'", + names[mod_name], + par["output_compression"], + ) + new_sample.write_h5mu( + output_dir / names[mod_name], compression=par["output_compression"] + ) + logger.info("Done writing output file.") + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_TYPES" ]; then + VIASH_PAR_OUTPUT_TYPES=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_TYPES") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_TYPES" ] && [ ! -e "$VIASH_PAR_OUTPUT_TYPES" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_TYPES' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/demux/bcl2fastq/.config.vsh.yaml b/target/executable/demux/bcl2fastq/.config.vsh.yaml new file mode 100644 index 00000000..3430108e --- /dev/null +++ b/target/executable/demux/bcl2fastq/.config.vsh.yaml @@ -0,0 +1,245 @@ +name: "bcl2fastq" +namespace: "demux" +version: "main" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + - "--runfolder_dir" + description: "Input run directory" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + alternatives: + - "-s" + description: "Pointer to the sample sheet" + info: null + example: + - "SampleSheet.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containig fastq files" + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--ignore_missing" + description: "Interpret missing *.bcl files as no call (N), interpret missing\ + \ control files as not-set\ncontrol bits and fill in with zeros when *.stats\ + \ files are missing.\n" + info: null + direction: "input" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert bcl files to fastq files using bcl2fastq.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "bcl" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/bcl2fastq:2.20" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/bcl2fastq/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/demux/bcl2fastq" + executable: "target/executable/demux/bcl2fastq/bcl2fastq" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/demux/bcl2fastq/bcl2fastq b/target/executable/demux/bcl2fastq/bcl2fastq new file mode 100755 index 00000000..afdbdabb --- /dev/null +++ b/target/executable/demux/bcl2fastq/bcl2fastq @@ -0,0 +1,1237 @@ +#!/usr/bin/env bash + +# bcl2fastq main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Toni Verbeiren (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bcl2fastq" +VIASH_META_FUNCTIONALITY_NAME="bcl2fastq" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/bcl2fastq:2.20 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Toni Verbeiren" +LABEL org.opencontainers.image.description="Companion container for running component demux bcl2fastq" +LABEL org.opencontainers.image.created="2025-08-22T07:23:07Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bcl2fastq main" + echo "" + echo "Convert bcl files to fastq files using bcl2fastq." + echo "" + echo "Arguments:" + echo " -i, --runfolder_dir, --input" + echo " type: file, required parameter, file must exist" + echo " example: bcl_dir" + echo " Input run directory" + echo "" + echo " -s, --sample_sheet" + echo " type: file, required parameter, file must exist" + echo " example: SampleSheet.csv" + echo " Pointer to the sample sheet" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: fastq_dir" + echo " Output directory containig fastq files" + echo "" + echo " --reports" + echo " type: file, output, file must exist" + echo " example: reports_dir" + echo " Reports directory" + echo "" + echo " --ignore_missing" + echo " type: boolean_true" + echo " Interpret missing *.bcl files as no call (N), interpret missing control" + echo " files as not-set" + echo " control bits and fill in with zeros when *.stats files are missing." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bcl2fastq main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runfolder_dir) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--runfolder_dir\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --runfolder_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_sheet) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'--sample_sheet\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_sheet. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_sheet=*) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'--sample_sheet=*\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET=$(ViashRemoveFlags "$1") + shift 1 + ;; + -s) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'-s\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -s. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reports) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reports. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reports=*) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports=*\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ignore_missing) + [ -n "$VIASH_PAR_IGNORE_MISSING" ] && ViashError Bad arguments for option \'--ignore_missing\': \'$VIASH_PAR_IGNORE_MISSING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_IGNORE_MISSING=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/demux/bcl2fastq:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then + ViashError '--sample_sheet' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_IGNORE_MISSING+x} ]; then + VIASH_PAR_IGNORE_MISSING="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ] && [ ! -e "$VIASH_PAR_SAMPLE_SHEET" ]; then + ViashError "Input file '$VIASH_PAR_SAMPLE_SHEET' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_IGNORE_MISSING" ]]; then + if ! [[ "$VIASH_PAR_IGNORE_MISSING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--ignore_missing' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -d "$(dirname "$VIASH_PAR_REPORTS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_REPORTS")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_SHEET")" ) + VIASH_PAR_SAMPLE_SHEET=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_SHEET") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REPORTS")" ) + VIASH_PAR_REPORTS=$(ViashDockerAutodetectMount "$VIASH_PAR_REPORTS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_REPORTS" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bcl2fastq-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then echo "${VIASH_PAR_SAMPLE_SHEET}" | sed "s#'#'\"'\"'#g;s#.*#par_sample_sheet='&'#" ; else echo "# par_sample_sheet="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\"'\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_PAR_IGNORE_MISSING+x} ]; then echo "${VIASH_PAR_IGNORE_MISSING}" | sed "s#'#'\"'\"'#g;s#.*#par_ignore_missing='&'#" ; else echo "# par_ignore_missing="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -exo pipefail + +extra_params=() + +# Handle reports stored separate +if [ ! -z "\$par_reports" ]; then + extra_params+=("--reports-dir" "\$par_reports") +fi + +# Handle the boolean flag +if [ "\$par_ignore_missing" == "true" ]; then + extra_params+=("--ignore-missing-control" "--ignore-missing-bcl" "--ignore-missing-filter") +fi + +# Run the actual command +bcl2fastq \\ + --runfolder-dir "\$par_input" \\ + --sample-sheet "\$par_sample_sheet" \\ + --output-dir "\$par_output" \\ + "\${extra_params[@]}" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ]; then + VIASH_PAR_SAMPLE_SHEET=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_SHEET") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_PAR_REPORTS=$(ViashDockerStripAutomount "$VIASH_PAR_REPORTS") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -e "$VIASH_PAR_REPORTS" ]; then + ViashError "Output file '$VIASH_PAR_REPORTS' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/demux/bcl2fastq/nextflow_labels.config b/target/executable/demux/bcl2fastq/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/demux/bcl2fastq/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/demux/bcl_convert/.config.vsh.yaml b/target/executable/demux/bcl_convert/.config.vsh.yaml new file mode 100644 index 00000000..b3a9a3a3 --- /dev/null +++ b/target/executable/demux/bcl_convert/.config.vsh.yaml @@ -0,0 +1,317 @@ +name: "bcl_convert" +namespace: "demux" +version: "main" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +- name: "Marijke Van Moerbeke" + roles: + - "author" + info: + role: "Contributor" + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input run directory" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + alternatives: + - "-s" + description: "Pointer to the sample sheet" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containig fastq files" + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--test_mode" + description: "Should bcl-convert be run in test mode (using --first-tile-only)?" + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--strict_mode" + description: "Abort if any files are missing." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tiles" + description: "Process only a subset of tiles by a regular expression." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--exclude_tiles" + description: "Exclude set of tiles by a regular expression" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--no_lane_splitting" + description: "Wheter to avoid splitting FASTQ file by lane." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert bcl files to fastq files using bcl-convert.\nInformation about\ + \ upgrading from bcl2fastq via\nhttps://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html\n\ + and https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "bcl2" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/bclconvert:4.2" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/bcl_convert/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/demux/bcl_convert" + executable: "target/executable/demux/bcl_convert/bcl_convert" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/demux/bcl_convert/bcl_convert b/target/executable/demux/bcl_convert/bcl_convert new file mode 100755 index 00000000..ae850862 --- /dev/null +++ b/target/executable/demux/bcl_convert/bcl_convert @@ -0,0 +1,1326 @@ +#!/usr/bin/env bash + +# bcl_convert main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Toni Verbeiren (author, maintainer) +# * Marijke Van Moerbeke (author) +# * Weiwei Schultz (contributor) +# * Dorien Roosen (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bcl_convert" +VIASH_META_FUNCTIONALITY_NAME="bcl_convert" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/bclconvert:4.2 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Toni Verbeiren, Marijke Van Moerbeke, Weiwei Schultz, Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component demux bcl_convert" +LABEL org.opencontainers.image.created="2025-08-22T07:23:07Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bcl_convert main" + echo "" + echo "Convert bcl files to fastq files using bcl-convert." + echo "Information about upgrading from bcl2fastq via" + echo "https://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html" + echo "and" + echo "https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html" + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: bcl_dir" + echo " Input run directory" + echo "" + echo " -s, --sample_sheet" + echo " type: file, required parameter, file must exist" + echo " example: bcl_dir" + echo " Pointer to the sample sheet" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: fastq_dir" + echo " Output directory containig fastq files" + echo "" + echo " --reports" + echo " type: file, output, file must exist" + echo " example: reports_dir" + echo " Reports directory" + echo "" + echo " --test_mode" + echo " type: boolean" + echo " default: false" + echo " Should bcl-convert be run in test mode (using --first-tile-only)?" + echo "" + echo " --strict_mode" + echo " type: boolean" + echo " default: false" + echo " Abort if any files are missing." + echo "" + echo " --tiles" + echo " type: string" + echo " Process only a subset of tiles by a regular expression." + echo "" + echo " --exclude_tiles" + echo " type: string" + echo " Exclude set of tiles by a regular expression" + echo "" + echo " --no_lane_splitting" + echo " type: boolean" + echo " Wheter to avoid splitting FASTQ file by lane." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bcl_convert main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_sheet) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'--sample_sheet\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_sheet. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_sheet=*) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'--sample_sheet=*\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET=$(ViashRemoveFlags "$1") + shift 1 + ;; + -s) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'-s\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -s. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reports) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reports. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reports=*) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports=*\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --test_mode) + [ -n "$VIASH_PAR_TEST_MODE" ] && ViashError Bad arguments for option \'--test_mode\': \'$VIASH_PAR_TEST_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TEST_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --test_mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --test_mode=*) + [ -n "$VIASH_PAR_TEST_MODE" ] && ViashError Bad arguments for option \'--test_mode=*\': \'$VIASH_PAR_TEST_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TEST_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --strict_mode) + [ -n "$VIASH_PAR_STRICT_MODE" ] && ViashError Bad arguments for option \'--strict_mode\': \'$VIASH_PAR_STRICT_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRICT_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --strict_mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --strict_mode=*) + [ -n "$VIASH_PAR_STRICT_MODE" ] && ViashError Bad arguments for option \'--strict_mode=*\': \'$VIASH_PAR_STRICT_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRICT_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tiles) + [ -n "$VIASH_PAR_TILES" ] && ViashError Bad arguments for option \'--tiles\': \'$VIASH_PAR_TILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TILES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tiles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tiles=*) + [ -n "$VIASH_PAR_TILES" ] && ViashError Bad arguments for option \'--tiles=*\': \'$VIASH_PAR_TILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TILES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --exclude_tiles) + [ -n "$VIASH_PAR_EXCLUDE_TILES" ] && ViashError Bad arguments for option \'--exclude_tiles\': \'$VIASH_PAR_EXCLUDE_TILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_TILES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exclude_tiles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude_tiles=*) + [ -n "$VIASH_PAR_EXCLUDE_TILES" ] && ViashError Bad arguments for option \'--exclude_tiles=*\': \'$VIASH_PAR_EXCLUDE_TILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_TILES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --no_lane_splitting) + [ -n "$VIASH_PAR_NO_LANE_SPLITTING" ] && ViashError Bad arguments for option \'--no_lane_splitting\': \'$VIASH_PAR_NO_LANE_SPLITTING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_LANE_SPLITTING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --no_lane_splitting. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --no_lane_splitting=*) + [ -n "$VIASH_PAR_NO_LANE_SPLITTING" ] && ViashError Bad arguments for option \'--no_lane_splitting=*\': \'$VIASH_PAR_NO_LANE_SPLITTING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_LANE_SPLITTING=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/demux/bcl_convert:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then + ViashError '--sample_sheet' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_TEST_MODE+x} ]; then + VIASH_PAR_TEST_MODE="false" +fi +if [ -z ${VIASH_PAR_STRICT_MODE+x} ]; then + VIASH_PAR_STRICT_MODE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ] && [ ! -e "$VIASH_PAR_SAMPLE_SHEET" ]; then + ViashError "Input file '$VIASH_PAR_SAMPLE_SHEET' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_TEST_MODE" ]]; then + if ! [[ "$VIASH_PAR_TEST_MODE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--test_mode' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_STRICT_MODE" ]]; then + if ! [[ "$VIASH_PAR_STRICT_MODE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--strict_mode' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NO_LANE_SPLITTING" ]]; then + if ! [[ "$VIASH_PAR_NO_LANE_SPLITTING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--no_lane_splitting' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -d "$(dirname "$VIASH_PAR_REPORTS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_REPORTS")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_SHEET")" ) + VIASH_PAR_SAMPLE_SHEET=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_SHEET") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REPORTS")" ) + VIASH_PAR_REPORTS=$(ViashDockerAutodetectMount "$VIASH_PAR_REPORTS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_REPORTS" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bcl_convert-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then echo "${VIASH_PAR_SAMPLE_SHEET}" | sed "s#'#'\"'\"'#g;s#.*#par_sample_sheet='&'#" ; else echo "# par_sample_sheet="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\"'\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_PAR_TEST_MODE+x} ]; then echo "${VIASH_PAR_TEST_MODE}" | sed "s#'#'\"'\"'#g;s#.*#par_test_mode='&'#" ; else echo "# par_test_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_STRICT_MODE+x} ]; then echo "${VIASH_PAR_STRICT_MODE}" | sed "s#'#'\"'\"'#g;s#.*#par_strict_mode='&'#" ; else echo "# par_strict_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_TILES+x} ]; then echo "${VIASH_PAR_TILES}" | sed "s#'#'\"'\"'#g;s#.*#par_tiles='&'#" ; else echo "# par_tiles="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE_TILES+x} ]; then echo "${VIASH_PAR_EXCLUDE_TILES}" | sed "s#'#'\"'\"'#g;s#.*#par_exclude_tiles='&'#" ; else echo "# par_exclude_tiles="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_LANE_SPLITTING+x} ]; then echo "${VIASH_PAR_NO_LANE_SPLITTING}" | sed "s#'#'\"'\"'#g;s#.*#par_no_lane_splitting='&'#" ; else echo "# par_no_lane_splitting="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +[ -d "\$par_output" ] || mkdir -p "\$par_output" + +bcl-convert \\ + --force \\ + --bcl-input-directory "\$par_input" \\ + --output-directory "\$par_output" \\ + --sample-sheet "\$par_sample_sheet" \\ + --first-tile-only "\$par_test_mode" \\ + --strict-mode "\$par_strict_mode" \\ + \${par_no_lane_splitting:+--no-lane-splitting "\$par_no_lane_splitting"} \\ + \${par_tiles:+--tiles \$par_tiles} \\ + \${par_exclude_tiles:+--exclude-tiles \$par_exclude_tiles} + + +if [ ! -z "\$par_reports" ]; then + echo "Moving reports to its own location" + mv "\$par_output"/Reports "\$par_reports" +else + echo "Leaving reports alone" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ]; then + VIASH_PAR_SAMPLE_SHEET=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_SHEET") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_PAR_REPORTS=$(ViashDockerStripAutomount "$VIASH_PAR_REPORTS") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -e "$VIASH_PAR_REPORTS" ]; then + ViashError "Output file '$VIASH_PAR_REPORTS' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/demux/bcl_convert/nextflow_labels.config b/target/executable/demux/bcl_convert/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/demux/bcl_convert/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/demux/cellranger_atac_mkfastq/.config.vsh.yaml b/target/executable/demux/cellranger_atac_mkfastq/.config.vsh.yaml new file mode 100644 index 00000000..553728ae --- /dev/null +++ b/target/executable/demux/cellranger_atac_mkfastq/.config.vsh.yaml @@ -0,0 +1,286 @@ +name: "cellranger_atac_mkfastq" +namespace: "demux" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Path of Illumina BCL run folder." + info: null + example: + - "/path/to/bcl" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--csv" + description: "The path to the simple layout sample sheet." + info: null + example: + - "SampleSheet.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--lanes" + description: "bcl2fastq option. Semicolon-delimited series of lanes to demultiplex.\ + \ Use this if you have a sample sheet for an entire flow cell but only want\ + \ to generate a few lanes for further 10x Genomics analysis." + info: null + example: + - "1,3" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "string" + name: "--use_bases_mask" + description: "bcl2fastq option. Use to clip extra bases off a read if you ran\ + \ extra cycles for QC." + info: null + example: + - "y50n,I6n,Y50n" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "boolean_true" + name: "--delete_undetermined" + description: "bcl2fastq option. Delete the Undetermined FASTQs generated by bcl2fastq.\ + \ Useful if you are demultiplexing a small number of samples from a large flow\ + \ cell." + info: null + direction: "input" + - type: "integer" + name: "--barcode_mismatches" + description: "bcl2fastq option. Use this option to change the number of allowed\ + \ mismatches per index adapter (0, 1, 2)." + info: null + default: + - 1 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "The folder to store the demux results" + info: null + example: + - "/path/to/output" + default: + - "fastqs" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Demultiplex raw sequencing data for ATAC experiments" +usage: "cellranger_atac_mkfastq \\\n --input /path/to/bcl \\\n --csv simple_layout_sample_sheet.csv\ + \ \\\n --output /path/to/output\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_atac_tiny_bcl" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger_atac:2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update \\\n&& apt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/cellranger_atac_mkfastq/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/demux/cellranger_atac_mkfastq" + executable: "target/executable/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq b/target/executable/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq new file mode 100755 index 00000000..f52f1316 --- /dev/null +++ b/target/executable/demux/cellranger_atac_mkfastq/cellranger_atac_mkfastq @@ -0,0 +1,1333 @@ +#!/usr/bin/env bash + +# cellranger_atac_mkfastq main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_atac_mkfastq" +VIASH_META_FUNCTIONALITY_NAME="cellranger_atac_mkfastq" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger_atac:2.1 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update \ +&& apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component demux cellranger_atac_mkfastq" +LABEL org.opencontainers.image.created="2025-08-22T07:23:07Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_atac_mkfastq main" + echo "" + echo "Demultiplex raw sequencing data for ATAC experiments" + echo "" + echo "Usage:" + echo "cellranger_atac_mkfastq \\" + echo " --input /path/to/bcl \\" + echo " --csv simple_layout_sample_sheet.csv \\" + echo " --output /path/to/output" + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/bcl" + echo " Path of Illumina BCL run folder." + echo "" + echo " --csv" + echo " type: file, required parameter, file must exist" + echo " example: SampleSheet.csv" + echo " The path to the simple layout sample sheet." + echo "" + echo " --lanes" + echo " type: string, multiple values allowed" + echo " example: 1,3" + echo " bcl2fastq option. Semicolon-delimited series of lanes to demultiplex." + echo " Use this if you have a sample sheet for an entire flow cell but only" + echo " want to generate a few lanes for further 10x Genomics analysis." + echo "" + echo " --use_bases_mask" + echo " type: string, multiple values allowed" + echo " example: y50n,I6n,Y50n" + echo " bcl2fastq option. Use to clip extra bases off a read if you ran extra" + echo " cycles for QC." + echo "" + echo " --delete_undetermined" + echo " type: boolean_true" + echo " bcl2fastq option. Delete the Undetermined FASTQs generated by bcl2fastq." + echo " Useful if you are demultiplexing a small number of samples from a large" + echo " flow cell." + echo "" + echo " --barcode_mismatches" + echo " type: integer" + echo " default: 1" + echo " min: 0" + echo " bcl2fastq option. Use this option to change the number of allowed" + echo " mismatches per index adapter (0, 1, 2)." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " default: fastqs" + echo " example: /path/to/output" + echo " The folder to store the demux results" + echo "" + echo " --reports" + echo " type: file, output, file must exist" + echo " example: reports_dir" + echo " Reports directory" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_atac_mkfastq main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --csv) + [ -n "$VIASH_PAR_CSV" ] && ViashError Bad arguments for option \'--csv\': \'$VIASH_PAR_CSV\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CSV="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --csv. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --csv=*) + [ -n "$VIASH_PAR_CSV" ] && ViashError Bad arguments for option \'--csv=*\': \'$VIASH_PAR_CSV\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CSV=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lanes) + if [ -z "$VIASH_PAR_LANES" ]; then + VIASH_PAR_LANES="$2" + else + VIASH_PAR_LANES="$VIASH_PAR_LANES,""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lanes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lanes=*) + if [ -z "$VIASH_PAR_LANES" ]; then + VIASH_PAR_LANES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LANES="$VIASH_PAR_LANES,"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --use_bases_mask) + if [ -z "$VIASH_PAR_USE_BASES_MASK" ]; then + VIASH_PAR_USE_BASES_MASK="$2" + else + VIASH_PAR_USE_BASES_MASK="$VIASH_PAR_USE_BASES_MASK,""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --use_bases_mask. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --use_bases_mask=*) + if [ -z "$VIASH_PAR_USE_BASES_MASK" ]; then + VIASH_PAR_USE_BASES_MASK=$(ViashRemoveFlags "$1") + else + VIASH_PAR_USE_BASES_MASK="$VIASH_PAR_USE_BASES_MASK,"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --delete_undetermined) + [ -n "$VIASH_PAR_DELETE_UNDETERMINED" ] && ViashError Bad arguments for option \'--delete_undetermined\': \'$VIASH_PAR_DELETE_UNDETERMINED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DELETE_UNDETERMINED=true + shift 1 + ;; + --barcode_mismatches) + [ -n "$VIASH_PAR_BARCODE_MISMATCHES" ] && ViashError Bad arguments for option \'--barcode_mismatches\': \'$VIASH_PAR_BARCODE_MISMATCHES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE_MISMATCHES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcode_mismatches. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcode_mismatches=*) + [ -n "$VIASH_PAR_BARCODE_MISMATCHES" ] && ViashError Bad arguments for option \'--barcode_mismatches=*\': \'$VIASH_PAR_BARCODE_MISMATCHES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE_MISMATCHES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reports) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reports. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reports=*) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports=*\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/demux/cellranger_atac_mkfastq:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_CSV+x} ]; then + ViashError '--csv' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_DELETE_UNDETERMINED+x} ]; then + VIASH_PAR_DELETE_UNDETERMINED="false" +fi +if [ -z ${VIASH_PAR_BARCODE_MISMATCHES+x} ]; then + VIASH_PAR_BARCODE_MISMATCHES="1" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CSV" ] && [ ! -e "$VIASH_PAR_CSV" ]; then + ViashError "Input file '$VIASH_PAR_CSV' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_DELETE_UNDETERMINED" ]]; then + if ! [[ "$VIASH_PAR_DELETE_UNDETERMINED" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--delete_undetermined' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BARCODE_MISMATCHES" ]]; then + if ! [[ "$VIASH_PAR_BARCODE_MISMATCHES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--barcode_mismatches' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_BARCODE_MISMATCHES -lt 0 ]]; then + ViashError '--barcode_mismatches' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -d "$(dirname "$VIASH_PAR_REPORTS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_REPORTS")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_CSV" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CSV")" ) + VIASH_PAR_CSV=$(ViashDockerAutodetectMount "$VIASH_PAR_CSV") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REPORTS")" ) + VIASH_PAR_REPORTS=$(ViashDockerAutodetectMount "$VIASH_PAR_REPORTS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_REPORTS" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_atac_mkfastq-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_CSV+x} ]; then echo "${VIASH_PAR_CSV}" | sed "s#'#'\"'\"'#g;s#.*#par_csv='&'#" ; else echo "# par_csv="; fi ) +$( if [ ! -z ${VIASH_PAR_LANES+x} ]; then echo "${VIASH_PAR_LANES}" | sed "s#'#'\"'\"'#g;s#.*#par_lanes='&'#" ; else echo "# par_lanes="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_BASES_MASK+x} ]; then echo "${VIASH_PAR_USE_BASES_MASK}" | sed "s#'#'\"'\"'#g;s#.*#par_use_bases_mask='&'#" ; else echo "# par_use_bases_mask="; fi ) +$( if [ ! -z ${VIASH_PAR_DELETE_UNDETERMINED+x} ]; then echo "${VIASH_PAR_DELETE_UNDETERMINED}" | sed "s#'#'\"'\"'#g;s#.*#par_delete_undetermined='&'#" ; else echo "# par_delete_undetermined="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODE_MISMATCHES+x} ]; then echo "${VIASH_PAR_BARCODE_MISMATCHES}" | sed "s#'#'\"'\"'#g;s#.*#par_barcode_mismatches='&'#" ; else echo "# par_barcode_mismatches="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\"'\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\$(mktemp -d "$VIASH_TEMP/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# if par_input not is a folder, untar first +if [ ! -d "\$par_input" ]; then + echo "Assuming input is a tar.gz, untarring" + input_dir="\$tmpdir/bcl" + mkdir -p "\$input_dir" + tar -xzf "\$par_input" -C "\$input_dir" --strip-components=1 +else + input_dir="\$par_input" +fi + + +if [ ! -z "\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\`python -c "print(int('\$meta_memory_gb') - 2)"\` +fi + + +echo "Running cellranger-atac mkfastq" + +id=myoutput + +IFS="," +cellranger-atac mkfastq \\ + --id "\$id" \\ + --csv "\$par_csv" \\ + --run "\$par_input" \\ + --disable-ui \\ + --output-dir "\$par_output" \\ + \${meta_cpus:+--localcores=\$meta_cpus} \\ + \${memory_gb:+--localmem=\$memory_gb} \\ + \${par_lanes:+--lanes=\${par_lanes[*]}} \\ + \${par_use_bases_mask:+--use-bases-mask=\${par_use_bases_mask[*]}} \\ + \${par_delete_undetermined:+--delete-undetermined} \\ + \${par_barcode_mismatches:+--barcode-mismatches=\$par_barcode_mismatches} +unset IFS + +# Move reports to their own output location +if [ ! -z "\$par_reports" ]; then + echo "Moving reports its own location" + mv "\$par_output"/Reports "\$par_reports" +else + echo "Leaving reports alone" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_CSV" ]; then + VIASH_PAR_CSV=$(ViashDockerStripAutomount "$VIASH_PAR_CSV") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_PAR_REPORTS=$(ViashDockerStripAutomount "$VIASH_PAR_REPORTS") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -e "$VIASH_PAR_REPORTS" ]; then + ViashError "Output file '$VIASH_PAR_REPORTS' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/demux/cellranger_atac_mkfastq/nextflow_labels.config b/target/executable/demux/cellranger_atac_mkfastq/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/demux/cellranger_atac_mkfastq/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/demux/cellranger_atac_mkfastq/setup_logger.py b/target/executable/demux/cellranger_atac_mkfastq/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/demux/cellranger_atac_mkfastq/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/demux/cellranger_mkfastq/.config.vsh.yaml b/target/executable/demux/cellranger_mkfastq/.config.vsh.yaml new file mode 100644 index 00000000..724a5cd7 --- /dev/null +++ b/target/executable/demux/cellranger_mkfastq/.config.vsh.yaml @@ -0,0 +1,277 @@ +name: "cellranger_mkfastq" +namespace: "demux" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Path to the (untarred) BCL files. Expects 'RunParameters.xml' at\ + \ './'." + info: null + example: + - "/path/to/bcl" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + description: "The path to the sample sheet." + info: null + example: + - "SampleSheet.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "The folder to store the demux results" + info: null + example: + - "/path/to/output" + default: + - "fastqs" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Demultiplex raw sequencing data" +usage: "cellranger_mkfastq \\\n --input /path/to/bcl \\\n --sample_sheet SampleSheet.csv\ + \ \\\n --output /path/to/output\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_bcl" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/cellranger_mkfastq/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/demux/cellranger_mkfastq" + executable: "target/executable/demux/cellranger_mkfastq/cellranger_mkfastq" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/demux/cellranger_mkfastq/cellranger_mkfastq b/target/executable/demux/cellranger_mkfastq/cellranger_mkfastq new file mode 100755 index 00000000..580a254e --- /dev/null +++ b/target/executable/demux/cellranger_mkfastq/cellranger_mkfastq @@ -0,0 +1,1231 @@ +#!/usr/bin/env bash + +# cellranger_mkfastq main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Samuel D'Souza (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_mkfastq" +VIASH_META_FUNCTIONALITY_NAME="cellranger_mkfastq" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger:9.0 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update && \ +apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Samuel D'Souza, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component demux cellranger_mkfastq" +LABEL org.opencontainers.image.created="2025-08-22T07:23:07Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_mkfastq main" + echo "" + echo "Demultiplex raw sequencing data" + echo "" + echo "Usage:" + echo "cellranger_mkfastq \\" + echo " --input /path/to/bcl \\" + echo " --sample_sheet SampleSheet.csv \\" + echo " --output /path/to/output" + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/bcl" + echo " Path to the (untarred) BCL files. Expects 'RunParameters.xml' at './'." + echo "" + echo " --sample_sheet" + echo " type: file, required parameter, file must exist" + echo " example: SampleSheet.csv" + echo " The path to the sample sheet." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " default: fastqs" + echo " example: /path/to/output" + echo " The folder to store the demux results" + echo "" + echo " --reports" + echo " type: file, output, file must exist" + echo " example: reports_dir" + echo " Reports directory" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_mkfastq main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_sheet) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'--sample_sheet\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_sheet. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_sheet=*) + [ -n "$VIASH_PAR_SAMPLE_SHEET" ] && ViashError Bad arguments for option \'--sample_sheet=*\': \'$VIASH_PAR_SAMPLE_SHEET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_SHEET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reports) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reports. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reports=*) + [ -n "$VIASH_PAR_REPORTS" ] && ViashError Bad arguments for option \'--reports=*\': \'$VIASH_PAR_REPORTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/demux/cellranger_mkfastq:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then + ViashError '--sample_sheet' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ] && [ ! -e "$VIASH_PAR_SAMPLE_SHEET" ]; then + ViashError "Input file '$VIASH_PAR_SAMPLE_SHEET' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -d "$(dirname "$VIASH_PAR_REPORTS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_REPORTS")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_SHEET")" ) + VIASH_PAR_SAMPLE_SHEET=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_SHEET") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REPORTS")" ) + VIASH_PAR_REPORTS=$(ViashDockerAutodetectMount "$VIASH_PAR_REPORTS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_REPORTS" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_mkfastq-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then echo "${VIASH_PAR_SAMPLE_SHEET}" | sed "s#'#'\"'\"'#g;s#.*#par_sample_sheet='&'#" ; else echo "# par_sample_sheet="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\"'\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\$(mktemp -d "$VIASH_TEMP/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# if par_input not is a folder, untar first +if [ ! -d "\$par_input" ]; then + echo "Assuming input is a tar.gz, untarring" + input_dir="\$tmpdir/bcl" + mkdir -p "\$input_dir" + tar -xzf "\$par_input" -C "\$input_dir" --strip-components=1 +else + input_dir="\$par_input" +fi + + +# add additional params +extra_params=( ) + +if [ ! -z "\$meta_cpus" ]; then + extra_params+=( "--localcores=\$meta_cpus" ) +fi +if [ ! -z "\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\`python -c "print(int('\$meta_memory_gb') - 2)"\` + extra_params+=( "--localmem=\$memory_gb" ) +fi + + +echo "Running cellranger demux" + +id=myoutput + +cellranger mkfastq \\ + --id "\$id" \\ + --csv "\$par_sample_sheet" \\ + --run "\$par_input" \\ + "\${extra_params[@]}" \\ + --disable-ui \\ + --output-dir "\$par_output" + +# Move reports to their own output location +if [ ! -z "\$par_reports" ]; then + echo "Moving reports its own location" + mv "\$par_output"/Reports "\$par_reports" +else + echo "Leaving reports alone" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_SHEET" ]; then + VIASH_PAR_SAMPLE_SHEET=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_SHEET") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_REPORTS" ]; then + VIASH_PAR_REPORTS=$(ViashDockerStripAutomount "$VIASH_PAR_REPORTS") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REPORTS" ] && [ ! -e "$VIASH_PAR_REPORTS" ]; then + ViashError "Output file '$VIASH_PAR_REPORTS' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/demux/cellranger_mkfastq/nextflow_labels.config b/target/executable/demux/cellranger_mkfastq/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/demux/cellranger_mkfastq/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/demux/cellranger_mkfastq/setup_logger.py b/target/executable/demux/cellranger_mkfastq/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/demux/cellranger_mkfastq/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dimred/densmap/.config.vsh.yaml b/target/executable/dimred/densmap/.config.vsh.yaml new file mode 100644 index 00000000..f7f3c170 --- /dev/null +++ b/target/executable/dimred/densmap/.config.vsh.yaml @@ -0,0 +1,442 @@ +name: "densmap" +namespace: "dimred" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "maintainer" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_neighbors" + description: "The `.uns` neighbors slot as output by the `find_neighbors` component." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_pca" + description: "The slot in `.obsm` where the PCA results are stored.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The .obsm key to use for storing the densMAP results.." + info: null + default: + - "X_densmap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments UMAP" + arguments: + - type: "double" + name: "--min_dist" + description: "The effective minimum distance between embedded points. Smaller\ + \ values will result \nin a more clustered/clumped embedding where nearby points\ + \ on the manifold are drawn \ncloser together, while larger values will result\ + \ on a more even dispersal of points. \nThe value should be set relative to\ + \ the spread value, which determines the scale at \nwhich embedded points will\ + \ be spread out. \n" + info: null + default: + - 0.5 + required: false + min: 0.0 + max: 10.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--spread" + description: "The effective scale of embedded points. In combination with `min_dist`\ + \ this \ndetermines how clustered/clumped the embedded points are.\n" + info: null + default: + - 1.0 + required: false + min: 0.0 + max: 10.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_components" + description: "The number of dimensions of the embedding." + info: null + default: + - 2 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "The number of iterations (epochs) of the optimization. Called `n_epochs`\ + \ \nin the original UMAP. Default is set to 500 if \nneighbors['connectivities'].shape[0]\ + \ <= 10000, else 200.\n" + info: null + default: + - 0 + required: false + min: 0 + max: 1000 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "The initial learning rate for the embedding optimization." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--gamma" + description: "Weighting applied to negative samples in low dimensional embedding\ + \ optimization. \nValues higher than one will result in greater weight being\ + \ given to negative samples.\n" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--negative_sample_rate" + description: "The number of negative samples to select per positive sample\nin\ + \ the optimization process. Increasing this value will result\nin greater repulsive\ + \ force being applied, greater optimization\ncost, but slightly more accuracy.\n" + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--init_pos" + description: "How to initialize the low dimensional embedding. Called `init` in\ + \ the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions\ + \ from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`:\ + \ assign initial embedding positions at random.\n" + info: null + default: + - "spectral" + required: false + choices: + - "paga" + - "spectral" + - "random" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments densMAP" + arguments: + - type: "double" + name: "--lambda" + description: "Controls the regularization weight of the density correlation term\ + \ in densMAP. \nHigher values prioritize density preservation over the UMAP\ + \ objective, and vice versa \nfor values closer to zero. Setting this parameter\ + \ to zero is equivalent to running \nthe original UMAP algorithm.\n" + info: null + default: + - 2.0 + required: false + min: 0.01 + max: 10.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--fraction" + description: "Controls the fraction of epochs (between 0 and 1) where the density-augmented\ + \ objective \nis used in densMAP. The first (1 - dens_frac) fraction of epochs\ + \ optimize the original \nUMAP objective before introducing the density correlation\ + \ term.\n" + info: null + default: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--var_shift" + description: "A small constant added to the variance of local radii in the embedding\ + \ when calculating \nthe density correlation objective to prevent numerical\ + \ instability from dividing by a \nsmall number.\n" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A modification of UMAP that adds an extra cost term in order to preserve\ + \ information \nabout the relative local density of the data. It is performed on\ + \ the same inputs as UMAP.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "umap-learn" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/densmap/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dimred/densmap" + executable: "target/executable/dimred/densmap/densmap" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dimred/densmap/compress_h5mu.py b/target/executable/dimred/densmap/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/dimred/densmap/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/dimred/densmap/densmap b/target/executable/dimred/densmap/densmap new file mode 100755 index 00000000..432bde8d --- /dev/null +++ b/target/executable/dimred/densmap/densmap @@ -0,0 +1,1753 @@ +#!/usr/bin/env bash + +# densmap main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="densmap" +VIASH_META_FUNCTIONALITY_NAME="densmap" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "umap-learn" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component dimred densmap" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "densmap main" + echo "" + echo "A modification of UMAP that adds an extra cost term in order to preserve" + echo "information" + echo "about the relative local density of the data. It is performed on the same inputs" + echo "as UMAP." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --uns_neighbors" + echo " type: string" + echo " default: neighbors" + echo " The \`.uns\` neighbors slot as output by the \`find_neighbors\` component." + echo "" + echo " --obsm_pca" + echo " type: string, required parameter" + echo " The slot in \`.obsm\` where the PCA results are stored." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_densmap" + echo " The .obsm key to use for storing the densMAP results.." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments UMAP:" + echo " --min_dist" + echo " type: double" + echo " default: 0.5" + echo " min: 0.0" + echo " max: 10.0" + echo " The effective minimum distance between embedded points. Smaller values" + echo " will result" + echo " in a more clustered/clumped embedding where nearby points on the" + echo " manifold are drawn" + echo " closer together, while larger values will result on a more even" + echo " dispersal of points." + echo " The value should be set relative to the spread value, which determines" + echo " the scale at" + echo " which embedded points will be spread out." + echo "" + echo " --spread" + echo " type: double" + echo " default: 1.0" + echo " min: 0.0" + echo " max: 10.0" + echo " The effective scale of embedded points. In combination with \`min_dist\`" + echo " this" + echo " determines how clustered/clumped the embedded points are." + echo "" + echo " --num_components" + echo " type: integer" + echo " default: 2" + echo " min: 1" + echo " The number of dimensions of the embedding." + echo "" + echo " --max_iter" + echo " type: integer" + echo " default: 0" + echo " min: 0" + echo " max: 1000" + echo " The number of iterations (epochs) of the optimization. Called \`n_epochs\`" + echo " in the original UMAP. Default is set to 500 if" + echo " neighbors['connectivities'].shape[0] <= 10000, else 200." + echo "" + echo " --alpha" + echo " type: double" + echo " default: 1.0" + echo " The initial learning rate for the embedding optimization." + echo "" + echo " --gamma" + echo " type: double" + echo " default: 1.0" + echo " Weighting applied to negative samples in low dimensional embedding" + echo " optimization." + echo " Values higher than one will result in greater weight being given to" + echo " negative samples." + echo "" + echo " --negative_sample_rate" + echo " type: integer" + echo " default: 5" + echo " The number of negative samples to select per positive sample" + echo " in the optimization process. Increasing this value will result" + echo " in greater repulsive force being applied, greater optimization" + echo " cost, but slightly more accuracy." + echo "" + echo " --init_pos" + echo " type: string" + echo " default: spectral" + echo " choices: [ paga, spectral, random ]" + echo " How to initialize the low dimensional embedding. Called \`init\` in the" + echo " original UMAP. Options are:" + echo " * Any key from \`.obsm\`" + echo " * \`'paga'\`: positions from \`paga()\`" + echo " * \`'spectral'\`: use a spectral embedding of the graph" + echo " * \`'random'\`: assign initial embedding positions at random." + echo "" + echo "Arguments densMAP:" + echo " --lambda" + echo " type: double" + echo " default: 2.0" + echo " min: 0.01" + echo " max: 10.0" + echo " Controls the regularization weight of the density correlation term in" + echo " densMAP." + echo " Higher values prioritize density preservation over the UMAP objective," + echo " and vice versa" + echo " for values closer to zero. Setting this parameter to zero is equivalent" + echo " to running" + echo " the original UMAP algorithm." + echo "" + echo " --fraction" + echo " type: double" + echo " default: 0.3" + echo " Controls the fraction of epochs (between 0 and 1) where the" + echo " density-augmented objective" + echo " is used in densMAP. The first (1 - dens_frac) fraction of epochs" + echo " optimize the original" + echo " UMAP objective before introducing the density correlation term." + echo "" + echo " --var_shift" + echo " type: double" + echo " default: 0.1" + echo " A small constant added to the variance of local radii in the embedding" + echo " when calculating" + echo " the density correlation objective to prevent numerical instability from" + echo " dividing by a" + echo " small number." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "densmap main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_neighbors) + [ -n "$VIASH_PAR_UNS_NEIGHBORS" ] && ViashError Bad arguments for option \'--uns_neighbors\': \'$VIASH_PAR_UNS_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_NEIGHBORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_neighbors. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_neighbors=*) + [ -n "$VIASH_PAR_UNS_NEIGHBORS" ] && ViashError Bad arguments for option \'--uns_neighbors=*\': \'$VIASH_PAR_UNS_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_NEIGHBORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_pca) + [ -n "$VIASH_PAR_OBSM_PCA" ] && ViashError Bad arguments for option \'--obsm_pca\': \'$VIASH_PAR_OBSM_PCA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_PCA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_pca. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_pca=*) + [ -n "$VIASH_PAR_OBSM_PCA" ] && ViashError Bad arguments for option \'--obsm_pca=*\': \'$VIASH_PAR_OBSM_PCA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_PCA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_dist) + [ -n "$VIASH_PAR_MIN_DIST" ] && ViashError Bad arguments for option \'--min_dist\': \'$VIASH_PAR_MIN_DIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_dist. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_dist=*) + [ -n "$VIASH_PAR_MIN_DIST" ] && ViashError Bad arguments for option \'--min_dist=*\': \'$VIASH_PAR_MIN_DIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --spread) + [ -n "$VIASH_PAR_SPREAD" ] && ViashError Bad arguments for option \'--spread\': \'$VIASH_PAR_SPREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --spread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --spread=*) + [ -n "$VIASH_PAR_SPREAD" ] && ViashError Bad arguments for option \'--spread=*\': \'$VIASH_PAR_SPREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_components) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_components=*) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components=*\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_iter) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_iter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_iter=*) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter=*\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alpha) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alpha=*) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha=*\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gamma) + [ -n "$VIASH_PAR_GAMMA" ] && ViashError Bad arguments for option \'--gamma\': \'$VIASH_PAR_GAMMA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GAMMA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gamma. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gamma=*) + [ -n "$VIASH_PAR_GAMMA" ] && ViashError Bad arguments for option \'--gamma=*\': \'$VIASH_PAR_GAMMA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GAMMA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --negative_sample_rate) + [ -n "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" ] && ViashError Bad arguments for option \'--negative_sample_rate\': \'$VIASH_PAR_NEGATIVE_SAMPLE_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NEGATIVE_SAMPLE_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --negative_sample_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --negative_sample_rate=*) + [ -n "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" ] && ViashError Bad arguments for option \'--negative_sample_rate=*\': \'$VIASH_PAR_NEGATIVE_SAMPLE_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NEGATIVE_SAMPLE_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --init_pos) + [ -n "$VIASH_PAR_INIT_POS" ] && ViashError Bad arguments for option \'--init_pos\': \'$VIASH_PAR_INIT_POS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INIT_POS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --init_pos. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --init_pos=*) + [ -n "$VIASH_PAR_INIT_POS" ] && ViashError Bad arguments for option \'--init_pos=*\': \'$VIASH_PAR_INIT_POS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INIT_POS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lambda) + [ -n "$VIASH_PAR_LAMBDA" ] && ViashError Bad arguments for option \'--lambda\': \'$VIASH_PAR_LAMBDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAMBDA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lambda. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lambda=*) + [ -n "$VIASH_PAR_LAMBDA" ] && ViashError Bad arguments for option \'--lambda=*\': \'$VIASH_PAR_LAMBDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAMBDA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fraction) + [ -n "$VIASH_PAR_FRACTION" ] && ViashError Bad arguments for option \'--fraction\': \'$VIASH_PAR_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fraction=*) + [ -n "$VIASH_PAR_FRACTION" ] && ViashError Bad arguments for option \'--fraction=*\': \'$VIASH_PAR_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_shift) + [ -n "$VIASH_PAR_VAR_SHIFT" ] && ViashError Bad arguments for option \'--var_shift\': \'$VIASH_PAR_VAR_SHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_SHIFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_shift. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_shift=*) + [ -n "$VIASH_PAR_VAR_SHIFT" ] && ViashError Bad arguments for option \'--var_shift=*\': \'$VIASH_PAR_VAR_SHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_SHIFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dimred/densmap:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBSM_PCA+x} ]; then + ViashError '--obsm_pca' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_UNS_NEIGHBORS+x} ]; then + VIASH_PAR_UNS_NEIGHBORS="neighbors" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_densmap" +fi +if [ -z ${VIASH_PAR_MIN_DIST+x} ]; then + VIASH_PAR_MIN_DIST="0.5" +fi +if [ -z ${VIASH_PAR_SPREAD+x} ]; then + VIASH_PAR_SPREAD="1.0" +fi +if [ -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then + VIASH_PAR_NUM_COMPONENTS="2" +fi +if [ -z ${VIASH_PAR_MAX_ITER+x} ]; then + VIASH_PAR_MAX_ITER="0" +fi +if [ -z ${VIASH_PAR_ALPHA+x} ]; then + VIASH_PAR_ALPHA="1.0" +fi +if [ -z ${VIASH_PAR_GAMMA+x} ]; then + VIASH_PAR_GAMMA="1.0" +fi +if [ -z ${VIASH_PAR_NEGATIVE_SAMPLE_RATE+x} ]; then + VIASH_PAR_NEGATIVE_SAMPLE_RATE="5" +fi +if [ -z ${VIASH_PAR_INIT_POS+x} ]; then + VIASH_PAR_INIT_POS="spectral" +fi +if [ -z ${VIASH_PAR_LAMBDA+x} ]; then + VIASH_PAR_LAMBDA="2.0" +fi +if [ -z ${VIASH_PAR_FRACTION+x} ]; then + VIASH_PAR_FRACTION="0.3" +fi +if [ -z ${VIASH_PAR_VAR_SHIFT+x} ]; then + VIASH_PAR_VAR_SHIFT="0.1" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_DIST" ]]; then + if ! [[ "$VIASH_PAR_MIN_DIST" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_dist' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_MIN_DIST '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--min_dist' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_MIN_DIST -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--min_dist' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--min_dist' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_MIN_DIST '<=' 10.0 | bc` -eq 1 ]]; then + ViashError '--min_dist' has to be less than or equal to 10.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_MIN_DIST -v n2=10.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--min_dist' has be less than or equal to 10.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--min_dist' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_SPREAD" ]]; then + if ! [[ "$VIASH_PAR_SPREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--spread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_SPREAD '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--spread' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_SPREAD -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--spread' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--spread' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_SPREAD '<=' 10.0 | bc` -eq 1 ]]; then + ViashError '--spread' has to be less than or equal to 10.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_SPREAD -v n2=10.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--spread' has be less than or equal to 10.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--spread' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_NUM_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_NUM_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_NUM_COMPONENTS -lt 1 ]]; then + ViashError '--num_components' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_ITER" ]]; then + if ! [[ "$VIASH_PAR_MAX_ITER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_iter' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_MAX_ITER -lt 0 ]]; then + ViashError '--max_iter' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_MAX_ITER -gt 1000 ]]; then + ViashError '--max_iter' has be less than or equal to 1000. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALPHA" ]]; then + if ! [[ "$VIASH_PAR_ALPHA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alpha' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GAMMA" ]]; then + if ! [[ "$VIASH_PAR_GAMMA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--gamma' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" ]]; then + if ! [[ "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--negative_sample_rate' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LAMBDA" ]]; then + if ! [[ "$VIASH_PAR_LAMBDA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lambda' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LAMBDA '>=' 0.01 | bc` -eq 1 ]]; then + ViashError '--lambda' has be more than or equal to 0.01. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LAMBDA -v n2=0.01 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lambda' has be more than or equal to 0.01. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lambda' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LAMBDA '<=' 10.0 | bc` -eq 1 ]]; then + ViashError '--lambda' has to be less than or equal to 10.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LAMBDA -v n2=10.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lambda' has be less than or equal to 10.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lambda' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VAR_SHIFT" ]]; then + if ! [[ "$VIASH_PAR_VAR_SHIFT" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--var_shift' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_INIT_POS" ]; then + VIASH_PAR_INIT_POS_CHOICES=("paga;spectral;random") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_INIT_POS_CHOICES[*]};" =~ ";$VIASH_PAR_INIT_POS;" ]]; then + ViashError '--init_pos' specified value of \'$VIASH_PAR_INIT_POS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-densmap-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from umap import UMAP +import mudata as mu +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_neighbors': $( if [ ! -z ${VIASH_PAR_UNS_NEIGHBORS+x} ]; then echo "r'${VIASH_PAR_UNS_NEIGHBORS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_pca': $( if [ ! -z ${VIASH_PAR_OBSM_PCA+x} ]; then echo "r'${VIASH_PAR_OBSM_PCA//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_dist': $( if [ ! -z ${VIASH_PAR_MIN_DIST+x} ]; then echo "float(r'${VIASH_PAR_MIN_DIST//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'spread': $( if [ ! -z ${VIASH_PAR_SPREAD+x} ]; then echo "float(r'${VIASH_PAR_SPREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gamma': $( if [ ! -z ${VIASH_PAR_GAMMA+x} ]; then echo "float(r'${VIASH_PAR_GAMMA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'negative_sample_rate': $( if [ ! -z ${VIASH_PAR_NEGATIVE_SAMPLE_RATE+x} ]; then echo "int(r'${VIASH_PAR_NEGATIVE_SAMPLE_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'init_pos': $( if [ ! -z ${VIASH_PAR_INIT_POS+x} ]; then echo "r'${VIASH_PAR_INIT_POS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'lambda': $( if [ ! -z ${VIASH_PAR_LAMBDA+x} ]; then echo "float(r'${VIASH_PAR_LAMBDA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'fraction': $( if [ ! -z ${VIASH_PAR_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'var_shift': $( if [ ! -z ${VIASH_PAR_VAR_SHIFT+x} ]; then echo "float(r'${VIASH_PAR_VAR_SHIFT//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +try: + data = mu.read_h5ad(par["input"], mod=par["modality"]) +except KeyError as e: + raise ValueError( + f"Modality '{par['modality']}' not found in the input data." + ) from e + +logger.info("Computing densMAP for modality '%s'", par["modality"]) + +neigh_key = par["uns_neighbors"] + +if neigh_key not in data.uns: + raise ValueError( + f"'{neigh_key}' was not found in .mod['{par['modality']}'].uns. Set the correct key or run 'find_neighbors' first." + ) + +temp_uns = {neigh_key: data.uns[neigh_key]} + +if "use_rep" not in temp_uns[neigh_key]["params"]: + raise ValueError( + f"'use_rep' was not found in .mod['{par['modality']}'].uns['{neigh_key}'].params. Set the correct key or run PCA first." + ) + + +X_densmap = UMAP( + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + n_epochs=par["max_iter"], + learning_rate=par["alpha"], + repulsion_strength=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init=par["init_pos"], + metric=data.uns["neighbors"].get("metric", "euclidean"), + metric_kwds=data.uns["neighbors"].get("metric_kwds", {}), + densmap=True, + dens_lambda=par["lambda"], + dens_frac=par["fraction"], + dens_var_shift=par["var_shift"], +).fit_transform(data.obsm[par["obsm_pca"]]) + +logger.info( + f"Writing densMAP embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = X_densmap + +logger.info(f"Writing densMAP metadata to .mod[{par['modality']}].uns['densmap']") +data.uns["densmap"] = { + "params": { + "min_dist": par["min_dist"], + "spread": par["spread"], + "n_components": par["num_components"], + "n_epochs": par["max_iter"], + "learning_rate": par["alpha"], + "repulsion_strength": par["gamma"], + "negative_sample_rate": par["negative_sample_rate"], + "init": par["init_pos"], + "metric": data.uns["neighbors"].get("metric", "euclidean"), + "metric_kwds": data.uns["neighbors"].get("metric_kwds", {}), + "dens_lambda": par["lambda"], + "dens_frac": par["fraction"], + "dens_var_shift": par["var_shift"], + } +} + +logger.info("Writing to %s.", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dimred/densmap/nextflow_labels.config b/target/executable/dimred/densmap/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dimred/densmap/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dimred/densmap/setup_logger.py b/target/executable/dimred/densmap/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dimred/densmap/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dimred/lsi/.config.vsh.yaml b/target/executable/dimred/lsi/.config.vsh.yaml new file mode 100644 index 00000000..f2e3eafa --- /dev/null +++ b/target/executable/dimred/lsi/.config.vsh.yaml @@ -0,0 +1,361 @@ +name: "lsi" +namespace: "dimred" +version: "main" +authors: +- name: "Sarah Ouologuem" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "SarahOuologuem" + orcid: "0009-0005-3398-1700" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +- name: "Vladimir Shitov" + roles: + - "contributor" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "On which modality to run LSI on." + info: null + default: + - "atac" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Use specified layer for expression values. If not specified, uses\ + \ adata.X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "Column name in .var matrix that will be used to select which genes\ + \ to run the LSI on. If not specified, uses all features." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "LSI options" + arguments: + - type: "integer" + name: "--num_components" + description: "Number of components to compute." + info: null + default: + - 50 + required: false + min: 2 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--scale_embeddings" + description: "Scale embeddings to zero mean and unit variance." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting embedding." + info: null + default: + - "X_lsi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--varm_output" + description: "In which .varm slot to store the resulting loadings matrix." + info: null + default: + - "lsi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "In which .uns slot to store the stdev." + info: null + default: + - "lsi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite" + description: "Allow overwriting .obsm, .varm and .uns slots." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings\ + \ and singular values. Uses the implementation of scipy.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "concat_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pkg-config" + - "libhdf5-dev" + - "gcc" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "muon~=0.1.6" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/lsi/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dimred/lsi" + executable: "target/executable/dimred/lsi/lsi" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dimred/lsi/compress_h5mu.py b/target/executable/dimred/lsi/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/dimred/lsi/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/dimred/lsi/lsi b/target/executable/dimred/lsi/lsi new file mode 100755 index 00000000..c4eb5607 --- /dev/null +++ b/target/executable/dimred/lsi/lsi @@ -0,0 +1,1454 @@ +#!/usr/bin/env bash + +# lsi main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Sarah Ouologuem (contributor) +# * Vladimir Shitov (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="lsi" +VIASH_META_FUNCTIONALITY_NAME="lsi" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps pkg-config libhdf5-dev gcc && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "muon~=0.1.6" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Sarah Ouologuem, Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component dimred lsi" +LABEL org.opencontainers.image.created="2025-08-22T07:23:14Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "lsi main" + echo "" + echo "Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and" + echo "singular values. Uses the implementation of scipy." + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Path to input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: atac" + echo " On which modality to run LSI on." + echo "" + echo " --layer" + echo " type: string" + echo " Use specified layer for expression values. If not specified, uses" + echo " adata.X." + echo "" + echo " --var_input" + echo " type: string" + echo " Column name in .var matrix that will be used to select which genes to" + echo " run the LSI on. If not specified, uses all features." + echo "" + echo "LSI options:" + echo " --num_components" + echo " type: integer" + echo " default: 50" + echo " min: 2" + echo " Number of components to compute." + echo "" + echo " --scale_embeddings" + echo " type: boolean" + echo " default: true" + echo " Scale embeddings to zero mean and unit variance." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_lsi" + echo " In which .obsm slot to store the resulting embedding." + echo "" + echo " --varm_output" + echo " type: string" + echo " default: lsi" + echo " In which .varm slot to store the resulting loadings matrix." + echo "" + echo " --uns_output" + echo " type: string" + echo " default: lsi" + echo " In which .uns slot to store the stdev." + echo "" + echo " --overwrite" + echo " type: boolean_true" + echo " Allow overwriting .obsm, .varm and .uns slots." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "lsi main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_components) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_components=*) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components=*\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scale_embeddings) + [ -n "$VIASH_PAR_SCALE_EMBEDDINGS" ] && ViashError Bad arguments for option \'--scale_embeddings\': \'$VIASH_PAR_SCALE_EMBEDDINGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCALE_EMBEDDINGS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scale_embeddings. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scale_embeddings=*) + [ -n "$VIASH_PAR_SCALE_EMBEDDINGS" ] && ViashError Bad arguments for option \'--scale_embeddings=*\': \'$VIASH_PAR_SCALE_EMBEDDINGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCALE_EMBEDDINGS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --varm_output) + [ -n "$VIASH_PAR_VARM_OUTPUT" ] && ViashError Bad arguments for option \'--varm_output\': \'$VIASH_PAR_VARM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --varm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --varm_output=*) + [ -n "$VIASH_PAR_VARM_OUTPUT" ] && ViashError Bad arguments for option \'--varm_output=*\': \'$VIASH_PAR_VARM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_output) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_output=*) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output=*\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --overwrite) + [ -n "$VIASH_PAR_OVERWRITE" ] && ViashError Bad arguments for option \'--overwrite\': \'$VIASH_PAR_OVERWRITE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OVERWRITE=true + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dimred/lsi:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="atac" +fi +if [ -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then + VIASH_PAR_NUM_COMPONENTS="50" +fi +if [ -z ${VIASH_PAR_SCALE_EMBEDDINGS+x} ]; then + VIASH_PAR_SCALE_EMBEDDINGS="true" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_lsi" +fi +if [ -z ${VIASH_PAR_VARM_OUTPUT+x} ]; then + VIASH_PAR_VARM_OUTPUT="lsi" +fi +if [ -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then + VIASH_PAR_UNS_OUTPUT="lsi" +fi +if [ -z ${VIASH_PAR_OVERWRITE+x} ]; then + VIASH_PAR_OVERWRITE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_NUM_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_NUM_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_NUM_COMPONENTS -lt 2 ]]; then + ViashError '--num_components' has be more than or equal to 2. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCALE_EMBEDDINGS" ]]; then + if ! [[ "$VIASH_PAR_SCALE_EMBEDDINGS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--scale_embeddings' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OVERWRITE" ]]; then + if ! [[ "$VIASH_PAR_OVERWRITE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--overwrite' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-lsi-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import muon as mu +import mudata as md +from anndata import AnnData +import numpy as np +import sys + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scale_embeddings': $( if [ ! -z ${VIASH_PAR_SCALE_EMBEDDINGS+x} ]; then echo "r'${VIASH_PAR_SCALE_EMBEDDINGS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'varm_output': $( if [ ! -z ${VIASH_PAR_VARM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_VARM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'overwrite': $( if [ ! -z ${VIASH_PAR_OVERWRITE+x} ]; then echo "r'${VIASH_PAR_OVERWRITE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from subset_vars import subset_vars + +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from setup_logger import setup_logger + +logger = setup_logger() + + +# 1.read in adata +logger.info("Reading %s.", par["input"]) +try: + adata = md.read_h5ad(par["input"], mod=par["modality"]) +except KeyError as e: + raise ValueError( + f"Modality '{par['modality']}' was not found in mudata {par['input']}." + ) from e + +# 3. Specify layer +if par["layer"] and par["layer"] not in adata.layers: + raise ValueError( + f"Layer '{par['layer']}' was not found in modality '{par['modality']}'." + ) +layer = adata.X if not par["layer"] else adata.layers[par["layer"]] +adata_input_layer = AnnData(layer, var=adata.var) + + +if not par["layer"]: + logger.info("Using modality '%s' and adata.X for LSI computation", par["modality"]) +else: + logger.info( + "Using modality '%s' and layer '%s' for LSI computation", + par["modality"], + par["layer"], + ) + + +# 4. Subset on highly variable features if applicable +if par["var_input"]: + adata_input_layer = subset_vars(adata_input_layer, par["var_input"]) + + +# 5. Run LSI +logger.info( + "Computing %s LSI components on %s features", + par["num_components"], + adata_input_layer.X.shape[1], +) +mu.atac.tl.lsi( + adata_input_layer, + scale_embeddings=par["scale_embeddings"], + n_comps=par["num_components"], +) + + +# 6. Store output in object +check_exist_dict = { + "obsm_output": ("obsm"), + "varm_output": ("varm"), + "uns_output": ("uns"), +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(adata, field): + if not par["overwrite"]: + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) + del getattr(adata, field)[par[parameter_name]] + +adata.obsm[par["obsm_output"]] = adata_input_layer.obsm["X_lsi"] +adata.uns[par["uns_output"]] = adata_input_layer.uns["lsi"] +if par["var_input"]: + adata.varm[par["varm_output"]] = np.zeros( + shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]) + ) + adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = ( + adata_input_layer.varm["LSI"] + ) +else: + adata.varm[par["varm_output"]] = adata_input_layer.varm["LSI"] + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dimred/lsi/nextflow_labels.config b/target/executable/dimred/lsi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dimred/lsi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dimred/lsi/setup_logger.py b/target/executable/dimred/lsi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dimred/lsi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dimred/lsi/subset_vars.py b/target/executable/dimred/lsi/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/dimred/lsi/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/dimred/pca/.config.vsh.yaml b/target/executable/dimred/pca/.config.vsh.yaml new file mode 100644 index 00000000..11e9022e --- /dev/null +++ b/target/executable/dimred/pca/.config.vsh.yaml @@ -0,0 +1,323 @@ +name: "pca" +namespace: "dimred" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "Column name in .var matrix that will be used to select which genes\ + \ to run the PCA on." + info: null + example: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--varm_output" + description: "In which .varm slot to store the resulting loadings matrix." + info: null + default: + - "pca_loadings" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "In which .uns slot to store the resulting variance objects." + info: null + default: + - "pca_variance" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_components" + description: "Number of principal components to compute. Defaults to 50, or 1\ + \ - minimum dimension size of selected representation." + info: null + example: + - 25 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite" + description: "Allow overwriting .obsm, .varm and .uns slots." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Computes PCA coordinates, loadings and variance decomposition. Uses\ + \ the implementation of scikit-learn [Pedregosa11].\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/pca/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dimred/pca" + executable: "target/executable/dimred/pca/pca" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dimred/pca/compress_h5mu.py b/target/executable/dimred/pca/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/dimred/pca/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/dimred/pca/nextflow_labels.config b/target/executable/dimred/pca/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dimred/pca/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dimred/pca/pca b/target/executable/dimred/pca/pca new file mode 100755 index 00000000..4ccecdae --- /dev/null +++ b/target/executable/dimred/pca/pca @@ -0,0 +1,1393 @@ +#!/usr/bin/env bash + +# pca main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="pca" +VIASH_META_FUNCTIONALITY_NAME="pca" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer" +LABEL org.opencontainers.image.description="Companion container for running component dimred pca" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "pca main" + echo "" + echo "Computes PCA coordinates, loadings and variance decomposition. Uses the" + echo "implementation of scikit-learn [Pedregosa11]." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " Use specified layer for expression values instead of the .X object from" + echo " the modality." + echo "" + echo " --var_input" + echo " type: string" + echo " example: filter_with_hvg" + echo " Column name in .var matrix that will be used to select which genes to" + echo " run the PCA on." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_pca" + echo " In which .obsm slot to store the resulting embedding." + echo "" + echo " --varm_output" + echo " type: string" + echo " default: pca_loadings" + echo " In which .varm slot to store the resulting loadings matrix." + echo "" + echo " --uns_output" + echo " type: string" + echo " default: pca_variance" + echo " In which .uns slot to store the resulting variance objects." + echo "" + echo " --num_components" + echo " type: integer" + echo " example: 25" + echo " Number of principal components to compute. Defaults to 50, or 1 -" + echo " minimum dimension size of selected representation." + echo "" + echo " --overwrite" + echo " type: boolean_true" + echo " Allow overwriting .obsm, .varm and .uns slots." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "pca main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --varm_output) + [ -n "$VIASH_PAR_VARM_OUTPUT" ] && ViashError Bad arguments for option \'--varm_output\': \'$VIASH_PAR_VARM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --varm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --varm_output=*) + [ -n "$VIASH_PAR_VARM_OUTPUT" ] && ViashError Bad arguments for option \'--varm_output=*\': \'$VIASH_PAR_VARM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_output) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_output=*) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output=*\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_components) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_components=*) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components=*\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --overwrite) + [ -n "$VIASH_PAR_OVERWRITE" ] && ViashError Bad arguments for option \'--overwrite\': \'$VIASH_PAR_OVERWRITE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OVERWRITE=true + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dimred/pca:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_pca" +fi +if [ -z ${VIASH_PAR_VARM_OUTPUT+x} ]; then + VIASH_PAR_VARM_OUTPUT="pca_loadings" +fi +if [ -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then + VIASH_PAR_UNS_OUTPUT="pca_variance" +fi +if [ -z ${VIASH_PAR_OVERWRITE+x} ]; then + VIASH_PAR_OVERWRITE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_NUM_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_NUM_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OVERWRITE" ]]; then + if ! [[ "$VIASH_PAR_OVERWRITE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--overwrite' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-pca-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import sys +from anndata import AnnData + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'varm_output': $( if [ ! -z ${VIASH_PAR_VARM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_VARM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'overwrite': $( if [ ! -z ${VIASH_PAR_OVERWRITE+x} ]; then echo "r'${VIASH_PAR_OVERWRITE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing PCA components for modality '%s'", par["modality"]) +if par["layer"] and par["layer"] not in data.layers: + raise ValueError(f"{par['layer']} was not found in modality {par['modality']}.") +layer = data.X if not par["layer"] else data.layers[par["layer"]] +adata_input_layer = AnnData(layer) +adata_input_layer.var.index = data.var.index + +use_highly_variable = False +if par["var_input"]: + if par["var_input"] not in data.var.columns: + raise ValueError( + f"Requested to use .var column {par['var_input']} " + "as a selection of genes to run the PCA on, " + f"but the column is not available for modality {par['modality']}" + ) + use_highly_variable = True + adata_input_layer.var["highly_variable"] = data.var[par["var_input"]] + +# run pca +output_adata = sc.tl.pca( + adata_input_layer, + n_comps=par["num_components"], + copy=True, + use_highly_variable=use_highly_variable, +) + +# store output in specific objects + +check_exist_dict = { + "obsm_output": ("obs"), + "varm_output": ("varm"), + "uns_output": ("uns"), +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(data, field): + if not par["overwrite"]: + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) + del getattr(data, field)[par[parameter_name]] + +data.obsm[par["obsm_output"]] = output_adata.obsm["X_pca"] +data.varm[par["varm_output"]] = output_adata.varm["PCs"] +data.uns[par["uns_output"]] = { + "variance": output_adata.uns["pca"]["variance"], + "variance_ratio": output_adata.uns["pca"]["variance_ratio"], +} + + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dimred/pca/setup_logger.py b/target/executable/dimred/pca/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dimred/pca/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dimred/tsne/.config.vsh.yaml b/target/executable/dimred/tsne/.config.vsh.yaml new file mode 100644 index 00000000..8ae65e2b --- /dev/null +++ b/target/executable/dimred/tsne/.config.vsh.yaml @@ -0,0 +1,370 @@ +name: "tsne" +namespace: "dimred" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "maintainer" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--use_rep" + description: "The `.obsm` slot to use as input for the tSNE computation." + info: null + example: + - "X_pca" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The .obsm key to use for storing the tSNE results." + info: null + default: + - "X_tsne" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--n_pcs" + description: "The number of principal components to use for the tSNE computation." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--perplexity" + description: "The perplexity is related to the number of nearest neighbors that\ + \ is used in other manifold learning algorithms. Larger datasets usually require\ + \ a larger perplexity. Consider selecting a value between 5 and 50. Different\ + \ values can result in significantly different results." + info: null + default: + - 30.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_dist" + description: "The effective minimum distance between embedded points. Smaller\ + \ values will result in a more clustered/clumped embedding where nearby points\ + \ on the manifold are drawn closer together, while larger values will result\ + \ on a more even dispersal of points. The value should be set relative to the\ + \ spread value, which determines the scale at which embedded points will be\ + \ spread out." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--metric" + description: "Distance metric to calculate neighbors on." + info: null + default: + - "euclidean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_exaggeration" + description: "Controls how tight natural clusters in the original space are in\ + \ the embedded space and how much space will be between them. For larger values,\ + \ the space between natural clusters will be larger in the embedded space. Again,\ + \ the choice of this parameter is not very critical. If the cost function increases\ + \ during initial optimization, the early exaggeration factor or the learning\ + \ rate might be too high." + info: null + default: + - 12.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--learning_rate" + description: "The learning rate for t-SNE optimization. Typical values range between\ + \ 10.0 and 1000.0." + info: null + default: + - 1000.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed to use for the tSNE computation." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "t-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality\ + \ reduction technique used to visualize high-dimensional data in a low-dimensional\ + \ space, revealing patterns and clusters by preserving local data similarities.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/tsne/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dimred/tsne" + executable: "target/executable/dimred/tsne/tsne" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dimred/tsne/compress_h5mu.py b/target/executable/dimred/tsne/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/dimred/tsne/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/dimred/tsne/nextflow_labels.config b/target/executable/dimred/tsne/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dimred/tsne/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dimred/tsne/setup_logger.py b/target/executable/dimred/tsne/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dimred/tsne/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dimred/tsne/tsne b/target/executable/dimred/tsne/tsne new file mode 100755 index 00000000..33a462f8 --- /dev/null +++ b/target/executable/dimred/tsne/tsne @@ -0,0 +1,1463 @@ +#!/usr/bin/env bash + +# tsne main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="tsne" +VIASH_META_FUNCTIONALITY_NAME="tsne" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component dimred tsne" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "tsne main" + echo "" + echo "t-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality" + echo "reduction technique used to visualize high-dimensional data in a low-dimensional" + echo "space, revealing patterns and clusters by preserving local data similarities." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string, required parameter" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --use_rep" + echo " type: string, required parameter" + echo " example: X_pca" + echo " The \`.obsm\` slot to use as input for the tSNE computation." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_tsne" + echo " The .obsm key to use for storing the tSNE results." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --n_pcs" + echo " type: integer" + echo " default: 50" + echo " The number of principal components to use for the tSNE computation." + echo "" + echo " --perplexity" + echo " type: double" + echo " default: 30.0" + echo " The perplexity is related to the number of nearest neighbors that is" + echo " used in other manifold learning algorithms. Larger datasets usually" + echo " require a larger perplexity. Consider selecting a value between 5 and" + echo " 50. Different values can result in significantly different results." + echo "" + echo " --min_dist" + echo " type: double" + echo " default: 0.5" + echo " The effective minimum distance between embedded points. Smaller values" + echo " will result in a more clustered/clumped embedding where nearby points on" + echo " the manifold are drawn closer together, while larger values will result" + echo " on a more even dispersal of points. The value should be set relative to" + echo " the spread value, which determines the scale at which embedded points" + echo " will be spread out." + echo "" + echo " --metric" + echo " type: string" + echo " default: euclidean" + echo " Distance metric to calculate neighbors on." + echo "" + echo " --early_exaggeration" + echo " type: double" + echo " default: 12.0" + echo " Controls how tight natural clusters in the original space are in the" + echo " embedded space and how much space will be between them. For larger" + echo " values, the space between natural clusters will be larger in the" + echo " embedded space. Again, the choice of this parameter is not very" + echo " critical. If the cost function increases during initial optimization," + echo " the early exaggeration factor or the learning rate might be too high." + echo "" + echo " --learning_rate" + echo " type: double" + echo " default: 1000.0" + echo " The learning rate for t-SNE optimization. Typical values range between" + echo " 10.0 and 1000.0." + echo "" + echo " --random_state" + echo " type: integer" + echo " default: 0" + echo " The random seed to use for the tSNE computation." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "tsne main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --use_rep) + [ -n "$VIASH_PAR_USE_REP" ] && ViashError Bad arguments for option \'--use_rep\': \'$VIASH_PAR_USE_REP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_REP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --use_rep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --use_rep=*) + [ -n "$VIASH_PAR_USE_REP" ] && ViashError Bad arguments for option \'--use_rep=*\': \'$VIASH_PAR_USE_REP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_REP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_pcs) + [ -n "$VIASH_PAR_N_PCS" ] && ViashError Bad arguments for option \'--n_pcs\': \'$VIASH_PAR_N_PCS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PCS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_pcs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_pcs=*) + [ -n "$VIASH_PAR_N_PCS" ] && ViashError Bad arguments for option \'--n_pcs=*\': \'$VIASH_PAR_N_PCS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PCS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --perplexity) + [ -n "$VIASH_PAR_PERPLEXITY" ] && ViashError Bad arguments for option \'--perplexity\': \'$VIASH_PAR_PERPLEXITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PERPLEXITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --perplexity. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --perplexity=*) + [ -n "$VIASH_PAR_PERPLEXITY" ] && ViashError Bad arguments for option \'--perplexity=*\': \'$VIASH_PAR_PERPLEXITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PERPLEXITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_dist) + [ -n "$VIASH_PAR_MIN_DIST" ] && ViashError Bad arguments for option \'--min_dist\': \'$VIASH_PAR_MIN_DIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_dist. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_dist=*) + [ -n "$VIASH_PAR_MIN_DIST" ] && ViashError Bad arguments for option \'--min_dist=*\': \'$VIASH_PAR_MIN_DIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --metric) + [ -n "$VIASH_PAR_METRIC" ] && ViashError Bad arguments for option \'--metric\': \'$VIASH_PAR_METRIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRIC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --metric. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --metric=*) + [ -n "$VIASH_PAR_METRIC" ] && ViashError Bad arguments for option \'--metric=*\': \'$VIASH_PAR_METRIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRIC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_exaggeration) + [ -n "$VIASH_PAR_EARLY_EXAGGERATION" ] && ViashError Bad arguments for option \'--early_exaggeration\': \'$VIASH_PAR_EARLY_EXAGGERATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_EXAGGERATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_exaggeration. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_exaggeration=*) + [ -n "$VIASH_PAR_EARLY_EXAGGERATION" ] && ViashError Bad arguments for option \'--early_exaggeration=*\': \'$VIASH_PAR_EARLY_EXAGGERATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_EXAGGERATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --learning_rate) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --learning_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --learning_rate=*) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate=*\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --random_state) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --random_state. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --random_state=*) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state=*\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dimred/tsne:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + ViashError '--modality' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_USE_REP+x} ]; then + ViashError '--use_rep' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_tsne" +fi +if [ -z ${VIASH_PAR_N_PCS+x} ]; then + VIASH_PAR_N_PCS="50" +fi +if [ -z ${VIASH_PAR_PERPLEXITY+x} ]; then + VIASH_PAR_PERPLEXITY="30.0" +fi +if [ -z ${VIASH_PAR_MIN_DIST+x} ]; then + VIASH_PAR_MIN_DIST="0.5" +fi +if [ -z ${VIASH_PAR_METRIC+x} ]; then + VIASH_PAR_METRIC="euclidean" +fi +if [ -z ${VIASH_PAR_EARLY_EXAGGERATION+x} ]; then + VIASH_PAR_EARLY_EXAGGERATION="12.0" +fi +if [ -z ${VIASH_PAR_LEARNING_RATE+x} ]; then + VIASH_PAR_LEARNING_RATE="1000.0" +fi +if [ -z ${VIASH_PAR_RANDOM_STATE+x} ]; then + VIASH_PAR_RANDOM_STATE="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_PCS" ]]; then + if ! [[ "$VIASH_PAR_N_PCS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_pcs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PERPLEXITY" ]]; then + if ! [[ "$VIASH_PAR_PERPLEXITY" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--perplexity' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_DIST" ]]; then + if ! [[ "$VIASH_PAR_MIN_DIST" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_dist' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_EXAGGERATION" ]]; then + if ! [[ "$VIASH_PAR_EARLY_EXAGGERATION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--early_exaggeration' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LEARNING_RATE" ]]; then + if ! [[ "$VIASH_PAR_LEARNING_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--learning_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RANDOM_STATE" ]]; then + if ! [[ "$VIASH_PAR_RANDOM_STATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--random_state' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-tsne-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import sys +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'use_rep': $( if [ ! -z ${VIASH_PAR_USE_REP+x} ]; then echo "r'${VIASH_PAR_USE_REP//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_pcs': $( if [ ! -z ${VIASH_PAR_N_PCS+x} ]; then echo "int(r'${VIASH_PAR_N_PCS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'perplexity': $( if [ ! -z ${VIASH_PAR_PERPLEXITY+x} ]; then echo "float(r'${VIASH_PAR_PERPLEXITY//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_dist': $( if [ ! -z ${VIASH_PAR_MIN_DIST+x} ]; then echo "float(r'${VIASH_PAR_MIN_DIST//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'metric': $( if [ ! -z ${VIASH_PAR_METRIC+x} ]; then echo "r'${VIASH_PAR_METRIC//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'early_exaggeration': $( if [ ! -z ${VIASH_PAR_EARLY_EXAGGERATION+x} ]; then echo "float(r'${VIASH_PAR_EARLY_EXAGGERATION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing tSNE for modality '%s'", par["modality"]) +if par["use_rep"] not in data.obsm.keys(): + raise ValueError( + f"'{par['use_rep']}' was not found in .mod['{par['modality']}'].obsm. No precomputed PCA provided. Please run PCA first." + ) +temp_obsm = {par["use_rep"]: data.obsm[par["use_rep"]]} + +temp_adata = ad.AnnData(obsm=temp_obsm, shape=data.shape) + +sc.tl.tsne( + adata=temp_adata, + n_pcs=par["n_pcs"], + use_rep=par["use_rep"], + perplexity=par["perplexity"], + metric=par["metric"], + early_exaggeration=par["early_exaggeration"], + learning_rate=par["learning_rate"], + random_state=par["random_state"], + n_jobs=meta["cpus"], +) + +logger.info( + f"Writing tSNE embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_tsne"] + +logger.info(f"Writing tSNE metadata to .mod[{par['modality']}].uns['tsne']") +data.uns["tsne"] = temp_adata.uns["tsne"] + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/dimred/umap/.config.vsh.yaml b/target/executable/dimred/umap/.config.vsh.yaml new file mode 100644 index 00000000..9de292c8 --- /dev/null +++ b/target/executable/dimred/umap/.config.vsh.yaml @@ -0,0 +1,377 @@ +name: "umap" +namespace: "dimred" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_neighbors" + description: "The `.uns` neighbors slot as output by the `find_neighbors` component." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The pre/postfix under which to store the UMAP results." + info: null + default: + - "umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "double" + name: "--min_dist" + description: "The effective minimum distance between embedded points. Smaller\ + \ values will result in a more clustered/clumped embedding where nearby points\ + \ on the manifold are drawn closer together, while larger values will result\ + \ on a more even dispersal of points. The value should be set relative to the\ + \ spread value, which determines the scale at which embedded points will be\ + \ spread out." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--spread" + description: "The effective scale of embedded points. In combination with `min_dist`\ + \ this determines how clustered/clumped the embedded points are." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_components" + description: "The number of dimensions of the embedding." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "The number of iterations (epochs) of the optimization. Called `n_epochs`\ + \ in the original UMAP. Default is set to 500 if neighbors['connectivities'].shape[0]\ + \ <= 10000, else 200." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "The initial learning rate for the embedding optimization." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--gamma" + description: "Weighting applied to negative samples in low dimensional embedding\ + \ optimization. Values higher than one will result in greater weight being given\ + \ to negative samples." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--negative_sample_rate" + description: "The number of negative edge/1-simplex samples to use per positive\ + \ edge/1-simplex sample in optimizing the low dimensional embedding." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--init_pos" + description: "How to initialize the low dimensional embedding. Called `init` in\ + \ the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions\ + \ from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`:\ + \ assign initial embedding positions at random.\n" + info: null + default: + - "spectral" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning\ + \ technique suitable for visualizing high-dimensional data. Besides tending to be\ + \ faster than tSNE, it optimizes the embedding such that it best reflects the topology\ + \ of the data, which we represent throughout Scanpy using a neighborhood graph.\ + \ tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in\ + \ the embedding such that these best match the distribution of distances in the\ + \ high-dimensional space. We use the implementation of umap-learn [McInnes18]. For\ + \ a few comparisons of UMAP with tSNE, see this preprint.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/umap/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/dimred/umap" + executable: "target/executable/dimred/umap/umap" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/dimred/umap/compress_h5mu.py b/target/executable/dimred/umap/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/dimred/umap/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/dimred/umap/nextflow_labels.config b/target/executable/dimred/umap/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/dimred/umap/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/dimred/umap/setup_logger.py b/target/executable/dimred/umap/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/dimred/umap/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/dimred/umap/umap b/target/executable/dimred/umap/umap new file mode 100755 index 00000000..e1cf923b --- /dev/null +++ b/target/executable/dimred/umap/umap @@ -0,0 +1,1497 @@ +#!/usr/bin/env bash + +# umap main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="umap" +VIASH_META_FUNCTIONALITY_NAME="umap" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer" +LABEL org.opencontainers.image.description="Companion container for running component dimred umap" +LABEL org.opencontainers.image.created="2025-08-22T07:23:13Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "umap main" + echo "" + echo "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning" + echo "technique suitable for visualizing high-dimensional data. Besides tending to be" + echo "faster than tSNE, it optimizes the embedding such that it best reflects the" + echo "topology of the data, which we represent throughout Scanpy using a neighborhood" + echo "graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor" + echo "distances in the embedding such that these best match the distribution of" + echo "distances in the high-dimensional space. We use the implementation of umap-learn" + echo "[McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --uns_neighbors" + echo " type: string" + echo " default: neighbors" + echo " The \`.uns\` neighbors slot as output by the \`find_neighbors\` component." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: umap" + echo " The pre/postfix under which to store the UMAP results." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --min_dist" + echo " type: double" + echo " default: 0.5" + echo " The effective minimum distance between embedded points. Smaller values" + echo " will result in a more clustered/clumped embedding where nearby points on" + echo " the manifold are drawn closer together, while larger values will result" + echo " on a more even dispersal of points. The value should be set relative to" + echo " the spread value, which determines the scale at which embedded points" + echo " will be spread out." + echo "" + echo " --spread" + echo " type: double" + echo " default: 1.0" + echo " The effective scale of embedded points. In combination with \`min_dist\`" + echo " this determines how clustered/clumped the embedded points are." + echo "" + echo " --num_components" + echo " type: integer" + echo " default: 2" + echo " The number of dimensions of the embedding." + echo "" + echo " --max_iter" + echo " type: integer" + echo " The number of iterations (epochs) of the optimization. Called \`n_epochs\`" + echo " in the original UMAP. Default is set to 500 if" + echo " neighbors['connectivities'].shape[0] <= 10000, else 200." + echo "" + echo " --alpha" + echo " type: double" + echo " default: 1.0" + echo " The initial learning rate for the embedding optimization." + echo "" + echo " --gamma" + echo " type: double" + echo " default: 1.0" + echo " Weighting applied to negative samples in low dimensional embedding" + echo " optimization. Values higher than one will result in greater weight being" + echo " given to negative samples." + echo "" + echo " --negative_sample_rate" + echo " type: integer" + echo " default: 5" + echo " The number of negative edge/1-simplex samples to use per positive" + echo " edge/1-simplex sample in optimizing the low dimensional embedding." + echo "" + echo " --init_pos" + echo " type: string" + echo " default: spectral" + echo " How to initialize the low dimensional embedding. Called \`init\` in the" + echo " original UMAP. Options are:" + echo " * Any key from \`.obsm\`" + echo " * \`'paga'\`: positions from \`paga()\`" + echo " * \`'spectral'\`: use a spectral embedding of the graph" + echo " * \`'random'\`: assign initial embedding positions at random." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "umap main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_neighbors) + [ -n "$VIASH_PAR_UNS_NEIGHBORS" ] && ViashError Bad arguments for option \'--uns_neighbors\': \'$VIASH_PAR_UNS_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_NEIGHBORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_neighbors. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_neighbors=*) + [ -n "$VIASH_PAR_UNS_NEIGHBORS" ] && ViashError Bad arguments for option \'--uns_neighbors=*\': \'$VIASH_PAR_UNS_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_NEIGHBORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_dist) + [ -n "$VIASH_PAR_MIN_DIST" ] && ViashError Bad arguments for option \'--min_dist\': \'$VIASH_PAR_MIN_DIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_dist. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_dist=*) + [ -n "$VIASH_PAR_MIN_DIST" ] && ViashError Bad arguments for option \'--min_dist=*\': \'$VIASH_PAR_MIN_DIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --spread) + [ -n "$VIASH_PAR_SPREAD" ] && ViashError Bad arguments for option \'--spread\': \'$VIASH_PAR_SPREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --spread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --spread=*) + [ -n "$VIASH_PAR_SPREAD" ] && ViashError Bad arguments for option \'--spread=*\': \'$VIASH_PAR_SPREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_components) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_components=*) + [ -n "$VIASH_PAR_NUM_COMPONENTS" ] && ViashError Bad arguments for option \'--num_components=*\': \'$VIASH_PAR_NUM_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_iter) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_iter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_iter=*) + [ -n "$VIASH_PAR_MAX_ITER" ] && ViashError Bad arguments for option \'--max_iter=*\': \'$VIASH_PAR_MAX_ITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_ITER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alpha) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alpha=*) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha=*\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gamma) + [ -n "$VIASH_PAR_GAMMA" ] && ViashError Bad arguments for option \'--gamma\': \'$VIASH_PAR_GAMMA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GAMMA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gamma. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gamma=*) + [ -n "$VIASH_PAR_GAMMA" ] && ViashError Bad arguments for option \'--gamma=*\': \'$VIASH_PAR_GAMMA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GAMMA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --negative_sample_rate) + [ -n "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" ] && ViashError Bad arguments for option \'--negative_sample_rate\': \'$VIASH_PAR_NEGATIVE_SAMPLE_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NEGATIVE_SAMPLE_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --negative_sample_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --negative_sample_rate=*) + [ -n "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" ] && ViashError Bad arguments for option \'--negative_sample_rate=*\': \'$VIASH_PAR_NEGATIVE_SAMPLE_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NEGATIVE_SAMPLE_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --init_pos) + [ -n "$VIASH_PAR_INIT_POS" ] && ViashError Bad arguments for option \'--init_pos\': \'$VIASH_PAR_INIT_POS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INIT_POS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --init_pos. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --init_pos=*) + [ -n "$VIASH_PAR_INIT_POS" ] && ViashError Bad arguments for option \'--init_pos=*\': \'$VIASH_PAR_INIT_POS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INIT_POS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/dimred/umap:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_UNS_NEIGHBORS+x} ]; then + VIASH_PAR_UNS_NEIGHBORS="neighbors" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="umap" +fi +if [ -z ${VIASH_PAR_MIN_DIST+x} ]; then + VIASH_PAR_MIN_DIST="0.5" +fi +if [ -z ${VIASH_PAR_SPREAD+x} ]; then + VIASH_PAR_SPREAD="1.0" +fi +if [ -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then + VIASH_PAR_NUM_COMPONENTS="2" +fi +if [ -z ${VIASH_PAR_ALPHA+x} ]; then + VIASH_PAR_ALPHA="1.0" +fi +if [ -z ${VIASH_PAR_GAMMA+x} ]; then + VIASH_PAR_GAMMA="1.0" +fi +if [ -z ${VIASH_PAR_NEGATIVE_SAMPLE_RATE+x} ]; then + VIASH_PAR_NEGATIVE_SAMPLE_RATE="5" +fi +if [ -z ${VIASH_PAR_INIT_POS+x} ]; then + VIASH_PAR_INIT_POS="spectral" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_DIST" ]]; then + if ! [[ "$VIASH_PAR_MIN_DIST" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_dist' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SPREAD" ]]; then + if ! [[ "$VIASH_PAR_SPREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--spread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NUM_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_NUM_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_ITER" ]]; then + if ! [[ "$VIASH_PAR_MAX_ITER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_iter' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALPHA" ]]; then + if ! [[ "$VIASH_PAR_ALPHA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alpha' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GAMMA" ]]; then + if ! [[ "$VIASH_PAR_GAMMA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--gamma' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" ]]; then + if ! [[ "$VIASH_PAR_NEGATIVE_SAMPLE_RATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--negative_sample_rate' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-umap-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import sys +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_neighbors': $( if [ ! -z ${VIASH_PAR_UNS_NEIGHBORS+x} ]; then echo "r'${VIASH_PAR_UNS_NEIGHBORS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_dist': $( if [ ! -z ${VIASH_PAR_MIN_DIST+x} ]; then echo "float(r'${VIASH_PAR_MIN_DIST//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'spread': $( if [ ! -z ${VIASH_PAR_SPREAD+x} ]; then echo "float(r'${VIASH_PAR_SPREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gamma': $( if [ ! -z ${VIASH_PAR_GAMMA+x} ]; then echo "float(r'${VIASH_PAR_GAMMA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'negative_sample_rate': $( if [ ! -z ${VIASH_PAR_NEGATIVE_SAMPLE_RATE+x} ]; then echo "int(r'${VIASH_PAR_NEGATIVE_SAMPLE_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'init_pos': $( if [ ! -z ${VIASH_PAR_INIT_POS+x} ]; then echo "r'${VIASH_PAR_INIT_POS//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing UMAP for modality '%s'", par["modality"]) + +if par["uns_neighbors"] not in data.uns: + raise ValueError( + f"'{par['uns_neighbors']}' was not found in .mod['{par['modality']}'].uns." + ) + +# create temporary AnnData +# ... because sc.tl.umap doesn't allow to choose +# the obsm output slot +# ... also we can see scanpy is a data format dependency hell +neigh_key = par["uns_neighbors"] +temp_uns = {neigh_key: data.uns[neigh_key]} +conn_key = temp_uns[neigh_key]["connectivities_key"] +dist_key = temp_uns[neigh_key]["distances_key"] +temp_obsp = { + conn_key: data.obsp[conn_key], + dist_key: data.obsp[dist_key], +} +pca_key = temp_uns[neigh_key]["params"]["use_rep"] +temp_obsm = {pca_key: data.obsm[pca_key]} + +temp_adata = ad.AnnData(obsm=temp_obsm, obsp=temp_obsp, uns=temp_uns, shape=data.shape) + +sc.tl.umap( + temp_adata, + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + maxiter=par["max_iter"], + alpha=par["alpha"], + gamma=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init_pos=par["init_pos"], + neighbors_key=neigh_key, +) + +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_umap"] + +logger.info("Writing to %s, compression %s", par["output"], par["output_compression"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/download/download_file/.config.vsh.yaml b/target/executable/download/download_file/.config.vsh.yaml new file mode 100644 index 00000000..d40edd33 --- /dev/null +++ b/target/executable/download/download_file/.config.vsh.yaml @@ -0,0 +1,208 @@ +name: "download_file" +namespace: "download" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--input" + description: "URL to a file to download." + info: null + example: + - "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Path where to store output." + info: null + example: + - "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--verbose" + alternatives: + - "-v" + description: "Increase verbosity" + info: null + direction: "input" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Download a file.\n" +usage: "download_file \\\n --input https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5\ + \ \\\n --output output_rna.h5\n" +test_resources: +- type: "bash_script" + path: "run_test.sh" + is_executable: true +info: null +status: "deprecated" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bash:5.1.16" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/download/download_file/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/download/download_file" + executable: "target/executable/download/download_file/download_file" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/download/download_file/download_file b/target/executable/download/download_file/download_file new file mode 100755 index 00000000..401793f2 --- /dev/null +++ b/target/executable/download/download_file/download_file @@ -0,0 +1,1133 @@ +#!/usr/bin/env bash + +# download_file main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="download_file" +VIASH_META_FUNCTIONALITY_NAME="download_file" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM bash:5.1.16 +ENTRYPOINT [] +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component download download_file" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "download_file main" + echo "" + echo "Download a file." + echo "" + echo "Usage:" + echo "download_file \\" + echo " --input" + echo "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + echo "\\" + echo " --output output_rna.h5" + echo "" + echo "Arguments:" + echo " --input" + echo " type: string, required parameter" + echo " example:" + echo "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + echo " URL to a file to download." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + echo " Path where to store output." + echo "" + echo " -v, --verbose" + echo " type: boolean_true" + echo " Increase verbosity" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "download_file main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --verbose) + [ -n "$VIASH_PAR_VERBOSE" ] && ViashError Bad arguments for option \'--verbose\': \'$VIASH_PAR_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSE=true + shift 1 + ;; + -v) + [ -n "$VIASH_PAR_VERBOSE" ] && ViashError Bad arguments for option \'-v\': \'$VIASH_PAR_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSE=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/download/download_file:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_VERBOSE+x} ]; then + VIASH_PAR_VERBOSE="false" +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_VERBOSE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--verbose' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-download_file-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_VERBOSE+x} ]; then echo "${VIASH_PAR_VERBOSE}" | sed "s#'#'\"'\"'#g;s#.*#par_verbose='&'#" ; else echo "# par_verbose="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=() + +if [ "\$par_verbose" != "true" ]; then + extra_params+=("--quiet") +fi + +wget "\$par_input" -O "\$par_output" "\${extra_params[@]}" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/download/download_file/nextflow_labels.config b/target/executable/download/download_file/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/download/download_file/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/feature_annotation/align_query_reference/.config.vsh.yaml b/target/executable/feature_annotation/align_query_reference/.config.vsh.yaml new file mode 100644 index 00000000..67bd3232 --- /dev/null +++ b/target/executable/feature_annotation/align_query_reference/.config.vsh.yaml @@ -0,0 +1,564 @@ +name: "align_query_reference" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process. Note that the query and reference modalities\ + \ should be the same." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input (query) data containing raw counts if .X\ + \ is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer_lognormalized" + description: "The layer in the input (query) data containing log normalized counts\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the .var column in the input (query) data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch" + description: "The name of the .obs column in the input (query) data containing\ + \ batch information.\n" + info: null + example: + - "sample_id" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_label" + description: "The name of the .obs column in the input (query) data containing\ + \ cell type labels. If not provided, the --unkown_celltype_label will be assigned\ + \ to all observations.\n" + info: null + example: + - "cell_type" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_id" + description: "Meta id value to be assigned to the --output_obs_id .obs field of\ + \ the aligned input (query) data.\n" + info: null + default: + - "query" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data containing raw counts if .X is not\ + \ to be used. Data are expected to be processed in the same way as the --input\ + \ query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer_lognormalized" + description: "The layer in the reference data containing log normalized counts\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the .var column in the reference data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch" + description: "The name of the .obs column in the reference data containing batch\ + \ information.\n" + info: null + example: + - "sample_id" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_label" + description: "The name of the .obs column in the reference data containing cell\ + \ type labels. If not provided, the --unkown_celltype_label will be assigned\ + \ to all observations.\n" + info: null + example: + - "cell_type" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_id" + description: "Meta id value to be assigned to the --output_obs_id .obs field of\ + \ the aligned reference data.\n" + info: null + default: + - "reference" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output_query" + description: "Aligned query data." + info: null + example: + - "output_query.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_reference" + description: "Aligned reference data." + info: null + example: + - "output_reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Name of the aligned layer containing raw counts in the output query\ + \ and reference datasets." + info: null + default: + - "_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer_lognormalized" + description: "Name of the aligned layer containing log normalized counts in the\ + \ output query and reference datasets." + info: null + default: + - "_log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_gene_names" + description: "Name of the .var column in the output query and reference datasets\ + \ containing the gene names." + info: null + default: + - "_gene_names" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_batch" + description: "Name of the .obs column in the output query and reference datasets\ + \ containing the batch information." + info: null + default: + - "_sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_label" + description: "Name of the .obs column in the output query and reference datasets\ + \ containing the cell type labels." + info: null + default: + - "_cell_type" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_id" + description: "Name of the .obs column in the output query and reference datasets\ + \ containing the dataset id." + info: null + default: + - "_dataset" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_index" + description: "Name of the .var column to which the .var index of the --input and\ + \ --reference datasets is stored. Only relevant if \"--preserve_var_index\"\ + \ is False." + info: null + default: + - "_ori_var_index" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_common_genes" + description: "Name of the .var column in the output query and reference datasets\ + \ containing the boolean array indicating the common variables." + info: null + default: + - "_common_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + description: "Arguments related to the alignment of the input and reference datasets." + arguments: + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--align_layers_raw_counts" + description: "Whether to align the query and reference layers containing raw counts." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--align_layers_lognormalized_counts" + description: "Whether to align the query and reference layers containing log normalized\ + \ counts." + info: null + direction: "input" + - type: "string" + name: "--unkown_celltype_label" + description: "The label to assign to cells with an unknown cell type.\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite_existing_key" + description: "If set to true and the layer, obs or var key already exists in the\ + \ query/reference file, the key will be overwritten." + info: null + direction: "input" + - type: "boolean_true" + name: "--preserve_var_index" + description: "If set to true, the .var index of the --input and --reference datasets\ + \ will be preserved.\nIf set to false (default behavior), the original .var\ + \ index will be stored in the --output_var_index .var column and the .var index\ + \ will be replaced with the sanitized & aligned gene names.\n" + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Alignment of a query and reference dataset by:\n* Alignment of layers\n\ + * Harmonization of .obs field names for batch and cell type labels\n* Harmonization\ + \ of .var field name for gene names\n* Sanitation of gene names\n* Cross-checking\ + \ of genes\n* Assignment of an id to the query and reference datasets\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/align_query_reference/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/feature_annotation/align_query_reference" + executable: "target/executable/feature_annotation/align_query_reference/align_query_reference" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/feature_annotation/align_query_reference/align_query_reference b/target/executable/feature_annotation/align_query_reference/align_query_reference new file mode 100755 index 00000000..67db4bb2 --- /dev/null +++ b/target/executable/feature_annotation/align_query_reference/align_query_reference @@ -0,0 +1,2041 @@ +#!/usr/bin/env bash + +# align_query_reference main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="align_query_reference" +VIASH_META_FUNCTIONALITY_NAME="align_query_reference" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component feature_annotation align_query_reference" +LABEL org.opencontainers.image.created="2025-08-22T07:23:08Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "align_query_reference main" + echo "" + echo "Alignment of a query and reference dataset by:" + echo "* Alignment of layers" + echo "* Harmonization of .obs field names for batch and cell type labels" + echo "* Harmonization of .var field name for gene names" + echo "* Sanitation of gene names" + echo "* Cross-checking of genes" + echo "* Assignment of an id to the query and reference datasets" + echo "" + echo "Inputs:" + echo " Input dataset (query) arguments" + echo "" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input (query) data to be labeled. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to process. Note that the query and reference modalities" + echo " should be the same." + echo "" + echo " --input_layer" + echo " type: string" + echo " The layer in the input (query) data containing raw counts if .X is not" + echo " to be used." + echo "" + echo " --input_layer_lognormalized" + echo " type: string" + echo " The layer in the input (query) data containing log normalized counts if" + echo " .X is not to be used." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " The name of the .var column in the input (query) data containing gene" + echo " names; when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --input_obs_batch" + echo " type: string, required parameter" + echo " example: sample_id" + echo " The name of the .obs column in the input (query) data containing batch" + echo " information." + echo "" + echo " --input_obs_label" + echo " type: string" + echo " example: cell_type" + echo " The name of the .obs column in the input (query) data containing cell" + echo " type labels. If not provided, the --unkown_celltype_label will be" + echo " assigned to all observations." + echo "" + echo " --input_id" + echo " type: string" + echo " default: query" + echo " Meta id value to be assigned to the --output_obs_id .obs field of the" + echo " aligned input (query) data." + echo "" + echo "Reference:" + echo " Arguments related to the reference dataset." + echo "" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train the CellTypist classifiers on. Only required" + echo " if a pre-trained --model is not provided." + echo "" + echo " --reference_layer" + echo " type: string" + echo " The layer in the reference data containing raw counts if .X is not to be" + echo " used. Data are expected to be processed in the same way as the --input" + echo " query dataset." + echo "" + echo " --reference_layer_lognormalized" + echo " type: string" + echo " The layer in the reference data containing log normalized counts if .X" + echo " is not to be used. Data are expected to be processed in the same way as" + echo " the --input query dataset." + echo "" + echo " --reference_var_gene_names" + echo " type: string" + echo " The name of the .var column in the reference data containing gene names;" + echo " when no gene_name_layer is provided, the var index will be used." + echo "" + echo " --reference_obs_batch" + echo " type: string, required parameter" + echo " example: sample_id" + echo " The name of the .obs column in the reference data containing batch" + echo " information." + echo "" + echo " --reference_obs_label" + echo " type: string" + echo " example: cell_type" + echo " The name of the .obs column in the reference data containing cell type" + echo " labels. If not provided, the --unkown_celltype_label will be assigned to" + echo " all observations." + echo "" + echo " --reference_id" + echo " type: string" + echo " default: reference" + echo " Meta id value to be assigned to the --output_obs_id .obs field of the" + echo " aligned reference data." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output_query" + echo " type: file, output, file must exist" + echo " example: output_query.h5mu" + echo " Aligned query data." + echo "" + echo " --output_reference" + echo " type: file, output, file must exist" + echo " example: output_reference.h5mu" + echo " Aligned reference data." + echo "" + echo " --output_layer" + echo " type: string" + echo " default: _counts" + echo " Name of the aligned layer containing raw counts in the output query and" + echo " reference datasets." + echo "" + echo " --output_layer_lognormalized" + echo " type: string" + echo " default: _log_normalized" + echo " Name of the aligned layer containing log normalized counts in the output" + echo " query and reference datasets." + echo "" + echo " --output_var_gene_names" + echo " type: string" + echo " default: _gene_names" + echo " Name of the .var column in the output query and reference datasets" + echo " containing the gene names." + echo "" + echo " --output_obs_batch" + echo " type: string" + echo " default: _sample_id" + echo " Name of the .obs column in the output query and reference datasets" + echo " containing the batch information." + echo "" + echo " --output_obs_label" + echo " type: string" + echo " default: _cell_type" + echo " Name of the .obs column in the output query and reference datasets" + echo " containing the cell type labels." + echo "" + echo " --output_obs_id" + echo " type: string" + echo " default: _dataset" + echo " Name of the .obs column in the output query and reference datasets" + echo " containing the dataset id." + echo "" + echo " --output_var_index" + echo " type: string" + echo " default: _ori_var_index" + echo " Name of the .var column to which the .var index of the --input and" + echo " --reference datasets is stored. Only relevant if \"--preserve_var_index\"" + echo " is False." + echo "" + echo " --output_var_common_genes" + echo " type: string" + echo " default: _common_vars" + echo " Name of the .var column in the output query and reference datasets" + echo " containing the boolean array indicating the common variables." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " Arguments related to the alignment of the input and reference datasets." + echo "" + echo " --input_reference_gene_overlap" + echo " type: integer" + echo " default: 100" + echo " min: 1" + echo " The minimum number of genes present in both the reference and query" + echo " datasets." + echo "" + echo " --align_layers_raw_counts" + echo " type: boolean" + echo " default: true" + echo " Whether to align the query and reference layers containing raw counts." + echo "" + echo " --align_layers_lognormalized_counts" + echo " type: boolean_true" + echo " Whether to align the query and reference layers containing log" + echo " normalized counts." + echo "" + echo " --unkown_celltype_label" + echo " type: string" + echo " default: Unknown" + echo " The label to assign to cells with an unknown cell type." + echo "" + echo " --overwrite_existing_key" + echo " type: boolean_true" + echo " If set to true and the layer, obs or var key already exists in the" + echo " query/reference file, the key will be overwritten." + echo "" + echo " --preserve_var_index" + echo " type: boolean_true" + echo " If set to true, the .var index of the --input and --reference datasets" + echo " will be preserved." + echo " If set to false (default behavior), the original .var index will be" + echo " stored in the --output_var_index .var column and the .var index will be" + echo " replaced with the sanitized & aligned gene names." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "align_query_reference main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer_lognormalized) + [ -n "$VIASH_PAR_INPUT_LAYER_LOGNORMALIZED" ] && ViashError Bad arguments for option \'--input_layer_lognormalized\': \'$VIASH_PAR_INPUT_LAYER_LOGNORMALIZED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER_LOGNORMALIZED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer_lognormalized. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer_lognormalized=*) + [ -n "$VIASH_PAR_INPUT_LAYER_LOGNORMALIZED" ] && ViashError Bad arguments for option \'--input_layer_lognormalized=*\': \'$VIASH_PAR_INPUT_LAYER_LOGNORMALIZED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER_LOGNORMALIZED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_batch) + [ -n "$VIASH_PAR_INPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--input_obs_batch\': \'$VIASH_PAR_INPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_batch=*) + [ -n "$VIASH_PAR_INPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--input_obs_batch=*\': \'$VIASH_PAR_INPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_label) + [ -n "$VIASH_PAR_INPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--input_obs_label\': \'$VIASH_PAR_INPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_label=*) + [ -n "$VIASH_PAR_INPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--input_obs_label=*\': \'$VIASH_PAR_INPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_id) + [ -n "$VIASH_PAR_INPUT_ID" ] && ViashError Bad arguments for option \'--input_id\': \'$VIASH_PAR_INPUT_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_id=*) + [ -n "$VIASH_PAR_INPUT_ID" ] && ViashError Bad arguments for option \'--input_id=*\': \'$VIASH_PAR_INPUT_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER" ] && ViashError Bad arguments for option \'--reference_layer=*\': \'$VIASH_PAR_REFERENCE_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_layer_lognormalized) + [ -n "$VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED" ] && ViashError Bad arguments for option \'--reference_layer_lognormalized\': \'$VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_layer_lognormalized. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_layer_lognormalized=*) + [ -n "$VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED" ] && ViashError Bad arguments for option \'--reference_layer_lognormalized=*\': \'$VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_var_gene_names) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_var_gene_names=*) + [ -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--reference_var_gene_names=*\': \'$VIASH_PAR_REFERENCE_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_batch) + [ -n "$VIASH_PAR_REFERENCE_OBS_BATCH" ] && ViashError Bad arguments for option \'--reference_obs_batch\': \'$VIASH_PAR_REFERENCE_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_batch=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_BATCH" ] && ViashError Bad arguments for option \'--reference_obs_batch=*\': \'$VIASH_PAR_REFERENCE_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_label) + [ -n "$VIASH_PAR_REFERENCE_OBS_LABEL" ] && ViashError Bad arguments for option \'--reference_obs_label\': \'$VIASH_PAR_REFERENCE_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_label=*) + [ -n "$VIASH_PAR_REFERENCE_OBS_LABEL" ] && ViashError Bad arguments for option \'--reference_obs_label=*\': \'$VIASH_PAR_REFERENCE_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_id) + [ -n "$VIASH_PAR_REFERENCE_ID" ] && ViashError Bad arguments for option \'--reference_id\': \'$VIASH_PAR_REFERENCE_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_id=*) + [ -n "$VIASH_PAR_REFERENCE_ID" ] && ViashError Bad arguments for option \'--reference_id=*\': \'$VIASH_PAR_REFERENCE_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_query) + [ -n "$VIASH_PAR_OUTPUT_QUERY" ] && ViashError Bad arguments for option \'--output_query\': \'$VIASH_PAR_OUTPUT_QUERY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_QUERY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_query. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_query=*) + [ -n "$VIASH_PAR_OUTPUT_QUERY" ] && ViashError Bad arguments for option \'--output_query=*\': \'$VIASH_PAR_OUTPUT_QUERY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_QUERY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_reference) + [ -n "$VIASH_PAR_OUTPUT_REFERENCE" ] && ViashError Bad arguments for option \'--output_reference\': \'$VIASH_PAR_OUTPUT_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_reference=*) + [ -n "$VIASH_PAR_OUTPUT_REFERENCE" ] && ViashError Bad arguments for option \'--output_reference=*\': \'$VIASH_PAR_OUTPUT_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer_lognormalized) + [ -n "$VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED" ] && ViashError Bad arguments for option \'--output_layer_lognormalized\': \'$VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer_lognormalized. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer_lognormalized=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED" ] && ViashError Bad arguments for option \'--output_layer_lognormalized=*\': \'$VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_gene_names) + [ -n "$VIASH_PAR_OUTPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--output_var_gene_names\': \'$VIASH_PAR_OUTPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_gene_names=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--output_var_gene_names=*\': \'$VIASH_PAR_OUTPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_batch) + [ -n "$VIASH_PAR_OUTPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--output_obs_batch\': \'$VIASH_PAR_OUTPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_batch=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--output_obs_batch=*\': \'$VIASH_PAR_OUTPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_label) + [ -n "$VIASH_PAR_OUTPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--output_obs_label\': \'$VIASH_PAR_OUTPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_label=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--output_obs_label=*\': \'$VIASH_PAR_OUTPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_id) + [ -n "$VIASH_PAR_OUTPUT_OBS_ID" ] && ViashError Bad arguments for option \'--output_obs_id\': \'$VIASH_PAR_OUTPUT_OBS_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_id=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_ID" ] && ViashError Bad arguments for option \'--output_obs_id=*\': \'$VIASH_PAR_OUTPUT_OBS_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_index) + [ -n "$VIASH_PAR_OUTPUT_VAR_INDEX" ] && ViashError Bad arguments for option \'--output_var_index\': \'$VIASH_PAR_OUTPUT_VAR_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_index=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_INDEX" ] && ViashError Bad arguments for option \'--output_var_index=*\': \'$VIASH_PAR_OUTPUT_VAR_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_common_genes) + [ -n "$VIASH_PAR_OUTPUT_VAR_COMMON_GENES" ] && ViashError Bad arguments for option \'--output_var_common_genes\': \'$VIASH_PAR_OUTPUT_VAR_COMMON_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_COMMON_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_common_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_common_genes=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_COMMON_GENES" ] && ViashError Bad arguments for option \'--output_var_common_genes=*\': \'$VIASH_PAR_OUTPUT_VAR_COMMON_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_COMMON_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_reference_gene_overlap) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_reference_gene_overlap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_reference_gene_overlap=*) + [ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ] && ViashError Bad arguments for option \'--input_reference_gene_overlap=*\': \'$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --align_layers_raw_counts) + [ -n "$VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS" ] && ViashError Bad arguments for option \'--align_layers_raw_counts\': \'$VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --align_layers_raw_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --align_layers_raw_counts=*) + [ -n "$VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS" ] && ViashError Bad arguments for option \'--align_layers_raw_counts=*\': \'$VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --align_layers_lognormalized_counts) + [ -n "$VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS" ] && ViashError Bad arguments for option \'--align_layers_lognormalized_counts\': \'$VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS=true + shift 1 + ;; + --unkown_celltype_label) + [ -n "$VIASH_PAR_UNKOWN_CELLTYPE_LABEL" ] && ViashError Bad arguments for option \'--unkown_celltype_label\': \'$VIASH_PAR_UNKOWN_CELLTYPE_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNKOWN_CELLTYPE_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --unkown_celltype_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --unkown_celltype_label=*) + [ -n "$VIASH_PAR_UNKOWN_CELLTYPE_LABEL" ] && ViashError Bad arguments for option \'--unkown_celltype_label=*\': \'$VIASH_PAR_UNKOWN_CELLTYPE_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNKOWN_CELLTYPE_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --overwrite_existing_key) + [ -n "$VIASH_PAR_OVERWRITE_EXISTING_KEY" ] && ViashError Bad arguments for option \'--overwrite_existing_key\': \'$VIASH_PAR_OVERWRITE_EXISTING_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OVERWRITE_EXISTING_KEY=true + shift 1 + ;; + --preserve_var_index) + [ -n "$VIASH_PAR_PRESERVE_VAR_INDEX" ] && ViashError Bad arguments for option \'--preserve_var_index\': \'$VIASH_PAR_PRESERVE_VAR_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PRESERVE_VAR_INDEX=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/feature_annotation/align_query_reference:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then + ViashError '--input_obs_batch' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then + ViashError '--reference_obs_batch' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_INPUT_ID+x} ]; then + VIASH_PAR_INPUT_ID="query" +fi +if [ -z ${VIASH_PAR_REFERENCE_ID+x} ]; then + VIASH_PAR_REFERENCE_ID="reference" +fi +if [ -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then + VIASH_PAR_OUTPUT_LAYER="_counts" +fi +if [ -z ${VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED+x} ]; then + VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED="_log_normalized" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_GENE_NAMES+x} ]; then + VIASH_PAR_OUTPUT_VAR_GENE_NAMES="_gene_names" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_BATCH+x} ]; then + VIASH_PAR_OUTPUT_OBS_BATCH="_sample_id" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_LABEL+x} ]; then + VIASH_PAR_OUTPUT_OBS_LABEL="_cell_type" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_ID+x} ]; then + VIASH_PAR_OUTPUT_OBS_ID="_dataset" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_INDEX+x} ]; then + VIASH_PAR_OUTPUT_VAR_INDEX="_ori_var_index" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_COMMON_GENES+x} ]; then + VIASH_PAR_OUTPUT_VAR_COMMON_GENES="_common_vars" +fi +if [ -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then + VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP="100" +fi +if [ -z ${VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS+x} ]; then + VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS="true" +fi +if [ -z ${VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS+x} ]; then + VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS="false" +fi +if [ -z ${VIASH_PAR_UNKOWN_CELLTYPE_LABEL+x} ]; then + VIASH_PAR_UNKOWN_CELLTYPE_LABEL="Unknown" +fi +if [ -z ${VIASH_PAR_OVERWRITE_EXISTING_KEY+x} ]; then + VIASH_PAR_OVERWRITE_EXISTING_KEY="false" +fi +if [ -z ${VIASH_PAR_PRESERVE_VAR_INDEX+x} ]; then + VIASH_PAR_PRESERVE_VAR_INDEX="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" ]]; then + if ! [[ "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--input_reference_gene_overlap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP -lt 1 ]]; then + ViashError '--input_reference_gene_overlap' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--align_layers_raw_counts' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--align_layers_lognormalized_counts' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OVERWRITE_EXISTING_KEY" ]]; then + if ! [[ "$VIASH_PAR_OVERWRITE_EXISTING_KEY" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--overwrite_existing_key' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PRESERVE_VAR_INDEX" ]]; then + if ! [[ "$VIASH_PAR_PRESERVE_VAR_INDEX" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--preserve_var_index' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_QUERY" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_QUERY")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_QUERY")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_REFERENCE" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_REFERENCE")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_REFERENCE")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_QUERY" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_QUERY")" ) + VIASH_PAR_OUTPUT_QUERY=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_QUERY") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_QUERY" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_REFERENCE")" ) + VIASH_PAR_OUTPUT_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_REFERENCE") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_REFERENCE" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-align_query_reference-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer_lognormalized': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER_LOGNORMALIZED+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER_LOGNORMALIZED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_layer_lognormalized': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_batch': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_label': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_id': $( if [ ! -z ${VIASH_PAR_REFERENCE_ID+x} ]; then echo "r'${VIASH_PAR_REFERENCE_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_query': $( if [ ! -z ${VIASH_PAR_OUTPUT_QUERY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_QUERY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_reference': $( if [ ! -z ${VIASH_PAR_OUTPUT_REFERENCE+x} ]; then echo "r'${VIASH_PAR_OUTPUT_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer_lognormalized': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_gene_names': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_batch': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_label': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_id': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_ID+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_index': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_INDEX+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_common_genes': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_COMMON_GENES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_COMMON_GENES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'align_layers_raw_counts': $( if [ ! -z ${VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS+x} ]; then echo "r'${VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'align_layers_lognormalized_counts': $( if [ ! -z ${VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS+x} ]; then echo "r'${VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'unkown_celltype_label': $( if [ ! -z ${VIASH_PAR_UNKOWN_CELLTYPE_LABEL+x} ]; then echo "r'${VIASH_PAR_UNKOWN_CELLTYPE_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'overwrite_existing_key': $( if [ ! -z ${VIASH_PAR_OVERWRITE_EXISTING_KEY+x} ]; then echo "r'${VIASH_PAR_OVERWRITE_EXISTING_KEY//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'preserve_var_index': $( if [ ! -z ${VIASH_PAR_PRESERVE_VAR_INDEX+x} ]; then echo "r'${VIASH_PAR_PRESERVE_VAR_INDEX//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from cross_check_genes import cross_check_genes + +logger = setup_logger() + + +def copy_layer(adata, input_layer, output_layer, overwrite=False): + if output_layer not in adata.layers.keys() and input_layer: + logger.info(f"Copying layer from \`{input_layer}\` to \`{output_layer}\`") + adata.layers[output_layer] = adata.layers[input_layer].copy() + + elif output_layer not in adata.layers.keys() and not input_layer: + logger.info(f"Copying layer from .X to \`{output_layer}\`") + adata.layers[output_layer] = adata.X.copy() + + else: + if not overwrite: + raise ValueError( + f"Layer \`{output_layer}\` already exists. Data can not be copied." + ) + logger.warning(f"Layer \`{output_layer}\` already exists. Overwriting data.") + adata.layers[output_layer] = ( + adata.layers[input_layer].copy() if input_layer else adata.X.copy() + ) + + return adata + + +def copy_obs(adata, input_obs_key, output_obs_key, overwrite=False, fill_value=None): + if output_obs_key not in adata.obs.keys() and input_obs_key: + logger.info(f"Copying .obs field from \`{input_obs_key}\` to \`{output_obs_key}\`") + adata.obs[output_obs_key] = adata.obs[input_obs_key].copy() + + elif output_obs_key not in adata.obs.keys() and not input_obs_key: + logger.info(f"Assigning fill value \`{fill_value}\` to .obs {output_obs_key}") + adata.obs[output_obs_key] = fill_value + + else: + if not overwrite: + raise ValueError( + f".obs key \`{output_obs_key}\` already exists. Data can not be copied." + ) + + logger.warning(f".obs key \`{output_obs_key}\` already exists. Overwriting data.") + adata.obs[output_obs_key] = ( + adata.obs[input_obs_key].copy() if input_obs_key else fill_value + ) + + return adata + + +def copy_and_sanitize_var_gene_names( + adata, + input_var_key, + output_var_key, + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field="ori_var_index", +): + if output_var_key not in adata.var.keys() and input_var_key: + logger.info(f"Copying .var field from \`{input_var_key}\` to \`{output_var_key}\`") + adata.var[output_var_key] = adata.var[input_var_key].str.replace( + "\\\\.[0-9]+\$", "", regex=True + ) + + elif output_var_key not in adata.var.keys() and not input_var_key: + logger.info(f"Copying .var index to \`{output_var_key}\`") + adata.var[output_var_key] = adata.var.index.str.replace( + "\\\\.[0-9]+\$", "", regex=True + ) + + else: + if not overwrite: + raise ValueError( + f".var key \`{output_var_key}\` already exists. Data can not be copied." + ) + + logger.warning(f".var key \`{output_var_key}\` already exists. Overwriting data.") + adata.var[output_var_key] = ( + adata.var[input_var_key].str.replace("\\\\.[0-9]+\$", "", regex=True) + if input_var_key + else adata.var.index.str.replace("\\\\.[0-9]+\$", "", regex=True) + ) + + if not preserve_index: + logger.info("Replacing .var index with sanitized gene names...") + adata.var[var_index_field] = adata.var.index + adata.var.index = list(adata.var[output_var_key]) + + return adata + + +def main(): + logger.info("Reading query and reference data") + input_adata = mu.read_h5ad(par["input"], mod=par["modality"]) + input_modality = input_adata.copy() + + reference_adata = mu.read_h5ad(par["reference"], mod=par["modality"]) + reference_modality = reference_adata.copy() + + # Aligning layers + logger.info("### Aligning layers") + + if par["align_layers_lognormalized_counts"] and par["align_layers_raw_counts"]: + if par["input_layer"] == par["input_layer_lognormalized"]: + raise ValueError( + "Layer names for raw and lognormalized counts in the query data can not be identical." + ) + + if par["reference_layer"] == par["reference_layer_lognormalized"]: + raise ValueError( + "Layer names for raw and lognormalized counts in the reference data can not be identical." + ) + + if par["align_layers_raw_counts"]: + logger.info("## Copying query layer raw counts...") + input_modality = copy_layer( + input_modality, + par["input_layer"], + par["output_layer"], + overwrite=par["overwrite_existing_key"], + ) + logger.info("## Copying reference layer raw counts...") + reference_modality = copy_layer( + reference_modality, + par["reference_layer"], + par["output_layer"], + overwrite=par["overwrite_existing_key"], + ) + + if par["align_layers_lognormalized_counts"]: + logger.info("## Copying query layer lognormalized counts...") + input_modality = copy_layer( + input_modality, + par["input_layer_lognormalized"], + par["output_layer_lognormalized"], + overwrite=par["overwrite_existing_key"], + ) + logger.info("## Copying reference layerlognormalized counts...") + reference_modality = copy_layer( + reference_modality, + par["reference_layer_lognormalized"], + par["output_layer_lognormalized"], + overwrite=par["overwrite_existing_key"], + ) + + # Aligning batch labels + logger.info("### Aligning batch labels") + logger.info("## Copying query .obs batch field...") + input_modality = copy_obs( + input_modality, + par["input_obs_batch"], + par["output_obs_batch"], + par["overwrite_existing_key"], + fill_value=None, + ) + logger.info("## Copying reference .obs batch field...") + reference_modality = copy_obs( + reference_modality, + par["reference_obs_batch"], + par["output_obs_batch"], + overwrite=par["overwrite_existing_key"], + fill_value=None, + ) + + # Aligning cell type labels + logger.info("### Aligning cell type labels") + logger.info("## Copying query .obs cell type label field...") + input_modality = copy_obs( + input_modality, + par["input_obs_label"], + par["output_obs_label"], + par["overwrite_existing_key"], + fill_value=par["unkown_celltype_label"], + ) + logger.info("## Copying reference .obs cell type label field...") + reference_modality = copy_obs( + reference_modality, + par["reference_obs_label"], + par["output_obs_label"], + overwrite=par["overwrite_existing_key"], + fill_value=par["unkown_celltype_label"], + ) + + # Aligning and sanitizing gene names + logger.info("### Aligning and sanitizing gene names") + logger.info("## Copying query .var gene names field...") + input_modality = copy_and_sanitize_var_gene_names( + input_modality, + par["input_var_gene_names"], + par["output_var_gene_names"], + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field=par["output_var_index"], + ) + logger.info("## Copying reference .var gene names field...") + reference_modality = copy_and_sanitize_var_gene_names( + reference_modality, + par["reference_var_gene_names"], + par["output_var_gene_names"], + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field=par["output_var_index"], + ) + + # Cross check genes + logger.info("### Cross checking genes") + common_vars = cross_check_genes( + input_modality.var[par["output_var_gene_names"]], + reference_modality.var[par["output_var_gene_names"]], + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + # Add common vars to the output + input_modality.var[par["output_var_common_genes"]] = input_modality.var[ + par["output_var_gene_names"] + ].isin(common_vars) + reference_modality.var[par["output_var_common_genes"]] = reference_modality.var[ + par["output_var_gene_names"] + ].isin(common_vars) + + # Adding an id to the query and reference datasets + logger.info("### Adding an id to the query and reference datasets") + logger.info( + f"## Adding \`{par['input_id']}\` to the .obs \`{par['output_obs_id']}\` field of the query dataset..." + ) + input_modality.obs[par["output_obs_id"]] = par["input_id"] + logger.info( + f"## Adding \`{par['reference_id']}\` to the .obs \`{par['output_obs_id']}\` field of the reference dataset..." + ) + reference_modality.obs[par["output_obs_id"]] = par["reference_id"] + + # Writing output data + logger.info("Writing output data") + write_h5ad_to_h5mu_with_compression( + par["output_query"], + par["input"], + par["modality"], + input_modality, + par["output_compression"], + ) + + write_h5ad_to_h5mu_with_compression( + par["output_reference"], + par["reference"], + par["modality"], + reference_modality, + par["output_compression"], + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_QUERY" ]; then + VIASH_PAR_OUTPUT_QUERY=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_QUERY") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_REFERENCE" ]; then + VIASH_PAR_OUTPUT_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_REFERENCE") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_QUERY" ] && [ ! -e "$VIASH_PAR_OUTPUT_QUERY" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_QUERY' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_REFERENCE" ] && [ ! -e "$VIASH_PAR_OUTPUT_REFERENCE" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_REFERENCE' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/feature_annotation/align_query_reference/compress_h5mu.py b/target/executable/feature_annotation/align_query_reference/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/feature_annotation/align_query_reference/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/feature_annotation/align_query_reference/cross_check_genes.py b/target/executable/feature_annotation/align_query_reference/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/executable/feature_annotation/align_query_reference/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/executable/feature_annotation/align_query_reference/nextflow_labels.config b/target/executable/feature_annotation/align_query_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/feature_annotation/align_query_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/feature_annotation/align_query_reference/setup_logger.py b/target/executable/feature_annotation/align_query_reference/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/feature_annotation/align_query_reference/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/feature_annotation/highly_variable_features_scanpy/.config.vsh.yaml b/target/executable/feature_annotation/highly_variable_features_scanpy/.config.vsh.yaml new file mode 100644 index 00000000..c07dc1fc --- /dev/null +++ b/target/executable/feature_annotation/highly_variable_features_scanpy/.config.vsh.yaml @@ -0,0 +1,428 @@ +name: "highly_variable_features_scanpy" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use adata.layers[layer] for expression values instead of adata.X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "If specified, use boolean array in adata.var[var_input] to calculate\ + \ hvg on subset of vars.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_filter" + description: "In which .var slot to store a boolean array corresponding to which\ + \ observations should be filtered out." + info: null + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--varm_name" + description: "In which .varm slot to store additional metadata." + info: null + default: + - "hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--flavor" + description: "Choose the flavor for identifying highly variable features. For\ + \ the dispersion based methods\nin their default workflows, Seurat passes the\ + \ cutoffs whereas Cell Ranger passes n_top_features.\n" + info: null + default: + - "seurat" + required: false + choices: + - "seurat" + - "cell_ranger" + - "seurat_v3" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_top_features" + description: "Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_mean" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'." + info: null + default: + - 0.0125 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_mean" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'." + info: null + default: + - 3.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_disp" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_disp" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.\ + \ Default is +inf." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--span" + description: "The fraction of the data (cells) used when estimating the variance\ + \ in the loess model fit if flavor='seurat_v3'." + info: null + default: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_bins" + description: "Number of bins for binning the mean feature expression. Normalization\ + \ is done with respect to each bin. If just a single feature falls into a bin,\ + \ the normalized dispersion is artificially set to 1." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_key" + description: "If specified, highly-variable features are selected within each\ + \ batch separately and merged. This simple \nprocess avoids the selection of\ + \ batch-specific features and acts as a lightweight batch correction method.\ + \ \nFor all flavors, features are first sorted by how many batches they are\ + \ a HVG. For dispersion-based flavors \nties are broken by normalized dispersion.\ + \ If flavor = 'seurat_v3', ties are broken by the median (across\nbatches) rank\ + \ based on within-batch normalized variance.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\ + \nExpects logarithmized data, except when flavor='seurat_v3' in which count data\ + \ is expected.\n\nDepending on flavor, this reproduces the R-implementations of\ + \ Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the\ + \ dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion\ + \ is obtained by scaling with the mean and standard deviation of the dispersions\ + \ for features falling into a given bin for mean expression of features. This means\ + \ that for each bin of mean expression, highly variable features are selected.\n\ + \nFor [Stuart19], a normalized variance for each feature is computed. First, the\ + \ data are standardized (i.e., z-score normalization per feature) with a regularized\ + \ standard deviation. Next, the normalized variance is computed as the variance\ + \ of each feature after the transformation. Features are ranked by the normalized\ + \ variance.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scikit-misc" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/feature_annotation/highly_variable_features_scanpy" + executable: "target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/feature_annotation/highly_variable_features_scanpy/compress_h5mu.py b/target/executable/feature_annotation/highly_variable_features_scanpy/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/feature_annotation/highly_variable_features_scanpy/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy b/target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy new file mode 100755 index 00000000..f8b8d34c --- /dev/null +++ b/target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy @@ -0,0 +1,1643 @@ +#!/usr/bin/env bash + +# highly_variable_features_scanpy main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (contributor) +# * Robrecht Cannoodt (maintainer, contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="highly_variable_features_scanpy" +VIASH_META_FUNCTIONALITY_NAME="highly_variable_features_scanpy" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "scikit-misc" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component feature_annotation highly_variable_features_scanpy" +LABEL org.opencontainers.image.created="2025-08-22T07:23:08Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "highly_variable_features_scanpy main" + echo "" + echo "Annotate highly variable features [Satija15] [Zheng17] [Stuart19]." + echo "" + echo "Expects logarithmized data, except when flavor='seurat_v3' in which count data" + echo "is expected." + echo "" + echo "Depending on flavor, this reproduces the R-implementations of Seurat [Satija15]," + echo "Cell Ranger [Zheng17], and Seurat v3 [Stuart19]." + echo "" + echo "For the dispersion-based methods ([Satija15] and [Zheng17]), the normalized" + echo "dispersion is obtained by scaling with the mean and standard deviation of the" + echo "dispersions for features falling into a given bin for mean expression of" + echo "features. This means that for each bin of mean expression, highly variable" + echo "features are selected." + echo "" + echo "For [Stuart19], a normalized variance for each feature is computed. First, the" + echo "data are standardized (i.e., z-score normalization per feature) with a" + echo "regularized standard deviation. Next, the normalized variance is computed as the" + echo "variance of each feature after the transformation. Features are ranked by the" + echo "normalized variance." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " use adata.layers[layer] for expression values instead of adata.X." + echo "" + echo " --var_input" + echo " type: string" + echo " If specified, use boolean array in adata.var[var_input] to calculate hvg" + echo " on subset of vars." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --var_name_filter" + echo " type: string" + echo " default: filter_with_hvg" + echo " In which .var slot to store a boolean array corresponding to which" + echo " observations should be filtered out." + echo "" + echo " --varm_name" + echo " type: string" + echo " default: hvg" + echo " In which .varm slot to store additional metadata." + echo "" + echo " --flavor" + echo " type: string" + echo " default: seurat" + echo " choices: [ seurat, cell_ranger, seurat_v3 ]" + echo " Choose the flavor for identifying highly variable features. For the" + echo " dispersion based methods" + echo " in their default workflows, Seurat passes the cutoffs whereas Cell" + echo " Ranger passes n_top_features." + echo "" + echo " --n_top_features" + echo " type: integer" + echo " Number of highly-variable features to keep. Mandatory if" + echo " flavor='seurat_v3'." + echo "" + echo " --min_mean" + echo " type: double" + echo " default: 0.0125" + echo " If n_top_features is defined, this and all other cutoffs for the means" + echo " and the normalized dispersions are ignored. Ignored if" + echo " flavor='seurat_v3'." + echo "" + echo " --max_mean" + echo " type: double" + echo " default: 3.0" + echo " If n_top_features is defined, this and all other cutoffs for the means" + echo " and the normalized dispersions are ignored. Ignored if" + echo " flavor='seurat_v3'." + echo "" + echo " --min_disp" + echo " type: double" + echo " default: 0.5" + echo " If n_top_features is defined, this and all other cutoffs for the means" + echo " and the normalized dispersions are ignored. Ignored if" + echo " flavor='seurat_v3'." + echo "" + echo " --max_disp" + echo " type: double" + echo " If n_top_features is defined, this and all other cutoffs for the means" + echo " and the normalized dispersions are ignored. Ignored if" + echo " flavor='seurat_v3'. Default is +inf." + echo "" + echo " --span" + echo " type: double" + echo " default: 0.3" + echo " The fraction of the data (cells) used when estimating the variance in" + echo " the loess model fit if flavor='seurat_v3'." + echo "" + echo " --n_bins" + echo " type: integer" + echo " default: 20" + echo " Number of bins for binning the mean feature expression. Normalization is" + echo " done with respect to each bin. If just a single feature falls into a" + echo " bin, the normalized dispersion is artificially set to 1." + echo "" + echo " --obs_batch_key" + echo " type: string" + echo " If specified, highly-variable features are selected within each batch" + echo " separately and merged. This simple" + echo " process avoids the selection of batch-specific features and acts as a" + echo " lightweight batch correction method." + echo " For all flavors, features are first sorted by how many batches they are" + echo " a HVG. For dispersion-based flavors" + echo " ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties" + echo " are broken by the median (across" + echo " batches) rank based on within-batch normalized variance." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "highly_variable_features_scanpy main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_name_filter) + [ -n "$VIASH_PAR_VAR_NAME_FILTER" ] && ViashError Bad arguments for option \'--var_name_filter\': \'$VIASH_PAR_VAR_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_NAME_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_name_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_name_filter=*) + [ -n "$VIASH_PAR_VAR_NAME_FILTER" ] && ViashError Bad arguments for option \'--var_name_filter=*\': \'$VIASH_PAR_VAR_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_NAME_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --varm_name) + [ -n "$VIASH_PAR_VARM_NAME" ] && ViashError Bad arguments for option \'--varm_name\': \'$VIASH_PAR_VARM_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARM_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --varm_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --varm_name=*) + [ -n "$VIASH_PAR_VARM_NAME" ] && ViashError Bad arguments for option \'--varm_name=*\': \'$VIASH_PAR_VARM_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARM_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --flavor) + [ -n "$VIASH_PAR_FLAVOR" ] && ViashError Bad arguments for option \'--flavor\': \'$VIASH_PAR_FLAVOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FLAVOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --flavor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --flavor=*) + [ -n "$VIASH_PAR_FLAVOR" ] && ViashError Bad arguments for option \'--flavor=*\': \'$VIASH_PAR_FLAVOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FLAVOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_top_features) + [ -n "$VIASH_PAR_N_TOP_FEATURES" ] && ViashError Bad arguments for option \'--n_top_features\': \'$VIASH_PAR_N_TOP_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TOP_FEATURES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_top_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_top_features=*) + [ -n "$VIASH_PAR_N_TOP_FEATURES" ] && ViashError Bad arguments for option \'--n_top_features=*\': \'$VIASH_PAR_N_TOP_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TOP_FEATURES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_mean) + [ -n "$VIASH_PAR_MIN_MEAN" ] && ViashError Bad arguments for option \'--min_mean\': \'$VIASH_PAR_MIN_MEAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MEAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_mean. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_mean=*) + [ -n "$VIASH_PAR_MIN_MEAN" ] && ViashError Bad arguments for option \'--min_mean=*\': \'$VIASH_PAR_MIN_MEAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MEAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_mean) + [ -n "$VIASH_PAR_MAX_MEAN" ] && ViashError Bad arguments for option \'--max_mean\': \'$VIASH_PAR_MAX_MEAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_MEAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_mean. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_mean=*) + [ -n "$VIASH_PAR_MAX_MEAN" ] && ViashError Bad arguments for option \'--max_mean=*\': \'$VIASH_PAR_MAX_MEAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_MEAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_disp) + [ -n "$VIASH_PAR_MIN_DISP" ] && ViashError Bad arguments for option \'--min_disp\': \'$VIASH_PAR_MIN_DISP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DISP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_disp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_disp=*) + [ -n "$VIASH_PAR_MIN_DISP" ] && ViashError Bad arguments for option \'--min_disp=*\': \'$VIASH_PAR_MIN_DISP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_DISP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_disp) + [ -n "$VIASH_PAR_MAX_DISP" ] && ViashError Bad arguments for option \'--max_disp\': \'$VIASH_PAR_MAX_DISP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DISP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_disp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_disp=*) + [ -n "$VIASH_PAR_MAX_DISP" ] && ViashError Bad arguments for option \'--max_disp=*\': \'$VIASH_PAR_MAX_DISP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DISP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --span) + [ -n "$VIASH_PAR_SPAN" ] && ViashError Bad arguments for option \'--span\': \'$VIASH_PAR_SPAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --span. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --span=*) + [ -n "$VIASH_PAR_SPAN" ] && ViashError Bad arguments for option \'--span=*\': \'$VIASH_PAR_SPAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_bins) + [ -n "$VIASH_PAR_N_BINS" ] && ViashError Bad arguments for option \'--n_bins\': \'$VIASH_PAR_N_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_bins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_bins=*) + [ -n "$VIASH_PAR_N_BINS" ] && ViashError Bad arguments for option \'--n_bins=*\': \'$VIASH_PAR_N_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_batch_key) + [ -n "$VIASH_PAR_OBS_BATCH_KEY" ] && ViashError Bad arguments for option \'--obs_batch_key\': \'$VIASH_PAR_OBS_BATCH_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch_key=*) + [ -n "$VIASH_PAR_OBS_BATCH_KEY" ] && ViashError Bad arguments for option \'--obs_batch_key=*\': \'$VIASH_PAR_OBS_BATCH_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/feature_annotation/highly_variable_features_scanpy:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_VAR_NAME_FILTER+x} ]; then + VIASH_PAR_VAR_NAME_FILTER="filter_with_hvg" +fi +if [ -z ${VIASH_PAR_VARM_NAME+x} ]; then + VIASH_PAR_VARM_NAME="hvg" +fi +if [ -z ${VIASH_PAR_FLAVOR+x} ]; then + VIASH_PAR_FLAVOR="seurat" +fi +if [ -z ${VIASH_PAR_MIN_MEAN+x} ]; then + VIASH_PAR_MIN_MEAN="0.0125" +fi +if [ -z ${VIASH_PAR_MAX_MEAN+x} ]; then + VIASH_PAR_MAX_MEAN="3.0" +fi +if [ -z ${VIASH_PAR_MIN_DISP+x} ]; then + VIASH_PAR_MIN_DISP="0.5" +fi +if [ -z ${VIASH_PAR_SPAN+x} ]; then + VIASH_PAR_SPAN="0.3" +fi +if [ -z ${VIASH_PAR_N_BINS+x} ]; then + VIASH_PAR_N_BINS="20" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_TOP_FEATURES" ]]; then + if ! [[ "$VIASH_PAR_N_TOP_FEATURES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_top_features' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MEAN" ]]; then + if ! [[ "$VIASH_PAR_MIN_MEAN" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_mean' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_MEAN" ]]; then + if ! [[ "$VIASH_PAR_MAX_MEAN" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--max_mean' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_DISP" ]]; then + if ! [[ "$VIASH_PAR_MIN_DISP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_disp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_DISP" ]]; then + if ! [[ "$VIASH_PAR_MAX_DISP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--max_disp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SPAN" ]]; then + if ! [[ "$VIASH_PAR_SPAN" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--span' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_BINS" ]]; then + if ! [[ "$VIASH_PAR_N_BINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_bins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_FLAVOR" ]; then + VIASH_PAR_FLAVOR_CHOICES=("seurat;cell_ranger;seurat_v3") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_FLAVOR_CHOICES[*]};" =~ ";$VIASH_PAR_FLAVOR;" ]]; then + ViashError '--flavor' specified value of \'$VIASH_PAR_FLAVOR\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-highly_variable_features_scanpy-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import anndata as ad +import pandas as pd +import sys +import re + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_name_filter': $( if [ ! -z ${VIASH_PAR_VAR_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_VAR_NAME_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'varm_name': $( if [ ! -z ${VIASH_PAR_VARM_NAME+x} ]; then echo "r'${VIASH_PAR_VARM_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'flavor': $( if [ ! -z ${VIASH_PAR_FLAVOR+x} ]; then echo "r'${VIASH_PAR_FLAVOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_top_features': $( if [ ! -z ${VIASH_PAR_N_TOP_FEATURES+x} ]; then echo "int(r'${VIASH_PAR_N_TOP_FEATURES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_mean': $( if [ ! -z ${VIASH_PAR_MIN_MEAN+x} ]; then echo "float(r'${VIASH_PAR_MIN_MEAN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_mean': $( if [ ! -z ${VIASH_PAR_MAX_MEAN+x} ]; then echo "float(r'${VIASH_PAR_MAX_MEAN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_disp': $( if [ ! -z ${VIASH_PAR_MIN_DISP+x} ]; then echo "float(r'${VIASH_PAR_MIN_DISP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_disp': $( if [ ! -z ${VIASH_PAR_MAX_DISP+x} ]; then echo "float(r'${VIASH_PAR_MAX_DISP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'span': $( if [ ! -z ${VIASH_PAR_SPAN+x} ]; then echo "float(r'${VIASH_PAR_SPAN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_bins': $( if [ ! -z ${VIASH_PAR_N_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_BINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'obs_batch_key': $( if [ ! -z ${VIASH_PAR_OBS_BATCH_KEY+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() +modality_name = par["modality"] +data = mu.read_h5ad(par["input"], mod=modality_name) +assert data.var_names.is_unique, "Expected var_names of input modality to be unique." + +logger.info("Processing modality '%s'", modality_name) + +if par["layer"] and par["layer"] not in data.layers: + raise ValueError( + f"Layer '{par['layer']}' not found in layers for modality '{modality_name}'. " + f"Found layers are: {','.join(data.layers)}" + ) + +# input layer argument does not work when batch_key is specified because +# it still uses .X to filter out genes with 0 counts, even if .X might not exist. +# So create a custom anndata as input that always uses .X +input_layer = data.X if not par["layer"] else data.layers[par["layer"]] +obs = pd.DataFrame(index=data.obs_names.copy()) +var = pd.DataFrame(index=data.var_names.copy()) +if par["obs_batch_key"]: + obs = data.obs.loc[:, par["obs_batch_key"]].to_frame() +input_anndata = ad.AnnData(X=input_layer.copy(), obs=obs, var=var) +if "log1p" in data.uns: + input_anndata.uns["log1p"] = data.uns["log1p"] + +# Workaround for issue +# https://github.com/scverse/scanpy/issues/2239 +# https://github.com/scverse/scanpy/issues/2181 +if par["flavor"] != "seurat_v3": + # This component requires log normalized data when flavor is not seurat_v3 + # We assume that the data is correctly normalized but scanpy will look at + # .uns to check the transformations performed on the data. + # To prevent scanpy from automatically tranforming the counts when they are + # already transformed, we set the appropriate values to .uns. + if "log1p" not in input_anndata.uns: + logger.warning( + "When flavor is not set to 'seurat_v3', " + "the input data for this component must be log-transformed. " + "However, the 'log1p' dictionairy in .uns has not been set. " + "This is fine if you did not log transform your data with scanpy." + "Otherwise, please check if you are providing log transformed " + "data using --layer." + ) + input_anndata.uns["log1p"] = {"base": None} + elif "log1p" in input_anndata.uns and "base" not in input_anndata.uns["log1p"]: + input_anndata.uns["log1p"]["base"] = None + +# Enable calculating the HVG only on a subset of vars +# e.g for cell type annotation, only calculate HVG on variables that are common between query and reference +if par["var_input"]: + input_anndata.var[par["var_input"]] = data.var[par["var_input"]] + input_anndata = subset_vars(input_anndata, par["var_input"]) + +logger.info("\\tUnfiltered data: %s", data) + +logger.info("\\tComputing hvg") +# construct arguments +hvg_args = { + "adata": input_anndata, + "n_top_genes": par["n_top_features"], + "min_mean": par["min_mean"], + "max_mean": par["max_mean"], + "min_disp": par["min_disp"], + "span": par["span"], + "n_bins": par["n_bins"], + "flavor": par["flavor"], + "subset": False, + "inplace": False, + "layer": None, # Always uses .X because the input layer was already handled +} + +optional_parameters = { + "max_disp": "max_disp", + "obs_batch_key": "batch_key", + "n_top_genes": "n_top_features", +} +# only add parameter if it's passed +for par_name, dest_name in optional_parameters.items(): + if par.get(par_name): + hvg_args[dest_name] = par[par_name] + +# scanpy does not do this check, although it is stated in the documentation +if par["flavor"] == "seurat_v3" and not par["n_top_features"]: + raise ValueError( + "When flavor is set to 'seurat_v3', you are required to set 'n_top_features'." + ) + +# call function +try: + out = sc.pp.highly_variable_genes(**hvg_args) + if par["var_input"] is not None: + out.index = input_anndata.var.index + out = out.reindex(index=data.var.index, method=None) + out.highly_variable = out.highly_variable.fillna(False) + assert (out.index == data.var.index).all(), ( + "Expected output index values to be equivalent to the input index" + ) + elif par["obs_batch_key"] is not None: + out = out.reindex(index=data.var.index, method=None) + assert (out.index == data.var.index).all(), ( + "Expected output index values to be equivalent to the input index" + ) + +except ValueError as err: + if str(err) == "cannot specify integer \`bins\` when input data contains infinity": + err.args = ( + "Cannot specify integer \`bins\` when input data contains infinity. " + "Perhaps input data has not been log normalized?", + ) + if re.search("Bin edges must be unique:", str(err)): + raise RuntimeError( + "Scanpy failed to calculate hvg. The error " + "returned by scanpy (see above) could be the " + "result from trying to use this component on unfiltered data." + ) from err + raise err + +out.index = data.var.index +logger.info("\\tStoring output into .var") +if par.get("var_name_filter", None) is not None: + data.var[par["var_name_filter"]] = out["highly_variable"] + +if par.get("varm_name", None) is not None and "mean_bin" in out: + # drop mean_bin as mudata/anndata doesn't support tuples + data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) + +logger.info("Writing h5mu to file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], modality_name, data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/feature_annotation/highly_variable_features_scanpy/nextflow_labels.config b/target/executable/feature_annotation/highly_variable_features_scanpy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/feature_annotation/highly_variable_features_scanpy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/feature_annotation/highly_variable_features_scanpy/setup_logger.py b/target/executable/feature_annotation/highly_variable_features_scanpy/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/feature_annotation/highly_variable_features_scanpy/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/feature_annotation/highly_variable_features_scanpy/subset_vars.py b/target/executable/feature_annotation/highly_variable_features_scanpy/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/feature_annotation/highly_variable_features_scanpy/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/.config.vsh.yaml b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/.config.vsh.yaml new file mode 100644 index 00000000..83280d8a --- /dev/null +++ b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/.config.vsh.yaml @@ -0,0 +1,446 @@ +name: "score_genes_cell_cycle_scanpy" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input_file.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object containing normalized expression values.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the column in the var attribute of the adata object\ + \ that contains the gene names (symbols).\nIf not provided, the index of the\ + \ var attribute will be used.\n" + info: null + example: + - "gene_names" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Gene list inputs" + description: "The gene list inputs can be provided as a list of gene symbols or\ + \ as a file containing a list of gene symbols. The gene list file should be formatted\ + \ as a single column with gene symbols.\n\nMake sure that the gene list inputs\ + \ are consistent with the gene names in the adata object as provided by the --var_gene_names\ + \ argument.\n" + arguments: + - type: "string" + name: "--s_genes" + description: "List of gene symbols for scoring s phase genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--s_genes_file" + description: "Path to a .txt file containing the gene list of s phase genes to\ + \ be scored. \nThe gene list file should be formatted as a single column with\ + \ gene symbols.\n" + info: null + example: + - "s_gene_list.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--g2m_genes" + description: "List of gene symbols for scoring g2m phase genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--g2m_genes_file" + description: "Path to a .txt file containing the gene list of g2m phase genes\ + \ to be scored. \nThe gene list file should be formatted as a single column\ + \ with gene symbols.\n" + info: null + example: + - "g2m_gene_list.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_pool" + description: "List of gene symbols for sampling the reference set. Default is\ + \ all genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gene_pool_file" + description: "File with genes for sampling the reference set. Default is all genes.\ + \ \nThe gene pool file should be formatted as a single column with gene symbols.\n" + info: null + example: + - "gene_pool.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file\n" + info: null + example: + - "output_file.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_phase" + description: "The name of the column in the obs attribute of the adata object\ + \ that will store the cell cycle phase annotation.\n" + info: null + default: + - "phase" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_s_score" + description: "The name of the column in the obs attribute of the adata object\ + \ that will store the s phase score.\n" + info: null + default: + - "S_score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_g2m_score" + description: "The name of the column in the obs attribute of the adata object\ + \ that will store the g2m phase score.\n" + info: null + default: + - "G2M_score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--n_bins" + description: "Number of expression level bins for sampling.\n" + info: null + default: + - 25 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed for sampling.\n" + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--allow_missing_genes" + description: "If true, missing genes in the gene list will be ignored.\n" + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Calculates the score associated to S phase and G2M phase and annotates\ + \ the cell cycle phase for each cell, as implemented by scanpy. \nThe score is the\ + \ average expression of a set of genes subtracted with the average expression of\ + \ a reference set of genes.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/score_genes_cell_cycle_scanpy/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/feature_annotation/score_genes_cell_cycle_scanpy" + executable: "target/executable/feature_annotation/score_genes_cell_cycle_scanpy/score_genes_cell_cycle_scanpy" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/helper.py b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/helper.py new file mode 100644 index 00000000..fe559658 --- /dev/null +++ b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/helper.py @@ -0,0 +1,42 @@ +from typing import List, Dict, Any, Optional + + +def read_gene_list( + par: Dict[str, Any], + gene_names: List[str], + list_key: str, + file_key: str, + required: bool = True, +) -> Optional[List[str]]: + """ + Reads a gene list from the parameters and returns it as a list of strings. + """ + + # check whether one or the other was provided, if required + if required and not par[list_key] and not par[file_key]: + raise ValueError(f"Either --{list_key} or --{file_key} must be set") + + # read gene list from parameters + list_of_genes = par[list_key] if par[list_key] else [] + + # read gene list from file + if par[file_key]: + with open(par[file_key]) as file: + file_genes = [x.strip() for x in file] + list_of_genes.extend(file_genes) + + # check for missing genes + if not par["allow_missing_genes"] and list_of_genes: + missing = set(list_of_genes).difference(gene_names) + if missing: + raise ValueError( + f"The follow genes are missing from the input dataset: {missing}" + ) + + # return gene list + if list_of_genes: + return list_of_genes + elif required: + raise ValueError(f"No genes detected in --{list_key} or --{file_key}") + else: + return None diff --git a/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_labels.config b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/score_genes_cell_cycle_scanpy b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/score_genes_cell_cycle_scanpy new file mode 100755 index 00000000..24354f55 --- /dev/null +++ b/target/executable/feature_annotation/score_genes_cell_cycle_scanpy/score_genes_cell_cycle_scanpy @@ -0,0 +1,1598 @@ +#!/usr/bin/env bash + +# score_genes_cell_cycle_scanpy main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author) +# * Dorien Roosen (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="score_genes_cell_cycle_scanpy" +VIASH_META_FUNCTIONALITY_NAME="score_genes_cell_cycle_scanpy" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scanpy~=1.10.4" "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Dorien Roosen, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component feature_annotation score_genes_cell_cycle_scanpy" +LABEL org.opencontainers.image.created="2025-08-22T07:23:09Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "score_genes_cell_cycle_scanpy main" + echo "" + echo "Calculates the score associated to S phase and G2M phase and annotates the cell" + echo "cycle phase for each cell, as implemented by scanpy." + echo "The score is the average expression of a set of genes subtracted with the" + echo "average expression of a reference set of genes." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input_file.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " example: log_normalized" + echo " The layer of the adata object containing normalized expression values." + echo " If not provided, the X attribute of the adata object will be used." + echo "" + echo " --var_gene_names" + echo " type: string" + echo " example: gene_names" + echo " The name of the column in the var attribute of the adata object that" + echo " contains the gene names (symbols)." + echo " If not provided, the index of the var attribute will be used." + echo "" + echo "Gene list inputs:" + echo " The gene list inputs can be provided as a list of gene symbols or as a file" + echo " containing a list of gene symbols. The gene list file should be formatted as" + echo " a single column with gene symbols." + echo " Make sure that the gene list inputs are consistent with the gene names in" + echo " the adata object as provided by the --var_gene_names argument." + echo "" + echo " --s_genes" + echo " type: string, multiple values allowed" + echo " example: gene1;gene2;gene3" + echo " List of gene symbols for scoring s phase genes." + echo "" + echo " --s_genes_file" + echo " type: file, file must exist" + echo " example: s_gene_list.txt" + echo " Path to a .txt file containing the gene list of s phase genes to be" + echo " scored." + echo " The gene list file should be formatted as a single column with gene" + echo " symbols." + echo "" + echo " --g2m_genes" + echo " type: string, multiple values allowed" + echo " example: gene1;gene2;gene3" + echo " List of gene symbols for scoring g2m phase genes." + echo "" + echo " --g2m_genes_file" + echo " type: file, file must exist" + echo " example: g2m_gene_list.txt" + echo " Path to a .txt file containing the gene list of g2m phase genes to be" + echo " scored." + echo " The gene list file should be formatted as a single column with gene" + echo " symbols." + echo "" + echo " --gene_pool" + echo " type: string, multiple values allowed" + echo " example: gene1;gene2;gene3" + echo " List of gene symbols for sampling the reference set. Default is all" + echo " genes." + echo "" + echo " --gene_pool_file" + echo " type: file, file must exist" + echo " example: gene_pool.txt" + echo " File with genes for sampling the reference set. Default is all genes." + echo " The gene pool file should be formatted as a single column with gene" + echo " symbols." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output_file.h5mu" + echo " Output h5mu file" + echo "" + echo " --obs_phase" + echo " type: string" + echo " default: phase" + echo " The name of the column in the obs attribute of the adata object that" + echo " will store the cell cycle phase annotation." + echo "" + echo " --obs_s_score" + echo " type: string" + echo " default: S_score" + echo " The name of the column in the obs attribute of the adata object that" + echo " will store the s phase score." + echo "" + echo " --obs_g2m_score" + echo " type: string" + echo " default: G2M_score" + echo " The name of the column in the obs attribute of the adata object that" + echo " will store the g2m phase score." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --n_bins" + echo " type: integer" + echo " default: 25" + echo " min: 0" + echo " Number of expression level bins for sampling." + echo "" + echo " --random_state" + echo " type: integer" + echo " default: 0" + echo " The random seed for sampling." + echo "" + echo " --allow_missing_genes" + echo " type: boolean" + echo " default: false" + echo " If true, missing genes in the gene list will be ignored." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "score_genes_cell_cycle_scanpy main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_gene_names) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_gene_names=*) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names=*\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --s_genes) + if [ -z "$VIASH_PAR_S_GENES" ]; then + VIASH_PAR_S_GENES="$2" + else + VIASH_PAR_S_GENES="$VIASH_PAR_S_GENES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --s_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --s_genes=*) + if [ -z "$VIASH_PAR_S_GENES" ]; then + VIASH_PAR_S_GENES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_S_GENES="$VIASH_PAR_S_GENES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --s_genes_file) + [ -n "$VIASH_PAR_S_GENES_FILE" ] && ViashError Bad arguments for option \'--s_genes_file\': \'$VIASH_PAR_S_GENES_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_S_GENES_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --s_genes_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --s_genes_file=*) + [ -n "$VIASH_PAR_S_GENES_FILE" ] && ViashError Bad arguments for option \'--s_genes_file=*\': \'$VIASH_PAR_S_GENES_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_S_GENES_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --g2m_genes) + if [ -z "$VIASH_PAR_G2M_GENES" ]; then + VIASH_PAR_G2M_GENES="$2" + else + VIASH_PAR_G2M_GENES="$VIASH_PAR_G2M_GENES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --g2m_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --g2m_genes=*) + if [ -z "$VIASH_PAR_G2M_GENES" ]; then + VIASH_PAR_G2M_GENES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_G2M_GENES="$VIASH_PAR_G2M_GENES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --g2m_genes_file) + [ -n "$VIASH_PAR_G2M_GENES_FILE" ] && ViashError Bad arguments for option \'--g2m_genes_file\': \'$VIASH_PAR_G2M_GENES_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_G2M_GENES_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --g2m_genes_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --g2m_genes_file=*) + [ -n "$VIASH_PAR_G2M_GENES_FILE" ] && ViashError Bad arguments for option \'--g2m_genes_file=*\': \'$VIASH_PAR_G2M_GENES_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_G2M_GENES_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_pool) + if [ -z "$VIASH_PAR_GENE_POOL" ]; then + VIASH_PAR_GENE_POOL="$2" + else + VIASH_PAR_GENE_POOL="$VIASH_PAR_GENE_POOL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_pool. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_pool=*) + if [ -z "$VIASH_PAR_GENE_POOL" ]; then + VIASH_PAR_GENE_POOL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENE_POOL="$VIASH_PAR_GENE_POOL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --gene_pool_file) + [ -n "$VIASH_PAR_GENE_POOL_FILE" ] && ViashError Bad arguments for option \'--gene_pool_file\': \'$VIASH_PAR_GENE_POOL_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_POOL_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_pool_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_pool_file=*) + [ -n "$VIASH_PAR_GENE_POOL_FILE" ] && ViashError Bad arguments for option \'--gene_pool_file=*\': \'$VIASH_PAR_GENE_POOL_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_POOL_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_phase) + [ -n "$VIASH_PAR_OBS_PHASE" ] && ViashError Bad arguments for option \'--obs_phase\': \'$VIASH_PAR_OBS_PHASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_PHASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_phase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_phase=*) + [ -n "$VIASH_PAR_OBS_PHASE" ] && ViashError Bad arguments for option \'--obs_phase=*\': \'$VIASH_PAR_OBS_PHASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_PHASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_s_score) + [ -n "$VIASH_PAR_OBS_S_SCORE" ] && ViashError Bad arguments for option \'--obs_s_score\': \'$VIASH_PAR_OBS_S_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_S_SCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_s_score. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_s_score=*) + [ -n "$VIASH_PAR_OBS_S_SCORE" ] && ViashError Bad arguments for option \'--obs_s_score=*\': \'$VIASH_PAR_OBS_S_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_S_SCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_g2m_score) + [ -n "$VIASH_PAR_OBS_G2M_SCORE" ] && ViashError Bad arguments for option \'--obs_g2m_score\': \'$VIASH_PAR_OBS_G2M_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_G2M_SCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_g2m_score. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_g2m_score=*) + [ -n "$VIASH_PAR_OBS_G2M_SCORE" ] && ViashError Bad arguments for option \'--obs_g2m_score=*\': \'$VIASH_PAR_OBS_G2M_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_G2M_SCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_bins) + [ -n "$VIASH_PAR_N_BINS" ] && ViashError Bad arguments for option \'--n_bins\': \'$VIASH_PAR_N_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_bins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_bins=*) + [ -n "$VIASH_PAR_N_BINS" ] && ViashError Bad arguments for option \'--n_bins=*\': \'$VIASH_PAR_N_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --random_state) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --random_state. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --random_state=*) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state=*\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --allow_missing_genes) + [ -n "$VIASH_PAR_ALLOW_MISSING_GENES" ] && ViashError Bad arguments for option \'--allow_missing_genes\': \'$VIASH_PAR_ALLOW_MISSING_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALLOW_MISSING_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --allow_missing_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --allow_missing_genes=*) + [ -n "$VIASH_PAR_ALLOW_MISSING_GENES" ] && ViashError Bad arguments for option \'--allow_missing_genes=*\': \'$VIASH_PAR_ALLOW_MISSING_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALLOW_MISSING_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/feature_annotation/score_genes_cell_cycle_scanpy:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBS_PHASE+x} ]; then + VIASH_PAR_OBS_PHASE="phase" +fi +if [ -z ${VIASH_PAR_OBS_S_SCORE+x} ]; then + VIASH_PAR_OBS_S_SCORE="S_score" +fi +if [ -z ${VIASH_PAR_OBS_G2M_SCORE+x} ]; then + VIASH_PAR_OBS_G2M_SCORE="G2M_score" +fi +if [ -z ${VIASH_PAR_N_BINS+x} ]; then + VIASH_PAR_N_BINS="25" +fi +if [ -z ${VIASH_PAR_RANDOM_STATE+x} ]; then + VIASH_PAR_RANDOM_STATE="0" +fi +if [ -z ${VIASH_PAR_ALLOW_MISSING_GENES+x} ]; then + VIASH_PAR_ALLOW_MISSING_GENES="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_S_GENES_FILE" ] && [ ! -e "$VIASH_PAR_S_GENES_FILE" ]; then + ViashError "Input file '$VIASH_PAR_S_GENES_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_G2M_GENES_FILE" ] && [ ! -e "$VIASH_PAR_G2M_GENES_FILE" ]; then + ViashError "Input file '$VIASH_PAR_G2M_GENES_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GENE_POOL_FILE" ] && [ ! -e "$VIASH_PAR_GENE_POOL_FILE" ]; then + ViashError "Input file '$VIASH_PAR_GENE_POOL_FILE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_BINS" ]]; then + if ! [[ "$VIASH_PAR_N_BINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_bins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_N_BINS -lt 0 ]]; then + ViashError '--n_bins' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RANDOM_STATE" ]]; then + if ! [[ "$VIASH_PAR_RANDOM_STATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--random_state' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALLOW_MISSING_GENES" ]]; then + if ! [[ "$VIASH_PAR_ALLOW_MISSING_GENES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--allow_missing_genes' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_S_GENES_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_S_GENES_FILE")" ) + VIASH_PAR_S_GENES_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_S_GENES_FILE") +fi +if [ ! -z "$VIASH_PAR_G2M_GENES_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_G2M_GENES_FILE")" ) + VIASH_PAR_G2M_GENES_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_G2M_GENES_FILE") +fi +if [ ! -z "$VIASH_PAR_GENE_POOL_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENE_POOL_FILE")" ) + VIASH_PAR_GENE_POOL_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_GENE_POOL_FILE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-score_genes_cell_cycle_scanpy-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import anndata as ad +import pandas as pd +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 's_genes': $( if [ ! -z ${VIASH_PAR_S_GENES+x} ]; then echo "r'${VIASH_PAR_S_GENES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 's_genes_file': $( if [ ! -z ${VIASH_PAR_S_GENES_FILE+x} ]; then echo "r'${VIASH_PAR_S_GENES_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'g2m_genes': $( if [ ! -z ${VIASH_PAR_G2M_GENES+x} ]; then echo "r'${VIASH_PAR_G2M_GENES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'g2m_genes_file': $( if [ ! -z ${VIASH_PAR_G2M_GENES_FILE+x} ]; then echo "r'${VIASH_PAR_G2M_GENES_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gene_pool': $( if [ ! -z ${VIASH_PAR_GENE_POOL+x} ]; then echo "r'${VIASH_PAR_GENE_POOL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'gene_pool_file': $( if [ ! -z ${VIASH_PAR_GENE_POOL_FILE+x} ]; then echo "r'${VIASH_PAR_GENE_POOL_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_phase': $( if [ ! -z ${VIASH_PAR_OBS_PHASE+x} ]; then echo "r'${VIASH_PAR_OBS_PHASE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_s_score': $( if [ ! -z ${VIASH_PAR_OBS_S_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_S_SCORE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_g2m_score': $( if [ ! -z ${VIASH_PAR_OBS_G2M_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_G2M_SCORE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_bins': $( if [ ! -z ${VIASH_PAR_N_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_BINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'allow_missing_genes': $( if [ ! -z ${VIASH_PAR_ALLOW_MISSING_GENES+x} ]; then echo "r'${VIASH_PAR_ALLOW_MISSING_GENES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# import helper functions +sys.path.append(meta["resources_dir"]) +from helper import read_gene_list + +# read data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] + +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) +gene_names = pd.Series(input_adata.var_names, index=gene_names_index) + +# check if var index is unique +# input.var[par["var_gene_names"]] is mapped to var index, but may not contain unique values +if not input_adata.var.index.is_unique: + raise ValueError("var index is not unique") + +# read gene lists +s_genes = read_gene_list(par, gene_names.index, "s_genes", "s_genes_file") +g2m_genes = read_gene_list(par, gene_names.index, "g2m_genes", "g2m_genes_file") +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) + +# find matching index names for given genes +g2m_index = gene_names.loc[g2m_genes].tolist() +s_index = gene_names.loc[s_genes].tolist() +gene_pool_index = gene_names.loc[gene_pool].tolist() if gene_pool else None + +# create input data for scanpy +if par["input_layer"]: + X_data = input_adata.layers[par["input_layer"]].copy() +else: + X_data = input_adata.X.copy() +adata_scanpy = ad.AnnData( + X=X_data, + obs=pd.DataFrame(index=input_adata.obs.index), + var=pd.DataFrame(index=input_adata.var.index), +) + +# run score_genes_cell_cycle +sc.tl.score_genes_cell_cycle( + adata_scanpy, + s_genes=s_index, + g2m_genes=g2m_index, + gene_pool=gene_pool_index, + n_bins=par["n_bins"], + random_state=par["random_state"], +) + +# copy results to mudata +output_slot_mapping = { + par["obs_s_score"]: "S_score", + par["obs_g2m_score"]: "G2M_score", + par["obs_phase"]: "phase", +} +assert all(adata_scanpy.obs.index == input_adata.obs.index), ( + "index mismatch between input adata and scanpy output adata" +) +for dest, orig in output_slot_mapping.items(): + input_adata.obs[dest] = adata_scanpy.obs[orig] + +# write output to mudata +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_S_GENES_FILE" ]; then + VIASH_PAR_S_GENES_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_S_GENES_FILE") + fi + if [ ! -z "$VIASH_PAR_G2M_GENES_FILE" ]; then + VIASH_PAR_G2M_GENES_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_G2M_GENES_FILE") + fi + if [ ! -z "$VIASH_PAR_GENE_POOL_FILE" ]; then + VIASH_PAR_GENE_POOL_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_GENE_POOL_FILE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/feature_annotation/score_genes_scanpy/.config.vsh.yaml b/target/executable/feature_annotation/score_genes_scanpy/.config.vsh.yaml new file mode 100644 index 00000000..1208964f --- /dev/null +++ b/target/executable/feature_annotation/score_genes_scanpy/.config.vsh.yaml @@ -0,0 +1,335 @@ +name: "score_genes_scanpy" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input_file.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_list" + description: "List of gene symbols to be scored.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gene_list_file" + description: "Path to a .txt file containing the gene list to be scored.\nThe\ + \ gene list file should be formatted as a single column with gene symbols.\n" + info: null + example: + - "gene_list.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_pool" + description: "List of gene symbols for sampling the reference set. Default is\ + \ all genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gene_pool_file" + description: "File with genes for sampling the reference set. Default is all genes.\n\ + The gene pool file should be formatted as a single column with gene symbols.\n" + info: null + example: + - "gene_pool.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object containing normalized expression values.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: ".var column name to be used to detect mitochondrial genes instead\ + \ of .var_names (default if not set).\n" + info: null + example: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--allow_missing_genes" + description: "Whether to run score_genes when some genes in the gene_list or gene_list_file\ + \ are not present in the gene_pool\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file\n" + info: null + example: + - "output_file.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_score" + description: "Name of the score field to be added in .obs.\n" + info: null + default: + - "score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--ctrl_size" + description: "Number of reference genes to be sampled from each bin. \nIf len(gene_list)\ + \ is not too low, you can set ctrl_size=len(gene_list).\n" + info: null + default: + - 50 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_bins" + description: "Number of expression level bins for sampling.\n" + info: null + default: + - 25 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed for sampling.\n" + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Calculates the score of a set of genes for each cell, as implemented\ + \ by scanpy. \nThe score is the average expression of a set of genes subtracted\ + \ with the average expression of a reference set of genes.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/score_genes_scanpy/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/feature_annotation/score_genes_scanpy" + executable: "target/executable/feature_annotation/score_genes_scanpy/score_genes_scanpy" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/feature_annotation/score_genes_scanpy/helper.py b/target/executable/feature_annotation/score_genes_scanpy/helper.py new file mode 100644 index 00000000..fe559658 --- /dev/null +++ b/target/executable/feature_annotation/score_genes_scanpy/helper.py @@ -0,0 +1,42 @@ +from typing import List, Dict, Any, Optional + + +def read_gene_list( + par: Dict[str, Any], + gene_names: List[str], + list_key: str, + file_key: str, + required: bool = True, +) -> Optional[List[str]]: + """ + Reads a gene list from the parameters and returns it as a list of strings. + """ + + # check whether one or the other was provided, if required + if required and not par[list_key] and not par[file_key]: + raise ValueError(f"Either --{list_key} or --{file_key} must be set") + + # read gene list from parameters + list_of_genes = par[list_key] if par[list_key] else [] + + # read gene list from file + if par[file_key]: + with open(par[file_key]) as file: + file_genes = [x.strip() for x in file] + list_of_genes.extend(file_genes) + + # check for missing genes + if not par["allow_missing_genes"] and list_of_genes: + missing = set(list_of_genes).difference(gene_names) + if missing: + raise ValueError( + f"The follow genes are missing from the input dataset: {missing}" + ) + + # return gene list + if list_of_genes: + return list_of_genes + elif required: + raise ValueError(f"No genes detected in --{list_key} or --{file_key}") + else: + return None diff --git a/target/executable/feature_annotation/score_genes_scanpy/nextflow_labels.config b/target/executable/feature_annotation/score_genes_scanpy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/feature_annotation/score_genes_scanpy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/feature_annotation/score_genes_scanpy/score_genes_scanpy b/target/executable/feature_annotation/score_genes_scanpy/score_genes_scanpy new file mode 100755 index 00000000..cf83cbd5 --- /dev/null +++ b/target/executable/feature_annotation/score_genes_scanpy/score_genes_scanpy @@ -0,0 +1,1511 @@ +#!/usr/bin/env bash + +# score_genes_scanpy main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author) +# * Dorien Roosen (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="score_genes_scanpy" +VIASH_META_FUNCTIONALITY_NAME="score_genes_scanpy" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scanpy~=1.10.4" "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Dorien Roosen, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component feature_annotation score_genes_scanpy" +LABEL org.opencontainers.image.created="2025-08-22T07:23:08Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "score_genes_scanpy main" + echo "" + echo "Calculates the score of a set of genes for each cell, as implemented by scanpy." + echo "The score is the average expression of a set of genes subtracted with the" + echo "average expression of a reference set of genes." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input_file.h5mu" + echo " Input h5mu file" + echo "" + echo " --gene_list" + echo " type: string, multiple values allowed" + echo " example: gene1;gene2;gene3" + echo " List of gene symbols to be scored." + echo "" + echo " --gene_list_file" + echo " type: file, file must exist" + echo " example: gene_list.txt" + echo " Path to a .txt file containing the gene list to be scored." + echo " The gene list file should be formatted as a single column with gene" + echo " symbols." + echo "" + echo " --gene_pool" + echo " type: string, multiple values allowed" + echo " example: gene1;gene2;gene3" + echo " List of gene symbols for sampling the reference set. Default is all" + echo " genes." + echo "" + echo " --gene_pool_file" + echo " type: file, file must exist" + echo " example: gene_pool.txt" + echo " File with genes for sampling the reference set. Default is all genes." + echo " The gene pool file should be formatted as a single column with gene" + echo " symbols." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " example: log_normalized" + echo " The layer of the adata object containing normalized expression values." + echo " If not provided, the X attribute of the adata object will be used." + echo "" + echo " --var_gene_names" + echo " type: string" + echo " example: gene_symbol" + echo " .var column name to be used to detect mitochondrial genes instead of" + echo " .var_names (default if not set)." + echo "" + echo " --allow_missing_genes" + echo " type: boolean" + echo " Whether to run score_genes when some genes in the gene_list or" + echo " gene_list_file are not present in the gene_pool" + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output_file.h5mu" + echo " Output h5mu file" + echo "" + echo " --obs_score" + echo " type: string" + echo " default: score" + echo " Name of the score field to be added in .obs." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --ctrl_size" + echo " type: integer" + echo " default: 50" + echo " min: 0" + echo " Number of reference genes to be sampled from each bin." + echo " If len(gene_list) is not too low, you can set ctrl_size=len(gene_list)." + echo "" + echo " --n_bins" + echo " type: integer" + echo " default: 25" + echo " min: 0" + echo " Number of expression level bins for sampling." + echo "" + echo " --random_state" + echo " type: integer" + echo " default: 0" + echo " The random seed for sampling." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "score_genes_scanpy main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_list) + if [ -z "$VIASH_PAR_GENE_LIST" ]; then + VIASH_PAR_GENE_LIST="$2" + else + VIASH_PAR_GENE_LIST="$VIASH_PAR_GENE_LIST;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_list=*) + if [ -z "$VIASH_PAR_GENE_LIST" ]; then + VIASH_PAR_GENE_LIST=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENE_LIST="$VIASH_PAR_GENE_LIST;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --gene_list_file) + [ -n "$VIASH_PAR_GENE_LIST_FILE" ] && ViashError Bad arguments for option \'--gene_list_file\': \'$VIASH_PAR_GENE_LIST_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_LIST_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_list_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_list_file=*) + [ -n "$VIASH_PAR_GENE_LIST_FILE" ] && ViashError Bad arguments for option \'--gene_list_file=*\': \'$VIASH_PAR_GENE_LIST_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_LIST_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_pool) + if [ -z "$VIASH_PAR_GENE_POOL" ]; then + VIASH_PAR_GENE_POOL="$2" + else + VIASH_PAR_GENE_POOL="$VIASH_PAR_GENE_POOL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_pool. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_pool=*) + if [ -z "$VIASH_PAR_GENE_POOL" ]; then + VIASH_PAR_GENE_POOL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENE_POOL="$VIASH_PAR_GENE_POOL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --gene_pool_file) + [ -n "$VIASH_PAR_GENE_POOL_FILE" ] && ViashError Bad arguments for option \'--gene_pool_file\': \'$VIASH_PAR_GENE_POOL_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_POOL_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_pool_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_pool_file=*) + [ -n "$VIASH_PAR_GENE_POOL_FILE" ] && ViashError Bad arguments for option \'--gene_pool_file=*\': \'$VIASH_PAR_GENE_POOL_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_POOL_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_gene_names) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_gene_names=*) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names=*\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --allow_missing_genes) + [ -n "$VIASH_PAR_ALLOW_MISSING_GENES" ] && ViashError Bad arguments for option \'--allow_missing_genes\': \'$VIASH_PAR_ALLOW_MISSING_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALLOW_MISSING_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --allow_missing_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --allow_missing_genes=*) + [ -n "$VIASH_PAR_ALLOW_MISSING_GENES" ] && ViashError Bad arguments for option \'--allow_missing_genes=*\': \'$VIASH_PAR_ALLOW_MISSING_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALLOW_MISSING_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_score) + [ -n "$VIASH_PAR_OBS_SCORE" ] && ViashError Bad arguments for option \'--obs_score\': \'$VIASH_PAR_OBS_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_SCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_score. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_score=*) + [ -n "$VIASH_PAR_OBS_SCORE" ] && ViashError Bad arguments for option \'--obs_score=*\': \'$VIASH_PAR_OBS_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_SCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ctrl_size) + [ -n "$VIASH_PAR_CTRL_SIZE" ] && ViashError Bad arguments for option \'--ctrl_size\': \'$VIASH_PAR_CTRL_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CTRL_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ctrl_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ctrl_size=*) + [ -n "$VIASH_PAR_CTRL_SIZE" ] && ViashError Bad arguments for option \'--ctrl_size=*\': \'$VIASH_PAR_CTRL_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CTRL_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_bins) + [ -n "$VIASH_PAR_N_BINS" ] && ViashError Bad arguments for option \'--n_bins\': \'$VIASH_PAR_N_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_bins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_bins=*) + [ -n "$VIASH_PAR_N_BINS" ] && ViashError Bad arguments for option \'--n_bins=*\': \'$VIASH_PAR_N_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --random_state) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --random_state. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --random_state=*) + [ -n "$VIASH_PAR_RANDOM_STATE" ] && ViashError Bad arguments for option \'--random_state=*\': \'$VIASH_PAR_RANDOM_STATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOM_STATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/feature_annotation/score_genes_scanpy:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBS_SCORE+x} ]; then + VIASH_PAR_OBS_SCORE="score" +fi +if [ -z ${VIASH_PAR_CTRL_SIZE+x} ]; then + VIASH_PAR_CTRL_SIZE="50" +fi +if [ -z ${VIASH_PAR_N_BINS+x} ]; then + VIASH_PAR_N_BINS="25" +fi +if [ -z ${VIASH_PAR_RANDOM_STATE+x} ]; then + VIASH_PAR_RANDOM_STATE="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GENE_LIST_FILE" ] && [ ! -e "$VIASH_PAR_GENE_LIST_FILE" ]; then + ViashError "Input file '$VIASH_PAR_GENE_LIST_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GENE_POOL_FILE" ] && [ ! -e "$VIASH_PAR_GENE_POOL_FILE" ]; then + ViashError "Input file '$VIASH_PAR_GENE_POOL_FILE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_ALLOW_MISSING_GENES" ]]; then + if ! [[ "$VIASH_PAR_ALLOW_MISSING_GENES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--allow_missing_genes' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CTRL_SIZE" ]]; then + if ! [[ "$VIASH_PAR_CTRL_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--ctrl_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_CTRL_SIZE -lt 0 ]]; then + ViashError '--ctrl_size' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_BINS" ]]; then + if ! [[ "$VIASH_PAR_N_BINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_bins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_N_BINS -lt 0 ]]; then + ViashError '--n_bins' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RANDOM_STATE" ]]; then + if ! [[ "$VIASH_PAR_RANDOM_STATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--random_state' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_GENE_LIST_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENE_LIST_FILE")" ) + VIASH_PAR_GENE_LIST_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_GENE_LIST_FILE") +fi +if [ ! -z "$VIASH_PAR_GENE_POOL_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENE_POOL_FILE")" ) + VIASH_PAR_GENE_POOL_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_GENE_POOL_FILE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-score_genes_scanpy-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import anndata as ad +import pandas as pd +import mudata as mu +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gene_list': $( if [ ! -z ${VIASH_PAR_GENE_LIST+x} ]; then echo "r'${VIASH_PAR_GENE_LIST//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'gene_list_file': $( if [ ! -z ${VIASH_PAR_GENE_LIST_FILE+x} ]; then echo "r'${VIASH_PAR_GENE_LIST_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gene_pool': $( if [ ! -z ${VIASH_PAR_GENE_POOL+x} ]; then echo "r'${VIASH_PAR_GENE_POOL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'gene_pool_file': $( if [ ! -z ${VIASH_PAR_GENE_POOL_FILE+x} ]; then echo "r'${VIASH_PAR_GENE_POOL_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'allow_missing_genes': $( if [ ! -z ${VIASH_PAR_ALLOW_MISSING_GENES+x} ]; then echo "r'${VIASH_PAR_ALLOW_MISSING_GENES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_score': $( if [ ! -z ${VIASH_PAR_OBS_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_SCORE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'ctrl_size': $( if [ ! -z ${VIASH_PAR_CTRL_SIZE+x} ]; then echo "int(r'${VIASH_PAR_CTRL_SIZE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_bins': $( if [ ! -z ${VIASH_PAR_N_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_BINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import read_gene_list + +# read data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] + +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) +gene_names = pd.Series(input_adata.var_names, index=gene_names_index) + +# check if var index is unique +# input.var[par["var_gene_names"]] is mapped to var index, but may not contain unique values +if not input_adata.var.index.is_unique: + raise ValueError("var index is not unique") + +# read gene list +gene_list = read_gene_list(par, gene_names.index, "gene_list", "gene_list_file") +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) + +# find matching index names for given genes +gene_list_index = gene_names.loc[gene_list].tolist() +gene_pool_index = gene_names.loc[gene_pool].tolist() if gene_pool else None + +# create input data for scanpy +if par["input_layer"]: + layer_data = input_adata.layers[par["input_layer"]].copy() +else: + layer_data = input_adata.X.copy() +adata_scanpy = ad.AnnData( + X=layer_data, + obs=pd.DataFrame(index=input_adata.obs.index), + var=pd.DataFrame(index=input_adata.var.index), +) + +# run score_genes +sc.tl.score_genes( + adata_scanpy, + gene_list=gene_list_index, + gene_pool=gene_pool_index, + ctrl_size=par["ctrl_size"], + n_bins=par["n_bins"], + random_state=par["random_state"], +) + +# copy results to mudata +assert all(adata_scanpy.obs.index == input_adata.obs.index), ( + "index mismatch between input adata and scanpy output adata" +) +input_adata.obs[par["obs_score"]] = adata_scanpy.obs["score"] + +# write output to mudata +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_GENE_LIST_FILE" ]; then + VIASH_PAR_GENE_LIST_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_GENE_LIST_FILE") + fi + if [ ! -z "$VIASH_PAR_GENE_POOL_FILE" ]; then + VIASH_PAR_GENE_POOL_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_GENE_POOL_FILE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/files/make_params/.config.vsh.yaml b/target/executable/files/make_params/.config.vsh.yaml new file mode 100644 index 00000000..000ad668 --- /dev/null +++ b/target/executable/files/make_params/.config.vsh.yaml @@ -0,0 +1,285 @@ +name: "make_params" +namespace: "files" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--base_dir" + description: "Base directory to search recursively" + info: null + example: + - "/path/to/dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pattern" + description: "An optional regular expression. Only file names which match the\ + \ regular expression will be matched." + info: null + example: + - "*.fastq.gz" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_dirname_drop" + description: "For every matched file, the parent directory will be traversed N\ + \ times." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_basename_id" + description: "The unique identifiers will consist of at least N dirnames." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--id_name" + description: "The name for storing the identifier field in the yaml." + info: null + default: + - "id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--path_name" + description: "The name for storing the path field in the yaml." + info: null + default: + - "path" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_name" + description: "Top level name for the group of entries." + info: null + example: + - "param_list" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output YAML file." + info: null + example: + - "params.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Looks for files in a directory and turn it in a params file." +test_resources: +- type: "bash_script" + path: "test_make_params.sh" + is_executable: true +- type: "file" + path: "src" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/randpy:r4.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/files/make_params/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/files/make_params" + executable: "target/executable/files/make_params/make_params" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/files/make_params/make_params b/target/executable/files/make_params/make_params new file mode 100755 index 00000000..a5fa5481 --- /dev/null +++ b/target/executable/files/make_params/make_params @@ -0,0 +1,1310 @@ +#!/usr/bin/env bash + +# make_params main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (maintainer, author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="make_params" +VIASH_META_FUNCTIONALITY_NAME="make_params" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/randpy:r4.0 +ENTRYPOINT [] +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component files make_params" +LABEL org.opencontainers.image.created="2025-08-22T07:23:03Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "make_params main" + echo "" + echo "Looks for files in a directory and turn it in a params file." + echo "" + echo "Arguments:" + echo " --base_dir" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/dir" + echo " Base directory to search recursively" + echo "" + echo " --pattern" + echo " type: string, required parameter" + echo " example: *.fastq.gz" + echo " An optional regular expression. Only file names which match the regular" + echo " expression will be matched." + echo "" + echo " --n_dirname_drop" + echo " type: integer" + echo " default: 0" + echo " For every matched file, the parent directory will be traversed N times." + echo "" + echo " --n_basename_id" + echo " type: integer" + echo " default: 0" + echo " The unique identifiers will consist of at least N dirnames." + echo "" + echo " --id_name" + echo " type: string" + echo " default: id" + echo " The name for storing the identifier field in the yaml." + echo "" + echo " --path_name" + echo " type: string" + echo " default: path" + echo " The name for storing the path field in the yaml." + echo "" + echo " --group_name" + echo " type: string" + echo " example: param_list" + echo " Top level name for the group of entries." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: params.yaml" + echo " Output YAML file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "make_params main" + exit + ;; + --base_dir) + [ -n "$VIASH_PAR_BASE_DIR" ] && ViashError Bad arguments for option \'--base_dir\': \'$VIASH_PAR_BASE_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BASE_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --base_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --base_dir=*) + [ -n "$VIASH_PAR_BASE_DIR" ] && ViashError Bad arguments for option \'--base_dir=*\': \'$VIASH_PAR_BASE_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BASE_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pattern) + [ -n "$VIASH_PAR_PATTERN" ] && ViashError Bad arguments for option \'--pattern\': \'$VIASH_PAR_PATTERN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PATTERN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pattern. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pattern=*) + [ -n "$VIASH_PAR_PATTERN" ] && ViashError Bad arguments for option \'--pattern=*\': \'$VIASH_PAR_PATTERN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PATTERN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_dirname_drop) + [ -n "$VIASH_PAR_N_DIRNAME_DROP" ] && ViashError Bad arguments for option \'--n_dirname_drop\': \'$VIASH_PAR_N_DIRNAME_DROP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_DIRNAME_DROP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_dirname_drop. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_dirname_drop=*) + [ -n "$VIASH_PAR_N_DIRNAME_DROP" ] && ViashError Bad arguments for option \'--n_dirname_drop=*\': \'$VIASH_PAR_N_DIRNAME_DROP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_DIRNAME_DROP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_basename_id) + [ -n "$VIASH_PAR_N_BASENAME_ID" ] && ViashError Bad arguments for option \'--n_basename_id\': \'$VIASH_PAR_N_BASENAME_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BASENAME_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_basename_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_basename_id=*) + [ -n "$VIASH_PAR_N_BASENAME_ID" ] && ViashError Bad arguments for option \'--n_basename_id=*\': \'$VIASH_PAR_N_BASENAME_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_BASENAME_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --id_name) + [ -n "$VIASH_PAR_ID_NAME" ] && ViashError Bad arguments for option \'--id_name\': \'$VIASH_PAR_ID_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ID_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --id_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id_name=*) + [ -n "$VIASH_PAR_ID_NAME" ] && ViashError Bad arguments for option \'--id_name=*\': \'$VIASH_PAR_ID_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ID_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --path_name) + [ -n "$VIASH_PAR_PATH_NAME" ] && ViashError Bad arguments for option \'--path_name\': \'$VIASH_PAR_PATH_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PATH_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --path_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --path_name=*) + [ -n "$VIASH_PAR_PATH_NAME" ] && ViashError Bad arguments for option \'--path_name=*\': \'$VIASH_PAR_PATH_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PATH_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --group_name) + [ -n "$VIASH_PAR_GROUP_NAME" ] && ViashError Bad arguments for option \'--group_name\': \'$VIASH_PAR_GROUP_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --group_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --group_name=*) + [ -n "$VIASH_PAR_GROUP_NAME" ] && ViashError Bad arguments for option \'--group_name=*\': \'$VIASH_PAR_GROUP_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/files/make_params:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_BASE_DIR+x} ]; then + ViashError '--base_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_PATTERN+x} ]; then + ViashError '--pattern' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_N_DIRNAME_DROP+x} ]; then + VIASH_PAR_N_DIRNAME_DROP="0" +fi +if [ -z ${VIASH_PAR_N_BASENAME_ID+x} ]; then + VIASH_PAR_N_BASENAME_ID="0" +fi +if [ -z ${VIASH_PAR_ID_NAME+x} ]; then + VIASH_PAR_ID_NAME="id" +fi +if [ -z ${VIASH_PAR_PATH_NAME+x} ]; then + VIASH_PAR_PATH_NAME="path" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_BASE_DIR" ] && [ ! -e "$VIASH_PAR_BASE_DIR" ]; then + ViashError "Input file '$VIASH_PAR_BASE_DIR' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_DIRNAME_DROP" ]]; then + if ! [[ "$VIASH_PAR_N_DIRNAME_DROP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_dirname_drop' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_BASENAME_ID" ]]; then + if ! [[ "$VIASH_PAR_N_BASENAME_ID" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_basename_id' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_BASE_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BASE_DIR")" ) + VIASH_PAR_BASE_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_BASE_DIR") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-make_params-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +library(dplyr) +library(purrr) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "base_dir" = $( if [ ! -z ${VIASH_PAR_BASE_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_BASE_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "pattern" = $( if [ ! -z ${VIASH_PAR_PATTERN+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PATTERN" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "n_dirname_drop" = $( if [ ! -z ${VIASH_PAR_N_DIRNAME_DROP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_N_DIRNAME_DROP" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "n_basename_id" = $( if [ ! -z ${VIASH_PAR_N_BASENAME_ID+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_N_BASENAME_ID" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "id_name" = $( if [ ! -z ${VIASH_PAR_ID_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ID_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "path_name" = $( if [ ! -z ${VIASH_PAR_PATH_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PATH_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "group_name" = $( if [ ! -z ${VIASH_PAR_GROUP_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_GROUP_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +cat("> Listing files of base dir ", par\$base_dir, "\\n", sep = "") +paths <- list.files( + normalizePath(par\$base_dir), + pattern = par\$pattern, + recursive = TRUE, + full.names = TRUE +) + +cat("> Traversing up ", par\$n_dirname_apply, " times\\n", sep = "") +for (i in seq_len(par\$n_dirname_drop)) { + paths <- dirname(paths) %>% unique() +} + +# removing /viash_automount in case we're inside a docker container +paths <- gsub("^/viash_automount", "", paths) + +cat("> Checking whether basenames are unique\\n") +i <- par\$n_basename_id +maxi <- strsplit(paths, "/") %>% + map_int(length) %>% + max() + +regex <- paste0(".*/(", paste(rep("[^/]+/", i), collapse = ""), "[^/]*)\$") +ids <- gsub("/", "_", gsub(regex, "\\\\1", paths)) + +cat("> Printing first five rows\\n") +print(tibble(id = ids, path = paths) %>% head(5)) +cat("\\n") + +while (i < maxi && any(duplicated(ids))) { + i <- i + 1 + cat( + "Duplicated ids detected, combining with ", i, + " dirnames in an attempt to get unique ids.\\n" + ) + regex <- paste0(".*/(", paste(rep("[^/]+/", i), collapse = ""), "[^/]*)\$") + ids <- gsub("/", "_", gsub(regex, "\\\\1", paths)) + + cat("> Printing first five rows\\n") + print(tibble(id = ids, path = paths) %>% head(5)) + cat("\\n") +} + +cat("> Transforming into list of items\\n") +par_list <- map2( + ids, paths, + function(id, input) { + setNames(list(id, input), c(par\$id_name, par\$path_name)) + } +) + +if (!is.null(par\$group_name)) { + par_list <- setNames(list(par_list), par\$group_name) +} + +cat("> Writing as YAML\\n") +yaml::write_yaml(par_list, par\$output) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_BASE_DIR" ]; then + VIASH_PAR_BASE_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_BASE_DIR") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/files/make_params/nextflow_labels.config b/target/executable/files/make_params/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/files/make_params/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/delimit_fraction/.config.vsh.yaml b/target/executable/filter/delimit_fraction/.config.vsh.yaml new file mode 100644 index 00000000..530aeff2 --- /dev/null +++ b/target/executable/filter/delimit_fraction/.config.vsh.yaml @@ -0,0 +1,307 @@ +name: "delimit_fraction" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_fraction_column" + description: "Name of column from .var dataframe selecting\na column that contains\ + \ floating point values between 0 and 1.\n" + info: null + example: + - "fraction_mitochondrial" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_filter" + description: "In which .obs slot to store a boolean array corresponding to which\ + \ observations should be removed." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "double" + name: "--min_fraction" + description: "Min fraction for an observation to be retained (True in output)." + info: null + default: + - 0.0 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_fraction" + description: "Max fraction for an observation to be retained (True in output)." + info: null + default: + - 1.0 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Turns a column containing values between 0 and 1 into a boolean column\ + \ based on thresholds.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/delimit_fraction/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/delimit_fraction" + executable: "target/executable/filter/delimit_fraction/delimit_fraction" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/delimit_fraction/compress_h5mu.py b/target/executable/filter/delimit_fraction/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/filter/delimit_fraction/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/filter/delimit_fraction/delimit_fraction b/target/executable/filter/delimit_fraction/delimit_fraction new file mode 100755 index 00000000..c346372c --- /dev/null +++ b/target/executable/filter/delimit_fraction/delimit_fraction @@ -0,0 +1,1402 @@ +#!/usr/bin/env bash + +# delimit_fraction main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="delimit_fraction" +VIASH_META_FUNCTIONALITY_NAME="delimit_fraction" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component filter delimit_fraction" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "delimit_fraction main" + echo "" + echo "Turns a column containing values between 0 and 1 into a boolean column based on" + echo "thresholds." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obs_fraction_column" + echo " type: string, required parameter" + echo " example: fraction_mitochondrial" + echo " Name of column from .var dataframe selecting" + echo " a column that contains floating point values between 0 and 1." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obs_name_filter" + echo " type: string, required parameter" + echo " In which .obs slot to store a boolean array corresponding to which" + echo " observations should be removed." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --min_fraction" + echo " type: double" + echo " default: 0.0" + echo " min: 0.0" + echo " max: 1.0" + echo " Min fraction for an observation to be retained (True in output)." + echo "" + echo " --max_fraction" + echo " type: double" + echo " default: 1.0" + echo " min: 0.0" + echo " max: 1.0" + echo " Max fraction for an observation to be retained (True in output)." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "delimit_fraction main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_fraction_column) + [ -n "$VIASH_PAR_OBS_FRACTION_COLUMN" ] && ViashError Bad arguments for option \'--obs_fraction_column\': \'$VIASH_PAR_OBS_FRACTION_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_FRACTION_COLUMN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_fraction_column. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_fraction_column=*) + [ -n "$VIASH_PAR_OBS_FRACTION_COLUMN" ] && ViashError Bad arguments for option \'--obs_fraction_column=*\': \'$VIASH_PAR_OBS_FRACTION_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_FRACTION_COLUMN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_name_filter) + [ -n "$VIASH_PAR_OBS_NAME_FILTER" ] && ViashError Bad arguments for option \'--obs_name_filter\': \'$VIASH_PAR_OBS_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_name_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_name_filter=*) + [ -n "$VIASH_PAR_OBS_NAME_FILTER" ] && ViashError Bad arguments for option \'--obs_name_filter=*\': \'$VIASH_PAR_OBS_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_fraction) + [ -n "$VIASH_PAR_MIN_FRACTION" ] && ViashError Bad arguments for option \'--min_fraction\': \'$VIASH_PAR_MIN_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_fraction=*) + [ -n "$VIASH_PAR_MIN_FRACTION" ] && ViashError Bad arguments for option \'--min_fraction=*\': \'$VIASH_PAR_MIN_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_fraction) + [ -n "$VIASH_PAR_MAX_FRACTION" ] && ViashError Bad arguments for option \'--max_fraction\': \'$VIASH_PAR_MAX_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_fraction=*) + [ -n "$VIASH_PAR_MAX_FRACTION" ] && ViashError Bad arguments for option \'--max_fraction=*\': \'$VIASH_PAR_MAX_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/delimit_fraction:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_FRACTION_COLUMN+x} ]; then + ViashError '--obs_fraction_column' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then + ViashError '--obs_name_filter' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_MIN_FRACTION+x} ]; then + VIASH_PAR_MIN_FRACTION="0.0" +fi +if [ -z ${VIASH_PAR_MAX_FRACTION+x} ]; then + VIASH_PAR_MAX_FRACTION="1.0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_MIN_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_MIN_FRACTION '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--min_fraction' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_MIN_FRACTION -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--min_fraction' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--min_fraction' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_MIN_FRACTION '<=' 1.0 | bc` -eq 1 ]]; then + ViashError '--min_fraction' has to be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_MIN_FRACTION -v n2=1.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--min_fraction' has be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--min_fraction' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_MAX_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_MAX_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--max_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_MAX_FRACTION '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--max_fraction' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_MAX_FRACTION -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--max_fraction' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--max_fraction' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_MAX_FRACTION '<=' 1.0 | bc` -eq 1 ]]; then + ViashError '--max_fraction' has to be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_MAX_FRACTION -v n2=1.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--max_fraction' has be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--max_fraction' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-delimit_fraction-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import numpy as np +import sys +from operator import le, ge +from pandas.api.types import is_float_dtype + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_fraction_column': $( if [ ! -z ${VIASH_PAR_OBS_FRACTION_COLUMN+x} ]; then echo "r'${VIASH_PAR_OBS_FRACTION_COLUMN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_name_filter': $( if [ ! -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_fraction': $( if [ ! -z ${VIASH_PAR_MIN_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_MIN_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_fraction': $( if [ ! -z ${VIASH_PAR_MAX_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_MAX_FRACTION//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from %s", par["modality"], par["input"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +assert data.var_names.is_unique, ( + "The var_names of the input modality must be be unique." +) + +logger.info("\\tUnfiltered data: %s", data) +logger.info("\\tComputing aggregations.") + + +def apply_filter_to_mask(mask, base, filter, comparator): + new_filt = np.ravel(comparator(base, filter)) + num_removed = np.sum(np.invert(new_filt) & mask) + mask &= new_filt + return num_removed, mask + + +try: + fraction = data.obs[par["obs_fraction_column"]] +except KeyError: + raise ValueError(f"Could not find column '{par['obs_fraction_column']}'") +if not is_float_dtype(fraction): + raise ValueError( + f"Column '{par['obs_fraction_column']}' does not contain float datatype." + ) +if fraction.max() > 1: + raise ValueError(f"Column '{par['obs_fraction_column']}' contains values > 1.") +if fraction.min() < 0: + raise ValueError(f"Column '{par['obs_fraction_column']}' contains values < 0.") + + +# Filter cells +filters = ( + ( + "min_fraction", + fraction, + ge, + "\\tRemoving %s cells with <%s percentage mitochondrial reads.", + ), + ( + "max_fraction", + fraction, + le, + "\\tRemoving %s cells with >%s percentage mitochondrial reads.", + ), +) + +keep_cells = np.repeat(True, data.n_obs) +for filter_name_or_value, base, comparator, message in filters: + try: + filter = par[filter_name_or_value] + except KeyError: + filter = filter_name_or_value + if filter is not None: + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) + logger.info(message, num_removed, filter) + +data.obs[par["obs_name_filter"]] = keep_cells + +logger.info("\\tFiltered data: %s", data) +logger.info("Writing output data to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/delimit_fraction/nextflow_labels.config b/target/executable/filter/delimit_fraction/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/delimit_fraction/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/delimit_fraction/setup_logger.py b/target/executable/filter/delimit_fraction/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/delimit_fraction/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/do_filter/.config.vsh.yaml b/target/executable/filter/do_filter/.config.vsh.yaml new file mode 100644 index 00000000..525606d8 --- /dev/null +++ b/target/executable/filter/do_filter/.config.vsh.yaml @@ -0,0 +1,275 @@ +name: "do_filter" +namespace: "filter" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_filter" + description: "Which .obs columns to use to filter the observations by." + info: null + example: + - "filter_with_x" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--var_filter" + description: "Which .var columns to use to filter the observations by." + info: null + example: + - "filter_with_x" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Remove observations and variables based on specified .obs and .var columns.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/do_filter/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/do_filter" + executable: "target/executable/filter/do_filter/do_filter" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/do_filter/compress_h5mu.py b/target/executable/filter/do_filter/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/filter/do_filter/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/filter/do_filter/do_filter b/target/executable/filter/do_filter/do_filter new file mode 100755 index 00000000..bd5bab45 --- /dev/null +++ b/target/executable/filter/do_filter/do_filter @@ -0,0 +1,1259 @@ +#!/usr/bin/env bash + +# do_filter main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer, contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="do_filter" +VIASH_META_FUNCTIONALITY_NAME="do_filter" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component filter do_filter" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "do_filter main" + echo "" + echo "Remove observations and variables based on specified .obs and .var columns." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obs_filter" + echo " type: string, multiple values allowed" + echo " example: filter_with_x" + echo " Which .obs columns to use to filter the observations by." + echo "" + echo " --var_filter" + echo " type: string, multiple values allowed" + echo " example: filter_with_x" + echo " Which .var columns to use to filter the observations by." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "do_filter main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_filter) + if [ -z "$VIASH_PAR_OBS_FILTER" ]; then + VIASH_PAR_OBS_FILTER="$2" + else + VIASH_PAR_OBS_FILTER="$VIASH_PAR_OBS_FILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_filter=*) + if [ -z "$VIASH_PAR_OBS_FILTER" ]; then + VIASH_PAR_OBS_FILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_FILTER="$VIASH_PAR_OBS_FILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --var_filter) + if [ -z "$VIASH_PAR_VAR_FILTER" ]; then + VIASH_PAR_VAR_FILTER="$2" + else + VIASH_PAR_VAR_FILTER="$VIASH_PAR_VAR_FILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_filter=*) + if [ -z "$VIASH_PAR_VAR_FILTER" ]; then + VIASH_PAR_VAR_FILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VAR_FILTER="$VIASH_PAR_VAR_FILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/do_filter:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-do_filter-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import numpy as np +import sys + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_filter': $( if [ ! -z ${VIASH_PAR_OBS_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_FILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'var_filter': $( if [ ! -z ${VIASH_PAR_VAR_FILTER+x} ]; then echo "r'${VIASH_PAR_VAR_FILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality: %s", par["input"], par["modality"]) +adata = mu.read_h5ad(par["input"], mod=par["modality"]) + +obs_filt = np.repeat(True, adata.n_obs) +var_filt = np.repeat(True, adata.n_vars) + +par["obs_filter"] = par["obs_filter"] if par["obs_filter"] else [] +par["var_filter"] = par["var_filter"] if par["var_filter"] else [] + +for obs_name in par["obs_filter"]: + logger.info( + "Filtering modality '%s' observations by .obs['%s']", par["modality"], obs_name + ) + if obs_name not in adata.obs: + raise ValueError(f".mod[{par['modality']}].obs[{obs_name}] does not exist.") + if obs_name in adata.obs: + obs_filt &= adata.obs[obs_name] + +for var_name in par["var_filter"]: + logger.info( + "Filtering modality '%s' variables by .var['%s']", par["modality"], var_name + ) + if var_name not in adata.var: + raise ValueError(f".mod[{par['modality']}].var[{var_name}] does not exist.") + if var_name in adata.var: + var_filt &= adata.var[var_name] + +adata_filtered = adata[obs_filt, var_filt] + +logger.info("Writing h5mu to file %s.", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + adata_filtered, + par["output_compression"], +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/do_filter/nextflow_labels.config b/target/executable/filter/do_filter/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/do_filter/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/do_filter/setup_logger.py b/target/executable/filter/do_filter/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/do_filter/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/filter_with_counts/.config.vsh.yaml b/target/executable/filter/filter_with_counts/.config.vsh.yaml new file mode 100644 index 00000000..4f798b98 --- /dev/null +++ b/target/executable/filter/filter_with_counts/.config.vsh.yaml @@ -0,0 +1,362 @@ +name: "filter_with_counts" +namespace: "filter" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Location of the count matrix. If specified, will be used to select\ + \ a key from .layers,\notherwise .X is used.\n" + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--do_subset" + description: "Whether to subset before storing the output." + info: null + direction: "input" + - type: "string" + name: "--obs_name_filter" + description: "In which .obs slot to store a boolean array corresponding to which\ + \ observations should be removed." + info: null + default: + - "filter_with_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_filter" + description: "In which .var slot to store a boolean array corresponding to which\ + \ variables should be removed." + info: null + default: + - "filter_with_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts captured per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_counts" + description: "Maximum number of counts captured per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_genes_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_genes_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 1500000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_gene" + description: "Minimum of non-zero values per gene." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter scRNA-seq data based on the primary QC metrics. \nThis is based\ + \ on both the UMI counts, the gene counts \nand the mitochondrial genes (genes starting\ + \ with mt/MT).\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/filter_with_counts/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/filter_with_counts" + executable: "target/executable/filter/filter_with_counts/filter_with_counts" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/filter_with_counts/compress_h5mu.py b/target/executable/filter/filter_with_counts/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/filter/filter_with_counts/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/filter/filter_with_counts/filter_with_counts b/target/executable/filter/filter_with_counts/filter_with_counts new file mode 100755 index 00000000..41cba357 --- /dev/null +++ b/target/executable/filter/filter_with_counts/filter_with_counts @@ -0,0 +1,1472 @@ +#!/usr/bin/env bash + +# filter_with_counts main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (author) +# * Robrecht Cannoodt (maintainer, author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="filter_with_counts" +VIASH_META_FUNCTIONALITY_NAME="filter_with_counts" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component filter filter_with_counts" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "filter_with_counts main" + echo "" + echo "Filter scRNA-seq data based on the primary QC metrics." + echo "This is based on both the UMI counts, the gene counts" + echo "and the mitochondrial genes (genes starting with mt/MT)." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " example: raw_counts" + echo " Location of the count matrix. If specified, will be used to select a key" + echo " from .layers," + echo " otherwise .X is used." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --do_subset" + echo " type: boolean_true" + echo " Whether to subset before storing the output." + echo "" + echo " --obs_name_filter" + echo " type: string" + echo " default: filter_with_counts" + echo " In which .obs slot to store a boolean array corresponding to which" + echo " observations should be removed." + echo "" + echo " --var_name_filter" + echo " type: string" + echo " default: filter_with_counts" + echo " In which .var slot to store a boolean array corresponding to which" + echo " variables should be removed." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --min_counts" + echo " type: integer" + echo " example: 200" + echo " Minimum number of counts captured per cell." + echo "" + echo " --max_counts" + echo " type: integer" + echo " example: 5000000" + echo " Maximum number of counts captured per cell." + echo "" + echo " --min_genes_per_cell" + echo " type: integer" + echo " example: 200" + echo " Minimum of non-zero values per cell." + echo "" + echo " --max_genes_per_cell" + echo " type: integer" + echo " example: 1500000" + echo " Maximum of non-zero values per cell." + echo "" + echo " --min_cells_per_gene" + echo " type: integer" + echo " example: 3" + echo " Minimum of non-zero values per gene." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "filter_with_counts main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --do_subset) + [ -n "$VIASH_PAR_DO_SUBSET" ] && ViashError Bad arguments for option \'--do_subset\': \'$VIASH_PAR_DO_SUBSET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DO_SUBSET=true + shift 1 + ;; + --obs_name_filter) + [ -n "$VIASH_PAR_OBS_NAME_FILTER" ] && ViashError Bad arguments for option \'--obs_name_filter\': \'$VIASH_PAR_OBS_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_name_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_name_filter=*) + [ -n "$VIASH_PAR_OBS_NAME_FILTER" ] && ViashError Bad arguments for option \'--obs_name_filter=*\': \'$VIASH_PAR_OBS_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_name_filter) + [ -n "$VIASH_PAR_VAR_NAME_FILTER" ] && ViashError Bad arguments for option \'--var_name_filter\': \'$VIASH_PAR_VAR_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_NAME_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_name_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_name_filter=*) + [ -n "$VIASH_PAR_VAR_NAME_FILTER" ] && ViashError Bad arguments for option \'--var_name_filter=*\': \'$VIASH_PAR_VAR_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_NAME_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_counts) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_counts=*) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts=*\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_counts) + [ -n "$VIASH_PAR_MAX_COUNTS" ] && ViashError Bad arguments for option \'--max_counts\': \'$VIASH_PAR_MAX_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_counts=*) + [ -n "$VIASH_PAR_MAX_COUNTS" ] && ViashError Bad arguments for option \'--max_counts=*\': \'$VIASH_PAR_MAX_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_genes_per_cell) + [ -n "$VIASH_PAR_MIN_GENES_PER_CELL" ] && ViashError Bad arguments for option \'--min_genes_per_cell\': \'$VIASH_PAR_MIN_GENES_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_GENES_PER_CELL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_genes_per_cell. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_genes_per_cell=*) + [ -n "$VIASH_PAR_MIN_GENES_PER_CELL" ] && ViashError Bad arguments for option \'--min_genes_per_cell=*\': \'$VIASH_PAR_MIN_GENES_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_GENES_PER_CELL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_genes_per_cell) + [ -n "$VIASH_PAR_MAX_GENES_PER_CELL" ] && ViashError Bad arguments for option \'--max_genes_per_cell\': \'$VIASH_PAR_MAX_GENES_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_GENES_PER_CELL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_genes_per_cell. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_genes_per_cell=*) + [ -n "$VIASH_PAR_MAX_GENES_PER_CELL" ] && ViashError Bad arguments for option \'--max_genes_per_cell=*\': \'$VIASH_PAR_MAX_GENES_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_GENES_PER_CELL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_cells_per_gene) + [ -n "$VIASH_PAR_MIN_CELLS_PER_GENE" ] && ViashError Bad arguments for option \'--min_cells_per_gene\': \'$VIASH_PAR_MIN_CELLS_PER_GENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS_PER_GENE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_cells_per_gene. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_cells_per_gene=*) + [ -n "$VIASH_PAR_MIN_CELLS_PER_GENE" ] && ViashError Bad arguments for option \'--min_cells_per_gene=*\': \'$VIASH_PAR_MIN_CELLS_PER_GENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS_PER_GENE=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/filter_with_counts:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_DO_SUBSET+x} ]; then + VIASH_PAR_DO_SUBSET="false" +fi +if [ -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then + VIASH_PAR_OBS_NAME_FILTER="filter_with_counts" +fi +if [ -z ${VIASH_PAR_VAR_NAME_FILTER+x} ]; then + VIASH_PAR_VAR_NAME_FILTER="filter_with_counts" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_DO_SUBSET" ]]; then + if ! [[ "$VIASH_PAR_DO_SUBSET" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--do_subset' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_MIN_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_MAX_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_GENES_PER_CELL" ]]; then + if ! [[ "$VIASH_PAR_MIN_GENES_PER_CELL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_genes_per_cell' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_GENES_PER_CELL" ]]; then + if ! [[ "$VIASH_PAR_MAX_GENES_PER_CELL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_genes_per_cell' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CELLS_PER_GENE" ]]; then + if ! [[ "$VIASH_PAR_MIN_CELLS_PER_GENE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_cells_per_gene' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-filter_with_counts-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import numpy as np +import sys +from operator import le, ge, gt + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'do_subset': $( if [ ! -z ${VIASH_PAR_DO_SUBSET+x} ]; then echo "r'${VIASH_PAR_DO_SUBSET//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'obs_name_filter': $( if [ ! -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_name_filter': $( if [ ! -z ${VIASH_PAR_VAR_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_VAR_NAME_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_counts': $( if [ ! -z ${VIASH_PAR_MAX_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MAX_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_genes_per_cell': $( if [ ! -z ${VIASH_PAR_MIN_GENES_PER_CELL+x} ]; then echo "int(r'${VIASH_PAR_MIN_GENES_PER_CELL//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_genes_per_cell': $( if [ ! -z ${VIASH_PAR_MAX_GENES_PER_CELL+x} ]; then echo "int(r'${VIASH_PAR_MAX_GENES_PER_CELL//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_cells_per_gene': $( if [ ! -z ${VIASH_PAR_MIN_CELLS_PER_GENE+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS_PER_GENE//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input data from %s, modality %s", par["input"], par["modality"]) +modality_data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("\\tUnfiltered data: %s", modality_data) + +logger.info("Selecting input layer %s", "X" if par["layer"] else par["layer"]) +input_layer = ( + modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] +) + +logger.info("\\tComputing aggregations.") +n_counts_per_cell = np.ravel(np.sum(input_layer, axis=1)) +n_cells_per_gene = np.sum(input_layer > 0, axis=0) +n_genes_per_cell = np.sum(input_layer > 0, axis=1) + + +def apply_filter_to_mask(mask, base, filter, comparator): + new_filt = np.ravel(comparator(base, filter)) + num_removed = np.sum(np.invert(new_filt) & mask) + mask &= new_filt + return num_removed, mask + + +# Filter genes +keep_genes = np.repeat(True, modality_data.n_vars) +if par["min_cells_per_gene"] is not None: + num_removed, keep_genes = apply_filter_to_mask( + keep_genes, n_cells_per_gene, par["min_cells_per_gene"], ge + ) + logger.info( + "\\tRemoving %s genes with non-zero values in <%s cells.", + num_removed, + par["min_cells_per_gene"], + ) + +# Filter cells +filters = ( + ( + "min_genes_per_cell", + n_genes_per_cell, + ge, + "\\tRemoving %s cells with non-zero values in <%s genes.", + ), + ( + "max_genes_per_cell", + n_genes_per_cell, + le, + "\\tRemoving %s cells with non-zero values in >%s genes.", + ), + ("min_counts", n_counts_per_cell, ge, "\\tRemoving %s cells with <%s total counts."), + ("max_counts", n_counts_per_cell, le, "\\tRemoving %s cells with >%s total counts."), + ( + 0, + np.sum(input_layer[:, keep_genes], axis=1), + gt, + "\\tRemoving %s cells with %s counts", + ), +) + +keep_cells = np.repeat(True, modality_data.n_obs) +for filter_name_or_value, base, comparator, message in filters: + try: + filter = par[filter_name_or_value] + except KeyError: + filter = filter_name_or_value + if filter is not None: + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) + logger.info(message, num_removed, filter) + +if par["obs_name_filter"] is not None: + modality_data.obs[par["obs_name_filter"]] = keep_cells +if par["var_name_filter"] is not None: + modality_data.var[par["var_name_filter"]] = keep_genes + +if par["do_subset"]: + modality_data = modality_data[keep_cells, keep_genes] + +logger.info("\\tFiltered data: %s", modality_data) +logger.info( + "Writing output data to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], +) + + +logger.info("Finished") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/filter_with_counts/nextflow_labels.config b/target/executable/filter/filter_with_counts/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/filter_with_counts/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/filter_with_counts/setup_logger.py b/target/executable/filter/filter_with_counts/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/filter_with_counts/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/filter_with_scrublet/.config.vsh.yaml b/target/executable/filter/filter_with_scrublet/.config.vsh.yaml new file mode 100644 index 00000000..d84b016d --- /dev/null +++ b/target/executable/filter/filter_with_scrublet/.config.vsh.yaml @@ -0,0 +1,425 @@ +name: "filter_with_scrublet" +namespace: "filter" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to use as data for calculating doublets. .X is used\ + \ not specified." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_filter" + description: "In which .obs slot to store a boolean array corresponding to which\ + \ observations should be filtered out." + info: null + default: + - "filter_with_scrublet" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--do_subset" + description: "Whether to subset before storing the output." + info: null + direction: "input" + - type: "string" + name: "--obs_name_doublet_score" + description: "Name of the doublet scores column in the obs slot of the returned\ + \ object." + info: null + default: + - "scrublet_doublet_score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--expected_doublet_rate" + description: "The estimated fraction of doublets as from the experimental setup.\n" + info: null + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--stdev_doublet_rate" + description: "Uncertainty in the expected doublet rate." + info: null + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors" + description: "Number of neighbors used to construct the KNN classifier of observed\ + \ transcriptomes\nand simulated doublets.\n" + info: null + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--sim_doublet_ratio" + description: "Number of doublets to simulate relative to the number of observed\n\ + transcriptomes.\n" + info: null + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts" + description: "The number of minimal UMI counts per cell that have to be present\ + \ for initial cell detection." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells" + description: "The number of cells in which UMIs for a gene were detected." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_gene_variablity_percent" + description: "Used for gene filtering prior to PCA. Keep the most highly variable\ + \ genes (in the top min_gene_variability_pctl percentile), as measured by the\ + \ v-statistic [Klein et al., Cell 2015]." + info: null + default: + - 85.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_pca_components" + description: "Number of principal components to use during PCA dimensionality\ + \ reduction." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--distance_metric" + description: "The distance metric used for computing similarities." + info: null + default: + - "euclidean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--allow_automatic_threshold_detection_fail" + description: "When scrublet fails to automatically determine the double score\ + \ threshold, \nallow the component to continue and set the output columns to\ + \ NA.\n" + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Doublet detection using the Scrublet method (Wolock, Lopez and Klein,\ + \ 2019).\nThe method tests for potential doublets by using the expression profiles\ + \ of\ncells to generate synthetic potential doubles which are tested against cells.\ + \ \nThe method returns a \"doublet score\" on which it calls for potential doublets.\n\ + \nFor the source code please visit https://github.com/AllonKleinLab/scrublet.\n\n\ + For 10x we expect the doublet rates to be:\n Multiplet Rate (%) - # of Cells Loaded\ + \ - # of Cells Recovered\n ~0.4% ~800 ~500\n ~0.8% ~1,600 ~1,000\n ~1.6% ~3,200\ + \ ~2,000\n ~2.3% ~4,800 ~3,000\n ~3.1% ~6,400 ~4,000\n ~3.9% ~8,000 ~5,000\n\ + \ ~4.6% ~9,600 ~6,000\n ~5.4% ~11,200 ~7,000\n ~6.1% ~12,800 ~8,000\n ~6.9%\ + \ ~14,400 ~9,000\n ~7.6% ~16,000 ~10,000\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" + docker_run_args: + - "--env NUMBA_CACHE_DIR=/tmp" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scrublet" + - "annoy==1.17.3" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/filter_with_scrublet/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/filter_with_scrublet" + executable: "target/executable/filter/filter_with_scrublet/filter_with_scrublet" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/filter_with_scrublet/compress_h5mu.py b/target/executable/filter/filter_with_scrublet/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/filter/filter_with_scrublet/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/filter/filter_with_scrublet/filter_with_scrublet b/target/executable/filter/filter_with_scrublet/filter_with_scrublet new file mode 100755 index 00000000..cceed8a4 --- /dev/null +++ b/target/executable/filter/filter_with_scrublet/filter_with_scrublet @@ -0,0 +1,1653 @@ +#!/usr/bin/env bash + +# filter_with_scrublet main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (contributor) +# * Robrecht Cannoodt (maintainer, contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="filter_with_scrublet" +VIASH_META_FUNCTIONALITY_NAME="filter_with_scrublet" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps build-essential && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "scrublet" "annoy==1.17.3" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component filter filter_with_scrublet" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm --env NUMBA_CACHE_DIR=/tmp) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "filter_with_scrublet main" + echo "" + echo "Doublet detection using the Scrublet method (Wolock, Lopez and Klein, 2019)." + echo "The method tests for potential doublets by using the expression profiles of" + echo "cells to generate synthetic potential doubles which are tested against cells." + echo "The method returns a \"doublet score\" on which it calls for potential doublets." + echo "" + echo "For the source code please visit https://github.com/AllonKleinLab/scrublet." + echo "" + echo "For 10x we expect the doublet rates to be:" + echo " Multiplet Rate (%) - # of Cells Loaded - # of Cells Recovered" + echo " ~0.4% ~800 ~500" + echo " ~0.8% ~1,600 ~1,000" + echo " ~1.6% ~3,200 ~2,000" + echo " ~2.3% ~4,800 ~3,000" + echo " ~3.1% ~6,400 ~4,000" + echo " ~3.9% ~8,000 ~5,000" + echo " ~4.6% ~9,600 ~6,000" + echo " ~5.4% ~11,200 ~7,000" + echo " ~6.1% ~12,800 ~8,000" + echo " ~6.9% ~14,400 ~9,000" + echo " ~7.6% ~16,000 ~10,000" + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " Input layer to use as data for calculating doublets. .X is used not" + echo " specified." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --obs_name_filter" + echo " type: string" + echo " default: filter_with_scrublet" + echo " In which .obs slot to store a boolean array corresponding to which" + echo " observations should be filtered out." + echo "" + echo " --do_subset" + echo " type: boolean_true" + echo " Whether to subset before storing the output." + echo "" + echo " --obs_name_doublet_score" + echo " type: string" + echo " default: scrublet_doublet_score" + echo " Name of the doublet scores column in the obs slot of the returned" + echo " object." + echo "" + echo " --expected_doublet_rate" + echo " type: double" + echo " min: 0.0" + echo " max: 1.0" + echo " The estimated fraction of doublets as from the experimental setup." + echo "" + echo " --stdev_doublet_rate" + echo " type: double" + echo " min: 0.0" + echo " Uncertainty in the expected doublet rate." + echo "" + echo " --n_neighbors" + echo " type: integer" + echo " min: 0" + echo " Number of neighbors used to construct the KNN classifier of observed" + echo " transcriptomes" + echo " and simulated doublets." + echo "" + echo " --sim_doublet_ratio" + echo " type: double" + echo " min: 0.0" + echo " Number of doublets to simulate relative to the number of observed" + echo " transcriptomes." + echo "" + echo " --min_counts" + echo " type: integer" + echo " default: 2" + echo " The number of minimal UMI counts per cell that have to be present for" + echo " initial cell detection." + echo "" + echo " --min_cells" + echo " type: integer" + echo " default: 3" + echo " The number of cells in which UMIs for a gene were detected." + echo "" + echo " --min_gene_variablity_percent" + echo " type: double" + echo " default: 85.0" + echo " Used for gene filtering prior to PCA. Keep the most highly variable" + echo " genes (in the top min_gene_variability_pctl percentile), as measured by" + echo " the v-statistic [Klein et al., Cell 2015]." + echo "" + echo " --num_pca_components" + echo " type: integer" + echo " default: 30" + echo " Number of principal components to use during PCA dimensionality" + echo " reduction." + echo "" + echo " --distance_metric" + echo " type: string" + echo " default: euclidean" + echo " The distance metric used for computing similarities." + echo "" + echo " --allow_automatic_threshold_detection_fail" + echo " type: boolean_true" + echo " When scrublet fails to automatically determine the double score" + echo " threshold," + echo " allow the component to continue and set the output columns to NA." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "filter_with_scrublet main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_name_filter) + [ -n "$VIASH_PAR_OBS_NAME_FILTER" ] && ViashError Bad arguments for option \'--obs_name_filter\': \'$VIASH_PAR_OBS_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_name_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_name_filter=*) + [ -n "$VIASH_PAR_OBS_NAME_FILTER" ] && ViashError Bad arguments for option \'--obs_name_filter=*\': \'$VIASH_PAR_OBS_NAME_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --do_subset) + [ -n "$VIASH_PAR_DO_SUBSET" ] && ViashError Bad arguments for option \'--do_subset\': \'$VIASH_PAR_DO_SUBSET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DO_SUBSET=true + shift 1 + ;; + --obs_name_doublet_score) + [ -n "$VIASH_PAR_OBS_NAME_DOUBLET_SCORE" ] && ViashError Bad arguments for option \'--obs_name_doublet_score\': \'$VIASH_PAR_OBS_NAME_DOUBLET_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_DOUBLET_SCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_name_doublet_score. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_name_doublet_score=*) + [ -n "$VIASH_PAR_OBS_NAME_DOUBLET_SCORE" ] && ViashError Bad arguments for option \'--obs_name_doublet_score=*\': \'$VIASH_PAR_OBS_NAME_DOUBLET_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_NAME_DOUBLET_SCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expected_doublet_rate) + [ -n "$VIASH_PAR_EXPECTED_DOUBLET_RATE" ] && ViashError Bad arguments for option \'--expected_doublet_rate\': \'$VIASH_PAR_EXPECTED_DOUBLET_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_DOUBLET_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expected_doublet_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expected_doublet_rate=*) + [ -n "$VIASH_PAR_EXPECTED_DOUBLET_RATE" ] && ViashError Bad arguments for option \'--expected_doublet_rate=*\': \'$VIASH_PAR_EXPECTED_DOUBLET_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_DOUBLET_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --stdev_doublet_rate) + [ -n "$VIASH_PAR_STDEV_DOUBLET_RATE" ] && ViashError Bad arguments for option \'--stdev_doublet_rate\': \'$VIASH_PAR_STDEV_DOUBLET_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STDEV_DOUBLET_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --stdev_doublet_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --stdev_doublet_rate=*) + [ -n "$VIASH_PAR_STDEV_DOUBLET_RATE" ] && ViashError Bad arguments for option \'--stdev_doublet_rate=*\': \'$VIASH_PAR_STDEV_DOUBLET_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STDEV_DOUBLET_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_neighbors) + [ -n "$VIASH_PAR_N_NEIGHBORS" ] && ViashError Bad arguments for option \'--n_neighbors\': \'$VIASH_PAR_N_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_neighbors. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_neighbors=*) + [ -n "$VIASH_PAR_N_NEIGHBORS" ] && ViashError Bad arguments for option \'--n_neighbors=*\': \'$VIASH_PAR_N_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sim_doublet_ratio) + [ -n "$VIASH_PAR_SIM_DOUBLET_RATIO" ] && ViashError Bad arguments for option \'--sim_doublet_ratio\': \'$VIASH_PAR_SIM_DOUBLET_RATIO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SIM_DOUBLET_RATIO="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sim_doublet_ratio. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sim_doublet_ratio=*) + [ -n "$VIASH_PAR_SIM_DOUBLET_RATIO" ] && ViashError Bad arguments for option \'--sim_doublet_ratio=*\': \'$VIASH_PAR_SIM_DOUBLET_RATIO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SIM_DOUBLET_RATIO=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_counts) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_counts=*) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts=*\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_cells) + [ -n "$VIASH_PAR_MIN_CELLS" ] && ViashError Bad arguments for option \'--min_cells\': \'$VIASH_PAR_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_cells=*) + [ -n "$VIASH_PAR_MIN_CELLS" ] && ViashError Bad arguments for option \'--min_cells=*\': \'$VIASH_PAR_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_gene_variablity_percent) + [ -n "$VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT" ] && ViashError Bad arguments for option \'--min_gene_variablity_percent\': \'$VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_gene_variablity_percent. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_gene_variablity_percent=*) + [ -n "$VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT" ] && ViashError Bad arguments for option \'--min_gene_variablity_percent=*\': \'$VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_pca_components) + [ -n "$VIASH_PAR_NUM_PCA_COMPONENTS" ] && ViashError Bad arguments for option \'--num_pca_components\': \'$VIASH_PAR_NUM_PCA_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_PCA_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_pca_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_pca_components=*) + [ -n "$VIASH_PAR_NUM_PCA_COMPONENTS" ] && ViashError Bad arguments for option \'--num_pca_components=*\': \'$VIASH_PAR_NUM_PCA_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_PCA_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --distance_metric) + [ -n "$VIASH_PAR_DISTANCE_METRIC" ] && ViashError Bad arguments for option \'--distance_metric\': \'$VIASH_PAR_DISTANCE_METRIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DISTANCE_METRIC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --distance_metric. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --distance_metric=*) + [ -n "$VIASH_PAR_DISTANCE_METRIC" ] && ViashError Bad arguments for option \'--distance_metric=*\': \'$VIASH_PAR_DISTANCE_METRIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DISTANCE_METRIC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --allow_automatic_threshold_detection_fail) + [ -n "$VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL" ] && ViashError Bad arguments for option \'--allow_automatic_threshold_detection_fail\': \'$VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL=true + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/filter_with_scrublet:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then + VIASH_PAR_OBS_NAME_FILTER="filter_with_scrublet" +fi +if [ -z ${VIASH_PAR_DO_SUBSET+x} ]; then + VIASH_PAR_DO_SUBSET="false" +fi +if [ -z ${VIASH_PAR_OBS_NAME_DOUBLET_SCORE+x} ]; then + VIASH_PAR_OBS_NAME_DOUBLET_SCORE="scrublet_doublet_score" +fi +if [ -z ${VIASH_PAR_MIN_COUNTS+x} ]; then + VIASH_PAR_MIN_COUNTS="2" +fi +if [ -z ${VIASH_PAR_MIN_CELLS+x} ]; then + VIASH_PAR_MIN_CELLS="3" +fi +if [ -z ${VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT+x} ]; then + VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT="85.0" +fi +if [ -z ${VIASH_PAR_NUM_PCA_COMPONENTS+x} ]; then + VIASH_PAR_NUM_PCA_COMPONENTS="30" +fi +if [ -z ${VIASH_PAR_DISTANCE_METRIC+x} ]; then + VIASH_PAR_DISTANCE_METRIC="euclidean" +fi +if [ -z ${VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL+x} ]; then + VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_DO_SUBSET" ]]; then + if ! [[ "$VIASH_PAR_DO_SUBSET" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--do_subset' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXPECTED_DOUBLET_RATE" ]]; then + if ! [[ "$VIASH_PAR_EXPECTED_DOUBLET_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--expected_doublet_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_EXPECTED_DOUBLET_RATE '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--expected_doublet_rate' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_EXPECTED_DOUBLET_RATE -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--expected_doublet_rate' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--expected_doublet_rate' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_EXPECTED_DOUBLET_RATE '<=' 1.0 | bc` -eq 1 ]]; then + ViashError '--expected_doublet_rate' has to be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_EXPECTED_DOUBLET_RATE -v n2=1.0 'BEGIN { print (n1 <= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--expected_doublet_rate' has be less than or equal to 1.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--expected_doublet_rate' specifies a maximum value but the value was not verified as neither \'bc\' or \'awk\' are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_STDEV_DOUBLET_RATE" ]]; then + if ! [[ "$VIASH_PAR_STDEV_DOUBLET_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--stdev_doublet_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_STDEV_DOUBLET_RATE '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--stdev_doublet_rate' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_STDEV_DOUBLET_RATE -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--stdev_doublet_rate' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--stdev_doublet_rate' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_N_NEIGHBORS" ]]; then + if ! [[ "$VIASH_PAR_N_NEIGHBORS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_neighbors' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_N_NEIGHBORS -lt 0 ]]; then + ViashError '--n_neighbors' has be more than or equal to 0. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SIM_DOUBLET_RATIO" ]]; then + if ! [[ "$VIASH_PAR_SIM_DOUBLET_RATIO" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--sim_doublet_ratio' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_SIM_DOUBLET_RATIO '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--sim_doublet_ratio' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_SIM_DOUBLET_RATIO -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--sim_doublet_ratio' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--sim_doublet_ratio' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_MIN_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_MIN_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CELLS" ]]; then + if ! [[ "$VIASH_PAR_MIN_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT" ]]; then + if ! [[ "$VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_gene_variablity_percent' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NUM_PCA_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_NUM_PCA_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_pca_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL" ]]; then + if ! [[ "$VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--allow_automatic_threshold_detection_fail' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-filter_with_scrublet-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scrublet as scr +import mudata as mu +import numpy as np +import sys +import pandas as pd + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_name_filter': $( if [ ! -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'do_subset': $( if [ ! -z ${VIASH_PAR_DO_SUBSET+x} ]; then echo "r'${VIASH_PAR_DO_SUBSET//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'obs_name_doublet_score': $( if [ ! -z ${VIASH_PAR_OBS_NAME_DOUBLET_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_DOUBLET_SCORE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'expected_doublet_rate': $( if [ ! -z ${VIASH_PAR_EXPECTED_DOUBLET_RATE+x} ]; then echo "float(r'${VIASH_PAR_EXPECTED_DOUBLET_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'stdev_doublet_rate': $( if [ ! -z ${VIASH_PAR_STDEV_DOUBLET_RATE+x} ]; then echo "float(r'${VIASH_PAR_STDEV_DOUBLET_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_neighbors': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sim_doublet_ratio': $( if [ ! -z ${VIASH_PAR_SIM_DOUBLET_RATIO+x} ]; then echo "float(r'${VIASH_PAR_SIM_DOUBLET_RATIO//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_cells': $( if [ ! -z ${VIASH_PAR_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_gene_variablity_percent': $( if [ ! -z ${VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT+x} ]; then echo "float(r'${VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'num_pca_components': $( if [ ! -z ${VIASH_PAR_NUM_PCA_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_PCA_COMPONENTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'distance_metric': $( if [ ! -z ${VIASH_PAR_DISTANCE_METRIC+x} ]; then echo "r'${VIASH_PAR_DISTANCE_METRIC//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'allow_automatic_threshold_detection_fail': $( if [ ! -z ${VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL+x} ]; then echo "r'${VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"]) +input_layer = data.X if not par["layer"] else data.layers[par["layer"]] + +if 0 in input_layer.shape: + raise ValueError( + f"Modality {par['modality']} of input Mudata {par['input']} appears " + f"to be empty (shape: {input_layer.shape})." + ) + +logger.info("\\tRunning scrublet") +initializer_args = { + arg: par[arg] + for arg in ( + "expected_doublet_rate", + "stdev_doublet_rate", + "n_neighbors", + "sim_doublet_ratio", + ) + if par[arg] is not None +} +scrub = scr.Scrublet(input_layer, **initializer_args) + +doublet_scores, predicted_doublets = scrub.scrub_doublets( + min_counts=par["min_counts"], + min_cells=par["min_cells"], + min_gene_variability_pctl=par["min_gene_variablity_percent"], + n_prin_comps=par["num_pca_components"], + distance_metric=par["distance_metric"], + use_approx_neighbors=False, +) + +try: + keep_cells = np.invert(predicted_doublets) +except TypeError: + if par["allow_automatic_threshold_detection_fail"]: + # Scrublet might not throw an error and return None if it fails to detect doublets... + logger.info( + "\\tScrublet could not automatically detect the doublet score threshold. Setting output columns to NA." + ) + keep_cells = np.nan + doublet_scores = np.nan + else: + raise RuntimeError( + "Scrublet could not automatically detect the doublet score threshold. " + "--allow_automatic_threshold_detection_fail can be used to ignore this failure " + "and set the corresponding output columns to NA." + ) + +logger.info("\\tStoring output into .obs") +if par["obs_name_doublet_score"] is not None: + data.obs[par["obs_name_doublet_score"]] = doublet_scores + data.obs[par["obs_name_doublet_score"]] = data.obs[ + par["obs_name_doublet_score"] + ].astype("float64") +if par["obs_name_filter"] is not None: + data.obs[par["obs_name_filter"]] = keep_cells + data.obs[par["obs_name_filter"]] = data.obs[par["obs_name_filter"]].astype( + pd.BooleanDtype() + ) + +if par["do_subset"]: + if pd.api.types.is_scalar(keep_cells) and pd.isna(keep_cells): + logger.warning("Not subsetting beacuse doublets were not predicted") + else: + data = data[keep_cells, :] + +logger.info( + "Writing h5mu to %s, with compression %s", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/filter_with_scrublet/nextflow_labels.config b/target/executable/filter/filter_with_scrublet/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/filter_with_scrublet/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/filter_with_scrublet/setup_logger.py b/target/executable/filter/filter_with_scrublet/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/filter_with_scrublet/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/intersect_obs/.config.vsh.yaml b/target/executable/filter/intersect_obs/.config.vsh.yaml new file mode 100644 index 00000000..cd6b360d --- /dev/null +++ b/target/executable/filter/intersect_obs/.config.vsh.yaml @@ -0,0 +1,265 @@ +name: "intersect_obs" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Isabelle Bergiers" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "Isabelle-b" + orcid: "0000-0001-9622-7960" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Scientist OMICS Technology" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modalities" + description: "Which modalities from the input MuData file to process.\n" + info: null + example: + - "rna" + - "prot" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create an intersection between two or more modalities.\n\nThis component\ + \ removes any observations which are not present in all modalities.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/intersect_obs/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/intersect_obs" + executable: "target/executable/filter/intersect_obs/intersect_obs" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/intersect_obs/compress_h5mu.py b/target/executable/filter/intersect_obs/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/filter/intersect_obs/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/filter/intersect_obs/intersect_obs b/target/executable/filter/intersect_obs/intersect_obs new file mode 100755 index 00000000..9fc4e23c --- /dev/null +++ b/target/executable/filter/intersect_obs/intersect_obs @@ -0,0 +1,1237 @@ +#!/usr/bin/env bash + +# intersect_obs main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Isabelle Bergiers (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="intersect_obs" +VIASH_META_FUNCTIONALITY_NAME="intersect_obs" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont, Isabelle Bergiers" +LABEL org.opencontainers.image.description="Companion container for running component filter intersect_obs" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "intersect_obs main" + echo "" + echo "Create an intersection between two or more modalities." + echo "" + echo "This component removes any observations which are not present in all modalities." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modalities" + echo " type: string, required parameter, multiple values allowed" + echo " example: rna;prot" + echo " Which modalities from the input MuData file to process." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "intersect_obs main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modalities) + if [ -z "$VIASH_PAR_MODALITIES" ]; then + VIASH_PAR_MODALITIES="$2" + else + VIASH_PAR_MODALITIES="$VIASH_PAR_MODALITIES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modalities. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modalities=*) + if [ -z "$VIASH_PAR_MODALITIES" ]; then + VIASH_PAR_MODALITIES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MODALITIES="$VIASH_PAR_MODALITIES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/intersect_obs:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODALITIES+x} ]; then + ViashError '--modalities' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-intersect_obs-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import anndata as ad +import sys +from pathlib import Path +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modalities': $( if [ ! -z ${VIASH_PAR_MODALITIES+x} ]; then echo "r'${VIASH_PAR_MODALITIES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import compress_h5mu + +logger = setup_logger() + + +def main(): + modality_names = par["modalities"] + + if len(modality_names) < 2: + raise ValueError("Please provide two more more modalities.") + + obs_names = {} + for mod_name in par["modalities"]: + try: + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) + except KeyError: + raise ValueError( + f"Modality {mod_name} does not exist for file {par['input']}." + ) + + obs_names[mod_name] = modality.obs_names.copy() + del modality + + intersected_index = None + for mod_name, mod_index in obs_names.items(): + if intersected_index is None: + intersected_index = mod_index + continue + intersected_index = intersected_index.intersection(mod_index) + + output_file = Path(par["output"]) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) + output_file_uncompressed.touch() + + mdata = mu.MuData({modality: ad.AnnData() for modality in modality_names}) + mdata.write(output_file_uncompressed, compression=par["output_compression"]) + + for mod_name in modality_names: + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) + intersected_modality = modality[intersected_index] + mu.write_h5ad(output_file_uncompressed, data=intersected_modality, mod=mod_name) + + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, output_file, compression=par["output_compression"] + ) + output_file_uncompressed.unlink() + else: + shutil.move(output_file_uncompressed, output_file) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/intersect_obs/nextflow_labels.config b/target/executable/filter/intersect_obs/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/intersect_obs/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/intersect_obs/setup_logger.py b/target/executable/filter/intersect_obs/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/intersect_obs/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/remove_modality/.config.vsh.yaml b/target/executable/filter/remove_modality/.config.vsh.yaml new file mode 100644 index 00000000..60155822 --- /dev/null +++ b/target/executable/filter/remove_modality/.config.vsh.yaml @@ -0,0 +1,245 @@ +name: "remove_modality" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Name(s) of the modality to remove\n" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Remove a modality from a .h5mu file\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/remove_modality/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/remove_modality" + executable: "target/executable/filter/remove_modality/remove_modality" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/remove_modality/nextflow_labels.config b/target/executable/filter/remove_modality/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/remove_modality/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/remove_modality/remove_modality b/target/executable/filter/remove_modality/remove_modality new file mode 100755 index 00000000..ceb0105a --- /dev/null +++ b/target/executable/filter/remove_modality/remove_modality @@ -0,0 +1,1183 @@ +#!/usr/bin/env bash + +# remove_modality main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="remove_modality" +VIASH_META_FUNCTIONALITY_NAME="remove_modality" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component filter remove_modality" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "remove_modality main" + echo "" + echo "Remove a modality from a .h5mu file" + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string, required parameter, multiple values allowed" + echo " Name(s) of the modality to remove" + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "remove_modality main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + if [ -z "$VIASH_PAR_MODALITY" ]; then + VIASH_PAR_MODALITY="$2" + else + VIASH_PAR_MODALITY="$VIASH_PAR_MODALITY;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + if [ -z "$VIASH_PAR_MODALITY" ]; then + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MODALITY="$VIASH_PAR_MODALITY;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/remove_modality:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + ViashError '--modality' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-remove_modality-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from mudata import read_h5mu, MuData + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + + +input_mudata = read_h5mu(par["input"]) +new_mods = { + mod_name: mod + for mod_name, mod in input_mudata.mod.items() + if mod_name not in par["modality"] +} + +new_mudata = MuData(new_mods) +new_mudata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/subset_h5mu/.config.vsh.yaml b/target/executable/filter/subset_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..d4c053fe --- /dev/null +++ b/target/executable/filter/subset_h5mu/.config.vsh.yaml @@ -0,0 +1,259 @@ +name: "subset_h5mu" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--number_of_observations" + description: "Number of observations to be selected from the h5mu file." + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create a subset of a mudata file by selecting the first number of observations\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/subset_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/subset_h5mu" + executable: "target/executable/filter/subset_h5mu/subset_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/subset_h5mu/nextflow_labels.config b/target/executable/filter/subset_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/subset_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/subset_h5mu/setup_logger.py b/target/executable/filter/subset_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/subset_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/subset_h5mu/subset_h5mu b/target/executable/filter/subset_h5mu/subset_h5mu new file mode 100755 index 00000000..4d0f7eaf --- /dev/null +++ b/target/executable/filter/subset_h5mu/subset_h5mu @@ -0,0 +1,1205 @@ +#!/usr/bin/env bash + +# subset_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="subset_h5mu" +VIASH_META_FUNCTIONALITY_NAME="subset_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component filter subset_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "subset_h5mu main" + echo "" + echo "Create a subset of a mudata file by selecting the first number of observations" + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --number_of_observations" + echo " type: integer" + echo " example: 5" + echo " Number of observations to be selected from the h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "subset_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --number_of_observations) + [ -n "$VIASH_PAR_NUMBER_OF_OBSERVATIONS" ] && ViashError Bad arguments for option \'--number_of_observations\': \'$VIASH_PAR_NUMBER_OF_OBSERVATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUMBER_OF_OBSERVATIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --number_of_observations. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --number_of_observations=*) + [ -n "$VIASH_PAR_NUMBER_OF_OBSERVATIONS" ] && ViashError Bad arguments for option \'--number_of_observations=*\': \'$VIASH_PAR_NUMBER_OF_OBSERVATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUMBER_OF_OBSERVATIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/subset_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_NUMBER_OF_OBSERVATIONS" ]]; then + if ! [[ "$VIASH_PAR_NUMBER_OF_OBSERVATIONS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--number_of_observations' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-subset_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'number_of_observations': $( if [ ! -z ${VIASH_PAR_NUMBER_OF_OBSERVATIONS+x} ]; then echo "int(r'${VIASH_PAR_NUMBER_OF_OBSERVATIONS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +if __name__ == "__main__": + # read data + data = mudata.read(par["input"]) + + # subset data + if par["modality"]: + data.mod[par["modality"]] = data.mod[par["modality"]][ + : par["number_of_observations"] + ] + else: + data = data[: par["number_of_observations"]] + + # write data + data.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/filter/subset_obsp/.config.vsh.yaml b/target/executable/filter/subset_obsp/.config.vsh.yaml new file mode 100644 index 00000000..6001d85d --- /dev/null +++ b/target/executable/filter/subset_obsp/.config.vsh.yaml @@ -0,0 +1,295 @@ +name: "subset_obsp" +namespace: "filter" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsp_key" + description: "The .obsp field to be filtered." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_key" + description: "The .obs column to filter on." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_value" + description: "The value to filter on in the .obs column." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_key" + description: "The .obsm key to store the subset in." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create a subset of an .obsp field in a mudata file, by filtering the\ + \ columns based on the values of an .obs column. The resulting subset is moved to\ + \ an .obsm slot.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/subset_obsp/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/filter/subset_obsp" + executable: "target/executable/filter/subset_obsp/subset_obsp" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/filter/subset_obsp/compress_h5mu.py b/target/executable/filter/subset_obsp/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/filter/subset_obsp/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/filter/subset_obsp/nextflow_labels.config b/target/executable/filter/subset_obsp/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/filter/subset_obsp/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/filter/subset_obsp/setup_logger.py b/target/executable/filter/subset_obsp/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/filter/subset_obsp/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/filter/subset_obsp/subset_obsp b/target/executable/filter/subset_obsp/subset_obsp new file mode 100755 index 00000000..3d225c5f --- /dev/null +++ b/target/executable/filter/subset_obsp/subset_obsp @@ -0,0 +1,1290 @@ +#!/usr/bin/env bash + +# subset_obsp main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="subset_obsp" +VIASH_META_FUNCTIONALITY_NAME="subset_obsp" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component filter subset_obsp" +LABEL org.opencontainers.image.created="2025-08-22T07:23:19Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "subset_obsp main" + echo "" + echo "Create a subset of an .obsp field in a mudata file, by filtering the columns" + echo "based on the values of an .obs column. The resulting subset is moved to an .obsm" + echo "slot." + echo "" + echo "Arguments:" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Input:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_obsp_key" + echo " type: string, required parameter" + echo " The .obsp field to be filtered." + echo "" + echo " --input_obs_key" + echo " type: string, required parameter" + echo " The .obs column to filter on." + echo "" + echo " --input_obs_value" + echo " type: string, required parameter" + echo " The value to filter on in the .obs column." + echo "" + echo "Output:" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_obsm_key" + echo " type: string, required parameter" + echo " The .obsm key to store the subset in." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "subset_obsp main" + exit + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obsp_key) + [ -n "$VIASH_PAR_INPUT_OBSP_KEY" ] && ViashError Bad arguments for option \'--input_obsp_key\': \'$VIASH_PAR_INPUT_OBSP_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSP_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obsp_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obsp_key=*) + [ -n "$VIASH_PAR_INPUT_OBSP_KEY" ] && ViashError Bad arguments for option \'--input_obsp_key=*\': \'$VIASH_PAR_INPUT_OBSP_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSP_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_key) + [ -n "$VIASH_PAR_INPUT_OBS_KEY" ] && ViashError Bad arguments for option \'--input_obs_key\': \'$VIASH_PAR_INPUT_OBS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_key=*) + [ -n "$VIASH_PAR_INPUT_OBS_KEY" ] && ViashError Bad arguments for option \'--input_obs_key=*\': \'$VIASH_PAR_INPUT_OBS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_value) + [ -n "$VIASH_PAR_INPUT_OBS_VALUE" ] && ViashError Bad arguments for option \'--input_obs_value\': \'$VIASH_PAR_INPUT_OBS_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_VALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_value. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_value=*) + [ -n "$VIASH_PAR_INPUT_OBS_VALUE" ] && ViashError Bad arguments for option \'--input_obs_value=*\': \'$VIASH_PAR_INPUT_OBS_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_VALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obsm_key) + [ -n "$VIASH_PAR_OUTPUT_OBSM_KEY" ] && ViashError Bad arguments for option \'--output_obsm_key\': \'$VIASH_PAR_OUTPUT_OBSM_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBSM_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obsm_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obsm_key=*) + [ -n "$VIASH_PAR_OUTPUT_OBSM_KEY" ] && ViashError Bad arguments for option \'--output_obsm_key=*\': \'$VIASH_PAR_OUTPUT_OBSM_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBSM_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/filter/subset_obsp:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_OBSP_KEY+x} ]; then + ViashError '--input_obsp_key' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_OBS_KEY+x} ]; then + ViashError '--input_obs_key' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_OBS_VALUE+x} ]; then + ViashError '--input_obs_value' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_OBSM_KEY+x} ]; then + ViashError '--output_obsm_key' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-subset_obsp-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obsp_key': $( if [ ! -z ${VIASH_PAR_INPUT_OBSP_KEY+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSP_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_key': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_KEY+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_value': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_VALUE+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_VALUE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obsm_key': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBSM_KEY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBSM_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading %s, modality", par["input"], par["modality"]) + adata = mu.read_h5ad(par["input"], mod=par["modality"]) + + logger.info( + f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}" + ) + # .obsp, .obs and .obsm index and .obsp columns all have a dimension length of \`n_obs\` + # the index dimensions remain unaltered, but .obsp columns will be subset + obsp = adata.obsp[par["input_obsp_key"]] + idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"] + # A Series object cannot be used as an indexer for a scipy sparse array + # when the data type is a pandas boolean extension array because + # extension arrays do not define .nonzero() + # See https://github.com/pandas-dev/pandas/issues/46025 + idx = idx.to_numpy(dtype="bool", na_value=False) + obsm_subset = obsp[:, idx] + + logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}") + adata.obsm[par["output_obsm_key"]] = obsm_subset + + logger.info( + "Writing output to %s, modality %s", par["output"], par["output_compression"] + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/bcftools/.config.vsh.yaml b/target/executable/genetic_demux/bcftools/.config.vsh.yaml new file mode 100644 index 00000000..26d974f6 --- /dev/null +++ b/target/executable/genetic_demux/bcftools/.config.vsh.yaml @@ -0,0 +1,244 @@ +name: "bcftools" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--vcf" + description: "VCF files, must have the same sample columns appearing in the same\ + \ order." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--concat" + description: "Concatenate or combine VCFs and sort them." + info: null + direction: "input" + - type: "boolean_true" + name: "--filter" + description: "Filter VCFs." + info: null + direction: "input" + - type: "integer" + name: "--filter_qual" + description: "Filter VCFs with specified QUAL threshold." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "bcftools output directory" + info: null + example: + - "bcftools_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter the variants called by freebayes or cellSNP" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "wget" + - "bzip2" + - "gcc" + - "make" + - "libbz2-dev" + - "zlib1g-dev" + - "libncurses5-dev" + - "libncursesw5-dev" + - "liblzma-dev" + - "autoconf" + - "automake" + - "perl" + - "libcurl4-gnutls-dev" + - "libssl-dev" + interactive: false + - type: "docker" + run: + - "wget https://github.com/samtools/bcftools/releases/download/1.16/bcftools-1.16.tar.bz2\ + \ -O bcftools.tar.bz2 && tar -xjvf bcftools.tar.bz2 && cd bcftools-1.16 && make\ + \ prefix=/usr/local install" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/bcftools/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/bcftools" + executable: "target/executable/genetic_demux/bcftools/bcftools" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/bcftools/bcftools b/target/executable/genetic_demux/bcftools/bcftools new file mode 100755 index 00000000..15eddae6 --- /dev/null +++ b/target/executable/genetic_demux/bcftools/bcftools @@ -0,0 +1,1218 @@ +#!/usr/bin/env bash + +# bcftools main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bcftools" +VIASH_META_FUNCTIONALITY_NAME="bcftools" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:latest +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y wget bzip2 gcc make libbz2-dev zlib1g-dev libncurses5-dev libncursesw5-dev liblzma-dev autoconf automake perl libcurl4-gnutls-dev libssl-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://github.com/samtools/bcftools/releases/download/1.16/bcftools-1.16.tar.bz2 -O bcftools.tar.bz2 && tar -xjvf bcftools.tar.bz2 && cd bcftools-1.16 && make prefix=/usr/local install +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux bcftools" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bcftools main" + echo "" + echo "Filter the variants called by freebayes or cellSNP" + echo "" + echo "Arguments:" + echo " --vcf" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " VCF files, must have the same sample columns appearing in the same" + echo " order." + echo "" + echo " --concat" + echo " type: boolean_true" + echo " Concatenate or combine VCFs and sort them." + echo "" + echo " --filter" + echo " type: boolean_true" + echo " Filter VCFs." + echo "" + echo " --filter_qual" + echo " type: integer" + echo " default: 30" + echo " Filter VCFs with specified QUAL threshold." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: bcftools_out" + echo " bcftools output directory" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bcftools main" + exit + ;; + --vcf) + if [ -z "$VIASH_PAR_VCF" ]; then + VIASH_PAR_VCF="$2" + else + VIASH_PAR_VCF="$VIASH_PAR_VCF;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf=*) + if [ -z "$VIASH_PAR_VCF" ]; then + VIASH_PAR_VCF=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VCF="$VIASH_PAR_VCF;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --concat) + [ -n "$VIASH_PAR_CONCAT" ] && ViashError Bad arguments for option \'--concat\': \'$VIASH_PAR_CONCAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CONCAT=true + shift 1 + ;; + --filter) + [ -n "$VIASH_PAR_FILTER" ] && ViashError Bad arguments for option \'--filter\': \'$VIASH_PAR_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTER=true + shift 1 + ;; + --filter_qual) + [ -n "$VIASH_PAR_FILTER_QUAL" ] && ViashError Bad arguments for option \'--filter_qual\': \'$VIASH_PAR_FILTER_QUAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTER_QUAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --filter_qual. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --filter_qual=*) + [ -n "$VIASH_PAR_FILTER_QUAL" ] && ViashError Bad arguments for option \'--filter_qual=*\': \'$VIASH_PAR_FILTER_QUAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTER_QUAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/bcftools:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_VCF+x} ]; then + ViashError '--vcf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_CONCAT+x} ]; then + VIASH_PAR_CONCAT="false" +fi +if [ -z ${VIASH_PAR_FILTER+x} ]; then + VIASH_PAR_FILTER="false" +fi +if [ -z ${VIASH_PAR_FILTER_QUAL+x} ]; then + VIASH_PAR_FILTER_QUAL="30" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_VCF" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_VCF; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_CONCAT" ]]; then + if ! [[ "$VIASH_PAR_CONCAT" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--concat' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FILTER" ]]; then + if ! [[ "$VIASH_PAR_FILTER" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--filter' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FILTER_QUAL" ]]; then + if ! [[ "$VIASH_PAR_FILTER_QUAL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--filter_qual' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_TEST_VCF=() + IFS=';' + for var in $VIASH_PAR_VCF; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_VCF+=( "$var" ) + done + VIASH_PAR_VCF=$(IFS=';' ; echo "${VIASH_TEST_VCF[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bcftools-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_CONCAT+x} ]; then echo "${VIASH_PAR_CONCAT}" | sed "s#'#'\"'\"'#g;s#.*#par_concat='&'#" ; else echo "# par_concat="; fi ) +$( if [ ! -z ${VIASH_PAR_FILTER+x} ]; then echo "${VIASH_PAR_FILTER}" | sed "s#'#'\"'\"'#g;s#.*#par_filter='&'#" ; else echo "# par_filter="; fi ) +$( if [ ! -z ${VIASH_PAR_FILTER_QUAL+x} ]; then echo "${VIASH_PAR_FILTER_QUAL}" | sed "s#'#'\"'\"'#g;s#.*#par_filter_qual='&'#" ; else echo "# par_filter_qual="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +if [ ! -d "\$par_output" ]; then + mkdir -p \$par_output +fi + +IFS=";" read -a vcf_list <<< \$par_vcf + + +if [ "\$par_concat" = true ] && [ "\$par_filter" = true ] ; then + bcftools concat -o "\$par_output/concated_chroms.vcf" \${vcf_list[@]} + bcftools sort "\$par_output/concated_chroms.vcf" -o "\$par_output/sorted_concated_chroms.vcf" + bcftools filter -i "QUAL>\$par_filter_qual" "\$par_output/sorted_concated_chroms.vcf" -o "\$par_output/filtered_sorted_concated_chroms.vcf" + +elif [ "\$par_filter" = true ] ; then + bcftools filter -i "QUAL>\$par_filter_qual" \${vcf_list[@]} -o "\$par_output/filtered.vcf" + +else + bcftools concat -o "\$par_output/concated_chroms.vcf" \${vcf_list[@]} + bcftools sort "\$par_output/concated_chroms.vcf" -o "\$par_output/sorted_concated_chroms.vcf" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_VCF" ]; then + unset VIASH_TEST_VCF + IFS=';' + for var in $VIASH_PAR_VCF; do + unset IFS + if [ -z "$VIASH_TEST_VCF" ]; then + VIASH_TEST_VCF="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_VCF="$VIASH_TEST_VCF;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_VCF="$VIASH_TEST_VCF" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/bcftools/nextflow_labels.config b/target/executable/genetic_demux/bcftools/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/bcftools/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/cellsnp/.config.vsh.yaml b/target/executable/genetic_demux/cellsnp/.config.vsh.yaml new file mode 100644 index 00000000..f5296b71 --- /dev/null +++ b/target/executable/genetic_demux/cellsnp/.config.vsh.yaml @@ -0,0 +1,418 @@ +name: "cellsnp" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--sam_file" + description: "Indexed sam/bam file(s), comma separated multiple samples. Mode\ + \ 1a & 2a: one sam/bam file with single cell. Mode 1b & 2b: one or multiple\ + \ bulk sam/bam files, no barcodes needed, but sample ids and regionsVCF." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sam_index_file" + description: "Input SAM/BAM Index file, problem with samFileList." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sam_fileList" + description: "A list file containing bam files, each per line, for Mode 1b & 2b." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--regions_vcf" + description: "A vcf file listing all candidate SNPs, for fetch each variants.\ + \ If None, pileup the genome. Needed for bulk samples." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targets_vcf" + description: "Similar as --regions_vcf, but the next position is accessed by streaming\ + \ rather than indexing/jumping (like -T in samtools/bcftools mpileup)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode_file" + description: "A plain file listing all effective cell barcode." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_list" + description: "A list file containing sample IDs, each per line." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sample_ids" + description: "Comma separated sample ids." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--genotype" + description: "If use, do genotyping in addition to counting." + info: null + direction: "input" + - type: "boolean_true" + name: "--gzip" + description: "If use, the output files will be zipped into BGZF format." + info: null + direction: "input" + - type: "boolean_true" + name: "--print_skip_snps" + description: "If use, the SNPs skipped when loading VCF will be printed." + info: null + direction: "input" + - type: "string" + name: "--chrom" + description: "The chromosomes to use in integer format 1-22, comma separated" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_tag" + description: "Tag for cell barcodes, turn off with None." + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--umi_tag" + description: "Tag for UMI: UR, Auto, None. For Auto mode, use UR if barcodes is\ + \ inputted, otherwise use None. None mode means no UMI but read counts." + info: null + default: + - "Auto" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_count" + description: "Minimum aggragated count." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_maf" + description: "Minimum minor allele frequency." + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--doublet_gl" + description: "If use, keep doublet GT likelihood, i.e., GT=0.5 and GT=1.5." + info: null + direction: "input" + - type: "string" + name: "--incl_flag" + description: "Required flags: skip reads with all mask bits unset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--excl_flag" + description: "Filter flags: skip reads with any mask bits set [UNMAP,SECONDARY,QCFAIL\ + \ (when use UMI) or UNMAP,SECONDARY,QCFAIL,DUP (otherwise)]" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--count_orphan" + description: "If use, do not skip anomalous read pairs." + info: null + direction: "input" + - type: "integer" + name: "--min_mapq" + description: "Minimum MAPQ for read filtering." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_len" + description: "Minimum mapped length for read filtering." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "--outDir" + description: "Output directory for VCF and sparse matrices." + info: null + example: + - "cellsnp_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "cellSNP aims to pileup the expressed alleles in single-cell or bulk\ + \ RNA-seq data. It can be directly used for donor deconvolution in multiplexed single-cell\ + \ RNA-seq data, particularly with vireo." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "wget" + - "gcc" + - "zlib1g" + - "make" + - "libbz2-dev" + - "zlib1g-dev" + - "libncurses5-dev" + - "liblzma-dev" + - "autoconf" + - "automake" + - "perl" + - "libcurl4-gnutls-dev" + - "libssl-dev" + - "git" + - "bzip2" + interactive: false + - type: "docker" + run: + - "wget https://github.com/samtools/htslib/releases/download/1.16/htslib-1.16.tar.bz2\ + \ -O htslib.tar.bz2 && tar -xjvf htslib.tar.bz2 && cd htslib-1.16 && make &&\ + \ make install" + - type: "docker" + run: + - "git clone https://github.com/single-cell-genetics/cellsnp-lite.git && cd cellsnp-lite\ + \ && autoreconf -iv && ./configure --with-htslib=/htslib-1.16 && make && make\ + \ install" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/cellsnp/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/cellsnp" + executable: "target/executable/genetic_demux/cellsnp/cellsnp" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/cellsnp/cellsnp b/target/executable/genetic_demux/cellsnp/cellsnp new file mode 100755 index 00000000..87badb60 --- /dev/null +++ b/target/executable/genetic_demux/cellsnp/cellsnp @@ -0,0 +1,1614 @@ +#!/usr/bin/env bash + +# cellsnp main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellsnp" +VIASH_META_FUNCTIONALITY_NAME="cellsnp" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:latest +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y wget gcc zlib1g make libbz2-dev zlib1g-dev libncurses5-dev liblzma-dev autoconf automake perl libcurl4-gnutls-dev libssl-dev git bzip2 && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://github.com/samtools/htslib/releases/download/1.16/htslib-1.16.tar.bz2 -O htslib.tar.bz2 && tar -xjvf htslib.tar.bz2 && cd htslib-1.16 && make && make install +RUN git clone https://github.com/single-cell-genetics/cellsnp-lite.git && cd cellsnp-lite && autoreconf -iv && ./configure --with-htslib=/htslib-1.16 && make && make install +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux cellsnp" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellsnp main" + echo "" + echo "cellSNP aims to pileup the expressed alleles in single-cell or bulk RNA-seq" + echo "data. It can be directly used for donor deconvolution in multiplexed single-cell" + echo "RNA-seq data, particularly with vireo." + echo "" + echo "Input:" + echo " --sam_file" + echo " type: file, file must exist" + echo " Indexed sam/bam file(s), comma separated multiple samples. Mode 1a & 2a:" + echo " one sam/bam file with single cell. Mode 1b & 2b: one or multiple bulk" + echo " sam/bam files, no barcodes needed, but sample ids and regionsVCF." + echo "" + echo " --sam_index_file" + echo " type: file, file must exist" + echo " Input SAM/BAM Index file, problem with samFileList." + echo "" + echo " --sam_fileList" + echo " type: file, file must exist" + echo " A list file containing bam files, each per line, for Mode 1b & 2b." + echo "" + echo " --regions_vcf" + echo " type: file, file must exist" + echo " A vcf file listing all candidate SNPs, for fetch each variants. If None," + echo " pileup the genome. Needed for bulk samples." + echo "" + echo " --targets_vcf" + echo " type: file, file must exist" + echo " Similar as --regions_vcf, but the next position is accessed by streaming" + echo " rather than indexing/jumping (like -T in samtools/bcftools mpileup)." + echo "" + echo " --barcode_file" + echo " type: file, file must exist" + echo " A plain file listing all effective cell barcode." + echo "" + echo " --sample_list" + echo " type: file, file must exist" + echo " A list file containing sample IDs, each per line." + echo "" + echo " --sample_ids" + echo " type: string" + echo " Comma separated sample ids." + echo "" + echo " --genotype" + echo " type: boolean_true" + echo " If use, do genotyping in addition to counting." + echo "" + echo " --gzip" + echo " type: boolean_true" + echo " If use, the output files will be zipped into BGZF format." + echo "" + echo " --print_skip_snps" + echo " type: boolean_true" + echo " If use, the SNPs skipped when loading VCF will be printed." + echo "" + echo " --chrom" + echo " type: string" + echo " The chromosomes to use in integer format 1-22, comma separated" + echo "" + echo " --cell_tag" + echo " type: string" + echo " default: CB" + echo " Tag for cell barcodes, turn off with None." + echo "" + echo " --umi_tag" + echo " type: string" + echo " default: Auto" + echo " Tag for UMI: UR, Auto, None. For Auto mode, use UR if barcodes is" + echo " inputted, otherwise use None. None mode means no UMI but read counts." + echo "" + echo " --min_count" + echo " type: integer" + echo " default: 20" + echo " Minimum aggragated count." + echo "" + echo " --min_maf" + echo " type: double" + echo " default: 0.0" + echo " Minimum minor allele frequency." + echo "" + echo " --doublet_gl" + echo " type: boolean_true" + echo " If use, keep doublet GT likelihood, i.e., GT=0.5 and GT=1.5." + echo "" + echo " --incl_flag" + echo " type: string" + echo " Required flags: skip reads with all mask bits unset." + echo "" + echo " --excl_flag" + echo " type: string" + echo " Filter flags: skip reads with any mask bits set [UNMAP,SECONDARY,QCFAIL" + echo " (when use UMI) or UNMAP,SECONDARY,QCFAIL,DUP (otherwise)]" + echo "" + echo " --count_orphan" + echo " type: boolean_true" + echo " If use, do not skip anomalous read pairs." + echo "" + echo " --min_mapq" + echo " type: integer" + echo " default: 20" + echo " Minimum MAPQ for read filtering." + echo "" + echo " --min_len" + echo " type: integer" + echo " default: 30" + echo " Minimum mapped length for read filtering." + echo "" + echo "Output:" + echo " --outDir, --output" + echo " type: file, output, file must exist" + echo " example: cellsnp_out" + echo " Output directory for VCF and sparse matrices." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellsnp main" + exit + ;; + --sam_file) + [ -n "$VIASH_PAR_SAM_FILE" ] && ViashError Bad arguments for option \'--sam_file\': \'$VIASH_PAR_SAM_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam_file=*) + [ -n "$VIASH_PAR_SAM_FILE" ] && ViashError Bad arguments for option \'--sam_file=*\': \'$VIASH_PAR_SAM_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sam_index_file) + [ -n "$VIASH_PAR_SAM_INDEX_FILE" ] && ViashError Bad arguments for option \'--sam_index_file\': \'$VIASH_PAR_SAM_INDEX_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_INDEX_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam_index_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam_index_file=*) + [ -n "$VIASH_PAR_SAM_INDEX_FILE" ] && ViashError Bad arguments for option \'--sam_index_file=*\': \'$VIASH_PAR_SAM_INDEX_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_INDEX_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sam_fileList) + [ -n "$VIASH_PAR_SAM_FILELIST" ] && ViashError Bad arguments for option \'--sam_fileList\': \'$VIASH_PAR_SAM_FILELIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_FILELIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam_fileList. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam_fileList=*) + [ -n "$VIASH_PAR_SAM_FILELIST" ] && ViashError Bad arguments for option \'--sam_fileList=*\': \'$VIASH_PAR_SAM_FILELIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_FILELIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --regions_vcf) + [ -n "$VIASH_PAR_REGIONS_VCF" ] && ViashError Bad arguments for option \'--regions_vcf\': \'$VIASH_PAR_REGIONS_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REGIONS_VCF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --regions_vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --regions_vcf=*) + [ -n "$VIASH_PAR_REGIONS_VCF" ] && ViashError Bad arguments for option \'--regions_vcf=*\': \'$VIASH_PAR_REGIONS_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REGIONS_VCF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --targets_vcf) + [ -n "$VIASH_PAR_TARGETS_VCF" ] && ViashError Bad arguments for option \'--targets_vcf\': \'$VIASH_PAR_TARGETS_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGETS_VCF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --targets_vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --targets_vcf=*) + [ -n "$VIASH_PAR_TARGETS_VCF" ] && ViashError Bad arguments for option \'--targets_vcf=*\': \'$VIASH_PAR_TARGETS_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGETS_VCF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --barcode_file) + [ -n "$VIASH_PAR_BARCODE_FILE" ] && ViashError Bad arguments for option \'--barcode_file\': \'$VIASH_PAR_BARCODE_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcode_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcode_file=*) + [ -n "$VIASH_PAR_BARCODE_FILE" ] && ViashError Bad arguments for option \'--barcode_file=*\': \'$VIASH_PAR_BARCODE_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_list) + [ -n "$VIASH_PAR_SAMPLE_LIST" ] && ViashError Bad arguments for option \'--sample_list\': \'$VIASH_PAR_SAMPLE_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_list=*) + [ -n "$VIASH_PAR_SAMPLE_LIST" ] && ViashError Bad arguments for option \'--sample_list=*\': \'$VIASH_PAR_SAMPLE_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_ids) + [ -n "$VIASH_PAR_SAMPLE_IDS" ] && ViashError Bad arguments for option \'--sample_ids\': \'$VIASH_PAR_SAMPLE_IDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_IDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_ids=*) + [ -n "$VIASH_PAR_SAMPLE_IDS" ] && ViashError Bad arguments for option \'--sample_ids=*\': \'$VIASH_PAR_SAMPLE_IDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_IDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genotype) + [ -n "$VIASH_PAR_GENOTYPE" ] && ViashError Bad arguments for option \'--genotype\': \'$VIASH_PAR_GENOTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPE=true + shift 1 + ;; + --gzip) + [ -n "$VIASH_PAR_GZIP" ] && ViashError Bad arguments for option \'--gzip\': \'$VIASH_PAR_GZIP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GZIP=true + shift 1 + ;; + --print_skip_snps) + [ -n "$VIASH_PAR_PRINT_SKIP_SNPS" ] && ViashError Bad arguments for option \'--print_skip_snps\': \'$VIASH_PAR_PRINT_SKIP_SNPS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PRINT_SKIP_SNPS=true + shift 1 + ;; + --chrom) + [ -n "$VIASH_PAR_CHROM" ] && ViashError Bad arguments for option \'--chrom\': \'$VIASH_PAR_CHROM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHROM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chrom. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chrom=*) + [ -n "$VIASH_PAR_CHROM" ] && ViashError Bad arguments for option \'--chrom=*\': \'$VIASH_PAR_CHROM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHROM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_tag) + [ -n "$VIASH_PAR_CELL_TAG" ] && ViashError Bad arguments for option \'--cell_tag\': \'$VIASH_PAR_CELL_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_TAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_tag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_tag=*) + [ -n "$VIASH_PAR_CELL_TAG" ] && ViashError Bad arguments for option \'--cell_tag=*\': \'$VIASH_PAR_CELL_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_TAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --umi_tag) + [ -n "$VIASH_PAR_UMI_TAG" ] && ViashError Bad arguments for option \'--umi_tag\': \'$VIASH_PAR_UMI_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UMI_TAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --umi_tag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --umi_tag=*) + [ -n "$VIASH_PAR_UMI_TAG" ] && ViashError Bad arguments for option \'--umi_tag=*\': \'$VIASH_PAR_UMI_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UMI_TAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_count) + [ -n "$VIASH_PAR_MIN_COUNT" ] && ViashError Bad arguments for option \'--min_count\': \'$VIASH_PAR_MIN_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_count=*) + [ -n "$VIASH_PAR_MIN_COUNT" ] && ViashError Bad arguments for option \'--min_count=*\': \'$VIASH_PAR_MIN_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_maf) + [ -n "$VIASH_PAR_MIN_MAF" ] && ViashError Bad arguments for option \'--min_maf\': \'$VIASH_PAR_MIN_MAF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_maf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_maf=*) + [ -n "$VIASH_PAR_MIN_MAF" ] && ViashError Bad arguments for option \'--min_maf=*\': \'$VIASH_PAR_MIN_MAF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --doublet_gl) + [ -n "$VIASH_PAR_DOUBLET_GL" ] && ViashError Bad arguments for option \'--doublet_gl\': \'$VIASH_PAR_DOUBLET_GL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DOUBLET_GL=true + shift 1 + ;; + --incl_flag) + [ -n "$VIASH_PAR_INCL_FLAG" ] && ViashError Bad arguments for option \'--incl_flag\': \'$VIASH_PAR_INCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INCL_FLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --incl_flag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --incl_flag=*) + [ -n "$VIASH_PAR_INCL_FLAG" ] && ViashError Bad arguments for option \'--incl_flag=*\': \'$VIASH_PAR_INCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INCL_FLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --excl_flag) + [ -n "$VIASH_PAR_EXCL_FLAG" ] && ViashError Bad arguments for option \'--excl_flag\': \'$VIASH_PAR_EXCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCL_FLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --excl_flag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --excl_flag=*) + [ -n "$VIASH_PAR_EXCL_FLAG" ] && ViashError Bad arguments for option \'--excl_flag=*\': \'$VIASH_PAR_EXCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCL_FLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --count_orphan) + [ -n "$VIASH_PAR_COUNT_ORPHAN" ] && ViashError Bad arguments for option \'--count_orphan\': \'$VIASH_PAR_COUNT_ORPHAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COUNT_ORPHAN=true + shift 1 + ;; + --min_mapq) + [ -n "$VIASH_PAR_MIN_MAPQ" ] && ViashError Bad arguments for option \'--min_mapq\': \'$VIASH_PAR_MIN_MAPQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAPQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_mapq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_mapq=*) + [ -n "$VIASH_PAR_MIN_MAPQ" ] && ViashError Bad arguments for option \'--min_mapq=*\': \'$VIASH_PAR_MIN_MAPQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAPQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_len) + [ -n "$VIASH_PAR_MIN_LEN" ] && ViashError Bad arguments for option \'--min_len\': \'$VIASH_PAR_MIN_LEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_LEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_len. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_len=*) + [ -n "$VIASH_PAR_MIN_LEN" ] && ViashError Bad arguments for option \'--min_len=*\': \'$VIASH_PAR_MIN_LEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_LEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outDir) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--outDir\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/cellsnp:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_GENOTYPE+x} ]; then + VIASH_PAR_GENOTYPE="false" +fi +if [ -z ${VIASH_PAR_GZIP+x} ]; then + VIASH_PAR_GZIP="false" +fi +if [ -z ${VIASH_PAR_PRINT_SKIP_SNPS+x} ]; then + VIASH_PAR_PRINT_SKIP_SNPS="false" +fi +if [ -z ${VIASH_PAR_CELL_TAG+x} ]; then + VIASH_PAR_CELL_TAG="CB" +fi +if [ -z ${VIASH_PAR_UMI_TAG+x} ]; then + VIASH_PAR_UMI_TAG="Auto" +fi +if [ -z ${VIASH_PAR_MIN_COUNT+x} ]; then + VIASH_PAR_MIN_COUNT="20" +fi +if [ -z ${VIASH_PAR_MIN_MAF+x} ]; then + VIASH_PAR_MIN_MAF="0.0" +fi +if [ -z ${VIASH_PAR_DOUBLET_GL+x} ]; then + VIASH_PAR_DOUBLET_GL="false" +fi +if [ -z ${VIASH_PAR_COUNT_ORPHAN+x} ]; then + VIASH_PAR_COUNT_ORPHAN="false" +fi +if [ -z ${VIASH_PAR_MIN_MAPQ+x} ]; then + VIASH_PAR_MIN_MAPQ="20" +fi +if [ -z ${VIASH_PAR_MIN_LEN+x} ]; then + VIASH_PAR_MIN_LEN="30" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_SAM_FILE" ] && [ ! -e "$VIASH_PAR_SAM_FILE" ]; then + ViashError "Input file '$VIASH_PAR_SAM_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAM_INDEX_FILE" ] && [ ! -e "$VIASH_PAR_SAM_INDEX_FILE" ]; then + ViashError "Input file '$VIASH_PAR_SAM_INDEX_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAM_FILELIST" ] && [ ! -e "$VIASH_PAR_SAM_FILELIST" ]; then + ViashError "Input file '$VIASH_PAR_SAM_FILELIST' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REGIONS_VCF" ] && [ ! -e "$VIASH_PAR_REGIONS_VCF" ]; then + ViashError "Input file '$VIASH_PAR_REGIONS_VCF' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TARGETS_VCF" ] && [ ! -e "$VIASH_PAR_TARGETS_VCF" ]; then + ViashError "Input file '$VIASH_PAR_TARGETS_VCF' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BARCODE_FILE" ] && [ ! -e "$VIASH_PAR_BARCODE_FILE" ]; then + ViashError "Input file '$VIASH_PAR_BARCODE_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_LIST" ] && [ ! -e "$VIASH_PAR_SAMPLE_LIST" ]; then + ViashError "Input file '$VIASH_PAR_SAMPLE_LIST' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_GENOTYPE" ]]; then + if ! [[ "$VIASH_PAR_GENOTYPE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--genotype' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GZIP" ]]; then + if ! [[ "$VIASH_PAR_GZIP" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--gzip' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PRINT_SKIP_SNPS" ]]; then + if ! [[ "$VIASH_PAR_PRINT_SKIP_SNPS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--print_skip_snps' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_COUNT" ]]; then + if ! [[ "$VIASH_PAR_MIN_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MAF" ]]; then + if ! [[ "$VIASH_PAR_MIN_MAF" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_maf' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DOUBLET_GL" ]]; then + if ! [[ "$VIASH_PAR_DOUBLET_GL" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--doublet_gl' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_COUNT_ORPHAN" ]]; then + if ! [[ "$VIASH_PAR_COUNT_ORPHAN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--count_orphan' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MAPQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_MAPQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_mapq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_LEN" ]]; then + if ! [[ "$VIASH_PAR_MIN_LEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_len' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_SAM_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAM_FILE")" ) + VIASH_PAR_SAM_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_SAM_FILE") +fi +if [ ! -z "$VIASH_PAR_SAM_INDEX_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAM_INDEX_FILE")" ) + VIASH_PAR_SAM_INDEX_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_SAM_INDEX_FILE") +fi +if [ ! -z "$VIASH_PAR_SAM_FILELIST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAM_FILELIST")" ) + VIASH_PAR_SAM_FILELIST=$(ViashDockerAutodetectMount "$VIASH_PAR_SAM_FILELIST") +fi +if [ ! -z "$VIASH_PAR_REGIONS_VCF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REGIONS_VCF")" ) + VIASH_PAR_REGIONS_VCF=$(ViashDockerAutodetectMount "$VIASH_PAR_REGIONS_VCF") +fi +if [ ! -z "$VIASH_PAR_TARGETS_VCF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TARGETS_VCF")" ) + VIASH_PAR_TARGETS_VCF=$(ViashDockerAutodetectMount "$VIASH_PAR_TARGETS_VCF") +fi +if [ ! -z "$VIASH_PAR_BARCODE_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BARCODE_FILE")" ) + VIASH_PAR_BARCODE_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_BARCODE_FILE") +fi +if [ ! -z "$VIASH_PAR_SAMPLE_LIST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_LIST")" ) + VIASH_PAR_SAMPLE_LIST=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_LIST") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellsnp-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_SAM_FILE+x} ]; then echo "${VIASH_PAR_SAM_FILE}" | sed "s#'#'\"'\"'#g;s#.*#par_sam_file='&'#" ; else echo "# par_sam_file="; fi ) +$( if [ ! -z ${VIASH_PAR_SAM_INDEX_FILE+x} ]; then echo "${VIASH_PAR_SAM_INDEX_FILE}" | sed "s#'#'\"'\"'#g;s#.*#par_sam_index_file='&'#" ; else echo "# par_sam_index_file="; fi ) +$( if [ ! -z ${VIASH_PAR_SAM_FILELIST+x} ]; then echo "${VIASH_PAR_SAM_FILELIST}" | sed "s#'#'\"'\"'#g;s#.*#par_sam_fileList='&'#" ; else echo "# par_sam_fileList="; fi ) +$( if [ ! -z ${VIASH_PAR_REGIONS_VCF+x} ]; then echo "${VIASH_PAR_REGIONS_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_regions_vcf='&'#" ; else echo "# par_regions_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_TARGETS_VCF+x} ]; then echo "${VIASH_PAR_TARGETS_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_targets_vcf='&'#" ; else echo "# par_targets_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODE_FILE+x} ]; then echo "${VIASH_PAR_BARCODE_FILE}" | sed "s#'#'\"'\"'#g;s#.*#par_barcode_file='&'#" ; else echo "# par_barcode_file="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_LIST+x} ]; then echo "${VIASH_PAR_SAMPLE_LIST}" | sed "s#'#'\"'\"'#g;s#.*#par_sample_list='&'#" ; else echo "# par_sample_list="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_IDS+x} ]; then echo "${VIASH_PAR_SAMPLE_IDS}" | sed "s#'#'\"'\"'#g;s#.*#par_sample_ids='&'#" ; else echo "# par_sample_ids="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPE+x} ]; then echo "${VIASH_PAR_GENOTYPE}" | sed "s#'#'\"'\"'#g;s#.*#par_genotype='&'#" ; else echo "# par_genotype="; fi ) +$( if [ ! -z ${VIASH_PAR_GZIP+x} ]; then echo "${VIASH_PAR_GZIP}" | sed "s#'#'\"'\"'#g;s#.*#par_gzip='&'#" ; else echo "# par_gzip="; fi ) +$( if [ ! -z ${VIASH_PAR_PRINT_SKIP_SNPS+x} ]; then echo "${VIASH_PAR_PRINT_SKIP_SNPS}" | sed "s#'#'\"'\"'#g;s#.*#par_print_skip_snps='&'#" ; else echo "# par_print_skip_snps="; fi ) +$( if [ ! -z ${VIASH_PAR_CHROM+x} ]; then echo "${VIASH_PAR_CHROM}" | sed "s#'#'\"'\"'#g;s#.*#par_chrom='&'#" ; else echo "# par_chrom="; fi ) +$( if [ ! -z ${VIASH_PAR_CELL_TAG+x} ]; then echo "${VIASH_PAR_CELL_TAG}" | sed "s#'#'\"'\"'#g;s#.*#par_cell_tag='&'#" ; else echo "# par_cell_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_UMI_TAG+x} ]; then echo "${VIASH_PAR_UMI_TAG}" | sed "s#'#'\"'\"'#g;s#.*#par_umi_tag='&'#" ; else echo "# par_umi_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_COUNT+x} ]; then echo "${VIASH_PAR_MIN_COUNT}" | sed "s#'#'\"'\"'#g;s#.*#par_min_count='&'#" ; else echo "# par_min_count="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MAF+x} ]; then echo "${VIASH_PAR_MIN_MAF}" | sed "s#'#'\"'\"'#g;s#.*#par_min_maf='&'#" ; else echo "# par_min_maf="; fi ) +$( if [ ! -z ${VIASH_PAR_DOUBLET_GL+x} ]; then echo "${VIASH_PAR_DOUBLET_GL}" | sed "s#'#'\"'\"'#g;s#.*#par_doublet_gl='&'#" ; else echo "# par_doublet_gl="; fi ) +$( if [ ! -z ${VIASH_PAR_INCL_FLAG+x} ]; then echo "${VIASH_PAR_INCL_FLAG}" | sed "s#'#'\"'\"'#g;s#.*#par_incl_flag='&'#" ; else echo "# par_incl_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCL_FLAG+x} ]; then echo "${VIASH_PAR_EXCL_FLAG}" | sed "s#'#'\"'\"'#g;s#.*#par_excl_flag='&'#" ; else echo "# par_excl_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_COUNT_ORPHAN+x} ]; then echo "${VIASH_PAR_COUNT_ORPHAN}" | sed "s#'#'\"'\"'#g;s#.*#par_count_orphan='&'#" ; else echo "# par_count_orphan="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MAPQ+x} ]; then echo "${VIASH_PAR_MIN_MAPQ}" | sed "s#'#'\"'\"'#g;s#.*#par_min_mapq='&'#" ; else echo "# par_min_mapq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_LEN+x} ]; then echo "${VIASH_PAR_MIN_LEN}" | sed "s#'#'\"'\"'#g;s#.*#par_min_len='&'#" ; else echo "# par_min_len="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\$par_genotype" == "false" ]] && unset par_genotype +[[ "\$par_gzip" == "false" ]] && unset par_gzip +[[ "\$par_print_skip_snps" == "false" ]] && unset par_print_skip_snps +[[ "\$par_doublet_gl" == "false" ]] && unset par_doublet_gl +[[ "\$par_count_orphan" == "false" ]] && unset par_count_orphan + +cellsnp-lite \\ + \${meta_cpus:+--nproc \$meta_cpus} \\ + --cellTAG \$par_cell_tag \\ + --UMItag \$par_umi_tag \\ + --minCOUNT \$par_min_count \\ + --minMAF \$par_min_maf \\ + --minLEN \$par_min_len \\ + --minMAPQ \$par_min_mapq \\ + --outDir \$par_output \\ + \${par_sam_file:+--samFile \$par_sam_file} \\ + \${par_sam_fileList:+--samFileList \$par_sam_fileList} \\ + \${par_regions_vcf:+--regionsVCF \$par_regions_vcf} \\ + \${par_targets_vcf:+--targetsVCF \$par_targets_vcf} \\ + \${par_barcode_file:+--barcodeFile \$par_barcode_file} \\ + \${par_sample_list:+--sampleList \$par_sample_list} \\ + \${par_sample_ids:+--sampleIDs \$par_sample_ids} \\ + \${par_genotype:+--genotype} \\ + \${par_gzip:+--gzip} \\ + \${par_print_skip_snps:+--printSkipSNPs} \\ + \${par_chrom:+--chrom \$par_chrom} \\ + \${par_doublet_gl:+--doubletGL} \\ + \${par_incl_flag:+--inclFLAG \$par_incl_flag} \\ + \${par_excl_flag:+--exclFLAG \$par_excl_flag} \\ + \${par_count_orphan:+--countORPHAN} +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_SAM_FILE" ]; then + VIASH_PAR_SAM_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_SAM_FILE") + fi + if [ ! -z "$VIASH_PAR_SAM_INDEX_FILE" ]; then + VIASH_PAR_SAM_INDEX_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_SAM_INDEX_FILE") + fi + if [ ! -z "$VIASH_PAR_SAM_FILELIST" ]; then + VIASH_PAR_SAM_FILELIST=$(ViashDockerStripAutomount "$VIASH_PAR_SAM_FILELIST") + fi + if [ ! -z "$VIASH_PAR_REGIONS_VCF" ]; then + VIASH_PAR_REGIONS_VCF=$(ViashDockerStripAutomount "$VIASH_PAR_REGIONS_VCF") + fi + if [ ! -z "$VIASH_PAR_TARGETS_VCF" ]; then + VIASH_PAR_TARGETS_VCF=$(ViashDockerStripAutomount "$VIASH_PAR_TARGETS_VCF") + fi + if [ ! -z "$VIASH_PAR_BARCODE_FILE" ]; then + VIASH_PAR_BARCODE_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_BARCODE_FILE") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_LIST" ]; then + VIASH_PAR_SAMPLE_LIST=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_LIST") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/cellsnp/nextflow_labels.config b/target/executable/genetic_demux/cellsnp/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/cellsnp/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/demuxlet/.config.vsh.yaml b/target/executable/genetic_demux/demuxlet/.config.vsh.yaml new file mode 100644 index 00000000..33b1719d --- /dev/null +++ b/target/executable/genetic_demux/demuxlet/.config.vsh.yaml @@ -0,0 +1,504 @@ +name: "demuxlet" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--sam" + description: "Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_group" + description: "Tag representing readgroup or cell barcodes, in the case to partition\ + \ the BAM file into multiple groups. For 10x genomics, use CB." + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_umi" + description: "Tag representing UMIs. For 10x genomiucs, use UB." + info: null + default: + - "UB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--plp" + description: "Input pileup format. If the value is a string, it will be considered\ + \ as the path of the plp file. If the value is boolean true, it will perform\ + \ dscpileup." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vcf" + description: "Input VCF/BCF file, containing the individual genotypes (GT), posterior\ + \ probability (GP), or genotype likelihood (PL)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--field" + description: "FORMAT field to extract the genotype, likelihood, or posterior from" + info: null + default: + - "GT" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--geno_error_offset" + description: "Offset of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2]" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--geno_error_coeff" + description: "Slope of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2]" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--r2_info" + description: "INFO field name representing R2 value. Used for representing imputation\ + \ quality." + info: null + default: + - "R2" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_mac" + description: "Minimum minor allele frequency." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_call_rate" + description: "Minimum call rate." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alpha" + description: "Grid of alpha to search for (default is 0.1, 0.2, 0.3, 0.4, 0.5)" + info: null + default: + - "0.5" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--doublet_prior" + description: "Prior of doublet" + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm" + description: "List of sample IDs to compare to (default: use all)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm_list" + description: "File containing the list of sample IDs to compare." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sam_verbose" + description: "Verbose message frequency for SAM/BAM/CRAM." + info: null + default: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vcf_verbose" + description: "Verbose message frequency for VCF/BCF." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--cap_bq" + description: "Maximum base quality (higher BQ will be capped)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_bq" + description: "Minimum base quality to consider (lower BQ will be skipped)." + info: null + default: + - 13 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_mq" + description: "Minimum mapping quality to consider (lower MQ will be ignored)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_td" + description: "Minimum distance to the tail (lower will be ignored)." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--excl_flag" + description: "SAM/BAM FLAGs to be excluded." + info: null + default: + - 3844 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_list" + description: "List of tag readgroup/cell barcode to consider in this run. All\ + \ other barcodes will be ignored. This is useful for parallelized run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_total" + description: "Minimum number of total reads for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_snp" + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_umi" + description: "Minimum number of UMIs for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "demux" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--out" + description: "demuxlet output file prefix" + info: null + example: + - "demuxlet" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "demuxlet.patch" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Demuxlet is a software tool to deconvolute sample identity and identify\ + \ multiplets when\nmultiple samples are pooled by barcoded single cell sequencing.\ + \ If external genotyping data\nfor each sample is available (e.g. from SNP arrays),\ + \ demuxlet would be recommended. Be careful\nthat the parameters on the github is\ + \ not in line with the newest help version.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + copy: + - "demuxlet.patch /opt/demuxlet.patch" + - type: "apt" + packages: + - "autoconf" + - "wget" + - "git" + - "build-essential" + - "libcurl4-openssl-dev" + - "cmake" + - "libbz2-dev" + - "libssl-dev" + - "liblzma-dev" + - "zlib1g-dev" + - "r-base" + interactive: false + - type: "docker" + run: + - "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib\ + \ && git submodule update --init --recursive && autoreconf -i && ./configure\ + \ --prefix=/usr/local/ && make && make install" + - type: "docker" + run: + - "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle &&\ + \ cd /tmp/popscle && git apply /opt/demuxlet.patch && mkdir -p /tmp/popscle/build\ + \ && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle\ + \ /usr/local/bin" + - type: "r" + cran: + - "readr" + - "processx" + - "dplyr" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/demuxlet/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/demuxlet" + executable: "target/executable/genetic_demux/demuxlet/demuxlet" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/demuxlet/demuxlet b/target/executable/genetic_demux/demuxlet/demuxlet new file mode 100755 index 00000000..5f0c1c95 --- /dev/null +++ b/target/executable/genetic_demux/demuxlet/demuxlet @@ -0,0 +1,1823 @@ +#!/usr/bin/env bash + +# demuxlet main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="demuxlet" +VIASH_META_FUNCTIONALITY_NAME="demuxlet" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:22.04 +ENTRYPOINT [] +COPY demuxlet.patch /opt/demuxlet.patch +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y autoconf wget git build-essential libcurl4-openssl-dev cmake libbz2-dev libssl-dev liblzma-dev zlib1g-dev r-base && \ + rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install +RUN git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/demuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("readr", "processx", "dplyr"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux demuxlet" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "demuxlet main" + echo "" + echo "Demuxlet is a software tool to deconvolute sample identity and identify" + echo "multiplets when" + echo "multiple samples are pooled by barcoded single cell sequencing. If external" + echo "genotyping data" + echo "for each sample is available (e.g. from SNP arrays), demuxlet would be" + echo "recommended. Be careful" + echo "that the parameters on the github is not in line with the newest help version." + echo "" + echo "Input:" + echo " --sam" + echo " type: file, file must exist" + echo " Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed." + echo "" + echo " --tag_group" + echo " type: string" + echo " default: CB" + echo " Tag representing readgroup or cell barcodes, in the case to partition" + echo " the BAM file into multiple groups. For 10x genomics, use CB." + echo "" + echo " --tag_umi" + echo " type: string" + echo " default: UB" + echo " Tag representing UMIs. For 10x genomiucs, use UB." + echo "" + echo " --plp" + echo " type: string" + echo " Input pileup format. If the value is a string, it will be considered as" + echo " the path of the plp file. If the value is boolean true, it will perform" + echo " dscpileup." + echo "" + echo " --vcf" + echo " type: file, file must exist" + echo " Input VCF/BCF file, containing the individual genotypes (GT), posterior" + echo " probability (GP), or genotype likelihood (PL)." + echo "" + echo " --field" + echo " type: string" + echo " default: GT" + echo " FORMAT field to extract the genotype, likelihood, or posterior from" + echo "" + echo " --geno_error_offset" + echo " type: double" + echo " default: 0.1" + echo " Offset of genotype error rate. [error] = [offset] +" + echo " [1-offset]*[coeff]*[1-r2]" + echo "" + echo " --geno_error_coeff" + echo " type: double" + echo " default: 0.0" + echo " Slope of genotype error rate. [error] = [offset] +" + echo " [1-offset]*[coeff]*[1-r2]" + echo "" + echo " --r2_info" + echo " type: string" + echo " default: R2" + echo " INFO field name representing R2 value. Used for representing imputation" + echo " quality." + echo "" + echo " --min_mac" + echo " type: integer" + echo " default: 1" + echo " Minimum minor allele frequency." + echo "" + echo " --min_call_rate" + echo " type: double" + echo " default: 0.5" + echo " Minimum call rate." + echo "" + echo " --alpha" + echo " type: string" + echo " default: 0.5" + echo " Grid of alpha to search for (default is 0.1, 0.2, 0.3, 0.4, 0.5)" + echo "" + echo " --doublet_prior" + echo " type: double" + echo " default: 0.5" + echo " Prior of doublet" + echo "" + echo " --sm" + echo " type: string" + echo " List of sample IDs to compare to (default: use all)." + echo "" + echo " --sm_list" + echo " type: string" + echo " File containing the list of sample IDs to compare." + echo "" + echo " --sam_verbose" + echo " type: integer" + echo " default: 1000000" + echo " Verbose message frequency for SAM/BAM/CRAM." + echo "" + echo " --vcf_verbose" + echo " type: integer" + echo " default: 1000" + echo " Verbose message frequency for VCF/BCF." + echo "" + echo " --cap_bq" + echo " type: integer" + echo " default: 20" + echo " Maximum base quality (higher BQ will be capped)." + echo "" + echo " --min_bq" + echo " type: integer" + echo " default: 13" + echo " Minimum base quality to consider (lower BQ will be skipped)." + echo "" + echo " --min_mq" + echo " type: integer" + echo " default: 20" + echo " Minimum mapping quality to consider (lower MQ will be ignored)." + echo "" + echo " --min_td" + echo " type: integer" + echo " default: 0" + echo " Minimum distance to the tail (lower will be ignored)." + echo "" + echo " --excl_flag" + echo " type: integer" + echo " default: 3844" + echo " SAM/BAM FLAGs to be excluded." + echo "" + echo " --group_list" + echo " type: string" + echo " List of tag readgroup/cell barcode to consider in this run. All other" + echo " barcodes will be ignored. This is useful for parallelized run." + echo "" + echo " --min_total" + echo " type: integer" + echo " default: 0" + echo " Minimum number of total reads for a droplet/cell to be considered." + echo "" + echo " --min_snp" + echo " type: integer" + echo " default: 0" + echo " Minimum number of SNPs with coverage for a droplet/cell to be" + echo " considered." + echo "" + echo " --min_umi" + echo " type: integer" + echo " default: 0" + echo " Minimum number of UMIs for a droplet/cell to be considered." + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: demux" + echo " Output directory" + echo "" + echo " --out" + echo " type: string" + echo " example: demuxlet" + echo " demuxlet output file prefix" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "demuxlet main" + exit + ;; + --sam) + [ -n "$VIASH_PAR_SAM" ] && ViashError Bad arguments for option \'--sam\': \'$VIASH_PAR_SAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam=*) + [ -n "$VIASH_PAR_SAM" ] && ViashError Bad arguments for option \'--sam=*\': \'$VIASH_PAR_SAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tag_group) + [ -n "$VIASH_PAR_TAG_GROUP" ] && ViashError Bad arguments for option \'--tag_group\': \'$VIASH_PAR_TAG_GROUP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_GROUP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tag_group. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tag_group=*) + [ -n "$VIASH_PAR_TAG_GROUP" ] && ViashError Bad arguments for option \'--tag_group=*\': \'$VIASH_PAR_TAG_GROUP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_GROUP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tag_umi) + [ -n "$VIASH_PAR_TAG_UMI" ] && ViashError Bad arguments for option \'--tag_umi\': \'$VIASH_PAR_TAG_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tag_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tag_umi=*) + [ -n "$VIASH_PAR_TAG_UMI" ] && ViashError Bad arguments for option \'--tag_umi=*\': \'$VIASH_PAR_TAG_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --plp) + [ -n "$VIASH_PAR_PLP" ] && ViashError Bad arguments for option \'--plp\': \'$VIASH_PAR_PLP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --plp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --plp=*) + [ -n "$VIASH_PAR_PLP" ] && ViashError Bad arguments for option \'--plp=*\': \'$VIASH_PAR_PLP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vcf) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf=*) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf=*\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --field) + [ -n "$VIASH_PAR_FIELD" ] && ViashError Bad arguments for option \'--field\': \'$VIASH_PAR_FIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FIELD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --field. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --field=*) + [ -n "$VIASH_PAR_FIELD" ] && ViashError Bad arguments for option \'--field=*\': \'$VIASH_PAR_FIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FIELD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --geno_error_offset) + [ -n "$VIASH_PAR_GENO_ERROR_OFFSET" ] && ViashError Bad arguments for option \'--geno_error_offset\': \'$VIASH_PAR_GENO_ERROR_OFFSET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_ERROR_OFFSET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --geno_error_offset. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --geno_error_offset=*) + [ -n "$VIASH_PAR_GENO_ERROR_OFFSET" ] && ViashError Bad arguments for option \'--geno_error_offset=*\': \'$VIASH_PAR_GENO_ERROR_OFFSET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_ERROR_OFFSET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --geno_error_coeff) + [ -n "$VIASH_PAR_GENO_ERROR_COEFF" ] && ViashError Bad arguments for option \'--geno_error_coeff\': \'$VIASH_PAR_GENO_ERROR_COEFF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_ERROR_COEFF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --geno_error_coeff. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --geno_error_coeff=*) + [ -n "$VIASH_PAR_GENO_ERROR_COEFF" ] && ViashError Bad arguments for option \'--geno_error_coeff=*\': \'$VIASH_PAR_GENO_ERROR_COEFF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_ERROR_COEFF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --r2_info) + [ -n "$VIASH_PAR_R2_INFO" ] && ViashError Bad arguments for option \'--r2_info\': \'$VIASH_PAR_R2_INFO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_R2_INFO="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --r2_info. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --r2_info=*) + [ -n "$VIASH_PAR_R2_INFO" ] && ViashError Bad arguments for option \'--r2_info=*\': \'$VIASH_PAR_R2_INFO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_R2_INFO=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_mac) + [ -n "$VIASH_PAR_MIN_MAC" ] && ViashError Bad arguments for option \'--min_mac\': \'$VIASH_PAR_MIN_MAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_mac. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_mac=*) + [ -n "$VIASH_PAR_MIN_MAC" ] && ViashError Bad arguments for option \'--min_mac=*\': \'$VIASH_PAR_MIN_MAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_call_rate) + [ -n "$VIASH_PAR_MIN_CALL_RATE" ] && ViashError Bad arguments for option \'--min_call_rate\': \'$VIASH_PAR_MIN_CALL_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CALL_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_call_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_call_rate=*) + [ -n "$VIASH_PAR_MIN_CALL_RATE" ] && ViashError Bad arguments for option \'--min_call_rate=*\': \'$VIASH_PAR_MIN_CALL_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CALL_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alpha) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alpha=*) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha=*\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --doublet_prior) + [ -n "$VIASH_PAR_DOUBLET_PRIOR" ] && ViashError Bad arguments for option \'--doublet_prior\': \'$VIASH_PAR_DOUBLET_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DOUBLET_PRIOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --doublet_prior. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --doublet_prior=*) + [ -n "$VIASH_PAR_DOUBLET_PRIOR" ] && ViashError Bad arguments for option \'--doublet_prior=*\': \'$VIASH_PAR_DOUBLET_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DOUBLET_PRIOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sm) + [ -n "$VIASH_PAR_SM" ] && ViashError Bad arguments for option \'--sm\': \'$VIASH_PAR_SM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sm=*) + [ -n "$VIASH_PAR_SM" ] && ViashError Bad arguments for option \'--sm=*\': \'$VIASH_PAR_SM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sm_list) + [ -n "$VIASH_PAR_SM_LIST" ] && ViashError Bad arguments for option \'--sm_list\': \'$VIASH_PAR_SM_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sm_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sm_list=*) + [ -n "$VIASH_PAR_SM_LIST" ] && ViashError Bad arguments for option \'--sm_list=*\': \'$VIASH_PAR_SM_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sam_verbose) + [ -n "$VIASH_PAR_SAM_VERBOSE" ] && ViashError Bad arguments for option \'--sam_verbose\': \'$VIASH_PAR_SAM_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_VERBOSE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam_verbose. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam_verbose=*) + [ -n "$VIASH_PAR_SAM_VERBOSE" ] && ViashError Bad arguments for option \'--sam_verbose=*\': \'$VIASH_PAR_SAM_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_VERBOSE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vcf_verbose) + [ -n "$VIASH_PAR_VCF_VERBOSE" ] && ViashError Bad arguments for option \'--vcf_verbose\': \'$VIASH_PAR_VCF_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF_VERBOSE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf_verbose. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf_verbose=*) + [ -n "$VIASH_PAR_VCF_VERBOSE" ] && ViashError Bad arguments for option \'--vcf_verbose=*\': \'$VIASH_PAR_VCF_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF_VERBOSE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cap_bq) + [ -n "$VIASH_PAR_CAP_BQ" ] && ViashError Bad arguments for option \'--cap_bq\': \'$VIASH_PAR_CAP_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CAP_BQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cap_bq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cap_bq=*) + [ -n "$VIASH_PAR_CAP_BQ" ] && ViashError Bad arguments for option \'--cap_bq=*\': \'$VIASH_PAR_CAP_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CAP_BQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_bq) + [ -n "$VIASH_PAR_MIN_BQ" ] && ViashError Bad arguments for option \'--min_bq\': \'$VIASH_PAR_MIN_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_bq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_bq=*) + [ -n "$VIASH_PAR_MIN_BQ" ] && ViashError Bad arguments for option \'--min_bq=*\': \'$VIASH_PAR_MIN_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_mq) + [ -n "$VIASH_PAR_MIN_MQ" ] && ViashError Bad arguments for option \'--min_mq\': \'$VIASH_PAR_MIN_MQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_mq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_mq=*) + [ -n "$VIASH_PAR_MIN_MQ" ] && ViashError Bad arguments for option \'--min_mq=*\': \'$VIASH_PAR_MIN_MQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_td) + [ -n "$VIASH_PAR_MIN_TD" ] && ViashError Bad arguments for option \'--min_td\': \'$VIASH_PAR_MIN_TD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_td. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_td=*) + [ -n "$VIASH_PAR_MIN_TD" ] && ViashError Bad arguments for option \'--min_td=*\': \'$VIASH_PAR_MIN_TD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --excl_flag) + [ -n "$VIASH_PAR_EXCL_FLAG" ] && ViashError Bad arguments for option \'--excl_flag\': \'$VIASH_PAR_EXCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCL_FLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --excl_flag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --excl_flag=*) + [ -n "$VIASH_PAR_EXCL_FLAG" ] && ViashError Bad arguments for option \'--excl_flag=*\': \'$VIASH_PAR_EXCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCL_FLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --group_list) + [ -n "$VIASH_PAR_GROUP_LIST" ] && ViashError Bad arguments for option \'--group_list\': \'$VIASH_PAR_GROUP_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --group_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --group_list=*) + [ -n "$VIASH_PAR_GROUP_LIST" ] && ViashError Bad arguments for option \'--group_list=*\': \'$VIASH_PAR_GROUP_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_total) + [ -n "$VIASH_PAR_MIN_TOTAL" ] && ViashError Bad arguments for option \'--min_total\': \'$VIASH_PAR_MIN_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TOTAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_total. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_total=*) + [ -n "$VIASH_PAR_MIN_TOTAL" ] && ViashError Bad arguments for option \'--min_total=*\': \'$VIASH_PAR_MIN_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TOTAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_snp) + [ -n "$VIASH_PAR_MIN_SNP" ] && ViashError Bad arguments for option \'--min_snp\': \'$VIASH_PAR_MIN_SNP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SNP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_snp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_snp=*) + [ -n "$VIASH_PAR_MIN_SNP" ] && ViashError Bad arguments for option \'--min_snp=*\': \'$VIASH_PAR_MIN_SNP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SNP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_umi) + [ -n "$VIASH_PAR_MIN_UMI" ] && ViashError Bad arguments for option \'--min_umi\': \'$VIASH_PAR_MIN_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_umi=*) + [ -n "$VIASH_PAR_MIN_UMI" ] && ViashError Bad arguments for option \'--min_umi=*\': \'$VIASH_PAR_MIN_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --out) + [ -n "$VIASH_PAR_OUT" ] && ViashError Bad arguments for option \'--out\': \'$VIASH_PAR_OUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --out. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --out=*) + [ -n "$VIASH_PAR_OUT" ] && ViashError Bad arguments for option \'--out=*\': \'$VIASH_PAR_OUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/demuxlet:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_TAG_GROUP+x} ]; then + VIASH_PAR_TAG_GROUP="CB" +fi +if [ -z ${VIASH_PAR_TAG_UMI+x} ]; then + VIASH_PAR_TAG_UMI="UB" +fi +if [ -z ${VIASH_PAR_FIELD+x} ]; then + VIASH_PAR_FIELD="GT" +fi +if [ -z ${VIASH_PAR_GENO_ERROR_OFFSET+x} ]; then + VIASH_PAR_GENO_ERROR_OFFSET="0.1" +fi +if [ -z ${VIASH_PAR_GENO_ERROR_COEFF+x} ]; then + VIASH_PAR_GENO_ERROR_COEFF="0.0" +fi +if [ -z ${VIASH_PAR_R2_INFO+x} ]; then + VIASH_PAR_R2_INFO="R2" +fi +if [ -z ${VIASH_PAR_MIN_MAC+x} ]; then + VIASH_PAR_MIN_MAC="1" +fi +if [ -z ${VIASH_PAR_MIN_CALL_RATE+x} ]; then + VIASH_PAR_MIN_CALL_RATE="0.5" +fi +if [ -z ${VIASH_PAR_ALPHA+x} ]; then + VIASH_PAR_ALPHA="0.5" +fi +if [ -z ${VIASH_PAR_DOUBLET_PRIOR+x} ]; then + VIASH_PAR_DOUBLET_PRIOR="0.5" +fi +if [ -z ${VIASH_PAR_SAM_VERBOSE+x} ]; then + VIASH_PAR_SAM_VERBOSE="1000000" +fi +if [ -z ${VIASH_PAR_VCF_VERBOSE+x} ]; then + VIASH_PAR_VCF_VERBOSE="1000" +fi +if [ -z ${VIASH_PAR_CAP_BQ+x} ]; then + VIASH_PAR_CAP_BQ="20" +fi +if [ -z ${VIASH_PAR_MIN_BQ+x} ]; then + VIASH_PAR_MIN_BQ="13" +fi +if [ -z ${VIASH_PAR_MIN_MQ+x} ]; then + VIASH_PAR_MIN_MQ="20" +fi +if [ -z ${VIASH_PAR_MIN_TD+x} ]; then + VIASH_PAR_MIN_TD="0" +fi +if [ -z ${VIASH_PAR_EXCL_FLAG+x} ]; then + VIASH_PAR_EXCL_FLAG="3844" +fi +if [ -z ${VIASH_PAR_MIN_TOTAL+x} ]; then + VIASH_PAR_MIN_TOTAL="0" +fi +if [ -z ${VIASH_PAR_MIN_SNP+x} ]; then + VIASH_PAR_MIN_SNP="0" +fi +if [ -z ${VIASH_PAR_MIN_UMI+x} ]; then + VIASH_PAR_MIN_UMI="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_SAM" ] && [ ! -e "$VIASH_PAR_SAM" ]; then + ViashError "Input file '$VIASH_PAR_SAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VCF" ] && [ ! -e "$VIASH_PAR_VCF" ]; then + ViashError "Input file '$VIASH_PAR_VCF' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_GENO_ERROR_OFFSET" ]]; then + if ! [[ "$VIASH_PAR_GENO_ERROR_OFFSET" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--geno_error_offset' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENO_ERROR_COEFF" ]]; then + if ! [[ "$VIASH_PAR_GENO_ERROR_COEFF" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--geno_error_coeff' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MAC" ]]; then + if ! [[ "$VIASH_PAR_MIN_MAC" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_mac' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CALL_RATE" ]]; then + if ! [[ "$VIASH_PAR_MIN_CALL_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_call_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DOUBLET_PRIOR" ]]; then + if ! [[ "$VIASH_PAR_DOUBLET_PRIOR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--doublet_prior' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SAM_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_SAM_VERBOSE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sam_verbose' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VCF_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_VCF_VERBOSE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--vcf_verbose' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CAP_BQ" ]]; then + if ! [[ "$VIASH_PAR_CAP_BQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--cap_bq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_BQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_BQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_bq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_MQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_mq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_TD" ]]; then + if ! [[ "$VIASH_PAR_MIN_TD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_td' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXCL_FLAG" ]]; then + if ! [[ "$VIASH_PAR_EXCL_FLAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--excl_flag' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_TOTAL" ]]; then + if ! [[ "$VIASH_PAR_MIN_TOTAL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_total' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SNP" ]]; then + if ! [[ "$VIASH_PAR_MIN_SNP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_snp' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_UMI" ]]; then + if ! [[ "$VIASH_PAR_MIN_UMI" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_umi' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_SAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAM")" ) + VIASH_PAR_SAM=$(ViashDockerAutodetectMount "$VIASH_PAR_SAM") +fi +if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VCF")" ) + VIASH_PAR_VCF=$(ViashDockerAutodetectMount "$VIASH_PAR_VCF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-demuxlet-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +requireNamespace("processx", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "sam" = $( if [ ! -z ${VIASH_PAR_SAM+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_SAM" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "tag_group" = $( if [ ! -z ${VIASH_PAR_TAG_GROUP+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_TAG_GROUP" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "tag_umi" = $( if [ ! -z ${VIASH_PAR_TAG_UMI+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_TAG_UMI" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "plp" = $( if [ ! -z ${VIASH_PAR_PLP+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PLP" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "vcf" = $( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_VCF" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "field" = $( if [ ! -z ${VIASH_PAR_FIELD+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_FIELD" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "geno_error_offset" = $( if [ ! -z ${VIASH_PAR_GENO_ERROR_OFFSET+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_GENO_ERROR_OFFSET" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "geno_error_coeff" = $( if [ ! -z ${VIASH_PAR_GENO_ERROR_COEFF+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_GENO_ERROR_COEFF" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "r2_info" = $( if [ ! -z ${VIASH_PAR_R2_INFO+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_R2_INFO" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_mac" = $( if [ ! -z ${VIASH_PAR_MIN_MAC+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_MAC" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_call_rate" = $( if [ ! -z ${VIASH_PAR_MIN_CALL_RATE+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_MIN_CALL_RATE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "alpha" = $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ALPHA" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "doublet_prior" = $( if [ ! -z ${VIASH_PAR_DOUBLET_PRIOR+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_DOUBLET_PRIOR" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "sm" = $( if [ ! -z ${VIASH_PAR_SM+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_SM" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "sm_list" = $( if [ ! -z ${VIASH_PAR_SM_LIST+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_SM_LIST" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "sam_verbose" = $( if [ ! -z ${VIASH_PAR_SAM_VERBOSE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_SAM_VERBOSE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "vcf_verbose" = $( if [ ! -z ${VIASH_PAR_VCF_VERBOSE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_VCF_VERBOSE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "cap_bq" = $( if [ ! -z ${VIASH_PAR_CAP_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_CAP_BQ" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_bq" = $( if [ ! -z ${VIASH_PAR_MIN_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_BQ" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_mq" = $( if [ ! -z ${VIASH_PAR_MIN_MQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_MQ" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_td" = $( if [ ! -z ${VIASH_PAR_MIN_TD+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_TD" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "excl_flag" = $( if [ ! -z ${VIASH_PAR_EXCL_FLAG+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_EXCL_FLAG" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "group_list" = $( if [ ! -z ${VIASH_PAR_GROUP_LIST+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_GROUP_LIST" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_total" = $( if [ ! -z ${VIASH_PAR_MIN_TOTAL+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_TOTAL" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_snp" = $( if [ ! -z ${VIASH_PAR_MIN_SNP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_SNP" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_umi" = $( if [ ! -z ${VIASH_PAR_MIN_UMI+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_UMI" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "out" = $( if [ ! -z ${VIASH_PAR_OUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +if (!dir.exists(par\$output)) { + dir.create(par\$output, recursive = TRUE, showWarnings = FALSE) +} + +cmd <- c( + "popscle", "demuxlet", + "--out", paste0(par\$output, "/", par\$out) +) + +argmap <- c( + "tag_group" = "--tag-group", + "tag_umi" = "--tag-UMI", + "field" = "--field", + "geno_error_offset" = "--geno-error-offset", + "geno_error_coeff" = "--geno-error-coeff", + "r2_info" = "--r2-info", + "min_mac" = "--min-mac", + "min_call_rate" = "--min-callrate", + "alpha" = "--alpha", + "doublet_prior" = "--doublet-prior", + "sm" = "--sm", + "sm_list" = "--sm-list", + "sam_verbose" = "--sam-verbose", + "vcf_verbose" = "--vcf-verbose", + "cap_bq" = "--cap-BQ", + "min_bq" = "--min-BQ", + "min_mq" = "--min-MQ", + "min_td" = "--min-TD", + "excl_flag" = "--excl-flag", + "group_list" = "--group-list", + "min_total" = "--min-total", + "min_snp" = "--min-snp", + "min_umi" = "--min-umi", + "plp" = "--plp", + "vcf" = "--vcf", + "sam" = "--sam", + "sm" = "--sm", + "sm_list" = "--sm-list", + "group_list" = "--group-list" +) + +for (arg in names(argmap)) { + if (!is.null(par[[arg]])) { + cmd <- c(cmd, argmap[[arg]], par[[arg]]) + } +} + +zzz <- processx::run( + cmd[[1]], + args = cmd[-1], + echo = TRUE, + echo_cmd = TRUE +) + +if (zzz\$status != 0) { + stop("Command failed with status ", zzz\$status) +} + +out_file <- paste0(par\$output, "/", par\$out, ".best") +if (!file.exists(out_file)) { + stop("Output file '", out_file, "' not found") +} +res <- readr::read_tsv(out_file) + +res2 <- res %>% + mutate( + donor_part1 = gsub("([^,]*),([^,]*),.*", "\\\\1", BEST.GUESS), + donor_part2 = gsub("([^,]*),([^,]*),.*", "\\\\2", BEST.GUESS), + donor_id = case_when( + donor_part1 == donor_part2 ~ donor_part1, + TRUE ~ DROPLET.TYPE + ) + ) + +demuxlet_assign <- res2 %>% select(cell = BARCODE, donor_id) + +readr::write_csv( + demuxlet_assign, + paste0(par\$output, "/cell_annotation.csv") +) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_SAM" ]; then + VIASH_PAR_SAM=$(ViashDockerStripAutomount "$VIASH_PAR_SAM") + fi + if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_PAR_VCF=$(ViashDockerStripAutomount "$VIASH_PAR_VCF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/demuxlet/demuxlet.patch b/target/executable/genetic_demux/demuxlet/demuxlet.patch new file mode 100644 index 00000000..29cc67c9 --- /dev/null +++ b/target/executable/genetic_demux/demuxlet/demuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/target/executable/genetic_demux/demuxlet/nextflow_labels.config b/target/executable/genetic_demux/demuxlet/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/demuxlet/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/dsc_pileup/.config.vsh.yaml b/target/executable/genetic_demux/dsc_pileup/.config.vsh.yaml new file mode 100644 index 00000000..30d56fd8 --- /dev/null +++ b/target/executable/genetic_demux/dsc_pileup/.config.vsh.yaml @@ -0,0 +1,416 @@ +name: "dsc_pileup" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--sam" + description: "Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_group" + description: "Tag representing readgroup or cell barcodes, in the case to partition\ + \ the BAM file into multiple groups. For 10x genomics, use CB." + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_umi" + description: "Tag representing UMIs. For 10x genomiucs, use UB." + info: null + default: + - "UB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--exclude_flag" + description: "SAM/BAM FLAGs to be excluded." + info: null + default: + - 1796 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vcf" + description: "Input VCF/BCF file for dsc-pileup, containing the AC and AN field." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm" + description: "List of sample IDs to compare to (default: use all)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm_list" + description: "File containing the list of sample IDs to compare." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sam_verbose" + description: "Verbose message frequency for SAM/BAM/CRAM." + info: null + default: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vcf_verbose" + description: "Verbose message frequency for VCF/BCF." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--skip_umi" + description: "Do not generate [prefix].umi.gz file, which stores the regions covered\ + \ by each barcode/UMI pair." + info: null + direction: "input" + - type: "integer" + name: "--cap_bq" + description: "Maximum base quality (higher BQ will be capped)." + info: null + default: + - 40 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_bq" + description: "Minimum base quality to consider (lower BQ will be skipped)." + info: null + default: + - 13 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_mq" + description: "Minimum mapping quality to consider (lower MQ will be ignored)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_td" + description: "Minimum distance to the tail (lower will be ignored)." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--excl_flag" + description: "SAM/BAM FLAGs to be excluded for SNP overlapping Read filtering\ + \ Options." + info: null + default: + - 3844 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_list" + description: "List of tag readgroup/cell barcode to consider in this run. All\ + \ other barcodes will be ignored. This is useful for parallelized run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_total" + description: "Minimum number of total reads for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_uniq" + description: "Minimum number of unique reads (determined by UMI/SNP pair) for\ + \ a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_snp" + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "demux" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--out" + description: "dsc-pileup output file prefix" + info: null + example: + - "demuxlet_dsc" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Dsc-pileup is a software tool to pileup reads and corresponding base\ + \ quality \nfor each overlapping SNPs and each barcode. By using pileup files,\n\ + it would allow us to run demuxlet/freemuxlet pretty fast multiple times\nwithout\ + \ going over the BAM file again.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:20.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "autoconf" + - "wget" + - "git" + - "build-essential" + - "libcurl4-openssl-dev" + - "cmake" + - "libbz2-dev" + - "libssl-dev" + - "liblzma-dev" + - "zlib1g-dev" + - "r-base" + interactive: false + - type: "docker" + run: + - "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib\ + \ && git submodule update --init --recursive && autoreconf -i && ./configure\ + \ --prefix=/usr/local/ && make && make install" + - type: "docker" + run: + - "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle &&\ + \ mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make &&\ + \ cp /tmp/popscle/bin/popscle /usr/local/bin" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/dsc_pileup/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/dsc_pileup" + executable: "target/executable/genetic_demux/dsc_pileup/dsc_pileup" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/dsc_pileup/dsc_pileup b/target/executable/genetic_demux/dsc_pileup/dsc_pileup new file mode 100755 index 00000000..1c113a3b --- /dev/null +++ b/target/executable/genetic_demux/dsc_pileup/dsc_pileup @@ -0,0 +1,1583 @@ +#!/usr/bin/env bash + +# dsc_pileup main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="dsc_pileup" +VIASH_META_FUNCTIONALITY_NAME="dsc_pileup" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:20.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y autoconf wget git build-essential libcurl4-openssl-dev cmake libbz2-dev libssl-dev liblzma-dev zlib1g-dev r-base && \ + rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install +RUN git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux dsc_pileup" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "dsc_pileup main" + echo "" + echo "Dsc-pileup is a software tool to pileup reads and corresponding base quality" + echo "for each overlapping SNPs and each barcode. By using pileup files," + echo "it would allow us to run demuxlet/freemuxlet pretty fast multiple times" + echo "without going over the BAM file again." + echo "" + echo "Input:" + echo " --sam" + echo " type: file, file must exist" + echo " Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed." + echo "" + echo " --tag_group" + echo " type: string" + echo " default: CB" + echo " Tag representing readgroup or cell barcodes, in the case to partition" + echo " the BAM file into multiple groups. For 10x genomics, use CB." + echo "" + echo " --tag_umi" + echo " type: string" + echo " default: UB" + echo " Tag representing UMIs. For 10x genomiucs, use UB." + echo "" + echo " --exclude_flag" + echo " type: integer" + echo " default: 1796" + echo " SAM/BAM FLAGs to be excluded." + echo "" + echo " --vcf" + echo " type: file, file must exist" + echo " Input VCF/BCF file for dsc-pileup, containing the AC and AN field." + echo "" + echo " --sm" + echo " type: string" + echo " List of sample IDs to compare to (default: use all)." + echo "" + echo " --sm_list" + echo " type: string" + echo " File containing the list of sample IDs to compare." + echo "" + echo " --sam_verbose" + echo " type: integer" + echo " default: 1000000" + echo " Verbose message frequency for SAM/BAM/CRAM." + echo "" + echo " --vcf_verbose" + echo " type: integer" + echo " default: 1000" + echo " Verbose message frequency for VCF/BCF." + echo "" + echo " --skip_umi" + echo " type: boolean_true" + echo " Do not generate [prefix].umi.gz file, which stores the regions covered" + echo " by each barcode/UMI pair." + echo "" + echo " --cap_bq" + echo " type: integer" + echo " default: 40" + echo " Maximum base quality (higher BQ will be capped)." + echo "" + echo " --min_bq" + echo " type: integer" + echo " default: 13" + echo " Minimum base quality to consider (lower BQ will be skipped)." + echo "" + echo " --min_mq" + echo " type: integer" + echo " default: 20" + echo " Minimum mapping quality to consider (lower MQ will be ignored)." + echo "" + echo " --min_td" + echo " type: integer" + echo " default: 0" + echo " Minimum distance to the tail (lower will be ignored)." + echo "" + echo " --excl_flag" + echo " type: integer" + echo " default: 3844" + echo " SAM/BAM FLAGs to be excluded for SNP overlapping Read filtering Options." + echo "" + echo " --group_list" + echo " type: string" + echo " List of tag readgroup/cell barcode to consider in this run. All other" + echo " barcodes will be ignored. This is useful for parallelized run." + echo "" + echo " --min_total" + echo " type: integer" + echo " default: 0" + echo " Minimum number of total reads for a droplet/cell to be considered." + echo "" + echo " --min_uniq" + echo " type: integer" + echo " default: 0" + echo " Minimum number of unique reads (determined by UMI/SNP pair) for a" + echo " droplet/cell to be considered." + echo "" + echo " --min_snp" + echo " type: integer" + echo " default: 0" + echo " Minimum number of SNPs with coverage for a droplet/cell to be" + echo " considered." + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: demux" + echo " Output directory" + echo "" + echo " --out" + echo " type: string" + echo " example: demuxlet_dsc" + echo " dsc-pileup output file prefix" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "dsc_pileup main" + exit + ;; + --sam) + [ -n "$VIASH_PAR_SAM" ] && ViashError Bad arguments for option \'--sam\': \'$VIASH_PAR_SAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam=*) + [ -n "$VIASH_PAR_SAM" ] && ViashError Bad arguments for option \'--sam=*\': \'$VIASH_PAR_SAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tag_group) + [ -n "$VIASH_PAR_TAG_GROUP" ] && ViashError Bad arguments for option \'--tag_group\': \'$VIASH_PAR_TAG_GROUP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_GROUP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tag_group. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tag_group=*) + [ -n "$VIASH_PAR_TAG_GROUP" ] && ViashError Bad arguments for option \'--tag_group=*\': \'$VIASH_PAR_TAG_GROUP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_GROUP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tag_umi) + [ -n "$VIASH_PAR_TAG_UMI" ] && ViashError Bad arguments for option \'--tag_umi\': \'$VIASH_PAR_TAG_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tag_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tag_umi=*) + [ -n "$VIASH_PAR_TAG_UMI" ] && ViashError Bad arguments for option \'--tag_umi=*\': \'$VIASH_PAR_TAG_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --exclude_flag) + [ -n "$VIASH_PAR_EXCLUDE_FLAG" ] && ViashError Bad arguments for option \'--exclude_flag\': \'$VIASH_PAR_EXCLUDE_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_FLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exclude_flag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude_flag=*) + [ -n "$VIASH_PAR_EXCLUDE_FLAG" ] && ViashError Bad arguments for option \'--exclude_flag=*\': \'$VIASH_PAR_EXCLUDE_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_FLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vcf) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf=*) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf=*\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sm) + [ -n "$VIASH_PAR_SM" ] && ViashError Bad arguments for option \'--sm\': \'$VIASH_PAR_SM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sm=*) + [ -n "$VIASH_PAR_SM" ] && ViashError Bad arguments for option \'--sm=*\': \'$VIASH_PAR_SM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sm_list) + [ -n "$VIASH_PAR_SM_LIST" ] && ViashError Bad arguments for option \'--sm_list\': \'$VIASH_PAR_SM_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sm_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sm_list=*) + [ -n "$VIASH_PAR_SM_LIST" ] && ViashError Bad arguments for option \'--sm_list=*\': \'$VIASH_PAR_SM_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SM_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sam_verbose) + [ -n "$VIASH_PAR_SAM_VERBOSE" ] && ViashError Bad arguments for option \'--sam_verbose\': \'$VIASH_PAR_SAM_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_VERBOSE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sam_verbose. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sam_verbose=*) + [ -n "$VIASH_PAR_SAM_VERBOSE" ] && ViashError Bad arguments for option \'--sam_verbose=*\': \'$VIASH_PAR_SAM_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAM_VERBOSE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vcf_verbose) + [ -n "$VIASH_PAR_VCF_VERBOSE" ] && ViashError Bad arguments for option \'--vcf_verbose\': \'$VIASH_PAR_VCF_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF_VERBOSE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf_verbose. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf_verbose=*) + [ -n "$VIASH_PAR_VCF_VERBOSE" ] && ViashError Bad arguments for option \'--vcf_verbose=*\': \'$VIASH_PAR_VCF_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF_VERBOSE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --skip_umi) + [ -n "$VIASH_PAR_SKIP_UMI" ] && ViashError Bad arguments for option \'--skip_umi\': \'$VIASH_PAR_SKIP_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SKIP_UMI=true + shift 1 + ;; + --cap_bq) + [ -n "$VIASH_PAR_CAP_BQ" ] && ViashError Bad arguments for option \'--cap_bq\': \'$VIASH_PAR_CAP_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CAP_BQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cap_bq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cap_bq=*) + [ -n "$VIASH_PAR_CAP_BQ" ] && ViashError Bad arguments for option \'--cap_bq=*\': \'$VIASH_PAR_CAP_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CAP_BQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_bq) + [ -n "$VIASH_PAR_MIN_BQ" ] && ViashError Bad arguments for option \'--min_bq\': \'$VIASH_PAR_MIN_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_bq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_bq=*) + [ -n "$VIASH_PAR_MIN_BQ" ] && ViashError Bad arguments for option \'--min_bq=*\': \'$VIASH_PAR_MIN_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_mq) + [ -n "$VIASH_PAR_MIN_MQ" ] && ViashError Bad arguments for option \'--min_mq\': \'$VIASH_PAR_MIN_MQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_mq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_mq=*) + [ -n "$VIASH_PAR_MIN_MQ" ] && ViashError Bad arguments for option \'--min_mq=*\': \'$VIASH_PAR_MIN_MQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_td) + [ -n "$VIASH_PAR_MIN_TD" ] && ViashError Bad arguments for option \'--min_td\': \'$VIASH_PAR_MIN_TD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_td. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_td=*) + [ -n "$VIASH_PAR_MIN_TD" ] && ViashError Bad arguments for option \'--min_td=*\': \'$VIASH_PAR_MIN_TD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --excl_flag) + [ -n "$VIASH_PAR_EXCL_FLAG" ] && ViashError Bad arguments for option \'--excl_flag\': \'$VIASH_PAR_EXCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCL_FLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --excl_flag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --excl_flag=*) + [ -n "$VIASH_PAR_EXCL_FLAG" ] && ViashError Bad arguments for option \'--excl_flag=*\': \'$VIASH_PAR_EXCL_FLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCL_FLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --group_list) + [ -n "$VIASH_PAR_GROUP_LIST" ] && ViashError Bad arguments for option \'--group_list\': \'$VIASH_PAR_GROUP_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --group_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --group_list=*) + [ -n "$VIASH_PAR_GROUP_LIST" ] && ViashError Bad arguments for option \'--group_list=*\': \'$VIASH_PAR_GROUP_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_total) + [ -n "$VIASH_PAR_MIN_TOTAL" ] && ViashError Bad arguments for option \'--min_total\': \'$VIASH_PAR_MIN_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TOTAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_total. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_total=*) + [ -n "$VIASH_PAR_MIN_TOTAL" ] && ViashError Bad arguments for option \'--min_total=*\': \'$VIASH_PAR_MIN_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TOTAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_uniq) + [ -n "$VIASH_PAR_MIN_UNIQ" ] && ViashError Bad arguments for option \'--min_uniq\': \'$VIASH_PAR_MIN_UNIQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_UNIQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_uniq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_uniq=*) + [ -n "$VIASH_PAR_MIN_UNIQ" ] && ViashError Bad arguments for option \'--min_uniq=*\': \'$VIASH_PAR_MIN_UNIQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_UNIQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_snp) + [ -n "$VIASH_PAR_MIN_SNP" ] && ViashError Bad arguments for option \'--min_snp\': \'$VIASH_PAR_MIN_SNP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SNP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_snp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_snp=*) + [ -n "$VIASH_PAR_MIN_SNP" ] && ViashError Bad arguments for option \'--min_snp=*\': \'$VIASH_PAR_MIN_SNP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SNP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --out) + [ -n "$VIASH_PAR_OUT" ] && ViashError Bad arguments for option \'--out\': \'$VIASH_PAR_OUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --out. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --out=*) + [ -n "$VIASH_PAR_OUT" ] && ViashError Bad arguments for option \'--out=*\': \'$VIASH_PAR_OUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/dsc_pileup:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_TAG_GROUP+x} ]; then + VIASH_PAR_TAG_GROUP="CB" +fi +if [ -z ${VIASH_PAR_TAG_UMI+x} ]; then + VIASH_PAR_TAG_UMI="UB" +fi +if [ -z ${VIASH_PAR_EXCLUDE_FLAG+x} ]; then + VIASH_PAR_EXCLUDE_FLAG="1796" +fi +if [ -z ${VIASH_PAR_SAM_VERBOSE+x} ]; then + VIASH_PAR_SAM_VERBOSE="1000000" +fi +if [ -z ${VIASH_PAR_VCF_VERBOSE+x} ]; then + VIASH_PAR_VCF_VERBOSE="1000" +fi +if [ -z ${VIASH_PAR_SKIP_UMI+x} ]; then + VIASH_PAR_SKIP_UMI="false" +fi +if [ -z ${VIASH_PAR_CAP_BQ+x} ]; then + VIASH_PAR_CAP_BQ="40" +fi +if [ -z ${VIASH_PAR_MIN_BQ+x} ]; then + VIASH_PAR_MIN_BQ="13" +fi +if [ -z ${VIASH_PAR_MIN_MQ+x} ]; then + VIASH_PAR_MIN_MQ="20" +fi +if [ -z ${VIASH_PAR_MIN_TD+x} ]; then + VIASH_PAR_MIN_TD="0" +fi +if [ -z ${VIASH_PAR_EXCL_FLAG+x} ]; then + VIASH_PAR_EXCL_FLAG="3844" +fi +if [ -z ${VIASH_PAR_MIN_TOTAL+x} ]; then + VIASH_PAR_MIN_TOTAL="0" +fi +if [ -z ${VIASH_PAR_MIN_UNIQ+x} ]; then + VIASH_PAR_MIN_UNIQ="0" +fi +if [ -z ${VIASH_PAR_MIN_SNP+x} ]; then + VIASH_PAR_MIN_SNP="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_SAM" ] && [ ! -e "$VIASH_PAR_SAM" ]; then + ViashError "Input file '$VIASH_PAR_SAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VCF" ] && [ ! -e "$VIASH_PAR_VCF" ]; then + ViashError "Input file '$VIASH_PAR_VCF' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EXCLUDE_FLAG" ]]; then + if ! [[ "$VIASH_PAR_EXCLUDE_FLAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--exclude_flag' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SAM_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_SAM_VERBOSE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sam_verbose' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VCF_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_VCF_VERBOSE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--vcf_verbose' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SKIP_UMI" ]]; then + if ! [[ "$VIASH_PAR_SKIP_UMI" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--skip_umi' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CAP_BQ" ]]; then + if ! [[ "$VIASH_PAR_CAP_BQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--cap_bq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_BQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_BQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_bq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_MQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_mq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_TD" ]]; then + if ! [[ "$VIASH_PAR_MIN_TD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_td' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXCL_FLAG" ]]; then + if ! [[ "$VIASH_PAR_EXCL_FLAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--excl_flag' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_TOTAL" ]]; then + if ! [[ "$VIASH_PAR_MIN_TOTAL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_total' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_UNIQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_UNIQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_uniq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SNP" ]]; then + if ! [[ "$VIASH_PAR_MIN_SNP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_snp' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_SAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAM")" ) + VIASH_PAR_SAM=$(ViashDockerAutodetectMount "$VIASH_PAR_SAM") +fi +if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VCF")" ) + VIASH_PAR_VCF=$(ViashDockerAutodetectMount "$VIASH_PAR_VCF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-dsc_pileup-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_SAM+x} ]; then echo "${VIASH_PAR_SAM}" | sed "s#'#'\"'\"'#g;s#.*#par_sam='&'#" ; else echo "# par_sam="; fi ) +$( if [ ! -z ${VIASH_PAR_TAG_GROUP+x} ]; then echo "${VIASH_PAR_TAG_GROUP}" | sed "s#'#'\"'\"'#g;s#.*#par_tag_group='&'#" ; else echo "# par_tag_group="; fi ) +$( if [ ! -z ${VIASH_PAR_TAG_UMI+x} ]; then echo "${VIASH_PAR_TAG_UMI}" | sed "s#'#'\"'\"'#g;s#.*#par_tag_umi='&'#" ; else echo "# par_tag_umi="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE_FLAG+x} ]; then echo "${VIASH_PAR_EXCLUDE_FLAG}" | sed "s#'#'\"'\"'#g;s#.*#par_exclude_flag='&'#" ; else echo "# par_exclude_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_SM+x} ]; then echo "${VIASH_PAR_SM}" | sed "s#'#'\"'\"'#g;s#.*#par_sm='&'#" ; else echo "# par_sm="; fi ) +$( if [ ! -z ${VIASH_PAR_SM_LIST+x} ]; then echo "${VIASH_PAR_SM_LIST}" | sed "s#'#'\"'\"'#g;s#.*#par_sm_list='&'#" ; else echo "# par_sm_list="; fi ) +$( if [ ! -z ${VIASH_PAR_SAM_VERBOSE+x} ]; then echo "${VIASH_PAR_SAM_VERBOSE}" | sed "s#'#'\"'\"'#g;s#.*#par_sam_verbose='&'#" ; else echo "# par_sam_verbose="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF_VERBOSE+x} ]; then echo "${VIASH_PAR_VCF_VERBOSE}" | sed "s#'#'\"'\"'#g;s#.*#par_vcf_verbose='&'#" ; else echo "# par_vcf_verbose="; fi ) +$( if [ ! -z ${VIASH_PAR_SKIP_UMI+x} ]; then echo "${VIASH_PAR_SKIP_UMI}" | sed "s#'#'\"'\"'#g;s#.*#par_skip_umi='&'#" ; else echo "# par_skip_umi="; fi ) +$( if [ ! -z ${VIASH_PAR_CAP_BQ+x} ]; then echo "${VIASH_PAR_CAP_BQ}" | sed "s#'#'\"'\"'#g;s#.*#par_cap_bq='&'#" ; else echo "# par_cap_bq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_BQ+x} ]; then echo "${VIASH_PAR_MIN_BQ}" | sed "s#'#'\"'\"'#g;s#.*#par_min_bq='&'#" ; else echo "# par_min_bq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MQ+x} ]; then echo "${VIASH_PAR_MIN_MQ}" | sed "s#'#'\"'\"'#g;s#.*#par_min_mq='&'#" ; else echo "# par_min_mq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_TD+x} ]; then echo "${VIASH_PAR_MIN_TD}" | sed "s#'#'\"'\"'#g;s#.*#par_min_td='&'#" ; else echo "# par_min_td="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCL_FLAG+x} ]; then echo "${VIASH_PAR_EXCL_FLAG}" | sed "s#'#'\"'\"'#g;s#.*#par_excl_flag='&'#" ; else echo "# par_excl_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_GROUP_LIST+x} ]; then echo "${VIASH_PAR_GROUP_LIST}" | sed "s#'#'\"'\"'#g;s#.*#par_group_list='&'#" ; else echo "# par_group_list="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_TOTAL+x} ]; then echo "${VIASH_PAR_MIN_TOTAL}" | sed "s#'#'\"'\"'#g;s#.*#par_min_total='&'#" ; else echo "# par_min_total="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_UNIQ+x} ]; then echo "${VIASH_PAR_MIN_UNIQ}" | sed "s#'#'\"'\"'#g;s#.*#par_min_uniq='&'#" ; else echo "# par_min_uniq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_SNP+x} ]; then echo "${VIASH_PAR_MIN_SNP}" | sed "s#'#'\"'\"'#g;s#.*#par_min_snp='&'#" ; else echo "# par_min_snp="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_OUT+x} ]; then echo "${VIASH_PAR_OUT}" | sed "s#'#'\"'\"'#g;s#.*#par_out='&'#" ; else echo "# par_out="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\$par_skip_umi" == "false" ]] && unset par_skip_umi + +# Create output directory if it doesn't exist +if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" +fi + +popscle dsc-pileup \\ + --sam \$par_sam \\ + --tag-group \$par_tag_group \\ + --tag-UMI \$par_tag_umi \\ + --exclude-flag \$par_exclude_flag \\ + --sam-verbose \$par_sam_verbose \\ + --vcf \$par_vcf \\ + --vcf-verbose \$par_vcf_verbose \\ + --out "\$par_output/\$par_out" \\ + --cap-BQ \$par_cap_bq \\ + --min-BQ \$par_min_bq \\ + --min-MQ \$par_min_mq \\ + --min-TD \$par_min_td \\ + --excl-flag \$par_excl_flag \\ + --min-total \$par_min_total \\ + --min-uniq \$par_min_uniq \\ + --min-snp \$par_min_snp \\ + \${par_sm:+--sm \$par_sm} \\ + \${par_sm_list:+--sm-list \$par_sm_list} \\ + \${par_skip_umi:+--skip-umi} \\ + \${par_group_list:+--group-list \$par_group_list} +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_SAM" ]; then + VIASH_PAR_SAM=$(ViashDockerStripAutomount "$VIASH_PAR_SAM") + fi + if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_PAR_VCF=$(ViashDockerStripAutomount "$VIASH_PAR_VCF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/dsc_pileup/nextflow_labels.config b/target/executable/genetic_demux/dsc_pileup/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/dsc_pileup/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/freebayes/.config.vsh.yaml b/target/executable/genetic_demux/freebayes/.config.vsh.yaml new file mode 100644 index 00000000..eeb09094 --- /dev/null +++ b/target/executable/genetic_demux/freebayes/.config.vsh.yaml @@ -0,0 +1,841 @@ +name: "freebayes" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--bam" + description: "Add FILE to the set of BAM files to be analyzed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam_list" + description: "A file containing a list of BAM files to be analyzed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--stdin" + description: "Read BAM input on stdin." + info: null + direction: "input" + - type: "file" + name: "--fasta_reference" + description: "Use FILE as the reference sequence for analysis. An index file (FILE.fai)\ + \ will be created if none exists. If neither --targets nor --region are specified,\ + \ FreeBayes will analyze every position in this reference." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--fasta_reference_index" + description: "Use FILE.fai as the index of reference sequence for analysis." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targets" + description: "Limit analysis to targets listed in the BED-format FILE." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--region" + description: "Limit analysis to the specified region, 0-base coordinates, end_position\ + \ not included (same as BED format)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--samples" + description: "Limit analysis to samples listed (one per line) in the FILE. By\ + \ default FreeBayes will analyze all samples in its input BAM files." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--populations" + description: "Each line of FILE should list a sample and a population which it\ + \ is part of. The population-based bayesian inference model will then be partitioned\ + \ on the basis of the populations." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cnv_map" + description: "Read a copy number map from the BED file FILE, which has either\ + \ a sample-level ploidy or a region-specific format." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--gvcf" + description: "Write gVCF output, which indicates coverage in uncalled regions." + info: null + direction: "input" + - type: "integer" + name: "--gvcf_chunk" + description: "When writing gVCF output emit a record for every NUM bases." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--variant_input" + description: "Use variants reported in VCF file as input to the algorithm. Variants\ + \ in this file will included in the output even if there is not enough support\ + \ in the data to pass input filters." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--only_use_input_alleles" + description: "Only provide variant calls and genotype likelihoods for sites and\ + \ alleles which are provided in the VCF input, and provide output in the VCF\ + \ for all input alleles, not just those which have support in the data." + info: null + direction: "input" + - type: "file" + name: "--haplotype_basis_alleles" + description: "When specified, only variant alleles provided in this input VCF\ + \ will be used for the construction of complex or haplotype alleles." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--report_all_haplotype_alleles" + description: "At sites where genotypes are made over haplotype alleles, provide\ + \ information about all alleles in output, not only those which are called." + info: null + direction: "input" + - type: "boolean_true" + name: "--report_monomorphic" + description: "Report even loci which appear to be monomorphic, and report all\ + \ considered alleles, even those which are not in called genotypes." + info: null + direction: "input" + - type: "double" + name: "--pvar" + description: "Report sites if the probability that there is a polymorphism at\ + \ the site is greater than N. Note that post-filtering is generally recommended\ + \ over the use of this parameter." + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--strict_vcf" + description: "Generate strict VCF format (FORMAT/GQ will be an int)." + info: null + direction: "input" + - type: "double" + name: "--theta" + description: "The expected mutation rate or pairwise nucleotide diversity among\ + \ the population under analysis. This serves as the single parameter to the\ + \ Ewens Sampling Formula prior model." + info: null + default: + - 0.001 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ploidy" + description: "Sets the default ploidy for the analysis to N." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--pooled_discrete" + description: "Assume that samples result from pooled sequencing. Model pooled\ + \ samples using discrete genotypes across pools." + info: null + direction: "input" + - type: "boolean_true" + name: "--pooled_continuous" + description: "Output all alleles which pass input filters, regardles of genotyping\ + \ outcome or model." + info: null + direction: "input" + - type: "boolean_true" + name: "--use_reference_allele" + description: "This flag includes the reference allele in the analysis as if it\ + \ is another sample from the same population." + info: null + direction: "input" + - type: "string" + name: "--reference_quality" + description: "Assign mapping quality of MQ to the reference allele at each site\ + \ and base quality of BQ." + info: null + default: + - "100,60" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--throw_away_snp_obs" + description: "Ignore SNP alleles." + info: null + direction: "input" + - type: "boolean_false" + name: "--throw_away_mnps_obs" + description: "Ignore multi-nuceotide polymorphisms, MNPs. MNPs are excluded as\ + \ default." + info: null + direction: "input" + - type: "boolean_false" + name: "--throw_away_indel_obs" + description: "Ignore insertion and deletion alleles. Indels are excluded as default." + info: null + direction: "input" + - type: "boolean_false" + name: "--throw_away_complex_obs" + description: "Ignore complex events (composites of other classes). Complex are\ + \ excluded as default" + info: null + direction: "input" + - type: "integer" + name: "--use_best_n_alleles" + description: "Evaluate only the best N SNP alleles, ranked by sum of supporting\ + \ quality scores." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_complex_gap" + description: "Allow haplotype calls with contiguous embedded matches of up to\ + \ this length." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_repeat_size" + description: "When assembling observations across repeats, require the total repeat\ + \ length at least this many bp." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_repeat_entropy" + description: "To detect interrupted repeats, build across sequence until it has\ + \ entropy > N bits per bp. Set to 0 to turn off." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_partial_observations" + description: "Exclude observations which do not fully span the dynamically-determined\ + \ detection window. (default, use all observations, dividing partial support\ + \ across matching haplotypes when generating haplotypes.)" + info: null + direction: "input" + - type: "boolean_true" + name: "--dont_left_align_indels" + description: "Turn off left-alignment of indels, which is enabled by default." + info: null + direction: "input" + - type: "boolean_true" + name: "--use_duplicate_reads" + description: "Include duplicate-marked alignments in the analysis. default: exclude\ + \ duplicates marked as such in alignments" + info: null + direction: "input" + - type: "integer" + name: "--min_mapping_quality" + description: "Exclude alignments from analysis if they have a mapping quality\ + \ less than Q." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_base_quality" + description: "Exclude alleles from analysis if their supporting base quality is\ + \ less than Q. Default value is changed according to the instruction of scSplit." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_supporting_allele_qsum" + description: "Consider any allele in which the sum of qualities of supporting\ + \ observations is at least Q." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_supporting_mapping_qsum" + description: "Consider any allele in which and the sum of mapping qualities of\ + \ supporting reads is at least." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--mismatch_base_quality_threshold" + description: "Count mismatches toward --read-mismatch-limit if the base quality\ + \ of the mismatch is >= Q." + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--read_max_mismatch_fraction" + description: "Exclude reads with more than N mismatches where each mismatch has\ + \ base quality >= mismatch-base-quality-threshold." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--read_mismatch_limit" + description: "Exclude reads with more than N [0,1] fraction of mismatches where\ + \ each mismatch has base quality >= mismatch-base-quality-threshold." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--read_snp_limit" + description: "Exclude reads with more than N base mismatches, ignoring gaps with\ + \ quality >= mismatch-base-quality-threshold." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--read_indel_limit" + description: "Exclude reads with more than N separate gaps." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--standard_filters" + description: "Use stringent input base and mapping quality filters, equivalent\ + \ to -m 30 -q 20 -R 0 -S 0" + info: null + direction: "input" + - type: "double" + name: "--min_alternate_fraction" + description: "Require at least this fraction of observations supporting an alternate\ + \ allele within a single individual in order to evaluate the position." + info: null + default: + - 0.05 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alternate_count" + description: "Require at least this count of observations supporting an alternate\ + \ allele within a single individual in order to evaluate the position." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alternate_qsum" + description: "Require at least this sum of quality of observations supporting\ + \ an alternate allele within a single individual in order to evaluate the position." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alternate_total" + description: "Require at least this count of observations supporting an alternate\ + \ allele within the total population in order to use the allele in analysis." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_coverage" + description: "Require at least this coverage to process a site." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_coverage" + description: "Do not process sites with greater than this coverage." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_population_priors" + description: "Equivalent to --pooled-discrete --hwe-priors-off and removal of\ + \ Ewens Sampling Formula component of priors." + info: null + direction: "input" + - type: "boolean_true" + name: "--hwe_priors_off" + description: "Disable estimation of the probability of the combination arising\ + \ under HWE given the allele frequency as estimated by observation frequency." + info: null + direction: "input" + - type: "boolean_true" + name: "--binomial_obs_priors_off" + description: "Disable incorporation of prior expectations about observations.\ + \ Uses read placement probability, strand balance probability, and read position\ + \ probability." + info: null + direction: "input" + - type: "boolean_true" + name: "--allele_balance_priors_off" + description: "Disable use of aggregate probability of observation balance between\ + \ alleles as a component of the priors." + info: null + direction: "input" + - type: "file" + name: "--observation_bias" + description: "Read length-dependent allele observation biases from FILE. The format\ + \ is [length] [alignment efficiency relative to reference] where the efficiency\ + \ is 1 if there is no relative observation bias." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--base_quality_cap" + description: "Limit estimated observation quality by capping base quality at Q." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--prob_contamination" + description: "An estimate of contamination to use for all samples." + info: null + default: + - 1.0E-8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--legacy_gls" + description: "Use legacy (polybayes equivalent) genotype likelihood calculations" + info: null + direction: "input" + - type: "file" + name: "--contamination_estimates" + description: "A file containing per-sample estimates of contamination, such as\ + \ those generated by VerifyBamID." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--report_genotype_likelihood_max" + description: "Report genotypes using the maximum-likelihood estimate provided\ + \ from genotype likelihoods." + info: null + direction: "input" + - type: "integer" + name: "--genotyping_max_iterations" + description: "Iterate no more than N times during genotyping step." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--genotyping_max_banddepth" + description: "Integrate no deeper than the Nth best genotype by likelihood when\ + \ genotyping." + info: null + default: + - 6 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--posterior_integration_limits" + description: "Integrate all genotype combinations in our posterior space which\ + \ include no more than N samples with their Mth best data likelihood." + info: null + default: + - "1,3" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--exclude_unobserved_genotypes" + description: "Skip sample genotypings for which the sample has no supporting reads." + info: null + direction: "input" + - type: "integer" + name: "--genotype_variant_threshold" + description: "Limit posterior integration to samples where the second-best genotype\ + \ likelihood is no more than log(N) from the highest genotype likelihood for\ + \ the sample." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--use_mapping_quality" + description: "Use mapping quality of alleles when calculating data likelihoods." + info: null + direction: "input" + - type: "boolean_true" + name: "--harmonic_indel_quality" + description: "Use a weighted sum of base qualities around an indel, scaled by\ + \ the distance from the indel. By default use a minimum BQ in flanking sequence." + info: null + direction: "input" + - type: "double" + name: "--read_dependence_factor" + description: "Incorporate non-independence of reads by scaling successive observations\ + \ by this factor during data likelihood calculations." + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--genotype_qualities" + description: "Calculate the marginal probability of genotypes and report as GQ\ + \ in each sample field in the VCF output." + info: null + direction: "input" + - type: "boolean_true" + name: "--debug" + description: "Print debugging output." + info: null + direction: "input" + - type: "boolean_true" + name: "--dd" + description: "Print more verbose debugging output" + info: null + direction: "input" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "freebayes_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--vcf" + description: "Output VCF-format results to FILE." + info: null + example: + - "snp.vcf" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Freebayes is a Bayesian genetic variant detector designed to\nfind small\ + \ polymorphisms, specifically SNPs.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "freebayes" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/freebayes/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/freebayes" + executable: "target/executable/genetic_demux/freebayes/freebayes" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/freebayes/freebayes b/target/executable/genetic_demux/freebayes/freebayes new file mode 100755 index 00000000..c24f79e0 --- /dev/null +++ b/target/executable/genetic_demux/freebayes/freebayes @@ -0,0 +1,2910 @@ +#!/usr/bin/env bash + +# freebayes main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="freebayes" +VIASH_META_FUNCTIONALITY_NAME="freebayes" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:22.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y freebayes && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux freebayes" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "freebayes main" + echo "" + echo "Freebayes is a Bayesian genetic variant detector designed to" + echo "find small polymorphisms, specifically SNPs." + echo "" + echo "Input:" + echo " --bam" + echo " type: file, file must exist" + echo " Add FILE to the set of BAM files to be analyzed." + echo "" + echo " --bam_list" + echo " type: file, file must exist" + echo " A file containing a list of BAM files to be analyzed." + echo "" + echo " --stdin" + echo " type: boolean_true" + echo " Read BAM input on stdin." + echo "" + echo " --fasta_reference" + echo " type: file, file must exist" + echo " Use FILE as the reference sequence for analysis. An index file" + echo " (FILE.fai) will be created if none exists. If neither --targets nor" + echo " --region are specified, FreeBayes will analyze every position in this" + echo " reference." + echo "" + echo " --fasta_reference_index" + echo " type: file, file must exist" + echo " Use FILE.fai as the index of reference sequence for analysis." + echo "" + echo " --targets" + echo " type: file, file must exist" + echo " Limit analysis to targets listed in the BED-format FILE." + echo "" + echo " --region" + echo " type: string" + echo " Limit analysis to the specified region, 0-base coordinates, end_position" + echo " not included (same as BED format)." + echo "" + echo " --samples" + echo " type: file, file must exist" + echo " Limit analysis to samples listed (one per line) in the FILE. By default" + echo " FreeBayes will analyze all samples in its input BAM files." + echo "" + echo " --populations" + echo " type: file, file must exist" + echo " Each line of FILE should list a sample and a population which it is part" + echo " of. The population-based bayesian inference model will then be" + echo " partitioned on the basis of the populations." + echo "" + echo " --cnv_map" + echo " type: file, file must exist" + echo " Read a copy number map from the BED file FILE, which has either a" + echo " sample-level ploidy or a region-specific format." + echo "" + echo " --gvcf" + echo " type: boolean_true" + echo " Write gVCF output, which indicates coverage in uncalled regions." + echo "" + echo " --gvcf_chunk" + echo " type: integer" + echo " When writing gVCF output emit a record for every NUM bases." + echo "" + echo " --variant_input" + echo " type: file, file must exist" + echo " Use variants reported in VCF file as input to the algorithm. Variants in" + echo " this file will included in the output even if there is not enough" + echo " support in the data to pass input filters." + echo "" + echo " --only_use_input_alleles" + echo " type: boolean_true" + echo " Only provide variant calls and genotype likelihoods for sites and" + echo " alleles which are provided in the VCF input, and provide output in the" + echo " VCF for all input alleles, not just those which have support in the" + echo " data." + echo "" + echo " --haplotype_basis_alleles" + echo " type: file, file must exist" + echo " When specified, only variant alleles provided in this input VCF will be" + echo " used for the construction of complex or haplotype alleles." + echo "" + echo " --report_all_haplotype_alleles" + echo " type: boolean_true" + echo " At sites where genotypes are made over haplotype alleles, provide" + echo " information about all alleles in output, not only those which are" + echo " called." + echo "" + echo " --report_monomorphic" + echo " type: boolean_true" + echo " Report even loci which appear to be monomorphic, and report all" + echo " considered alleles, even those which are not in called genotypes." + echo "" + echo " --pvar" + echo " type: double" + echo " default: 0.0" + echo " Report sites if the probability that there is a polymorphism at the site" + echo " is greater than N. Note that post-filtering is generally recommended" + echo " over the use of this parameter." + echo "" + echo " --strict_vcf" + echo " type: boolean_true" + echo " Generate strict VCF format (FORMAT/GQ will be an int)." + echo "" + echo " --theta" + echo " type: double" + echo " default: 0.001" + echo " The expected mutation rate or pairwise nucleotide diversity among the" + echo " population under analysis. This serves as the single parameter to the" + echo " Ewens Sampling Formula prior model." + echo "" + echo " --ploidy" + echo " type: integer" + echo " default: 2" + echo " Sets the default ploidy for the analysis to N." + echo "" + echo " --pooled_discrete" + echo " type: boolean_true" + echo " Assume that samples result from pooled sequencing. Model pooled samples" + echo " using discrete genotypes across pools." + echo "" + echo " --pooled_continuous" + echo " type: boolean_true" + echo " Output all alleles which pass input filters, regardles of genotyping" + echo " outcome or model." + echo "" + echo " --use_reference_allele" + echo " type: boolean_true" + echo " This flag includes the reference allele in the analysis as if it is" + echo " another sample from the same population." + echo "" + echo " --reference_quality" + echo " type: string" + echo " default: 100,60" + echo " Assign mapping quality of MQ to the reference allele at each site and" + echo " base quality of BQ." + echo "" + echo " --throw_away_snp_obs" + echo " type: boolean_true" + echo " Ignore SNP alleles." + echo "" + echo " --throw_away_mnps_obs" + echo " type: boolean_false" + echo " Ignore multi-nuceotide polymorphisms, MNPs. MNPs are excluded as" + echo " default." + echo "" + echo " --throw_away_indel_obs" + echo " type: boolean_false" + echo " Ignore insertion and deletion alleles. Indels are excluded as default." + echo "" + echo " --throw_away_complex_obs" + echo " type: boolean_false" + echo " Ignore complex events (composites of other classes). Complex are" + echo " excluded as default" + echo "" + echo " --use_best_n_alleles" + echo " type: integer" + echo " default: 0" + echo " Evaluate only the best N SNP alleles, ranked by sum of supporting" + echo " quality scores." + echo "" + echo " --max_complex_gap" + echo " type: integer" + echo " default: 3" + echo " Allow haplotype calls with contiguous embedded matches of up to this" + echo " length." + echo "" + echo " --min_repeat_size" + echo " type: integer" + echo " default: 5" + echo " When assembling observations across repeats, require the total repeat" + echo " length at least this many bp." + echo "" + echo " --min_repeat_entropy" + echo " type: integer" + echo " default: 1" + echo " To detect interrupted repeats, build across sequence until it has" + echo " entropy > N bits per bp. Set to 0 to turn off." + echo "" + echo " --no_partial_observations" + echo " type: boolean_true" + echo " Exclude observations which do not fully span the dynamically-determined" + echo " detection window. (default, use all observations, dividing partial" + echo " support across matching haplotypes when generating haplotypes.)" + echo "" + echo " --dont_left_align_indels" + echo " type: boolean_true" + echo " Turn off left-alignment of indels, which is enabled by default." + echo "" + echo " --use_duplicate_reads" + echo " type: boolean_true" + echo " Include duplicate-marked alignments in the analysis. default: exclude" + echo " duplicates marked as such in alignments" + echo "" + echo " --min_mapping_quality" + echo " type: integer" + echo " default: 1" + echo " Exclude alignments from analysis if they have a mapping quality less" + echo " than Q." + echo "" + echo " --min_base_quality" + echo " type: integer" + echo " default: 1" + echo " Exclude alleles from analysis if their supporting base quality is less" + echo " than Q. Default value is changed according to the instruction of" + echo " scSplit." + echo "" + echo " --min_supporting_allele_qsum" + echo " type: integer" + echo " default: 0" + echo " Consider any allele in which the sum of qualities of supporting" + echo " observations is at least Q." + echo "" + echo " --min_supporting_mapping_qsum" + echo " type: integer" + echo " default: 0" + echo " Consider any allele in which and the sum of mapping qualities of" + echo " supporting reads is at least." + echo "" + echo " --mismatch_base_quality_threshold" + echo " type: integer" + echo " default: 10" + echo " Count mismatches toward --read-mismatch-limit if the base quality of the" + echo " mismatch is >= Q." + echo "" + echo " --read_max_mismatch_fraction" + echo " type: double" + echo " default: 1.0" + echo " Exclude reads with more than N mismatches where each mismatch has base" + echo " quality >= mismatch-base-quality-threshold." + echo "" + echo " --read_mismatch_limit" + echo " type: integer" + echo " Exclude reads with more than N [0,1] fraction of mismatches where each" + echo " mismatch has base quality >= mismatch-base-quality-threshold." + echo "" + echo " --read_snp_limit" + echo " type: integer" + echo " Exclude reads with more than N base mismatches, ignoring gaps with" + echo " quality >= mismatch-base-quality-threshold." + echo "" + echo " --read_indel_limit" + echo " type: integer" + echo " Exclude reads with more than N separate gaps." + echo "" + echo " --standard_filters" + echo " type: boolean_true" + echo " Use stringent input base and mapping quality filters, equivalent to -m" + echo " 30 -q 20 -R 0 -S 0" + echo "" + echo " --min_alternate_fraction" + echo " type: double" + echo " default: 0.05" + echo " Require at least this fraction of observations supporting an alternate" + echo " allele within a single individual in order to evaluate the position." + echo "" + echo " --min_alternate_count" + echo " type: integer" + echo " default: 2" + echo " Require at least this count of observations supporting an alternate" + echo " allele within a single individual in order to evaluate the position." + echo "" + echo " --min_alternate_qsum" + echo " type: integer" + echo " default: 0" + echo " Require at least this sum of quality of observations supporting an" + echo " alternate allele within a single individual in order to evaluate the" + echo " position." + echo "" + echo " --min_alternate_total" + echo " type: integer" + echo " default: 1" + echo " Require at least this count of observations supporting an alternate" + echo " allele within the total population in order to use the allele in" + echo " analysis." + echo "" + echo " --min_coverage" + echo " type: integer" + echo " default: 0" + echo " Require at least this coverage to process a site." + echo "" + echo " --max_coverage" + echo " type: integer" + echo " Do not process sites with greater than this coverage." + echo "" + echo " --no_population_priors" + echo " type: boolean_true" + echo " Equivalent to --pooled-discrete --hwe-priors-off and removal of Ewens" + echo " Sampling Formula component of priors." + echo "" + echo " --hwe_priors_off" + echo " type: boolean_true" + echo " Disable estimation of the probability of the combination arising under" + echo " HWE given the allele frequency as estimated by observation frequency." + echo "" + echo " --binomial_obs_priors_off" + echo " type: boolean_true" + echo " Disable incorporation of prior expectations about observations. Uses" + echo " read placement probability, strand balance probability, and read" + echo " position probability." + echo "" + echo " --allele_balance_priors_off" + echo " type: boolean_true" + echo " Disable use of aggregate probability of observation balance between" + echo " alleles as a component of the priors." + echo "" + echo " --observation_bias" + echo " type: file, file must exist" + echo " Read length-dependent allele observation biases from FILE. The format is" + echo " [length] [alignment efficiency relative to reference] where the" + echo " efficiency is 1 if there is no relative observation bias." + echo "" + echo " --base_quality_cap" + echo " type: integer" + echo " Limit estimated observation quality by capping base quality at Q." + echo "" + echo " --prob_contamination" + echo " type: double" + echo " default: 1.0E-8" + echo " An estimate of contamination to use for all samples." + echo "" + echo " --legacy_gls" + echo " type: boolean_true" + echo " Use legacy (polybayes equivalent) genotype likelihood calculations" + echo "" + echo " --contamination_estimates" + echo " type: file, file must exist" + echo " A file containing per-sample estimates of contamination, such as those" + echo " generated by VerifyBamID." + echo "" + echo " --report_genotype_likelihood_max" + echo " type: boolean_true" + echo " Report genotypes using the maximum-likelihood estimate provided from" + echo " genotype likelihoods." + echo "" + echo " --genotyping_max_iterations" + echo " type: integer" + echo " default: 1000" + echo " Iterate no more than N times during genotyping step." + echo "" + echo " --genotyping_max_banddepth" + echo " type: integer" + echo " default: 6" + echo " Integrate no deeper than the Nth best genotype by likelihood when" + echo " genotyping." + echo "" + echo " --posterior_integration_limits" + echo " type: string" + echo " default: 1,3" + echo " Integrate all genotype combinations in our posterior space which include" + echo " no more than N samples with their Mth best data likelihood." + echo "" + echo " --exclude_unobserved_genotypes" + echo " type: boolean_true" + echo " Skip sample genotypings for which the sample has no supporting reads." + echo "" + echo " --genotype_variant_threshold" + echo " type: integer" + echo " Limit posterior integration to samples where the second-best genotype" + echo " likelihood is no more than log(N) from the highest genotype likelihood" + echo " for the sample." + echo "" + echo " --use_mapping_quality" + echo " type: boolean_true" + echo " Use mapping quality of alleles when calculating data likelihoods." + echo "" + echo " --harmonic_indel_quality" + echo " type: boolean_true" + echo " Use a weighted sum of base qualities around an indel, scaled by the" + echo " distance from the indel. By default use a minimum BQ in flanking" + echo " sequence." + echo "" + echo " --read_dependence_factor" + echo " type: double" + echo " default: 0.9" + echo " Incorporate non-independence of reads by scaling successive observations" + echo " by this factor during data likelihood calculations." + echo "" + echo " --genotype_qualities" + echo " type: boolean_true" + echo " Calculate the marginal probability of genotypes and report as GQ in each" + echo " sample field in the VCF output." + echo "" + echo " --debug" + echo " type: boolean_true" + echo " Print debugging output." + echo "" + echo " --dd" + echo " type: boolean_true" + echo " Print more verbose debugging output" + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: freebayes_out" + echo " Output directory" + echo "" + echo " --vcf" + echo " type: string" + echo " example: snp.vcf" + echo " Output VCF-format results to FILE." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "freebayes main" + exit + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam=*) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam=*\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam_list) + [ -n "$VIASH_PAR_BAM_LIST" ] && ViashError Bad arguments for option \'--bam_list\': \'$VIASH_PAR_BAM_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam_list=*) + [ -n "$VIASH_PAR_BAM_LIST" ] && ViashError Bad arguments for option \'--bam_list=*\': \'$VIASH_PAR_BAM_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --stdin) + [ -n "$VIASH_PAR_STDIN" ] && ViashError Bad arguments for option \'--stdin\': \'$VIASH_PAR_STDIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STDIN=true + shift 1 + ;; + --fasta_reference) + [ -n "$VIASH_PAR_FASTA_REFERENCE" ] && ViashError Bad arguments for option \'--fasta_reference\': \'$VIASH_PAR_FASTA_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FASTA_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fasta_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fasta_reference=*) + [ -n "$VIASH_PAR_FASTA_REFERENCE" ] && ViashError Bad arguments for option \'--fasta_reference=*\': \'$VIASH_PAR_FASTA_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FASTA_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fasta_reference_index) + [ -n "$VIASH_PAR_FASTA_REFERENCE_INDEX" ] && ViashError Bad arguments for option \'--fasta_reference_index\': \'$VIASH_PAR_FASTA_REFERENCE_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FASTA_REFERENCE_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fasta_reference_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fasta_reference_index=*) + [ -n "$VIASH_PAR_FASTA_REFERENCE_INDEX" ] && ViashError Bad arguments for option \'--fasta_reference_index=*\': \'$VIASH_PAR_FASTA_REFERENCE_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FASTA_REFERENCE_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --targets) + [ -n "$VIASH_PAR_TARGETS" ] && ViashError Bad arguments for option \'--targets\': \'$VIASH_PAR_TARGETS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGETS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --targets. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --targets=*) + [ -n "$VIASH_PAR_TARGETS" ] && ViashError Bad arguments for option \'--targets=*\': \'$VIASH_PAR_TARGETS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGETS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --region) + [ -n "$VIASH_PAR_REGION" ] && ViashError Bad arguments for option \'--region\': \'$VIASH_PAR_REGION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REGION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --region. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --region=*) + [ -n "$VIASH_PAR_REGION" ] && ViashError Bad arguments for option \'--region=*\': \'$VIASH_PAR_REGION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REGION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --samples) + [ -n "$VIASH_PAR_SAMPLES" ] && ViashError Bad arguments for option \'--samples\': \'$VIASH_PAR_SAMPLES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --samples. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --samples=*) + [ -n "$VIASH_PAR_SAMPLES" ] && ViashError Bad arguments for option \'--samples=*\': \'$VIASH_PAR_SAMPLES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --populations) + [ -n "$VIASH_PAR_POPULATIONS" ] && ViashError Bad arguments for option \'--populations\': \'$VIASH_PAR_POPULATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POPULATIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --populations. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --populations=*) + [ -n "$VIASH_PAR_POPULATIONS" ] && ViashError Bad arguments for option \'--populations=*\': \'$VIASH_PAR_POPULATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POPULATIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cnv_map) + [ -n "$VIASH_PAR_CNV_MAP" ] && ViashError Bad arguments for option \'--cnv_map\': \'$VIASH_PAR_CNV_MAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CNV_MAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cnv_map. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cnv_map=*) + [ -n "$VIASH_PAR_CNV_MAP" ] && ViashError Bad arguments for option \'--cnv_map=*\': \'$VIASH_PAR_CNV_MAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CNV_MAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gvcf) + [ -n "$VIASH_PAR_GVCF" ] && ViashError Bad arguments for option \'--gvcf\': \'$VIASH_PAR_GVCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GVCF=true + shift 1 + ;; + --gvcf_chunk) + [ -n "$VIASH_PAR_GVCF_CHUNK" ] && ViashError Bad arguments for option \'--gvcf_chunk\': \'$VIASH_PAR_GVCF_CHUNK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GVCF_CHUNK="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gvcf_chunk. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gvcf_chunk=*) + [ -n "$VIASH_PAR_GVCF_CHUNK" ] && ViashError Bad arguments for option \'--gvcf_chunk=*\': \'$VIASH_PAR_GVCF_CHUNK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GVCF_CHUNK=$(ViashRemoveFlags "$1") + shift 1 + ;; + --variant_input) + [ -n "$VIASH_PAR_VARIANT_INPUT" ] && ViashError Bad arguments for option \'--variant_input\': \'$VIASH_PAR_VARIANT_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARIANT_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --variant_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --variant_input=*) + [ -n "$VIASH_PAR_VARIANT_INPUT" ] && ViashError Bad arguments for option \'--variant_input=*\': \'$VIASH_PAR_VARIANT_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARIANT_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --only_use_input_alleles) + [ -n "$VIASH_PAR_ONLY_USE_INPUT_ALLELES" ] && ViashError Bad arguments for option \'--only_use_input_alleles\': \'$VIASH_PAR_ONLY_USE_INPUT_ALLELES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ONLY_USE_INPUT_ALLELES=true + shift 1 + ;; + --haplotype_basis_alleles) + [ -n "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES" ] && ViashError Bad arguments for option \'--haplotype_basis_alleles\': \'$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HAPLOTYPE_BASIS_ALLELES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --haplotype_basis_alleles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --haplotype_basis_alleles=*) + [ -n "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES" ] && ViashError Bad arguments for option \'--haplotype_basis_alleles=*\': \'$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HAPLOTYPE_BASIS_ALLELES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --report_all_haplotype_alleles) + [ -n "$VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES" ] && ViashError Bad arguments for option \'--report_all_haplotype_alleles\': \'$VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES=true + shift 1 + ;; + --report_monomorphic) + [ -n "$VIASH_PAR_REPORT_MONOMORPHIC" ] && ViashError Bad arguments for option \'--report_monomorphic\': \'$VIASH_PAR_REPORT_MONOMORPHIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORT_MONOMORPHIC=true + shift 1 + ;; + --pvar) + [ -n "$VIASH_PAR_PVAR" ] && ViashError Bad arguments for option \'--pvar\': \'$VIASH_PAR_PVAR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PVAR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pvar. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pvar=*) + [ -n "$VIASH_PAR_PVAR" ] && ViashError Bad arguments for option \'--pvar=*\': \'$VIASH_PAR_PVAR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PVAR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --strict_vcf) + [ -n "$VIASH_PAR_STRICT_VCF" ] && ViashError Bad arguments for option \'--strict_vcf\': \'$VIASH_PAR_STRICT_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRICT_VCF=true + shift 1 + ;; + --theta) + [ -n "$VIASH_PAR_THETA" ] && ViashError Bad arguments for option \'--theta\': \'$VIASH_PAR_THETA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THETA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --theta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --theta=*) + [ -n "$VIASH_PAR_THETA" ] && ViashError Bad arguments for option \'--theta=*\': \'$VIASH_PAR_THETA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THETA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ploidy) + [ -n "$VIASH_PAR_PLOIDY" ] && ViashError Bad arguments for option \'--ploidy\': \'$VIASH_PAR_PLOIDY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLOIDY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ploidy. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ploidy=*) + [ -n "$VIASH_PAR_PLOIDY" ] && ViashError Bad arguments for option \'--ploidy=*\': \'$VIASH_PAR_PLOIDY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLOIDY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pooled_discrete) + [ -n "$VIASH_PAR_POOLED_DISCRETE" ] && ViashError Bad arguments for option \'--pooled_discrete\': \'$VIASH_PAR_POOLED_DISCRETE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POOLED_DISCRETE=true + shift 1 + ;; + --pooled_continuous) + [ -n "$VIASH_PAR_POOLED_CONTINUOUS" ] && ViashError Bad arguments for option \'--pooled_continuous\': \'$VIASH_PAR_POOLED_CONTINUOUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POOLED_CONTINUOUS=true + shift 1 + ;; + --use_reference_allele) + [ -n "$VIASH_PAR_USE_REFERENCE_ALLELE" ] && ViashError Bad arguments for option \'--use_reference_allele\': \'$VIASH_PAR_USE_REFERENCE_ALLELE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_REFERENCE_ALLELE=true + shift 1 + ;; + --reference_quality) + [ -n "$VIASH_PAR_REFERENCE_QUALITY" ] && ViashError Bad arguments for option \'--reference_quality\': \'$VIASH_PAR_REFERENCE_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_quality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_quality=*) + [ -n "$VIASH_PAR_REFERENCE_QUALITY" ] && ViashError Bad arguments for option \'--reference_quality=*\': \'$VIASH_PAR_REFERENCE_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_QUALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --throw_away_snp_obs) + [ -n "$VIASH_PAR_THROW_AWAY_SNP_OBS" ] && ViashError Bad arguments for option \'--throw_away_snp_obs\': \'$VIASH_PAR_THROW_AWAY_SNP_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THROW_AWAY_SNP_OBS=true + shift 1 + ;; + --throw_away_mnps_obs) + [ -n "$VIASH_PAR_THROW_AWAY_MNPS_OBS" ] && ViashError Bad arguments for option \'--throw_away_mnps_obs\': \'$VIASH_PAR_THROW_AWAY_MNPS_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THROW_AWAY_MNPS_OBS=false + shift 1 + ;; + --throw_away_indel_obs) + [ -n "$VIASH_PAR_THROW_AWAY_INDEL_OBS" ] && ViashError Bad arguments for option \'--throw_away_indel_obs\': \'$VIASH_PAR_THROW_AWAY_INDEL_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THROW_AWAY_INDEL_OBS=false + shift 1 + ;; + --throw_away_complex_obs) + [ -n "$VIASH_PAR_THROW_AWAY_COMPLEX_OBS" ] && ViashError Bad arguments for option \'--throw_away_complex_obs\': \'$VIASH_PAR_THROW_AWAY_COMPLEX_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THROW_AWAY_COMPLEX_OBS=false + shift 1 + ;; + --use_best_n_alleles) + [ -n "$VIASH_PAR_USE_BEST_N_ALLELES" ] && ViashError Bad arguments for option \'--use_best_n_alleles\': \'$VIASH_PAR_USE_BEST_N_ALLELES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_BEST_N_ALLELES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --use_best_n_alleles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --use_best_n_alleles=*) + [ -n "$VIASH_PAR_USE_BEST_N_ALLELES" ] && ViashError Bad arguments for option \'--use_best_n_alleles=*\': \'$VIASH_PAR_USE_BEST_N_ALLELES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_BEST_N_ALLELES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_complex_gap) + [ -n "$VIASH_PAR_MAX_COMPLEX_GAP" ] && ViashError Bad arguments for option \'--max_complex_gap\': \'$VIASH_PAR_MAX_COMPLEX_GAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_COMPLEX_GAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_complex_gap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_complex_gap=*) + [ -n "$VIASH_PAR_MAX_COMPLEX_GAP" ] && ViashError Bad arguments for option \'--max_complex_gap=*\': \'$VIASH_PAR_MAX_COMPLEX_GAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_COMPLEX_GAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_repeat_size) + [ -n "$VIASH_PAR_MIN_REPEAT_SIZE" ] && ViashError Bad arguments for option \'--min_repeat_size\': \'$VIASH_PAR_MIN_REPEAT_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_REPEAT_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_repeat_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_repeat_size=*) + [ -n "$VIASH_PAR_MIN_REPEAT_SIZE" ] && ViashError Bad arguments for option \'--min_repeat_size=*\': \'$VIASH_PAR_MIN_REPEAT_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_REPEAT_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_repeat_entropy) + [ -n "$VIASH_PAR_MIN_REPEAT_ENTROPY" ] && ViashError Bad arguments for option \'--min_repeat_entropy\': \'$VIASH_PAR_MIN_REPEAT_ENTROPY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_REPEAT_ENTROPY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_repeat_entropy. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_repeat_entropy=*) + [ -n "$VIASH_PAR_MIN_REPEAT_ENTROPY" ] && ViashError Bad arguments for option \'--min_repeat_entropy=*\': \'$VIASH_PAR_MIN_REPEAT_ENTROPY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_REPEAT_ENTROPY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --no_partial_observations) + [ -n "$VIASH_PAR_NO_PARTIAL_OBSERVATIONS" ] && ViashError Bad arguments for option \'--no_partial_observations\': \'$VIASH_PAR_NO_PARTIAL_OBSERVATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_PARTIAL_OBSERVATIONS=true + shift 1 + ;; + --dont_left_align_indels) + [ -n "$VIASH_PAR_DONT_LEFT_ALIGN_INDELS" ] && ViashError Bad arguments for option \'--dont_left_align_indels\': \'$VIASH_PAR_DONT_LEFT_ALIGN_INDELS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DONT_LEFT_ALIGN_INDELS=true + shift 1 + ;; + --use_duplicate_reads) + [ -n "$VIASH_PAR_USE_DUPLICATE_READS" ] && ViashError Bad arguments for option \'--use_duplicate_reads\': \'$VIASH_PAR_USE_DUPLICATE_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_DUPLICATE_READS=true + shift 1 + ;; + --min_mapping_quality) + [ -n "$VIASH_PAR_MIN_MAPPING_QUALITY" ] && ViashError Bad arguments for option \'--min_mapping_quality\': \'$VIASH_PAR_MIN_MAPPING_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAPPING_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_mapping_quality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_mapping_quality=*) + [ -n "$VIASH_PAR_MIN_MAPPING_QUALITY" ] && ViashError Bad arguments for option \'--min_mapping_quality=*\': \'$VIASH_PAR_MIN_MAPPING_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_MAPPING_QUALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_base_quality) + [ -n "$VIASH_PAR_MIN_BASE_QUALITY" ] && ViashError Bad arguments for option \'--min_base_quality\': \'$VIASH_PAR_MIN_BASE_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BASE_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_base_quality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_base_quality=*) + [ -n "$VIASH_PAR_MIN_BASE_QUALITY" ] && ViashError Bad arguments for option \'--min_base_quality=*\': \'$VIASH_PAR_MIN_BASE_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BASE_QUALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_supporting_allele_qsum) + [ -n "$VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM" ] && ViashError Bad arguments for option \'--min_supporting_allele_qsum\': \'$VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_supporting_allele_qsum. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_supporting_allele_qsum=*) + [ -n "$VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM" ] && ViashError Bad arguments for option \'--min_supporting_allele_qsum=*\': \'$VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_supporting_mapping_qsum) + [ -n "$VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM" ] && ViashError Bad arguments for option \'--min_supporting_mapping_qsum\': \'$VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_supporting_mapping_qsum. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_supporting_mapping_qsum=*) + [ -n "$VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM" ] && ViashError Bad arguments for option \'--min_supporting_mapping_qsum=*\': \'$VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --mismatch_base_quality_threshold) + [ -n "$VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD" ] && ViashError Bad arguments for option \'--mismatch_base_quality_threshold\': \'$VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mismatch_base_quality_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mismatch_base_quality_threshold=*) + [ -n "$VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD" ] && ViashError Bad arguments for option \'--mismatch_base_quality_threshold=*\': \'$VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --read_max_mismatch_fraction) + [ -n "$VIASH_PAR_READ_MAX_MISMATCH_FRACTION" ] && ViashError Bad arguments for option \'--read_max_mismatch_fraction\': \'$VIASH_PAR_READ_MAX_MISMATCH_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_MAX_MISMATCH_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --read_max_mismatch_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --read_max_mismatch_fraction=*) + [ -n "$VIASH_PAR_READ_MAX_MISMATCH_FRACTION" ] && ViashError Bad arguments for option \'--read_max_mismatch_fraction=*\': \'$VIASH_PAR_READ_MAX_MISMATCH_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_MAX_MISMATCH_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --read_mismatch_limit) + [ -n "$VIASH_PAR_READ_MISMATCH_LIMIT" ] && ViashError Bad arguments for option \'--read_mismatch_limit\': \'$VIASH_PAR_READ_MISMATCH_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_MISMATCH_LIMIT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --read_mismatch_limit. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --read_mismatch_limit=*) + [ -n "$VIASH_PAR_READ_MISMATCH_LIMIT" ] && ViashError Bad arguments for option \'--read_mismatch_limit=*\': \'$VIASH_PAR_READ_MISMATCH_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_MISMATCH_LIMIT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --read_snp_limit) + [ -n "$VIASH_PAR_READ_SNP_LIMIT" ] && ViashError Bad arguments for option \'--read_snp_limit\': \'$VIASH_PAR_READ_SNP_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_SNP_LIMIT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --read_snp_limit. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --read_snp_limit=*) + [ -n "$VIASH_PAR_READ_SNP_LIMIT" ] && ViashError Bad arguments for option \'--read_snp_limit=*\': \'$VIASH_PAR_READ_SNP_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_SNP_LIMIT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --read_indel_limit) + [ -n "$VIASH_PAR_READ_INDEL_LIMIT" ] && ViashError Bad arguments for option \'--read_indel_limit\': \'$VIASH_PAR_READ_INDEL_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_INDEL_LIMIT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --read_indel_limit. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --read_indel_limit=*) + [ -n "$VIASH_PAR_READ_INDEL_LIMIT" ] && ViashError Bad arguments for option \'--read_indel_limit=*\': \'$VIASH_PAR_READ_INDEL_LIMIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_INDEL_LIMIT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --standard_filters) + [ -n "$VIASH_PAR_STANDARD_FILTERS" ] && ViashError Bad arguments for option \'--standard_filters\': \'$VIASH_PAR_STANDARD_FILTERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STANDARD_FILTERS=true + shift 1 + ;; + --min_alternate_fraction) + [ -n "$VIASH_PAR_MIN_ALTERNATE_FRACTION" ] && ViashError Bad arguments for option \'--min_alternate_fraction\': \'$VIASH_PAR_MIN_ALTERNATE_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_FRACTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_alternate_fraction. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_alternate_fraction=*) + [ -n "$VIASH_PAR_MIN_ALTERNATE_FRACTION" ] && ViashError Bad arguments for option \'--min_alternate_fraction=*\': \'$VIASH_PAR_MIN_ALTERNATE_FRACTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_FRACTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_alternate_count) + [ -n "$VIASH_PAR_MIN_ALTERNATE_COUNT" ] && ViashError Bad arguments for option \'--min_alternate_count\': \'$VIASH_PAR_MIN_ALTERNATE_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_alternate_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_alternate_count=*) + [ -n "$VIASH_PAR_MIN_ALTERNATE_COUNT" ] && ViashError Bad arguments for option \'--min_alternate_count=*\': \'$VIASH_PAR_MIN_ALTERNATE_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_alternate_qsum) + [ -n "$VIASH_PAR_MIN_ALTERNATE_QSUM" ] && ViashError Bad arguments for option \'--min_alternate_qsum\': \'$VIASH_PAR_MIN_ALTERNATE_QSUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_QSUM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_alternate_qsum. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_alternate_qsum=*) + [ -n "$VIASH_PAR_MIN_ALTERNATE_QSUM" ] && ViashError Bad arguments for option \'--min_alternate_qsum=*\': \'$VIASH_PAR_MIN_ALTERNATE_QSUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_QSUM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_alternate_total) + [ -n "$VIASH_PAR_MIN_ALTERNATE_TOTAL" ] && ViashError Bad arguments for option \'--min_alternate_total\': \'$VIASH_PAR_MIN_ALTERNATE_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_TOTAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_alternate_total. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_alternate_total=*) + [ -n "$VIASH_PAR_MIN_ALTERNATE_TOTAL" ] && ViashError Bad arguments for option \'--min_alternate_total=*\': \'$VIASH_PAR_MIN_ALTERNATE_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALTERNATE_TOTAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_coverage) + [ -n "$VIASH_PAR_MIN_COVERAGE" ] && ViashError Bad arguments for option \'--min_coverage\': \'$VIASH_PAR_MIN_COVERAGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COVERAGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_coverage. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_coverage=*) + [ -n "$VIASH_PAR_MIN_COVERAGE" ] && ViashError Bad arguments for option \'--min_coverage=*\': \'$VIASH_PAR_MIN_COVERAGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COVERAGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_coverage) + [ -n "$VIASH_PAR_MAX_COVERAGE" ] && ViashError Bad arguments for option \'--max_coverage\': \'$VIASH_PAR_MAX_COVERAGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_COVERAGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_coverage. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_coverage=*) + [ -n "$VIASH_PAR_MAX_COVERAGE" ] && ViashError Bad arguments for option \'--max_coverage=*\': \'$VIASH_PAR_MAX_COVERAGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_COVERAGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --no_population_priors) + [ -n "$VIASH_PAR_NO_POPULATION_PRIORS" ] && ViashError Bad arguments for option \'--no_population_priors\': \'$VIASH_PAR_NO_POPULATION_PRIORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_POPULATION_PRIORS=true + shift 1 + ;; + --hwe_priors_off) + [ -n "$VIASH_PAR_HWE_PRIORS_OFF" ] && ViashError Bad arguments for option \'--hwe_priors_off\': \'$VIASH_PAR_HWE_PRIORS_OFF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HWE_PRIORS_OFF=true + shift 1 + ;; + --binomial_obs_priors_off) + [ -n "$VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF" ] && ViashError Bad arguments for option \'--binomial_obs_priors_off\': \'$VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF=true + shift 1 + ;; + --allele_balance_priors_off) + [ -n "$VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF" ] && ViashError Bad arguments for option \'--allele_balance_priors_off\': \'$VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF=true + shift 1 + ;; + --observation_bias) + [ -n "$VIASH_PAR_OBSERVATION_BIAS" ] && ViashError Bad arguments for option \'--observation_bias\': \'$VIASH_PAR_OBSERVATION_BIAS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSERVATION_BIAS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --observation_bias. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --observation_bias=*) + [ -n "$VIASH_PAR_OBSERVATION_BIAS" ] && ViashError Bad arguments for option \'--observation_bias=*\': \'$VIASH_PAR_OBSERVATION_BIAS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSERVATION_BIAS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --base_quality_cap) + [ -n "$VIASH_PAR_BASE_QUALITY_CAP" ] && ViashError Bad arguments for option \'--base_quality_cap\': \'$VIASH_PAR_BASE_QUALITY_CAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BASE_QUALITY_CAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --base_quality_cap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --base_quality_cap=*) + [ -n "$VIASH_PAR_BASE_QUALITY_CAP" ] && ViashError Bad arguments for option \'--base_quality_cap=*\': \'$VIASH_PAR_BASE_QUALITY_CAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BASE_QUALITY_CAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --prob_contamination) + [ -n "$VIASH_PAR_PROB_CONTAMINATION" ] && ViashError Bad arguments for option \'--prob_contamination\': \'$VIASH_PAR_PROB_CONTAMINATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROB_CONTAMINATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --prob_contamination. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --prob_contamination=*) + [ -n "$VIASH_PAR_PROB_CONTAMINATION" ] && ViashError Bad arguments for option \'--prob_contamination=*\': \'$VIASH_PAR_PROB_CONTAMINATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROB_CONTAMINATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --legacy_gls) + [ -n "$VIASH_PAR_LEGACY_GLS" ] && ViashError Bad arguments for option \'--legacy_gls\': \'$VIASH_PAR_LEGACY_GLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEGACY_GLS=true + shift 1 + ;; + --contamination_estimates) + [ -n "$VIASH_PAR_CONTAMINATION_ESTIMATES" ] && ViashError Bad arguments for option \'--contamination_estimates\': \'$VIASH_PAR_CONTAMINATION_ESTIMATES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CONTAMINATION_ESTIMATES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --contamination_estimates. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --contamination_estimates=*) + [ -n "$VIASH_PAR_CONTAMINATION_ESTIMATES" ] && ViashError Bad arguments for option \'--contamination_estimates=*\': \'$VIASH_PAR_CONTAMINATION_ESTIMATES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CONTAMINATION_ESTIMATES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --report_genotype_likelihood_max) + [ -n "$VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX" ] && ViashError Bad arguments for option \'--report_genotype_likelihood_max\': \'$VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX=true + shift 1 + ;; + --genotyping_max_iterations) + [ -n "$VIASH_PAR_GENOTYPING_MAX_ITERATIONS" ] && ViashError Bad arguments for option \'--genotyping_max_iterations\': \'$VIASH_PAR_GENOTYPING_MAX_ITERATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPING_MAX_ITERATIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genotyping_max_iterations. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genotyping_max_iterations=*) + [ -n "$VIASH_PAR_GENOTYPING_MAX_ITERATIONS" ] && ViashError Bad arguments for option \'--genotyping_max_iterations=*\': \'$VIASH_PAR_GENOTYPING_MAX_ITERATIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPING_MAX_ITERATIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genotyping_max_banddepth) + [ -n "$VIASH_PAR_GENOTYPING_MAX_BANDDEPTH" ] && ViashError Bad arguments for option \'--genotyping_max_banddepth\': \'$VIASH_PAR_GENOTYPING_MAX_BANDDEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPING_MAX_BANDDEPTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genotyping_max_banddepth. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genotyping_max_banddepth=*) + [ -n "$VIASH_PAR_GENOTYPING_MAX_BANDDEPTH" ] && ViashError Bad arguments for option \'--genotyping_max_banddepth=*\': \'$VIASH_PAR_GENOTYPING_MAX_BANDDEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPING_MAX_BANDDEPTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --posterior_integration_limits) + [ -n "$VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS" ] && ViashError Bad arguments for option \'--posterior_integration_limits\': \'$VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --posterior_integration_limits. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --posterior_integration_limits=*) + [ -n "$VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS" ] && ViashError Bad arguments for option \'--posterior_integration_limits=*\': \'$VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --exclude_unobserved_genotypes) + [ -n "$VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES" ] && ViashError Bad arguments for option \'--exclude_unobserved_genotypes\': \'$VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES=true + shift 1 + ;; + --genotype_variant_threshold) + [ -n "$VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD" ] && ViashError Bad arguments for option \'--genotype_variant_threshold\': \'$VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genotype_variant_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genotype_variant_threshold=*) + [ -n "$VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD" ] && ViashError Bad arguments for option \'--genotype_variant_threshold=*\': \'$VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --use_mapping_quality) + [ -n "$VIASH_PAR_USE_MAPPING_QUALITY" ] && ViashError Bad arguments for option \'--use_mapping_quality\': \'$VIASH_PAR_USE_MAPPING_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_MAPPING_QUALITY=true + shift 1 + ;; + --harmonic_indel_quality) + [ -n "$VIASH_PAR_HARMONIC_INDEL_QUALITY" ] && ViashError Bad arguments for option \'--harmonic_indel_quality\': \'$VIASH_PAR_HARMONIC_INDEL_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HARMONIC_INDEL_QUALITY=true + shift 1 + ;; + --read_dependence_factor) + [ -n "$VIASH_PAR_READ_DEPENDENCE_FACTOR" ] && ViashError Bad arguments for option \'--read_dependence_factor\': \'$VIASH_PAR_READ_DEPENDENCE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_DEPENDENCE_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --read_dependence_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --read_dependence_factor=*) + [ -n "$VIASH_PAR_READ_DEPENDENCE_FACTOR" ] && ViashError Bad arguments for option \'--read_dependence_factor=*\': \'$VIASH_PAR_READ_DEPENDENCE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READ_DEPENDENCE_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genotype_qualities) + [ -n "$VIASH_PAR_GENOTYPE_QUALITIES" ] && ViashError Bad arguments for option \'--genotype_qualities\': \'$VIASH_PAR_GENOTYPE_QUALITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOTYPE_QUALITIES=true + shift 1 + ;; + --debug) + [ -n "$VIASH_PAR_DEBUG" ] && ViashError Bad arguments for option \'--debug\': \'$VIASH_PAR_DEBUG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DEBUG=true + shift 1 + ;; + --dd) + [ -n "$VIASH_PAR_DD" ] && ViashError Bad arguments for option \'--dd\': \'$VIASH_PAR_DD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DD=true + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf=*) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf=*\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/freebayes:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_STDIN+x} ]; then + VIASH_PAR_STDIN="false" +fi +if [ -z ${VIASH_PAR_GVCF+x} ]; then + VIASH_PAR_GVCF="false" +fi +if [ -z ${VIASH_PAR_ONLY_USE_INPUT_ALLELES+x} ]; then + VIASH_PAR_ONLY_USE_INPUT_ALLELES="false" +fi +if [ -z ${VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES+x} ]; then + VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES="false" +fi +if [ -z ${VIASH_PAR_REPORT_MONOMORPHIC+x} ]; then + VIASH_PAR_REPORT_MONOMORPHIC="false" +fi +if [ -z ${VIASH_PAR_PVAR+x} ]; then + VIASH_PAR_PVAR="0.0" +fi +if [ -z ${VIASH_PAR_STRICT_VCF+x} ]; then + VIASH_PAR_STRICT_VCF="false" +fi +if [ -z ${VIASH_PAR_THETA+x} ]; then + VIASH_PAR_THETA="0.001" +fi +if [ -z ${VIASH_PAR_PLOIDY+x} ]; then + VIASH_PAR_PLOIDY="2" +fi +if [ -z ${VIASH_PAR_POOLED_DISCRETE+x} ]; then + VIASH_PAR_POOLED_DISCRETE="false" +fi +if [ -z ${VIASH_PAR_POOLED_CONTINUOUS+x} ]; then + VIASH_PAR_POOLED_CONTINUOUS="false" +fi +if [ -z ${VIASH_PAR_USE_REFERENCE_ALLELE+x} ]; then + VIASH_PAR_USE_REFERENCE_ALLELE="false" +fi +if [ -z ${VIASH_PAR_REFERENCE_QUALITY+x} ]; then + VIASH_PAR_REFERENCE_QUALITY="100,60" +fi +if [ -z ${VIASH_PAR_THROW_AWAY_SNP_OBS+x} ]; then + VIASH_PAR_THROW_AWAY_SNP_OBS="false" +fi +if [ -z ${VIASH_PAR_THROW_AWAY_MNPS_OBS+x} ]; then + VIASH_PAR_THROW_AWAY_MNPS_OBS="true" +fi +if [ -z ${VIASH_PAR_THROW_AWAY_INDEL_OBS+x} ]; then + VIASH_PAR_THROW_AWAY_INDEL_OBS="true" +fi +if [ -z ${VIASH_PAR_THROW_AWAY_COMPLEX_OBS+x} ]; then + VIASH_PAR_THROW_AWAY_COMPLEX_OBS="true" +fi +if [ -z ${VIASH_PAR_USE_BEST_N_ALLELES+x} ]; then + VIASH_PAR_USE_BEST_N_ALLELES="0" +fi +if [ -z ${VIASH_PAR_MAX_COMPLEX_GAP+x} ]; then + VIASH_PAR_MAX_COMPLEX_GAP="3" +fi +if [ -z ${VIASH_PAR_MIN_REPEAT_SIZE+x} ]; then + VIASH_PAR_MIN_REPEAT_SIZE="5" +fi +if [ -z ${VIASH_PAR_MIN_REPEAT_ENTROPY+x} ]; then + VIASH_PAR_MIN_REPEAT_ENTROPY="1" +fi +if [ -z ${VIASH_PAR_NO_PARTIAL_OBSERVATIONS+x} ]; then + VIASH_PAR_NO_PARTIAL_OBSERVATIONS="false" +fi +if [ -z ${VIASH_PAR_DONT_LEFT_ALIGN_INDELS+x} ]; then + VIASH_PAR_DONT_LEFT_ALIGN_INDELS="false" +fi +if [ -z ${VIASH_PAR_USE_DUPLICATE_READS+x} ]; then + VIASH_PAR_USE_DUPLICATE_READS="false" +fi +if [ -z ${VIASH_PAR_MIN_MAPPING_QUALITY+x} ]; then + VIASH_PAR_MIN_MAPPING_QUALITY="1" +fi +if [ -z ${VIASH_PAR_MIN_BASE_QUALITY+x} ]; then + VIASH_PAR_MIN_BASE_QUALITY="1" +fi +if [ -z ${VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM+x} ]; then + VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM="0" +fi +if [ -z ${VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM+x} ]; then + VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM="0" +fi +if [ -z ${VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD+x} ]; then + VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD="10" +fi +if [ -z ${VIASH_PAR_READ_MAX_MISMATCH_FRACTION+x} ]; then + VIASH_PAR_READ_MAX_MISMATCH_FRACTION="1.0" +fi +if [ -z ${VIASH_PAR_STANDARD_FILTERS+x} ]; then + VIASH_PAR_STANDARD_FILTERS="false" +fi +if [ -z ${VIASH_PAR_MIN_ALTERNATE_FRACTION+x} ]; then + VIASH_PAR_MIN_ALTERNATE_FRACTION="0.05" +fi +if [ -z ${VIASH_PAR_MIN_ALTERNATE_COUNT+x} ]; then + VIASH_PAR_MIN_ALTERNATE_COUNT="2" +fi +if [ -z ${VIASH_PAR_MIN_ALTERNATE_QSUM+x} ]; then + VIASH_PAR_MIN_ALTERNATE_QSUM="0" +fi +if [ -z ${VIASH_PAR_MIN_ALTERNATE_TOTAL+x} ]; then + VIASH_PAR_MIN_ALTERNATE_TOTAL="1" +fi +if [ -z ${VIASH_PAR_MIN_COVERAGE+x} ]; then + VIASH_PAR_MIN_COVERAGE="0" +fi +if [ -z ${VIASH_PAR_NO_POPULATION_PRIORS+x} ]; then + VIASH_PAR_NO_POPULATION_PRIORS="false" +fi +if [ -z ${VIASH_PAR_HWE_PRIORS_OFF+x} ]; then + VIASH_PAR_HWE_PRIORS_OFF="false" +fi +if [ -z ${VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF+x} ]; then + VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF="false" +fi +if [ -z ${VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF+x} ]; then + VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF="false" +fi +if [ -z ${VIASH_PAR_PROB_CONTAMINATION+x} ]; then + VIASH_PAR_PROB_CONTAMINATION="1.0E-8" +fi +if [ -z ${VIASH_PAR_LEGACY_GLS+x} ]; then + VIASH_PAR_LEGACY_GLS="false" +fi +if [ -z ${VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX+x} ]; then + VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX="false" +fi +if [ -z ${VIASH_PAR_GENOTYPING_MAX_ITERATIONS+x} ]; then + VIASH_PAR_GENOTYPING_MAX_ITERATIONS="1000" +fi +if [ -z ${VIASH_PAR_GENOTYPING_MAX_BANDDEPTH+x} ]; then + VIASH_PAR_GENOTYPING_MAX_BANDDEPTH="6" +fi +if [ -z ${VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS+x} ]; then + VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS="1,3" +fi +if [ -z ${VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES+x} ]; then + VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES="false" +fi +if [ -z ${VIASH_PAR_USE_MAPPING_QUALITY+x} ]; then + VIASH_PAR_USE_MAPPING_QUALITY="false" +fi +if [ -z ${VIASH_PAR_HARMONIC_INDEL_QUALITY+x} ]; then + VIASH_PAR_HARMONIC_INDEL_QUALITY="false" +fi +if [ -z ${VIASH_PAR_READ_DEPENDENCE_FACTOR+x} ]; then + VIASH_PAR_READ_DEPENDENCE_FACTOR="0.9" +fi +if [ -z ${VIASH_PAR_GENOTYPE_QUALITIES+x} ]; then + VIASH_PAR_GENOTYPE_QUALITIES="false" +fi +if [ -z ${VIASH_PAR_DEBUG+x} ]; then + VIASH_PAR_DEBUG="false" +fi +if [ -z ${VIASH_PAR_DD+x} ]; then + VIASH_PAR_DD="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -e "$VIASH_PAR_BAM" ]; then + ViashError "Input file '$VIASH_PAR_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM_LIST" ] && [ ! -e "$VIASH_PAR_BAM_LIST" ]; then + ViashError "Input file '$VIASH_PAR_BAM_LIST' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_FASTA_REFERENCE" ] && [ ! -e "$VIASH_PAR_FASTA_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_FASTA_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_FASTA_REFERENCE_INDEX" ] && [ ! -e "$VIASH_PAR_FASTA_REFERENCE_INDEX" ]; then + ViashError "Input file '$VIASH_PAR_FASTA_REFERENCE_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TARGETS" ] && [ ! -e "$VIASH_PAR_TARGETS" ]; then + ViashError "Input file '$VIASH_PAR_TARGETS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLES" ] && [ ! -e "$VIASH_PAR_SAMPLES" ]; then + ViashError "Input file '$VIASH_PAR_SAMPLES' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_POPULATIONS" ] && [ ! -e "$VIASH_PAR_POPULATIONS" ]; then + ViashError "Input file '$VIASH_PAR_POPULATIONS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CNV_MAP" ] && [ ! -e "$VIASH_PAR_CNV_MAP" ]; then + ViashError "Input file '$VIASH_PAR_CNV_MAP' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VARIANT_INPUT" ] && [ ! -e "$VIASH_PAR_VARIANT_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_VARIANT_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES" ] && [ ! -e "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES" ]; then + ViashError "Input file '$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OBSERVATION_BIAS" ] && [ ! -e "$VIASH_PAR_OBSERVATION_BIAS" ]; then + ViashError "Input file '$VIASH_PAR_OBSERVATION_BIAS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CONTAMINATION_ESTIMATES" ] && [ ! -e "$VIASH_PAR_CONTAMINATION_ESTIMATES" ]; then + ViashError "Input file '$VIASH_PAR_CONTAMINATION_ESTIMATES' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_STDIN" ]]; then + if ! [[ "$VIASH_PAR_STDIN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--stdin' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GVCF" ]]; then + if ! [[ "$VIASH_PAR_GVCF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--gvcf' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GVCF_CHUNK" ]]; then + if ! [[ "$VIASH_PAR_GVCF_CHUNK" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gvcf_chunk' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ONLY_USE_INPUT_ALLELES" ]]; then + if ! [[ "$VIASH_PAR_ONLY_USE_INPUT_ALLELES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--only_use_input_alleles' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES" ]]; then + if ! [[ "$VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--report_all_haplotype_alleles' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REPORT_MONOMORPHIC" ]]; then + if ! [[ "$VIASH_PAR_REPORT_MONOMORPHIC" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--report_monomorphic' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PVAR" ]]; then + if ! [[ "$VIASH_PAR_PVAR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--pvar' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_STRICT_VCF" ]]; then + if ! [[ "$VIASH_PAR_STRICT_VCF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--strict_vcf' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_THETA" ]]; then + if ! [[ "$VIASH_PAR_THETA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--theta' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PLOIDY" ]]; then + if ! [[ "$VIASH_PAR_PLOIDY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--ploidy' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_POOLED_DISCRETE" ]]; then + if ! [[ "$VIASH_PAR_POOLED_DISCRETE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--pooled_discrete' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_POOLED_CONTINUOUS" ]]; then + if ! [[ "$VIASH_PAR_POOLED_CONTINUOUS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--pooled_continuous' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_REFERENCE_ALLELE" ]]; then + if ! [[ "$VIASH_PAR_USE_REFERENCE_ALLELE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--use_reference_allele' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_THROW_AWAY_SNP_OBS" ]]; then + if ! [[ "$VIASH_PAR_THROW_AWAY_SNP_OBS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--throw_away_snp_obs' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_THROW_AWAY_MNPS_OBS" ]]; then + if ! [[ "$VIASH_PAR_THROW_AWAY_MNPS_OBS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--throw_away_mnps_obs' has to be a boolean_false. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_THROW_AWAY_INDEL_OBS" ]]; then + if ! [[ "$VIASH_PAR_THROW_AWAY_INDEL_OBS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--throw_away_indel_obs' has to be a boolean_false. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_THROW_AWAY_COMPLEX_OBS" ]]; then + if ! [[ "$VIASH_PAR_THROW_AWAY_COMPLEX_OBS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--throw_away_complex_obs' has to be a boolean_false. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_BEST_N_ALLELES" ]]; then + if ! [[ "$VIASH_PAR_USE_BEST_N_ALLELES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--use_best_n_alleles' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_COMPLEX_GAP" ]]; then + if ! [[ "$VIASH_PAR_MAX_COMPLEX_GAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_complex_gap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_REPEAT_SIZE" ]]; then + if ! [[ "$VIASH_PAR_MIN_REPEAT_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_repeat_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_REPEAT_ENTROPY" ]]; then + if ! [[ "$VIASH_PAR_MIN_REPEAT_ENTROPY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_repeat_entropy' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NO_PARTIAL_OBSERVATIONS" ]]; then + if ! [[ "$VIASH_PAR_NO_PARTIAL_OBSERVATIONS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--no_partial_observations' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DONT_LEFT_ALIGN_INDELS" ]]; then + if ! [[ "$VIASH_PAR_DONT_LEFT_ALIGN_INDELS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--dont_left_align_indels' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_DUPLICATE_READS" ]]; then + if ! [[ "$VIASH_PAR_USE_DUPLICATE_READS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--use_duplicate_reads' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_MAPPING_QUALITY" ]]; then + if ! [[ "$VIASH_PAR_MIN_MAPPING_QUALITY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_mapping_quality' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_BASE_QUALITY" ]]; then + if ! [[ "$VIASH_PAR_MIN_BASE_QUALITY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_base_quality' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM" ]]; then + if ! [[ "$VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_supporting_allele_qsum' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM" ]]; then + if ! [[ "$VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_supporting_mapping_qsum' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--mismatch_base_quality_threshold' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READ_MAX_MISMATCH_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_READ_MAX_MISMATCH_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--read_max_mismatch_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READ_MISMATCH_LIMIT" ]]; then + if ! [[ "$VIASH_PAR_READ_MISMATCH_LIMIT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--read_mismatch_limit' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READ_SNP_LIMIT" ]]; then + if ! [[ "$VIASH_PAR_READ_SNP_LIMIT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--read_snp_limit' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READ_INDEL_LIMIT" ]]; then + if ! [[ "$VIASH_PAR_READ_INDEL_LIMIT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--read_indel_limit' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_STANDARD_FILTERS" ]]; then + if ! [[ "$VIASH_PAR_STANDARD_FILTERS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--standard_filters' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_ALTERNATE_FRACTION" ]]; then + if ! [[ "$VIASH_PAR_MIN_ALTERNATE_FRACTION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_alternate_fraction' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_ALTERNATE_COUNT" ]]; then + if ! [[ "$VIASH_PAR_MIN_ALTERNATE_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_alternate_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_ALTERNATE_QSUM" ]]; then + if ! [[ "$VIASH_PAR_MIN_ALTERNATE_QSUM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_alternate_qsum' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_ALTERNATE_TOTAL" ]]; then + if ! [[ "$VIASH_PAR_MIN_ALTERNATE_TOTAL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_alternate_total' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_COVERAGE" ]]; then + if ! [[ "$VIASH_PAR_MIN_COVERAGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_coverage' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_COVERAGE" ]]; then + if ! [[ "$VIASH_PAR_MAX_COVERAGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_coverage' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NO_POPULATION_PRIORS" ]]; then + if ! [[ "$VIASH_PAR_NO_POPULATION_PRIORS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--no_population_priors' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_HWE_PRIORS_OFF" ]]; then + if ! [[ "$VIASH_PAR_HWE_PRIORS_OFF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--hwe_priors_off' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF" ]]; then + if ! [[ "$VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--binomial_obs_priors_off' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF" ]]; then + if ! [[ "$VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--allele_balance_priors_off' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BASE_QUALITY_CAP" ]]; then + if ! [[ "$VIASH_PAR_BASE_QUALITY_CAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--base_quality_cap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PROB_CONTAMINATION" ]]; then + if ! [[ "$VIASH_PAR_PROB_CONTAMINATION" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--prob_contamination' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LEGACY_GLS" ]]; then + if ! [[ "$VIASH_PAR_LEGACY_GLS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--legacy_gls' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX" ]]; then + if ! [[ "$VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--report_genotype_likelihood_max' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENOTYPING_MAX_ITERATIONS" ]]; then + if ! [[ "$VIASH_PAR_GENOTYPING_MAX_ITERATIONS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--genotyping_max_iterations' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENOTYPING_MAX_BANDDEPTH" ]]; then + if ! [[ "$VIASH_PAR_GENOTYPING_MAX_BANDDEPTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--genotyping_max_banddepth' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES" ]]; then + if ! [[ "$VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--exclude_unobserved_genotypes' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--genotype_variant_threshold' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_MAPPING_QUALITY" ]]; then + if ! [[ "$VIASH_PAR_USE_MAPPING_QUALITY" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--use_mapping_quality' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_HARMONIC_INDEL_QUALITY" ]]; then + if ! [[ "$VIASH_PAR_HARMONIC_INDEL_QUALITY" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--harmonic_indel_quality' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READ_DEPENDENCE_FACTOR" ]]; then + if ! [[ "$VIASH_PAR_READ_DEPENDENCE_FACTOR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--read_dependence_factor' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENOTYPE_QUALITIES" ]]; then + if ! [[ "$VIASH_PAR_GENOTYPE_QUALITIES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--genotype_qualities' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DEBUG" ]]; then + if ! [[ "$VIASH_PAR_DEBUG" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--debug' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DD" ]]; then + if ! [[ "$VIASH_PAR_DD" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--dd' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM")" ) + VIASH_PAR_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM") +fi +if [ ! -z "$VIASH_PAR_BAM_LIST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM_LIST")" ) + VIASH_PAR_BAM_LIST=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM_LIST") +fi +if [ ! -z "$VIASH_PAR_FASTA_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FASTA_REFERENCE")" ) + VIASH_PAR_FASTA_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_FASTA_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_FASTA_REFERENCE_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FASTA_REFERENCE_INDEX")" ) + VIASH_PAR_FASTA_REFERENCE_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_FASTA_REFERENCE_INDEX") +fi +if [ ! -z "$VIASH_PAR_TARGETS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TARGETS")" ) + VIASH_PAR_TARGETS=$(ViashDockerAutodetectMount "$VIASH_PAR_TARGETS") +fi +if [ ! -z "$VIASH_PAR_SAMPLES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLES")" ) + VIASH_PAR_SAMPLES=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLES") +fi +if [ ! -z "$VIASH_PAR_POPULATIONS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_POPULATIONS")" ) + VIASH_PAR_POPULATIONS=$(ViashDockerAutodetectMount "$VIASH_PAR_POPULATIONS") +fi +if [ ! -z "$VIASH_PAR_CNV_MAP" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CNV_MAP")" ) + VIASH_PAR_CNV_MAP=$(ViashDockerAutodetectMount "$VIASH_PAR_CNV_MAP") +fi +if [ ! -z "$VIASH_PAR_VARIANT_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VARIANT_INPUT")" ) + VIASH_PAR_VARIANT_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_VARIANT_INPUT") +fi +if [ ! -z "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES")" ) + VIASH_PAR_HAPLOTYPE_BASIS_ALLELES=$(ViashDockerAutodetectMount "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES") +fi +if [ ! -z "$VIASH_PAR_OBSERVATION_BIAS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OBSERVATION_BIAS")" ) + VIASH_PAR_OBSERVATION_BIAS=$(ViashDockerAutodetectMount "$VIASH_PAR_OBSERVATION_BIAS") +fi +if [ ! -z "$VIASH_PAR_CONTAMINATION_ESTIMATES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CONTAMINATION_ESTIMATES")" ) + VIASH_PAR_CONTAMINATION_ESTIMATES=$(ViashDockerAutodetectMount "$VIASH_PAR_CONTAMINATION_ESTIMATES") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-freebayes-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM_LIST+x} ]; then echo "${VIASH_PAR_BAM_LIST}" | sed "s#'#'\"'\"'#g;s#.*#par_bam_list='&'#" ; else echo "# par_bam_list="; fi ) +$( if [ ! -z ${VIASH_PAR_STDIN+x} ]; then echo "${VIASH_PAR_STDIN}" | sed "s#'#'\"'\"'#g;s#.*#par_stdin='&'#" ; else echo "# par_stdin="; fi ) +$( if [ ! -z ${VIASH_PAR_FASTA_REFERENCE+x} ]; then echo "${VIASH_PAR_FASTA_REFERENCE}" | sed "s#'#'\"'\"'#g;s#.*#par_fasta_reference='&'#" ; else echo "# par_fasta_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_FASTA_REFERENCE_INDEX+x} ]; then echo "${VIASH_PAR_FASTA_REFERENCE_INDEX}" | sed "s#'#'\"'\"'#g;s#.*#par_fasta_reference_index='&'#" ; else echo "# par_fasta_reference_index="; fi ) +$( if [ ! -z ${VIASH_PAR_TARGETS+x} ]; then echo "${VIASH_PAR_TARGETS}" | sed "s#'#'\"'\"'#g;s#.*#par_targets='&'#" ; else echo "# par_targets="; fi ) +$( if [ ! -z ${VIASH_PAR_REGION+x} ]; then echo "${VIASH_PAR_REGION}" | sed "s#'#'\"'\"'#g;s#.*#par_region='&'#" ; else echo "# par_region="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLES+x} ]; then echo "${VIASH_PAR_SAMPLES}" | sed "s#'#'\"'\"'#g;s#.*#par_samples='&'#" ; else echo "# par_samples="; fi ) +$( if [ ! -z ${VIASH_PAR_POPULATIONS+x} ]; then echo "${VIASH_PAR_POPULATIONS}" | sed "s#'#'\"'\"'#g;s#.*#par_populations='&'#" ; else echo "# par_populations="; fi ) +$( if [ ! -z ${VIASH_PAR_CNV_MAP+x} ]; then echo "${VIASH_PAR_CNV_MAP}" | sed "s#'#'\"'\"'#g;s#.*#par_cnv_map='&'#" ; else echo "# par_cnv_map="; fi ) +$( if [ ! -z ${VIASH_PAR_GVCF+x} ]; then echo "${VIASH_PAR_GVCF}" | sed "s#'#'\"'\"'#g;s#.*#par_gvcf='&'#" ; else echo "# par_gvcf="; fi ) +$( if [ ! -z ${VIASH_PAR_GVCF_CHUNK+x} ]; then echo "${VIASH_PAR_GVCF_CHUNK}" | sed "s#'#'\"'\"'#g;s#.*#par_gvcf_chunk='&'#" ; else echo "# par_gvcf_chunk="; fi ) +$( if [ ! -z ${VIASH_PAR_VARIANT_INPUT+x} ]; then echo "${VIASH_PAR_VARIANT_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_variant_input='&'#" ; else echo "# par_variant_input="; fi ) +$( if [ ! -z ${VIASH_PAR_ONLY_USE_INPUT_ALLELES+x} ]; then echo "${VIASH_PAR_ONLY_USE_INPUT_ALLELES}" | sed "s#'#'\"'\"'#g;s#.*#par_only_use_input_alleles='&'#" ; else echo "# par_only_use_input_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_HAPLOTYPE_BASIS_ALLELES+x} ]; then echo "${VIASH_PAR_HAPLOTYPE_BASIS_ALLELES}" | sed "s#'#'\"'\"'#g;s#.*#par_haplotype_basis_alleles='&'#" ; else echo "# par_haplotype_basis_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES+x} ]; then echo "${VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES}" | sed "s#'#'\"'\"'#g;s#.*#par_report_all_haplotype_alleles='&'#" ; else echo "# par_report_all_haplotype_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT_MONOMORPHIC+x} ]; then echo "${VIASH_PAR_REPORT_MONOMORPHIC}" | sed "s#'#'\"'\"'#g;s#.*#par_report_monomorphic='&'#" ; else echo "# par_report_monomorphic="; fi ) +$( if [ ! -z ${VIASH_PAR_PVAR+x} ]; then echo "${VIASH_PAR_PVAR}" | sed "s#'#'\"'\"'#g;s#.*#par_pvar='&'#" ; else echo "# par_pvar="; fi ) +$( if [ ! -z ${VIASH_PAR_STRICT_VCF+x} ]; then echo "${VIASH_PAR_STRICT_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_strict_vcf='&'#" ; else echo "# par_strict_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_THETA+x} ]; then echo "${VIASH_PAR_THETA}" | sed "s#'#'\"'\"'#g;s#.*#par_theta='&'#" ; else echo "# par_theta="; fi ) +$( if [ ! -z ${VIASH_PAR_PLOIDY+x} ]; then echo "${VIASH_PAR_PLOIDY}" | sed "s#'#'\"'\"'#g;s#.*#par_ploidy='&'#" ; else echo "# par_ploidy="; fi ) +$( if [ ! -z ${VIASH_PAR_POOLED_DISCRETE+x} ]; then echo "${VIASH_PAR_POOLED_DISCRETE}" | sed "s#'#'\"'\"'#g;s#.*#par_pooled_discrete='&'#" ; else echo "# par_pooled_discrete="; fi ) +$( if [ ! -z ${VIASH_PAR_POOLED_CONTINUOUS+x} ]; then echo "${VIASH_PAR_POOLED_CONTINUOUS}" | sed "s#'#'\"'\"'#g;s#.*#par_pooled_continuous='&'#" ; else echo "# par_pooled_continuous="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_REFERENCE_ALLELE+x} ]; then echo "${VIASH_PAR_USE_REFERENCE_ALLELE}" | sed "s#'#'\"'\"'#g;s#.*#par_use_reference_allele='&'#" ; else echo "# par_use_reference_allele="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE_QUALITY+x} ]; then echo "${VIASH_PAR_REFERENCE_QUALITY}" | sed "s#'#'\"'\"'#g;s#.*#par_reference_quality='&'#" ; else echo "# par_reference_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_SNP_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_SNP_OBS}" | sed "s#'#'\"'\"'#g;s#.*#par_throw_away_snp_obs='&'#" ; else echo "# par_throw_away_snp_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_MNPS_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_MNPS_OBS}" | sed "s#'#'\"'\"'#g;s#.*#par_throw_away_mnps_obs='&'#" ; else echo "# par_throw_away_mnps_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_INDEL_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_INDEL_OBS}" | sed "s#'#'\"'\"'#g;s#.*#par_throw_away_indel_obs='&'#" ; else echo "# par_throw_away_indel_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_COMPLEX_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_COMPLEX_OBS}" | sed "s#'#'\"'\"'#g;s#.*#par_throw_away_complex_obs='&'#" ; else echo "# par_throw_away_complex_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_BEST_N_ALLELES+x} ]; then echo "${VIASH_PAR_USE_BEST_N_ALLELES}" | sed "s#'#'\"'\"'#g;s#.*#par_use_best_n_alleles='&'#" ; else echo "# par_use_best_n_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_COMPLEX_GAP+x} ]; then echo "${VIASH_PAR_MAX_COMPLEX_GAP}" | sed "s#'#'\"'\"'#g;s#.*#par_max_complex_gap='&'#" ; else echo "# par_max_complex_gap="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_REPEAT_SIZE+x} ]; then echo "${VIASH_PAR_MIN_REPEAT_SIZE}" | sed "s#'#'\"'\"'#g;s#.*#par_min_repeat_size='&'#" ; else echo "# par_min_repeat_size="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_REPEAT_ENTROPY+x} ]; then echo "${VIASH_PAR_MIN_REPEAT_ENTROPY}" | sed "s#'#'\"'\"'#g;s#.*#par_min_repeat_entropy='&'#" ; else echo "# par_min_repeat_entropy="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_PARTIAL_OBSERVATIONS+x} ]; then echo "${VIASH_PAR_NO_PARTIAL_OBSERVATIONS}" | sed "s#'#'\"'\"'#g;s#.*#par_no_partial_observations='&'#" ; else echo "# par_no_partial_observations="; fi ) +$( if [ ! -z ${VIASH_PAR_DONT_LEFT_ALIGN_INDELS+x} ]; then echo "${VIASH_PAR_DONT_LEFT_ALIGN_INDELS}" | sed "s#'#'\"'\"'#g;s#.*#par_dont_left_align_indels='&'#" ; else echo "# par_dont_left_align_indels="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_DUPLICATE_READS+x} ]; then echo "${VIASH_PAR_USE_DUPLICATE_READS}" | sed "s#'#'\"'\"'#g;s#.*#par_use_duplicate_reads='&'#" ; else echo "# par_use_duplicate_reads="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MAPPING_QUALITY+x} ]; then echo "${VIASH_PAR_MIN_MAPPING_QUALITY}" | sed "s#'#'\"'\"'#g;s#.*#par_min_mapping_quality='&'#" ; else echo "# par_min_mapping_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_BASE_QUALITY+x} ]; then echo "${VIASH_PAR_MIN_BASE_QUALITY}" | sed "s#'#'\"'\"'#g;s#.*#par_min_base_quality='&'#" ; else echo "# par_min_base_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM+x} ]; then echo "${VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM}" | sed "s#'#'\"'\"'#g;s#.*#par_min_supporting_allele_qsum='&'#" ; else echo "# par_min_supporting_allele_qsum="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM+x} ]; then echo "${VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM}" | sed "s#'#'\"'\"'#g;s#.*#par_min_supporting_mapping_qsum='&'#" ; else echo "# par_min_supporting_mapping_qsum="; fi ) +$( if [ ! -z ${VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD+x} ]; then echo "${VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD}" | sed "s#'#'\"'\"'#g;s#.*#par_mismatch_base_quality_threshold='&'#" ; else echo "# par_mismatch_base_quality_threshold="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_MAX_MISMATCH_FRACTION+x} ]; then echo "${VIASH_PAR_READ_MAX_MISMATCH_FRACTION}" | sed "s#'#'\"'\"'#g;s#.*#par_read_max_mismatch_fraction='&'#" ; else echo "# par_read_max_mismatch_fraction="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_MISMATCH_LIMIT+x} ]; then echo "${VIASH_PAR_READ_MISMATCH_LIMIT}" | sed "s#'#'\"'\"'#g;s#.*#par_read_mismatch_limit='&'#" ; else echo "# par_read_mismatch_limit="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_SNP_LIMIT+x} ]; then echo "${VIASH_PAR_READ_SNP_LIMIT}" | sed "s#'#'\"'\"'#g;s#.*#par_read_snp_limit='&'#" ; else echo "# par_read_snp_limit="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_INDEL_LIMIT+x} ]; then echo "${VIASH_PAR_READ_INDEL_LIMIT}" | sed "s#'#'\"'\"'#g;s#.*#par_read_indel_limit='&'#" ; else echo "# par_read_indel_limit="; fi ) +$( if [ ! -z ${VIASH_PAR_STANDARD_FILTERS+x} ]; then echo "${VIASH_PAR_STANDARD_FILTERS}" | sed "s#'#'\"'\"'#g;s#.*#par_standard_filters='&'#" ; else echo "# par_standard_filters="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_FRACTION+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_FRACTION}" | sed "s#'#'\"'\"'#g;s#.*#par_min_alternate_fraction='&'#" ; else echo "# par_min_alternate_fraction="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_COUNT+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_COUNT}" | sed "s#'#'\"'\"'#g;s#.*#par_min_alternate_count='&'#" ; else echo "# par_min_alternate_count="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_QSUM+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_QSUM}" | sed "s#'#'\"'\"'#g;s#.*#par_min_alternate_qsum='&'#" ; else echo "# par_min_alternate_qsum="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_TOTAL+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_TOTAL}" | sed "s#'#'\"'\"'#g;s#.*#par_min_alternate_total='&'#" ; else echo "# par_min_alternate_total="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_COVERAGE+x} ]; then echo "${VIASH_PAR_MIN_COVERAGE}" | sed "s#'#'\"'\"'#g;s#.*#par_min_coverage='&'#" ; else echo "# par_min_coverage="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_COVERAGE+x} ]; then echo "${VIASH_PAR_MAX_COVERAGE}" | sed "s#'#'\"'\"'#g;s#.*#par_max_coverage='&'#" ; else echo "# par_max_coverage="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_POPULATION_PRIORS+x} ]; then echo "${VIASH_PAR_NO_POPULATION_PRIORS}" | sed "s#'#'\"'\"'#g;s#.*#par_no_population_priors='&'#" ; else echo "# par_no_population_priors="; fi ) +$( if [ ! -z ${VIASH_PAR_HWE_PRIORS_OFF+x} ]; then echo "${VIASH_PAR_HWE_PRIORS_OFF}" | sed "s#'#'\"'\"'#g;s#.*#par_hwe_priors_off='&'#" ; else echo "# par_hwe_priors_off="; fi ) +$( if [ ! -z ${VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF+x} ]; then echo "${VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF}" | sed "s#'#'\"'\"'#g;s#.*#par_binomial_obs_priors_off='&'#" ; else echo "# par_binomial_obs_priors_off="; fi ) +$( if [ ! -z ${VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF+x} ]; then echo "${VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF}" | sed "s#'#'\"'\"'#g;s#.*#par_allele_balance_priors_off='&'#" ; else echo "# par_allele_balance_priors_off="; fi ) +$( if [ ! -z ${VIASH_PAR_OBSERVATION_BIAS+x} ]; then echo "${VIASH_PAR_OBSERVATION_BIAS}" | sed "s#'#'\"'\"'#g;s#.*#par_observation_bias='&'#" ; else echo "# par_observation_bias="; fi ) +$( if [ ! -z ${VIASH_PAR_BASE_QUALITY_CAP+x} ]; then echo "${VIASH_PAR_BASE_QUALITY_CAP}" | sed "s#'#'\"'\"'#g;s#.*#par_base_quality_cap='&'#" ; else echo "# par_base_quality_cap="; fi ) +$( if [ ! -z ${VIASH_PAR_PROB_CONTAMINATION+x} ]; then echo "${VIASH_PAR_PROB_CONTAMINATION}" | sed "s#'#'\"'\"'#g;s#.*#par_prob_contamination='&'#" ; else echo "# par_prob_contamination="; fi ) +$( if [ ! -z ${VIASH_PAR_LEGACY_GLS+x} ]; then echo "${VIASH_PAR_LEGACY_GLS}" | sed "s#'#'\"'\"'#g;s#.*#par_legacy_gls='&'#" ; else echo "# par_legacy_gls="; fi ) +$( if [ ! -z ${VIASH_PAR_CONTAMINATION_ESTIMATES+x} ]; then echo "${VIASH_PAR_CONTAMINATION_ESTIMATES}" | sed "s#'#'\"'\"'#g;s#.*#par_contamination_estimates='&'#" ; else echo "# par_contamination_estimates="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX+x} ]; then echo "${VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX}" | sed "s#'#'\"'\"'#g;s#.*#par_report_genotype_likelihood_max='&'#" ; else echo "# par_report_genotype_likelihood_max="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPING_MAX_ITERATIONS+x} ]; then echo "${VIASH_PAR_GENOTYPING_MAX_ITERATIONS}" | sed "s#'#'\"'\"'#g;s#.*#par_genotyping_max_iterations='&'#" ; else echo "# par_genotyping_max_iterations="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPING_MAX_BANDDEPTH+x} ]; then echo "${VIASH_PAR_GENOTYPING_MAX_BANDDEPTH}" | sed "s#'#'\"'\"'#g;s#.*#par_genotyping_max_banddepth='&'#" ; else echo "# par_genotyping_max_banddepth="; fi ) +$( if [ ! -z ${VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS+x} ]; then echo "${VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS}" | sed "s#'#'\"'\"'#g;s#.*#par_posterior_integration_limits='&'#" ; else echo "# par_posterior_integration_limits="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES+x} ]; then echo "${VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES}" | sed "s#'#'\"'\"'#g;s#.*#par_exclude_unobserved_genotypes='&'#" ; else echo "# par_exclude_unobserved_genotypes="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD+x} ]; then echo "${VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD}" | sed "s#'#'\"'\"'#g;s#.*#par_genotype_variant_threshold='&'#" ; else echo "# par_genotype_variant_threshold="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_MAPPING_QUALITY+x} ]; then echo "${VIASH_PAR_USE_MAPPING_QUALITY}" | sed "s#'#'\"'\"'#g;s#.*#par_use_mapping_quality='&'#" ; else echo "# par_use_mapping_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_HARMONIC_INDEL_QUALITY+x} ]; then echo "${VIASH_PAR_HARMONIC_INDEL_QUALITY}" | sed "s#'#'\"'\"'#g;s#.*#par_harmonic_indel_quality='&'#" ; else echo "# par_harmonic_indel_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_DEPENDENCE_FACTOR+x} ]; then echo "${VIASH_PAR_READ_DEPENDENCE_FACTOR}" | sed "s#'#'\"'\"'#g;s#.*#par_read_dependence_factor='&'#" ; else echo "# par_read_dependence_factor="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPE_QUALITIES+x} ]; then echo "${VIASH_PAR_GENOTYPE_QUALITIES}" | sed "s#'#'\"'\"'#g;s#.*#par_genotype_qualities='&'#" ; else echo "# par_genotype_qualities="; fi ) +$( if [ ! -z ${VIASH_PAR_DEBUG+x} ]; then echo "${VIASH_PAR_DEBUG}" | sed "s#'#'\"'\"'#g;s#.*#par_debug='&'#" ; else echo "# par_debug="; fi ) +$( if [ ! -z ${VIASH_PAR_DD+x} ]; then echo "${VIASH_PAR_DD}" | sed "s#'#'\"'\"'#g;s#.*#par_dd='&'#" ; else echo "# par_dd="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +set -eo pipefail + +# Unset boolean flags if their values are not 'true' +for flag in par_stdin par_gvcf par_only_use_input_alleles par_report_all_haplotype_alleles par_report_monomorphic par_strict_vcf \\ + par_pooled_discrete par_pooled_continuous par_use_reference_allele par_throw_away_snp_obs par_throw_away_indel_obs par_throw_away_mnps_obs par_throw_away_complex_obs \\ + par_no_partial_observations par_dont_left_align_indels par_use_duplicate_reads par_standard_filters par_no_population_priors \\ + par_hwe_priors_off par_binomial_obs_priors_off par_allele_balance_priors_off par_legacy_gls par_report_genotype_likelihood_max \\ + par_exclude_unobserved_genotypes par_use_mapping_quality par_harmonic_indel_quality par_genotype_qualities par_debug par_dd; do + [[ "\${!flag}" != "true" ]] && unset "\$flag" +done + +# Create output directory if it doesn't exist +if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" +fi + +freebayes \\ + --fasta-reference \$par_fasta_reference \\ + --pvar \$par_pvar \\ + --theta \$par_theta \\ + --ploidy \$par_ploidy \\ + --min-repeat-entropy \$par_min_repeat_entropy \\ + --reference-quality \$par_reference_quality \\ + --use-best-n-alleles \$par_use_best_n_alleles \\ + --max-complex-gap \$par_max_complex_gap \\ + --min-repeat-size \$par_min_repeat_size \\ + --min-mapping-quality \$par_min_mapping_quality \\ + --min-base-quality \$par_min_base_quality \\ + --min-supporting-allele-qsum \$par_min_supporting_allele_qsum \\ + --min-supporting-mapping-qsum \$par_min_supporting_mapping_qsum \\ + --mismatch-base-quality-threshold \$par_mismatch_base_quality_threshold \\ + --read-max-mismatch-fraction \$par_read_max_mismatch_fraction \\ + --min-alternate-fraction \$par_min_alternate_fraction \\ + --min-alternate-count \$par_min_alternate_count \\ + --min-alternate-qsum \$par_min_alternate_qsum \\ + --min-alternate-total \$par_min_alternate_total \\ + --min-coverage \$par_min_coverage \\ + --genotyping-max-iterations \$par_genotyping_max_iterations \\ + --genotyping-max-banddepth \$par_genotyping_max_banddepth \\ + --posterior-integration-limits \$par_posterior_integration_limits \\ + --read-dependence-factor \$par_read_dependence_factor \\ + --vcf \${par_output}/\${par_vcf} \\ + \${par_bam:+--bam \$par_bam} \\ + \${par_bam_list:+--bam-list \$par_bam_list} \\ + \${par_stdin:+--stdin} \\ + \${par_targets:+--targets \$par_targets} \\ + \${par_region:+--region \$par_region} \\ + \${par_max_coverage:+--max-coverage \$par_max_coverage} \\ + \${par_samples:+--samples \$par_samples} \\ + \${par_populations:+--populations \$par_populations} \\ + \${par_cnv_map:+--cnv-map \$par_cnv_map} \\ + \${par_gvcf:+--gvcf} \\ + \${par_gvcf_chunk:+--gvcf-chunk \$par_gvcf_chunk} \\ + \${par_variant_input:+--variant-input \$par_variant_input} \\ + \${par_only_use_input_alleles:+--only-use-input-alleles} \\ + \${par_haplotype_basis_alleles:+--haplotype-basis-alleles \$par_haplotype_basis_alleles} \\ + \${par_report_all_haplotype_alleles:+--report-all-haplotype-alleles} \\ + \${par_report_monomorphic:+--report-monomorphic} \\ + \${par_strict_vcf:+--strict-vcf} \\ + \${par_pooled_discrete:+--pooled-discrete} \\ + \${par_pooled_continuous:+--pooled-continuous} \\ + \${par_use_reference_allele:+--use-reference-allele} \\ + \${par_throw_away_snp_obs:+--throw-away-snp-obs} \\ + \${par_throw_away_indel_obs:+--throw-away-indel-obs} \\ + \${par_throw_away_mnps_obs:+--throw-away-mnps-obs} \\ + \${par_throw_away_complex_obs:+--throw-away-complex-obs} \\ + \${par_no_partial_observations:+--no-partial-observations} \\ + \${par_dont_left_align_indels:+--dont-left-align-indels} \\ + \${par_use_duplicate_reads:+--use-duplicate-reads} \\ + \${par_standard_filters:+--standard-filters} \\ + \${par_no_population_priors:+--no-population-priors} \\ + \${par_hwe_priors_off:+--hwe-priors-off} \\ + \${par_binomial_obs_priors_off:+--binomial-obs-priors-off} \\ + \${par_allele_balance_priors_off:+--allele-balance-priors-off} \\ + \${par_legacy_gls:+--legacy-gls} \\ + \${par_report_genotype_likelihood_max:+--report-genotype-likelihood-max} \\ + \${par_exclude_unobserved_genotypes:+--exclude-unobserved-genotypes} \\ + \${par_use_mapping_quality:+--use-mapping-quality} \\ + \${par_harmonic_indel_quality:+--harmonic-indel-quality} \\ + \${par_genotype_qualities:+--genotype-qualities} \\ + \${par_debug:+--debug} \\ + \${par_dd:+-dd} \\ + \${par_observation_bias:+--observation-bias \$par_observation_bias} \\ + \${par_read_mismatch_limit:+--read-mismatch-limit \$par_read_mismatch_limit} \\ + \${par_read_snp_limit:+--read-snp-limit \$par_read_snp_limit} \\ + \${par_read_indel_limit:+--read-indel-limit \$par_read_indel_limit} \\ + \${par_base_quality_cap:+--base-quality-cap \$par_base_quality_cap} \\ + \${par_prob_contamination:+--prob-contamination \$par_prob_contamination} \\ + \${par_contamination_estimates:+--contamination-estimates \$par_contamination_estimates} \\ + \${par_genotype_variant_threshold:+--genotype-variant-threshold \$par_genotype_variant_threshold} +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_PAR_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_BAM") + fi + if [ ! -z "$VIASH_PAR_BAM_LIST" ]; then + VIASH_PAR_BAM_LIST=$(ViashDockerStripAutomount "$VIASH_PAR_BAM_LIST") + fi + if [ ! -z "$VIASH_PAR_FASTA_REFERENCE" ]; then + VIASH_PAR_FASTA_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_FASTA_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_FASTA_REFERENCE_INDEX" ]; then + VIASH_PAR_FASTA_REFERENCE_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_FASTA_REFERENCE_INDEX") + fi + if [ ! -z "$VIASH_PAR_TARGETS" ]; then + VIASH_PAR_TARGETS=$(ViashDockerStripAutomount "$VIASH_PAR_TARGETS") + fi + if [ ! -z "$VIASH_PAR_SAMPLES" ]; then + VIASH_PAR_SAMPLES=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLES") + fi + if [ ! -z "$VIASH_PAR_POPULATIONS" ]; then + VIASH_PAR_POPULATIONS=$(ViashDockerStripAutomount "$VIASH_PAR_POPULATIONS") + fi + if [ ! -z "$VIASH_PAR_CNV_MAP" ]; then + VIASH_PAR_CNV_MAP=$(ViashDockerStripAutomount "$VIASH_PAR_CNV_MAP") + fi + if [ ! -z "$VIASH_PAR_VARIANT_INPUT" ]; then + VIASH_PAR_VARIANT_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_VARIANT_INPUT") + fi + if [ ! -z "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES" ]; then + VIASH_PAR_HAPLOTYPE_BASIS_ALLELES=$(ViashDockerStripAutomount "$VIASH_PAR_HAPLOTYPE_BASIS_ALLELES") + fi + if [ ! -z "$VIASH_PAR_OBSERVATION_BIAS" ]; then + VIASH_PAR_OBSERVATION_BIAS=$(ViashDockerStripAutomount "$VIASH_PAR_OBSERVATION_BIAS") + fi + if [ ! -z "$VIASH_PAR_CONTAMINATION_ESTIMATES" ]; then + VIASH_PAR_CONTAMINATION_ESTIMATES=$(ViashDockerStripAutomount "$VIASH_PAR_CONTAMINATION_ESTIMATES") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/freebayes/nextflow_labels.config b/target/executable/genetic_demux/freebayes/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/freebayes/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/freemuxlet/.config.vsh.yaml b/target/executable/genetic_demux/freemuxlet/.config.vsh.yaml new file mode 100644 index 00000000..d60c401c --- /dev/null +++ b/target/executable/genetic_demux/freemuxlet/.config.vsh.yaml @@ -0,0 +1,420 @@ +name: "freemuxlet" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--plp" + description: "Prefix of input files generated by dsc-pileup" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--init_cluster" + description: "Input file containing the initial cluster information." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--nsample" + description: "Number of samples multiplexed together" + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--aux_files" + description: "Turn on writing auxilary output files" + info: null + direction: "input" + - type: "integer" + name: "--verbose" + description: "Turn on verbose mode with specific verbosity threshold. 0: fully\ + \ verbose, 100 : no verbose messages." + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--doublet_prior" + description: "Prior of doublet." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--geno_error" + description: "Genotype error parameter per cluster." + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--bf_thres" + description: "Bayes Factor Threshold used in the initial clustering." + info: null + default: + - 5.41 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--frac_init_clust" + description: "Fraction of droplets to be clustered in the very first round of\ + \ initial clustering procedure." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--iter_init" + description: "Iteration for initial cluster assignment (set to zero to skip the\ + \ iterations)." + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--keep_init_missing" + description: "Keep missing cluster assignment as missing in the initial iteration." + info: null + direction: "input" + - type: "boolean_true" + name: "--randomize_singlet_score" + description: "Randomize the singlet scores to test its effect." + info: null + direction: "input" + - type: "integer" + name: "--seed" + description: "Seed for random number (use clocks if not set)." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--cap_bq" + description: "Maximum base quality (higher BQ will be capped)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_bq" + description: "Minimum base quality to consider (lower BQ will be skipped)." + info: null + default: + - 13 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_list" + description: "List of tag readgroup/cell barcode to consider in this run. All\ + \ other barcodes will be ignored. This is useful for parallelized run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_total" + description: "Minimum number of total reads for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_umi" + description: "Minimum number of UMIs for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_snp" + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "freemux" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--out" + description: "freemuxlet Output file prefix" + info: null + example: + - "freemuxlet" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "freemuxlet.patch" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Freemuxlet is a software tool to deconvolute sample identity and identify\ + \ multiplets when\nmultiple samples are pooled by barcoded single cell sequencing.\ + \ If external genotyping\ndata is not available, the genotyping-free version demuxlet,\ + \ freemuxlet, would be recommended.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + copy: + - "freemuxlet.patch /opt/freemuxlet.patch" + - type: "apt" + packages: + - "autoconf" + - "wget" + - "git" + - "build-essential" + - "libcurl4-openssl-dev" + - "cmake" + - "libbz2-dev" + - "libssl-dev" + - "liblzma-dev" + - "zlib1g-dev" + - "r-base" + interactive: false + - type: "docker" + run: + - "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib\ + \ && git submodule update --init --recursive && autoreconf -i && ./configure\ + \ --prefix=/usr/local/ && make && make install" + - type: "docker" + run: + - "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle &&\ + \ cd /tmp/popscle && git apply /opt/freemuxlet.patch && mkdir -p /tmp/popscle/build\ + \ && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle\ + \ /usr/local/bin" + - type: "r" + cran: + - "readr" + - "processx" + - "dplyr" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/freemuxlet/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/freemuxlet" + executable: "target/executable/genetic_demux/freemuxlet/freemuxlet" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/freemuxlet/freemuxlet b/target/executable/genetic_demux/freemuxlet/freemuxlet new file mode 100755 index 00000000..c1b4ff8a --- /dev/null +++ b/target/executable/genetic_demux/freemuxlet/freemuxlet @@ -0,0 +1,1664 @@ +#!/usr/bin/env bash + +# freemuxlet main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="freemuxlet" +VIASH_META_FUNCTIONALITY_NAME="freemuxlet" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:22.04 +ENTRYPOINT [] +COPY freemuxlet.patch /opt/freemuxlet.patch +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y autoconf wget git build-essential libcurl4-openssl-dev cmake libbz2-dev libssl-dev liblzma-dev zlib1g-dev r-base && \ + rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install +RUN git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/freemuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("readr", "processx", "dplyr"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux freemuxlet" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "freemuxlet main" + echo "" + echo "Freemuxlet is a software tool to deconvolute sample identity and identify" + echo "multiplets when" + echo "multiple samples are pooled by barcoded single cell sequencing. If external" + echo "genotyping" + echo "data is not available, the genotyping-free version demuxlet, freemuxlet, would" + echo "be recommended." + echo "" + echo "Input:" + echo " --plp" + echo " type: string" + echo " Prefix of input files generated by dsc-pileup" + echo "" + echo " --init_cluster" + echo " type: file, file must exist" + echo " Input file containing the initial cluster information." + echo "" + echo " --nsample" + echo " type: integer" + echo " default: 2" + echo " Number of samples multiplexed together" + echo "" + echo " --aux_files" + echo " type: boolean_true" + echo " Turn on writing auxilary output files" + echo "" + echo " --verbose" + echo " type: integer" + echo " default: 100" + echo " Turn on verbose mode with specific verbosity threshold. 0: fully" + echo " verbose, 100 : no verbose messages." + echo "" + echo " --doublet_prior" + echo " type: double" + echo " default: 0.5" + echo " Prior of doublet." + echo "" + echo " --geno_error" + echo " type: double" + echo " default: 0.1" + echo " Genotype error parameter per cluster." + echo "" + echo " --bf_thres" + echo " type: double" + echo " default: 5.41" + echo " Bayes Factor Threshold used in the initial clustering." + echo "" + echo " --frac_init_clust" + echo " type: double" + echo " default: 1.0" + echo " Fraction of droplets to be clustered in the very first round of initial" + echo " clustering procedure." + echo "" + echo " --iter_init" + echo " type: integer" + echo " default: 10" + echo " Iteration for initial cluster assignment (set to zero to skip the" + echo " iterations)." + echo "" + echo " --keep_init_missing" + echo " type: boolean_true" + echo " Keep missing cluster assignment as missing in the initial iteration." + echo "" + echo " --randomize_singlet_score" + echo " type: boolean_true" + echo " Randomize the singlet scores to test its effect." + echo "" + echo " --seed" + echo " type: integer" + echo " default: 0" + echo " Seed for random number (use clocks if not set)." + echo "" + echo " --cap_bq" + echo " type: integer" + echo " default: 20" + echo " Maximum base quality (higher BQ will be capped)." + echo "" + echo " --min_bq" + echo " type: integer" + echo " default: 13" + echo " Minimum base quality to consider (lower BQ will be skipped)." + echo "" + echo " --group_list" + echo " type: string" + echo " List of tag readgroup/cell barcode to consider in this run. All other" + echo " barcodes will be ignored. This is useful for parallelized run." + echo "" + echo " --min_total" + echo " type: integer" + echo " default: 0" + echo " Minimum number of total reads for a droplet/cell to be considered." + echo "" + echo " --min_umi" + echo " type: integer" + echo " default: 0" + echo " Minimum number of UMIs for a droplet/cell to be considered." + echo "" + echo " --min_snp" + echo " type: integer" + echo " default: 0" + echo " Minimum number of SNPs with coverage for a droplet/cell to be" + echo " considered." + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: freemux" + echo " Output directory" + echo "" + echo " --out" + echo " type: string" + echo " example: freemuxlet" + echo " freemuxlet Output file prefix" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "freemuxlet main" + exit + ;; + --plp) + [ -n "$VIASH_PAR_PLP" ] && ViashError Bad arguments for option \'--plp\': \'$VIASH_PAR_PLP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --plp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --plp=*) + [ -n "$VIASH_PAR_PLP" ] && ViashError Bad arguments for option \'--plp=*\': \'$VIASH_PAR_PLP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --init_cluster) + [ -n "$VIASH_PAR_INIT_CLUSTER" ] && ViashError Bad arguments for option \'--init_cluster\': \'$VIASH_PAR_INIT_CLUSTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INIT_CLUSTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --init_cluster. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --init_cluster=*) + [ -n "$VIASH_PAR_INIT_CLUSTER" ] && ViashError Bad arguments for option \'--init_cluster=*\': \'$VIASH_PAR_INIT_CLUSTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INIT_CLUSTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --nsample) + [ -n "$VIASH_PAR_NSAMPLE" ] && ViashError Bad arguments for option \'--nsample\': \'$VIASH_PAR_NSAMPLE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NSAMPLE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nsample. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nsample=*) + [ -n "$VIASH_PAR_NSAMPLE" ] && ViashError Bad arguments for option \'--nsample=*\': \'$VIASH_PAR_NSAMPLE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NSAMPLE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --aux_files) + [ -n "$VIASH_PAR_AUX_FILES" ] && ViashError Bad arguments for option \'--aux_files\': \'$VIASH_PAR_AUX_FILES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AUX_FILES=true + shift 1 + ;; + --verbose) + [ -n "$VIASH_PAR_VERBOSE" ] && ViashError Bad arguments for option \'--verbose\': \'$VIASH_PAR_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --verbose. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --verbose=*) + [ -n "$VIASH_PAR_VERBOSE" ] && ViashError Bad arguments for option \'--verbose=*\': \'$VIASH_PAR_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --doublet_prior) + [ -n "$VIASH_PAR_DOUBLET_PRIOR" ] && ViashError Bad arguments for option \'--doublet_prior\': \'$VIASH_PAR_DOUBLET_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DOUBLET_PRIOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --doublet_prior. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --doublet_prior=*) + [ -n "$VIASH_PAR_DOUBLET_PRIOR" ] && ViashError Bad arguments for option \'--doublet_prior=*\': \'$VIASH_PAR_DOUBLET_PRIOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DOUBLET_PRIOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --geno_error) + [ -n "$VIASH_PAR_GENO_ERROR" ] && ViashError Bad arguments for option \'--geno_error\': \'$VIASH_PAR_GENO_ERROR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_ERROR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --geno_error. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --geno_error=*) + [ -n "$VIASH_PAR_GENO_ERROR" ] && ViashError Bad arguments for option \'--geno_error=*\': \'$VIASH_PAR_GENO_ERROR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_ERROR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bf_thres) + [ -n "$VIASH_PAR_BF_THRES" ] && ViashError Bad arguments for option \'--bf_thres\': \'$VIASH_PAR_BF_THRES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BF_THRES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bf_thres. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bf_thres=*) + [ -n "$VIASH_PAR_BF_THRES" ] && ViashError Bad arguments for option \'--bf_thres=*\': \'$VIASH_PAR_BF_THRES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BF_THRES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --frac_init_clust) + [ -n "$VIASH_PAR_FRAC_INIT_CLUST" ] && ViashError Bad arguments for option \'--frac_init_clust\': \'$VIASH_PAR_FRAC_INIT_CLUST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FRAC_INIT_CLUST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --frac_init_clust. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --frac_init_clust=*) + [ -n "$VIASH_PAR_FRAC_INIT_CLUST" ] && ViashError Bad arguments for option \'--frac_init_clust=*\': \'$VIASH_PAR_FRAC_INIT_CLUST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FRAC_INIT_CLUST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --iter_init) + [ -n "$VIASH_PAR_ITER_INIT" ] && ViashError Bad arguments for option \'--iter_init\': \'$VIASH_PAR_ITER_INIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ITER_INIT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --iter_init. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --iter_init=*) + [ -n "$VIASH_PAR_ITER_INIT" ] && ViashError Bad arguments for option \'--iter_init=*\': \'$VIASH_PAR_ITER_INIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ITER_INIT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --keep_init_missing) + [ -n "$VIASH_PAR_KEEP_INIT_MISSING" ] && ViashError Bad arguments for option \'--keep_init_missing\': \'$VIASH_PAR_KEEP_INIT_MISSING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KEEP_INIT_MISSING=true + shift 1 + ;; + --randomize_singlet_score) + [ -n "$VIASH_PAR_RANDOMIZE_SINGLET_SCORE" ] && ViashError Bad arguments for option \'--randomize_singlet_score\': \'$VIASH_PAR_RANDOMIZE_SINGLET_SCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RANDOMIZE_SINGLET_SCORE=true + shift 1 + ;; + --seed) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seed=*) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed=*\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cap_bq) + [ -n "$VIASH_PAR_CAP_BQ" ] && ViashError Bad arguments for option \'--cap_bq\': \'$VIASH_PAR_CAP_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CAP_BQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cap_bq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cap_bq=*) + [ -n "$VIASH_PAR_CAP_BQ" ] && ViashError Bad arguments for option \'--cap_bq=*\': \'$VIASH_PAR_CAP_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CAP_BQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_bq) + [ -n "$VIASH_PAR_MIN_BQ" ] && ViashError Bad arguments for option \'--min_bq\': \'$VIASH_PAR_MIN_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BQ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_bq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_bq=*) + [ -n "$VIASH_PAR_MIN_BQ" ] && ViashError Bad arguments for option \'--min_bq=*\': \'$VIASH_PAR_MIN_BQ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_BQ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --group_list) + [ -n "$VIASH_PAR_GROUP_LIST" ] && ViashError Bad arguments for option \'--group_list\': \'$VIASH_PAR_GROUP_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_LIST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --group_list. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --group_list=*) + [ -n "$VIASH_PAR_GROUP_LIST" ] && ViashError Bad arguments for option \'--group_list=*\': \'$VIASH_PAR_GROUP_LIST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUP_LIST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_total) + [ -n "$VIASH_PAR_MIN_TOTAL" ] && ViashError Bad arguments for option \'--min_total\': \'$VIASH_PAR_MIN_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TOTAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_total. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_total=*) + [ -n "$VIASH_PAR_MIN_TOTAL" ] && ViashError Bad arguments for option \'--min_total=*\': \'$VIASH_PAR_MIN_TOTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_TOTAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_umi) + [ -n "$VIASH_PAR_MIN_UMI" ] && ViashError Bad arguments for option \'--min_umi\': \'$VIASH_PAR_MIN_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_umi=*) + [ -n "$VIASH_PAR_MIN_UMI" ] && ViashError Bad arguments for option \'--min_umi=*\': \'$VIASH_PAR_MIN_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_snp) + [ -n "$VIASH_PAR_MIN_SNP" ] && ViashError Bad arguments for option \'--min_snp\': \'$VIASH_PAR_MIN_SNP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SNP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_snp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_snp=*) + [ -n "$VIASH_PAR_MIN_SNP" ] && ViashError Bad arguments for option \'--min_snp=*\': \'$VIASH_PAR_MIN_SNP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SNP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --out) + [ -n "$VIASH_PAR_OUT" ] && ViashError Bad arguments for option \'--out\': \'$VIASH_PAR_OUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --out. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --out=*) + [ -n "$VIASH_PAR_OUT" ] && ViashError Bad arguments for option \'--out=*\': \'$VIASH_PAR_OUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/freemuxlet:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_NSAMPLE+x} ]; then + VIASH_PAR_NSAMPLE="2" +fi +if [ -z ${VIASH_PAR_AUX_FILES+x} ]; then + VIASH_PAR_AUX_FILES="false" +fi +if [ -z ${VIASH_PAR_VERBOSE+x} ]; then + VIASH_PAR_VERBOSE="100" +fi +if [ -z ${VIASH_PAR_DOUBLET_PRIOR+x} ]; then + VIASH_PAR_DOUBLET_PRIOR="0.5" +fi +if [ -z ${VIASH_PAR_GENO_ERROR+x} ]; then + VIASH_PAR_GENO_ERROR="0.1" +fi +if [ -z ${VIASH_PAR_BF_THRES+x} ]; then + VIASH_PAR_BF_THRES="5.41" +fi +if [ -z ${VIASH_PAR_FRAC_INIT_CLUST+x} ]; then + VIASH_PAR_FRAC_INIT_CLUST="1.0" +fi +if [ -z ${VIASH_PAR_ITER_INIT+x} ]; then + VIASH_PAR_ITER_INIT="10" +fi +if [ -z ${VIASH_PAR_KEEP_INIT_MISSING+x} ]; then + VIASH_PAR_KEEP_INIT_MISSING="false" +fi +if [ -z ${VIASH_PAR_RANDOMIZE_SINGLET_SCORE+x} ]; then + VIASH_PAR_RANDOMIZE_SINGLET_SCORE="false" +fi +if [ -z ${VIASH_PAR_SEED+x} ]; then + VIASH_PAR_SEED="0" +fi +if [ -z ${VIASH_PAR_CAP_BQ+x} ]; then + VIASH_PAR_CAP_BQ="20" +fi +if [ -z ${VIASH_PAR_MIN_BQ+x} ]; then + VIASH_PAR_MIN_BQ="13" +fi +if [ -z ${VIASH_PAR_MIN_TOTAL+x} ]; then + VIASH_PAR_MIN_TOTAL="0" +fi +if [ -z ${VIASH_PAR_MIN_UMI+x} ]; then + VIASH_PAR_MIN_UMI="0" +fi +if [ -z ${VIASH_PAR_MIN_SNP+x} ]; then + VIASH_PAR_MIN_SNP="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INIT_CLUSTER" ] && [ ! -e "$VIASH_PAR_INIT_CLUSTER" ]; then + ViashError "Input file '$VIASH_PAR_INIT_CLUSTER' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_NSAMPLE" ]]; then + if ! [[ "$VIASH_PAR_NSAMPLE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--nsample' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_AUX_FILES" ]]; then + if ! [[ "$VIASH_PAR_AUX_FILES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--aux_files' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_VERBOSE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--verbose' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DOUBLET_PRIOR" ]]; then + if ! [[ "$VIASH_PAR_DOUBLET_PRIOR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--doublet_prior' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENO_ERROR" ]]; then + if ! [[ "$VIASH_PAR_GENO_ERROR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--geno_error' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BF_THRES" ]]; then + if ! [[ "$VIASH_PAR_BF_THRES" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--bf_thres' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FRAC_INIT_CLUST" ]]; then + if ! [[ "$VIASH_PAR_FRAC_INIT_CLUST" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--frac_init_clust' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ITER_INIT" ]]; then + if ! [[ "$VIASH_PAR_ITER_INIT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--iter_init' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_KEEP_INIT_MISSING" ]]; then + if ! [[ "$VIASH_PAR_KEEP_INIT_MISSING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--keep_init_missing' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RANDOMIZE_SINGLET_SCORE" ]]; then + if ! [[ "$VIASH_PAR_RANDOMIZE_SINGLET_SCORE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--randomize_singlet_score' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEED" ]]; then + if ! [[ "$VIASH_PAR_SEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CAP_BQ" ]]; then + if ! [[ "$VIASH_PAR_CAP_BQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--cap_bq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_BQ" ]]; then + if ! [[ "$VIASH_PAR_MIN_BQ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_bq' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_TOTAL" ]]; then + if ! [[ "$VIASH_PAR_MIN_TOTAL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_total' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_UMI" ]]; then + if ! [[ "$VIASH_PAR_MIN_UMI" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_umi' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SNP" ]]; then + if ! [[ "$VIASH_PAR_MIN_SNP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_snp' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INIT_CLUSTER" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INIT_CLUSTER")" ) + VIASH_PAR_INIT_CLUSTER=$(ViashDockerAutodetectMount "$VIASH_PAR_INIT_CLUSTER") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-freemuxlet-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +requireNamespace("processx", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "plp" = $( if [ ! -z ${VIASH_PAR_PLP+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PLP" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "init_cluster" = $( if [ ! -z ${VIASH_PAR_INIT_CLUSTER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INIT_CLUSTER" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "nsample" = $( if [ ! -z ${VIASH_PAR_NSAMPLE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_NSAMPLE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "aux_files" = $( if [ ! -z ${VIASH_PAR_AUX_FILES+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_AUX_FILES" | sed "s#['\\]#\\\\&#g"; echo "'))"; else echo NULL; fi ), + "verbose" = $( if [ ! -z ${VIASH_PAR_VERBOSE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_VERBOSE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "doublet_prior" = $( if [ ! -z ${VIASH_PAR_DOUBLET_PRIOR+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_DOUBLET_PRIOR" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "geno_error" = $( if [ ! -z ${VIASH_PAR_GENO_ERROR+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_GENO_ERROR" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "bf_thres" = $( if [ ! -z ${VIASH_PAR_BF_THRES+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_BF_THRES" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "frac_init_clust" = $( if [ ! -z ${VIASH_PAR_FRAC_INIT_CLUST+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_FRAC_INIT_CLUST" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "iter_init" = $( if [ ! -z ${VIASH_PAR_ITER_INIT+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_ITER_INIT" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "keep_init_missing" = $( if [ ! -z ${VIASH_PAR_KEEP_INIT_MISSING+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_KEEP_INIT_MISSING" | sed "s#['\\]#\\\\&#g"; echo "'))"; else echo NULL; fi ), + "randomize_singlet_score" = $( if [ ! -z ${VIASH_PAR_RANDOMIZE_SINGLET_SCORE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_RANDOMIZE_SINGLET_SCORE" | sed "s#['\\]#\\\\&#g"; echo "'))"; else echo NULL; fi ), + "seed" = $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_SEED" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "cap_bq" = $( if [ ! -z ${VIASH_PAR_CAP_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_CAP_BQ" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_bq" = $( if [ ! -z ${VIASH_PAR_MIN_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_BQ" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "group_list" = $( if [ ! -z ${VIASH_PAR_GROUP_LIST+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_GROUP_LIST" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_total" = $( if [ ! -z ${VIASH_PAR_MIN_TOTAL+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_TOTAL" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_umi" = $( if [ ! -z ${VIASH_PAR_MIN_UMI+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_UMI" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_snp" = $( if [ ! -z ${VIASH_PAR_MIN_SNP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_SNP" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "out" = $( if [ ! -z ${VIASH_PAR_OUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +if (!dir.exists(par\$output)) { + dir.create(par\$output, recursive = TRUE, showWarnings = FALSE) +} + +cmd <- c( + "popscle", "freemuxlet", + "--out", paste0(par\$output, "/", par\$out) +) + +argmap <- c( + "plp" = "--plp", + "init_cluster" = "--init-cluster", + "nsample" = "--nsample", + "verbose" = "--verbose", + "doublet_prior" = "--doublet-prior", + "geno_error" = "--geno-error", + "bf_thres" = "--bf-thres", + "frac_init_clust" = "--frac-init-clust", + "iter_init" = "--iter-init", + "seed" = "--seed", + "cap_bq" = "--cap-BQ", + "min_bq" = "--min-BQ", + "min_total" = "--min-total", + "min_umi" = "--min-umi", + "min_snp" = "--min-snp", + "group_list" = "--group-list", + "aux_files" = "--aux-files", + "keep_init_missing" = "--keep-init-missing", + "randomize_singlet_score" = "randomize-singlet-score" +) + +for (arg in names(argmap)) { + if (!is.null(par[[arg]])) { + if (arg %in% c( + "aux_files", "keep_init_missing", + "randomize_singlet_score" + )) { + if (toupper(par[[arg]]) == TRUE) { + cmd <- c(cmd, argmap[[arg]]) + } + } else { + cmd <- c(cmd, argmap[[arg]], par[[arg]]) + } + } +} + +zzz <- processx::run( + cmd[[1]], + args = cmd[-1], + echo = TRUE, + echo_cmd = TRUE +) + +if (zzz\$status != 0) { + stop("Command failed with status ", zzz\$status) +} + +out_file <- paste0(par\$output, "/", par\$out, ".clust1.samples.gz") +if (!file.exists(out_file)) { + stop("Output file '", out_file, "' not found") +} + +res <- readr::read_tsv(out_file) + +res2 <- res %>% + mutate( + donor_part1 = gsub("([^,]*),([^,]*)*", "\\\\1", BEST.GUESS), + donor_part2 = gsub("([^,]*),([^,]*)*", "\\\\2", BEST.GUESS), + donor_id = case_when( + donor_part1 == donor_part2 ~ donor_part1, + TRUE ~ DROPLET.TYPE + ) + ) + +freemuxlet_assign <- res2 %>% select(cell = BARCODE, donor_id) + +readr::write_csv( + freemuxlet_assign, + paste0(par\$output, "/cell_annotation.csv") +) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INIT_CLUSTER" ]; then + VIASH_PAR_INIT_CLUSTER=$(ViashDockerStripAutomount "$VIASH_PAR_INIT_CLUSTER") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/freemuxlet/freemuxlet.patch b/target/executable/genetic_demux/freemuxlet/freemuxlet.patch new file mode 100644 index 00000000..29cc67c9 --- /dev/null +++ b/target/executable/genetic_demux/freemuxlet/freemuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/target/executable/genetic_demux/freemuxlet/nextflow_labels.config b/target/executable/genetic_demux/freemuxlet/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/freemuxlet/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/samtools/.config.vsh.yaml b/target/executable/genetic_demux/samtools/.config.vsh.yaml new file mode 100644 index 00000000..558bdbf0 --- /dev/null +++ b/target/executable/genetic_demux/samtools/.config.vsh.yaml @@ -0,0 +1,222 @@ +name: "samtools" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--bam" + description: "Input bam file for filtering." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Samtools output directory." + info: null + example: + - "samtools_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter the BAM according to the instruction of scSplit via Samtools." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "wget" + - "gcc" + - "make" + - "libbz2-dev" + - "zlib1g-dev" + - "libncurses5-dev" + - "libncursesw5-dev" + - "liblzma-dev" + interactive: false + - type: "docker" + run: + - "wget https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2\ + \ && tar jxf samtools-1.16.1.tar.bz2 && rm samtools-1.16.1.tar.bz2 && cd samtools-1.16.1\ + \ && make prefix=/usr/local install" + - type: "python" + user: false + pip: + - "umi_tools" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/samtools/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/samtools" + executable: "target/executable/genetic_demux/samtools/samtools" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/samtools/nextflow_labels.config b/target/executable/genetic_demux/samtools/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/samtools/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/samtools/samtools b/target/executable/genetic_demux/samtools/samtools new file mode 100755 index 00000000..3bc02843 --- /dev/null +++ b/target/executable/genetic_demux/samtools/samtools @@ -0,0 +1,1113 @@ +#!/usr/bin/env bash + +# samtools main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="samtools" +VIASH_META_FUNCTIONALITY_NAME="samtools" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y wget gcc make libbz2-dev zlib1g-dev libncurses5-dev libncursesw5-dev liblzma-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2 && tar jxf samtools-1.16.1.tar.bz2 && rm samtools-1.16.1.tar.bz2 && cd samtools-1.16.1 && make prefix=/usr/local install +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "umi_tools" + +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux samtools" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "samtools main" + echo "" + echo "Filter the BAM according to the instruction of scSplit via Samtools." + echo "" + echo "Arguments:" + echo " --bam" + echo " type: file, required parameter, file must exist" + echo " Input bam file for filtering." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " example: samtools_out" + echo " Samtools output directory." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "samtools main" + exit + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam=*) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam=*\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/samtools:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_BAM+x} ]; then + ViashError '--bam' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -e "$VIASH_PAR_BAM" ]; then + ViashError "Input file '$VIASH_PAR_BAM' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM")" ) + VIASH_PAR_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-samtools-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" +fi +samtools view -S -b -q 10 -F 3844 "\$par_bam" > "\${par_output}/filtered.bam" +cd \$par_output +samtools index filtered.bam filtered.bam.bai +umi_tools dedup --stdin=filtered.bam --extract-umi-method=tag --umi-tag=UR --cell-tag=CB --log=logfile > no_dup.bam +samtools sort no_dup.bam -o sorted.bam +samtools index sorted.bam sorted.bam.bai +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_PAR_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_BAM") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/scsplit/.config.vsh.yaml b/target/executable/genetic_demux/scsplit/.config.vsh.yaml new file mode 100644 index 00000000..7f845d31 --- /dev/null +++ b/target/executable/genetic_demux/scsplit/.config.vsh.yaml @@ -0,0 +1,346 @@ +name: "scsplit" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--vcf" + description: "VCF from mixed BAM" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "mixed sample BAM" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bar" + description: "barcodes whitelist" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag" + description: "tag for barcode" + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--com" + description: "common SNVs" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num" + description: "expected number of mixed samples" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sub" + description: "maximum number of subpopulations in autodetect mode" + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ems" + description: "number of EM repeats to avoid local maximum" + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--dbl" + description: "correction for doublets. There will be no refinement on the results\ + \ if this optional parameter is not specified or specified percentage is less\ + \ than doublet rates detected during the run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vcf_known" + description: "known individual genotypes to limit distinguishing variants to available\ + \ variants, so that users do not need to redo genotyping on selected variants,\ + \ otherwise any variants could be selected as distinguishing variants." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--geno" + description: "generate sample genotypes based on the split result." + info: null + direction: "input" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "scSplit_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--ref" + description: "output Ref count matrix" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alt" + description: "output Alt count matrix" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--psc" + description: "generated P(S|C)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "scsplit is a genotype-free demultiplexing methode of pooled single-cell\ + \ RNA-seq, using a hidden state model for identifying genetically distinct samples\ + \ within a mixed population." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "numpy<2" + - "pandas<2.0" + - "pysam" + - "setuptools<58" + - "scikit-learn==1.1.3" + - "scipy" + - "statistics" + upgrade: true + - type: "python" + user: false + pip: + - "PyVCF" + upgrade: true + - type: "docker" + run: + - "git clone https://github.com/jon-xu/scSplit && cp scSplit/scSplit /usr/local/bin\ + \ && rm -rf scSplit" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/scsplit/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/scsplit" + executable: "target/executable/genetic_demux/scsplit/scsplit" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/scsplit/nextflow_labels.config b/target/executable/genetic_demux/scsplit/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/scsplit/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/scsplit/scsplit b/target/executable/genetic_demux/scsplit/scsplit new file mode 100755 index 00000000..896aaeda --- /dev/null +++ b/target/executable/genetic_demux/scsplit/scsplit @@ -0,0 +1,1445 @@ +#!/usr/bin/env bash + +# scsplit main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scsplit" +VIASH_META_FUNCTIONALITY_NAME="scsplit" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "numpy<2" "pandas<2.0" "pysam" "setuptools<58" "scikit-learn==1.1.3" "scipy" "statistics" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "PyVCF" + +RUN git clone https://github.com/jon-xu/scSplit && cp scSplit/scSplit /usr/local/bin && rm -rf scSplit +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux scsplit" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scsplit main" + echo "" + echo "scsplit is a genotype-free demultiplexing methode of pooled single-cell RNA-seq," + echo "using a hidden state model for identifying genetically distinct samples within a" + echo "mixed population." + echo "" + echo "Input:" + echo " --vcf" + echo " type: file, file must exist" + echo " VCF from mixed BAM" + echo "" + echo " --bam" + echo " type: file, file must exist" + echo " mixed sample BAM" + echo "" + echo " --bar" + echo " type: file, file must exist" + echo " barcodes whitelist" + echo "" + echo " --tag" + echo " type: string" + echo " default: CB" + echo " tag for barcode" + echo "" + echo " --com" + echo " type: file, file must exist" + echo " common SNVs" + echo "" + echo " --num" + echo " type: integer" + echo " expected number of mixed samples" + echo "" + echo " --sub" + echo " type: integer" + echo " default: 10" + echo " maximum number of subpopulations in autodetect mode" + echo "" + echo " --ems" + echo " type: integer" + echo " default: 30" + echo " number of EM repeats to avoid local maximum" + echo "" + echo " --dbl" + echo " type: double" + echo " correction for doublets. There will be no refinement on the results if" + echo " this optional parameter is not specified or specified percentage is less" + echo " than doublet rates detected during the run." + echo "" + echo " --vcf_known" + echo " type: file, file must exist" + echo " known individual genotypes to limit distinguishing variants to available" + echo " variants, so that users do not need to redo genotyping on selected" + echo " variants, otherwise any variants could be selected as distinguishing" + echo " variants." + echo "" + echo " --geno" + echo " type: boolean_true" + echo " generate sample genotypes based on the split result." + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: scSplit_out" + echo " Output directory" + echo "" + echo " --ref" + echo " type: string" + echo " output Ref count matrix" + echo "" + echo " --alt" + echo " type: string" + echo " output Alt count matrix" + echo "" + echo " --psc" + echo " type: string" + echo " generated P(S|C)" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scsplit main" + exit + ;; + --vcf) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf=*) + [ -n "$VIASH_PAR_VCF" ] && ViashError Bad arguments for option \'--vcf=*\': \'$VIASH_PAR_VCF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam=*) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam=*\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bar) + [ -n "$VIASH_PAR_BAR" ] && ViashError Bad arguments for option \'--bar\': \'$VIASH_PAR_BAR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bar. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bar=*) + [ -n "$VIASH_PAR_BAR" ] && ViashError Bad arguments for option \'--bar=*\': \'$VIASH_PAR_BAR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tag) + [ -n "$VIASH_PAR_TAG" ] && ViashError Bad arguments for option \'--tag\': \'$VIASH_PAR_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tag=*) + [ -n "$VIASH_PAR_TAG" ] && ViashError Bad arguments for option \'--tag=*\': \'$VIASH_PAR_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --com) + [ -n "$VIASH_PAR_COM" ] && ViashError Bad arguments for option \'--com\': \'$VIASH_PAR_COM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --com. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --com=*) + [ -n "$VIASH_PAR_COM" ] && ViashError Bad arguments for option \'--com=*\': \'$VIASH_PAR_COM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num) + [ -n "$VIASH_PAR_NUM" ] && ViashError Bad arguments for option \'--num\': \'$VIASH_PAR_NUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num=*) + [ -n "$VIASH_PAR_NUM" ] && ViashError Bad arguments for option \'--num=*\': \'$VIASH_PAR_NUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sub) + [ -n "$VIASH_PAR_SUB" ] && ViashError Bad arguments for option \'--sub\': \'$VIASH_PAR_SUB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUB="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sub. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sub=*) + [ -n "$VIASH_PAR_SUB" ] && ViashError Bad arguments for option \'--sub=*\': \'$VIASH_PAR_SUB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUB=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ems) + [ -n "$VIASH_PAR_EMS" ] && ViashError Bad arguments for option \'--ems\': \'$VIASH_PAR_EMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ems. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ems=*) + [ -n "$VIASH_PAR_EMS" ] && ViashError Bad arguments for option \'--ems=*\': \'$VIASH_PAR_EMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dbl) + [ -n "$VIASH_PAR_DBL" ] && ViashError Bad arguments for option \'--dbl\': \'$VIASH_PAR_DBL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DBL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dbl. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dbl=*) + [ -n "$VIASH_PAR_DBL" ] && ViashError Bad arguments for option \'--dbl=*\': \'$VIASH_PAR_DBL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DBL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vcf_known) + [ -n "$VIASH_PAR_VCF_KNOWN" ] && ViashError Bad arguments for option \'--vcf_known\': \'$VIASH_PAR_VCF_KNOWN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF_KNOWN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vcf_known. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vcf_known=*) + [ -n "$VIASH_PAR_VCF_KNOWN" ] && ViashError Bad arguments for option \'--vcf_known=*\': \'$VIASH_PAR_VCF_KNOWN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VCF_KNOWN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --geno) + [ -n "$VIASH_PAR_GENO" ] && ViashError Bad arguments for option \'--geno\': \'$VIASH_PAR_GENO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO=true + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ref) + [ -n "$VIASH_PAR_REF" ] && ViashError Bad arguments for option \'--ref\': \'$VIASH_PAR_REF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ref. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ref=*) + [ -n "$VIASH_PAR_REF" ] && ViashError Bad arguments for option \'--ref=*\': \'$VIASH_PAR_REF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alt) + [ -n "$VIASH_PAR_ALT" ] && ViashError Bad arguments for option \'--alt\': \'$VIASH_PAR_ALT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alt. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alt=*) + [ -n "$VIASH_PAR_ALT" ] && ViashError Bad arguments for option \'--alt=*\': \'$VIASH_PAR_ALT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --psc) + [ -n "$VIASH_PAR_PSC" ] && ViashError Bad arguments for option \'--psc\': \'$VIASH_PAR_PSC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PSC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --psc. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --psc=*) + [ -n "$VIASH_PAR_PSC" ] && ViashError Bad arguments for option \'--psc=*\': \'$VIASH_PAR_PSC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PSC=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/scsplit:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_TAG+x} ]; then + VIASH_PAR_TAG="CB" +fi +if [ -z ${VIASH_PAR_SUB+x} ]; then + VIASH_PAR_SUB="10" +fi +if [ -z ${VIASH_PAR_EMS+x} ]; then + VIASH_PAR_EMS="30" +fi +if [ -z ${VIASH_PAR_GENO+x} ]; then + VIASH_PAR_GENO="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_VCF" ] && [ ! -e "$VIASH_PAR_VCF" ]; then + ViashError "Input file '$VIASH_PAR_VCF' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -e "$VIASH_PAR_BAM" ]; then + ViashError "Input file '$VIASH_PAR_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAR" ] && [ ! -e "$VIASH_PAR_BAR" ]; then + ViashError "Input file '$VIASH_PAR_BAR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_COM" ] && [ ! -e "$VIASH_PAR_COM" ]; then + ViashError "Input file '$VIASH_PAR_COM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VCF_KNOWN" ] && [ ! -e "$VIASH_PAR_VCF_KNOWN" ]; then + ViashError "Input file '$VIASH_PAR_VCF_KNOWN' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_NUM" ]]; then + if ! [[ "$VIASH_PAR_NUM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SUB" ]]; then + if ! [[ "$VIASH_PAR_SUB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sub' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EMS" ]]; then + if ! [[ "$VIASH_PAR_EMS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--ems' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DBL" ]]; then + if ! [[ "$VIASH_PAR_DBL" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--dbl' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENO" ]]; then + if ! [[ "$VIASH_PAR_GENO" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--geno' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VCF")" ) + VIASH_PAR_VCF=$(ViashDockerAutodetectMount "$VIASH_PAR_VCF") +fi +if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM")" ) + VIASH_PAR_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM") +fi +if [ ! -z "$VIASH_PAR_BAR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAR")" ) + VIASH_PAR_BAR=$(ViashDockerAutodetectMount "$VIASH_PAR_BAR") +fi +if [ ! -z "$VIASH_PAR_COM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_COM")" ) + VIASH_PAR_COM=$(ViashDockerAutodetectMount "$VIASH_PAR_COM") +fi +if [ ! -z "$VIASH_PAR_VCF_KNOWN" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VCF_KNOWN")" ) + VIASH_PAR_VCF_KNOWN=$(ViashDockerAutodetectMount "$VIASH_PAR_VCF_KNOWN") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scsplit-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\"'\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAR+x} ]; then echo "${VIASH_PAR_BAR}" | sed "s#'#'\"'\"'#g;s#.*#par_bar='&'#" ; else echo "# par_bar="; fi ) +$( if [ ! -z ${VIASH_PAR_TAG+x} ]; then echo "${VIASH_PAR_TAG}" | sed "s#'#'\"'\"'#g;s#.*#par_tag='&'#" ; else echo "# par_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_COM+x} ]; then echo "${VIASH_PAR_COM}" | sed "s#'#'\"'\"'#g;s#.*#par_com='&'#" ; else echo "# par_com="; fi ) +$( if [ ! -z ${VIASH_PAR_NUM+x} ]; then echo "${VIASH_PAR_NUM}" | sed "s#'#'\"'\"'#g;s#.*#par_num='&'#" ; else echo "# par_num="; fi ) +$( if [ ! -z ${VIASH_PAR_SUB+x} ]; then echo "${VIASH_PAR_SUB}" | sed "s#'#'\"'\"'#g;s#.*#par_sub='&'#" ; else echo "# par_sub="; fi ) +$( if [ ! -z ${VIASH_PAR_EMS+x} ]; then echo "${VIASH_PAR_EMS}" | sed "s#'#'\"'\"'#g;s#.*#par_ems='&'#" ; else echo "# par_ems="; fi ) +$( if [ ! -z ${VIASH_PAR_DBL+x} ]; then echo "${VIASH_PAR_DBL}" | sed "s#'#'\"'\"'#g;s#.*#par_dbl='&'#" ; else echo "# par_dbl="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF_KNOWN+x} ]; then echo "${VIASH_PAR_VCF_KNOWN}" | sed "s#'#'\"'\"'#g;s#.*#par_vcf_known='&'#" ; else echo "# par_vcf_known="; fi ) +$( if [ ! -z ${VIASH_PAR_GENO+x} ]; then echo "${VIASH_PAR_GENO}" | sed "s#'#'\"'\"'#g;s#.*#par_geno='&'#" ; else echo "# par_geno="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REF+x} ]; then echo "${VIASH_PAR_REF}" | sed "s#'#'\"'\"'#g;s#.*#par_ref='&'#" ; else echo "# par_ref="; fi ) +$( if [ ! -z ${VIASH_PAR_ALT+x} ]; then echo "${VIASH_PAR_ALT}" | sed "s#'#'\"'\"'#g;s#.*#par_alt='&'#" ; else echo "# par_alt="; fi ) +$( if [ ! -z ${VIASH_PAR_PSC+x} ]; then echo "${VIASH_PAR_PSC}" | sed "s#'#'\"'\"'#g;s#.*#par_psc='&'#" ; else echo "# par_psc="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" +fi + +scSplit count \\ + --vcf \$par_vcf \\ + --bam \$par_bam \\ + --bar \$par_bar \\ + --tag \$par_tag \\ + --ref \$par_ref \\ + --alt \$par_alt \\ + --out \$par_output \\ + \${par_com:+--com \$par_com} + +scSplit run \\ + --ref "\$par_output/\$par_ref" \\ + --alt "\$par_output/\$par_alt" \\ + --out \$par_output \\ + --num \$par_num \\ + \${par_sub:+--sub \$par_sub} \\ + \${par_ems:+--ems \$par_ems} \\ + \${par_dbl:+--dbl \$par_dbl} \\ + \${par_vcf_known:+--vcf \$par_vcf_known} + +if [ "\$par_geno" = true ]; then + scSplit genotype \\ + --ref "\$par_output/\$par_ref" \\ + --alt "\$par_output/\$par_alt" \\ + --psc "\$par_output/\$par_psc" \\ + "\$par_output" +fi + +echo "cell,donor_id" > "\$par_output/cell_annotation.csv" +sed -e '1d' -e 's/SNG-//g' "\$par_output/scSplit_result.csv" | +sed 's/\\t/,/g' | awk 'BEGIN{FS=OFS=","} { if (\$2 ~ /^DBL-/) \$2 = "doublet"; print }' \\ +>> "\$par_output/cell_annotation.csv" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_VCF" ]; then + VIASH_PAR_VCF=$(ViashDockerStripAutomount "$VIASH_PAR_VCF") + fi + if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_PAR_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_BAM") + fi + if [ ! -z "$VIASH_PAR_BAR" ]; then + VIASH_PAR_BAR=$(ViashDockerStripAutomount "$VIASH_PAR_BAR") + fi + if [ ! -z "$VIASH_PAR_COM" ]; then + VIASH_PAR_COM=$(ViashDockerStripAutomount "$VIASH_PAR_COM") + fi + if [ ! -z "$VIASH_PAR_VCF_KNOWN" ]; then + VIASH_PAR_VCF_KNOWN=$(ViashDockerStripAutomount "$VIASH_PAR_VCF_KNOWN") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/souporcell/.config.vsh.yaml b/target/executable/genetic_demux/souporcell/.config.vsh.yaml new file mode 100644 index 00000000..f987a1c4 --- /dev/null +++ b/target/executable/genetic_demux/souporcell/.config.vsh.yaml @@ -0,0 +1,334 @@ +name: "souporcell" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--fasta" + description: "reference fasta file" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "cellranger bam" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam_index" + description: "cellranger bam index" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcodes" + description: "barcodes.tsv from cellranger" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clusters" + description: "number cluster, tbd add easy way to run on a range of k" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ploidy" + description: "ploidy, must be 1 or 2" + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alt" + description: "min alt to use locus" + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_ref" + description: "min ref to use locus" + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_loci" + description: "max loci per cell, affects speed" + info: null + default: + - 2048 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--restarts" + description: "number of restarts in clustering, when there are > 12 clusters we\ + \ recommend increasing this to avoid local minima" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--common_variants" + description: "common variant loci or known variant loci vcf, must be vs same reference\ + \ fasta" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--known_genotypes" + description: "known variants per clone in population vcf mode, must be .vcf right\ + \ now we dont accept gzip or bcf sorry" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--known_genotypes_sample_names" + description: "which samples in population vcf from known genotypes option represent\ + \ the donors in your sample" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--skip_remap" + description: "dont remap with minimap2 (not recommended unless in conjunction\ + \ with --common_variants" + info: null + direction: "input" + - type: "boolean_true" + name: "--ignore" + description: "set to True to ignore data error assertions" + info: null + direction: "input" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "name of directory to place souporcell files" + info: null + example: + - "souporcell_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "souporcell is a method for clustering mixed-genotype scRNAseq experiments\ + \ by individual." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "cumulusprod/souporcell:2022.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/souporcell/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/souporcell" + executable: "target/executable/genetic_demux/souporcell/souporcell" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/souporcell/nextflow_labels.config b/target/executable/genetic_demux/souporcell/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/souporcell/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/souporcell/souporcell b/target/executable/genetic_demux/souporcell/souporcell new file mode 100755 index 00000000..4094c1c8 --- /dev/null +++ b/target/executable/genetic_demux/souporcell/souporcell @@ -0,0 +1,1477 @@ +#!/usr/bin/env bash + +# souporcell main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="souporcell" +VIASH_META_FUNCTIONALITY_NAME="souporcell" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM cumulusprod/souporcell:2022.12 +ENTRYPOINT [] +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux souporcell" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "souporcell main" + echo "" + echo "souporcell is a method for clustering mixed-genotype scRNAseq experiments by" + echo "individual." + echo "" + echo "Input:" + echo " --fasta" + echo " type: file, file must exist" + echo " reference fasta file" + echo "" + echo " --bam" + echo " type: file, file must exist" + echo " cellranger bam" + echo "" + echo " --bam_index" + echo " type: file, file must exist" + echo " cellranger bam index" + echo "" + echo " --barcodes" + echo " type: file, file must exist" + echo " barcodes.tsv from cellranger" + echo "" + echo " --clusters" + echo " type: integer" + echo " number cluster, tbd add easy way to run on a range of k" + echo "" + echo " --ploidy" + echo " type: integer" + echo " default: 2" + echo " ploidy, must be 1 or 2" + echo "" + echo " --min_alt" + echo " type: integer" + echo " default: 10" + echo " min alt to use locus" + echo "" + echo " --min_ref" + echo " type: integer" + echo " default: 10" + echo " min ref to use locus" + echo "" + echo " --max_loci" + echo " type: integer" + echo " default: 2048" + echo " max loci per cell, affects speed" + echo "" + echo " --restarts" + echo " type: integer" + echo " number of restarts in clustering, when there are > 12 clusters we" + echo " recommend increasing this to avoid local minima" + echo "" + echo " --common_variants" + echo " type: file, file must exist" + echo " common variant loci or known variant loci vcf, must be vs same reference" + echo " fasta" + echo "" + echo " --known_genotypes" + echo " type: file, file must exist" + echo " known variants per clone in population vcf mode, must be .vcf right now" + echo " we dont accept gzip or bcf sorry" + echo "" + echo " --known_genotypes_sample_names" + echo " type: string" + echo " which samples in population vcf from known genotypes option represent" + echo " the donors in your sample" + echo "" + echo " --skip_remap" + echo " type: boolean_true" + echo " dont remap with minimap2 (not recommended unless in conjunction with" + echo " --common_variants" + echo "" + echo " --ignore" + echo " type: boolean_true" + echo " set to True to ignore data error assertions" + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: souporcell_out" + echo " name of directory to place souporcell files" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "souporcell main" + exit + ;; + --fasta) + [ -n "$VIASH_PAR_FASTA" ] && ViashError Bad arguments for option \'--fasta\': \'$VIASH_PAR_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FASTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fasta=*) + [ -n "$VIASH_PAR_FASTA" ] && ViashError Bad arguments for option \'--fasta=*\': \'$VIASH_PAR_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FASTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam=*) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam=*\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam_index) + [ -n "$VIASH_PAR_BAM_INDEX" ] && ViashError Bad arguments for option \'--bam_index\': \'$VIASH_PAR_BAM_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam_index=*) + [ -n "$VIASH_PAR_BAM_INDEX" ] && ViashError Bad arguments for option \'--bam_index=*\': \'$VIASH_PAR_BAM_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --barcodes) + [ -n "$VIASH_PAR_BARCODES" ] && ViashError Bad arguments for option \'--barcodes\': \'$VIASH_PAR_BARCODES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcodes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcodes=*) + [ -n "$VIASH_PAR_BARCODES" ] && ViashError Bad arguments for option \'--barcodes=*\': \'$VIASH_PAR_BARCODES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clusters) + [ -n "$VIASH_PAR_CLUSTERS" ] && ViashError Bad arguments for option \'--clusters\': \'$VIASH_PAR_CLUSTERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLUSTERS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clusters. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clusters=*) + [ -n "$VIASH_PAR_CLUSTERS" ] && ViashError Bad arguments for option \'--clusters=*\': \'$VIASH_PAR_CLUSTERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLUSTERS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ploidy) + [ -n "$VIASH_PAR_PLOIDY" ] && ViashError Bad arguments for option \'--ploidy\': \'$VIASH_PAR_PLOIDY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLOIDY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ploidy. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ploidy=*) + [ -n "$VIASH_PAR_PLOIDY" ] && ViashError Bad arguments for option \'--ploidy=*\': \'$VIASH_PAR_PLOIDY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PLOIDY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_alt) + [ -n "$VIASH_PAR_MIN_ALT" ] && ViashError Bad arguments for option \'--min_alt\': \'$VIASH_PAR_MIN_ALT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_alt. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_alt=*) + [ -n "$VIASH_PAR_MIN_ALT" ] && ViashError Bad arguments for option \'--min_alt=*\': \'$VIASH_PAR_MIN_ALT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ALT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_ref) + [ -n "$VIASH_PAR_MIN_REF" ] && ViashError Bad arguments for option \'--min_ref\': \'$VIASH_PAR_MIN_REF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_REF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_ref. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_ref=*) + [ -n "$VIASH_PAR_MIN_REF" ] && ViashError Bad arguments for option \'--min_ref=*\': \'$VIASH_PAR_MIN_REF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_REF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_loci) + [ -n "$VIASH_PAR_MAX_LOCI" ] && ViashError Bad arguments for option \'--max_loci\': \'$VIASH_PAR_MAX_LOCI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_LOCI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_loci. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_loci=*) + [ -n "$VIASH_PAR_MAX_LOCI" ] && ViashError Bad arguments for option \'--max_loci=*\': \'$VIASH_PAR_MAX_LOCI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_LOCI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --restarts) + [ -n "$VIASH_PAR_RESTARTS" ] && ViashError Bad arguments for option \'--restarts\': \'$VIASH_PAR_RESTARTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RESTARTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --restarts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --restarts=*) + [ -n "$VIASH_PAR_RESTARTS" ] && ViashError Bad arguments for option \'--restarts=*\': \'$VIASH_PAR_RESTARTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RESTARTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --common_variants) + [ -n "$VIASH_PAR_COMMON_VARIANTS" ] && ViashError Bad arguments for option \'--common_variants\': \'$VIASH_PAR_COMMON_VARIANTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COMMON_VARIANTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --common_variants. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --common_variants=*) + [ -n "$VIASH_PAR_COMMON_VARIANTS" ] && ViashError Bad arguments for option \'--common_variants=*\': \'$VIASH_PAR_COMMON_VARIANTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COMMON_VARIANTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --known_genotypes) + [ -n "$VIASH_PAR_KNOWN_GENOTYPES" ] && ViashError Bad arguments for option \'--known_genotypes\': \'$VIASH_PAR_KNOWN_GENOTYPES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KNOWN_GENOTYPES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --known_genotypes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --known_genotypes=*) + [ -n "$VIASH_PAR_KNOWN_GENOTYPES" ] && ViashError Bad arguments for option \'--known_genotypes=*\': \'$VIASH_PAR_KNOWN_GENOTYPES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KNOWN_GENOTYPES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --known_genotypes_sample_names) + [ -n "$VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES" ] && ViashError Bad arguments for option \'--known_genotypes_sample_names\': \'$VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --known_genotypes_sample_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --known_genotypes_sample_names=*) + [ -n "$VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES" ] && ViashError Bad arguments for option \'--known_genotypes_sample_names=*\': \'$VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --skip_remap) + [ -n "$VIASH_PAR_SKIP_REMAP" ] && ViashError Bad arguments for option \'--skip_remap\': \'$VIASH_PAR_SKIP_REMAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SKIP_REMAP=true + shift 1 + ;; + --ignore) + [ -n "$VIASH_PAR_IGNORE" ] && ViashError Bad arguments for option \'--ignore\': \'$VIASH_PAR_IGNORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_IGNORE=true + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/souporcell:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_PLOIDY+x} ]; then + VIASH_PAR_PLOIDY="2" +fi +if [ -z ${VIASH_PAR_MIN_ALT+x} ]; then + VIASH_PAR_MIN_ALT="10" +fi +if [ -z ${VIASH_PAR_MIN_REF+x} ]; then + VIASH_PAR_MIN_REF="10" +fi +if [ -z ${VIASH_PAR_MAX_LOCI+x} ]; then + VIASH_PAR_MAX_LOCI="2048" +fi +if [ -z ${VIASH_PAR_SKIP_REMAP+x} ]; then + VIASH_PAR_SKIP_REMAP="false" +fi +if [ -z ${VIASH_PAR_IGNORE+x} ]; then + VIASH_PAR_IGNORE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_FASTA" ] && [ ! -e "$VIASH_PAR_FASTA" ]; then + ViashError "Input file '$VIASH_PAR_FASTA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -e "$VIASH_PAR_BAM" ]; then + ViashError "Input file '$VIASH_PAR_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM_INDEX" ] && [ ! -e "$VIASH_PAR_BAM_INDEX" ]; then + ViashError "Input file '$VIASH_PAR_BAM_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BARCODES" ] && [ ! -e "$VIASH_PAR_BARCODES" ]; then + ViashError "Input file '$VIASH_PAR_BARCODES' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_COMMON_VARIANTS" ] && [ ! -e "$VIASH_PAR_COMMON_VARIANTS" ]; then + ViashError "Input file '$VIASH_PAR_COMMON_VARIANTS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_KNOWN_GENOTYPES" ] && [ ! -e "$VIASH_PAR_KNOWN_GENOTYPES" ]; then + ViashError "Input file '$VIASH_PAR_KNOWN_GENOTYPES' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_CLUSTERS" ]]; then + if ! [[ "$VIASH_PAR_CLUSTERS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clusters' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PLOIDY" ]]; then + if ! [[ "$VIASH_PAR_PLOIDY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--ploidy' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_ALT" ]]; then + if ! [[ "$VIASH_PAR_MIN_ALT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_alt' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_REF" ]]; then + if ! [[ "$VIASH_PAR_MIN_REF" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_ref' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_LOCI" ]]; then + if ! [[ "$VIASH_PAR_MAX_LOCI" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_loci' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RESTARTS" ]]; then + if ! [[ "$VIASH_PAR_RESTARTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--restarts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SKIP_REMAP" ]]; then + if ! [[ "$VIASH_PAR_SKIP_REMAP" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--skip_remap' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_IGNORE" ]]; then + if ! [[ "$VIASH_PAR_IGNORE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--ignore' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_FASTA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FASTA")" ) + VIASH_PAR_FASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_FASTA") +fi +if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM")" ) + VIASH_PAR_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM") +fi +if [ ! -z "$VIASH_PAR_BAM_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM_INDEX")" ) + VIASH_PAR_BAM_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM_INDEX") +fi +if [ ! -z "$VIASH_PAR_BARCODES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BARCODES")" ) + VIASH_PAR_BARCODES=$(ViashDockerAutodetectMount "$VIASH_PAR_BARCODES") +fi +if [ ! -z "$VIASH_PAR_COMMON_VARIANTS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_COMMON_VARIANTS")" ) + VIASH_PAR_COMMON_VARIANTS=$(ViashDockerAutodetectMount "$VIASH_PAR_COMMON_VARIANTS") +fi +if [ ! -z "$VIASH_PAR_KNOWN_GENOTYPES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_KNOWN_GENOTYPES")" ) + VIASH_PAR_KNOWN_GENOTYPES=$(ViashDockerAutodetectMount "$VIASH_PAR_KNOWN_GENOTYPES") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-souporcell-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_FASTA+x} ]; then echo "${VIASH_PAR_FASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_fasta='&'#" ; else echo "# par_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM_INDEX+x} ]; then echo "${VIASH_PAR_BAM_INDEX}" | sed "s#'#'\"'\"'#g;s#.*#par_bam_index='&'#" ; else echo "# par_bam_index="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODES+x} ]; then echo "${VIASH_PAR_BARCODES}" | sed "s#'#'\"'\"'#g;s#.*#par_barcodes='&'#" ; else echo "# par_barcodes="; fi ) +$( if [ ! -z ${VIASH_PAR_CLUSTERS+x} ]; then echo "${VIASH_PAR_CLUSTERS}" | sed "s#'#'\"'\"'#g;s#.*#par_clusters='&'#" ; else echo "# par_clusters="; fi ) +$( if [ ! -z ${VIASH_PAR_PLOIDY+x} ]; then echo "${VIASH_PAR_PLOIDY}" | sed "s#'#'\"'\"'#g;s#.*#par_ploidy='&'#" ; else echo "# par_ploidy="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALT+x} ]; then echo "${VIASH_PAR_MIN_ALT}" | sed "s#'#'\"'\"'#g;s#.*#par_min_alt='&'#" ; else echo "# par_min_alt="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_REF+x} ]; then echo "${VIASH_PAR_MIN_REF}" | sed "s#'#'\"'\"'#g;s#.*#par_min_ref='&'#" ; else echo "# par_min_ref="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_LOCI+x} ]; then echo "${VIASH_PAR_MAX_LOCI}" | sed "s#'#'\"'\"'#g;s#.*#par_max_loci='&'#" ; else echo "# par_max_loci="; fi ) +$( if [ ! -z ${VIASH_PAR_RESTARTS+x} ]; then echo "${VIASH_PAR_RESTARTS}" | sed "s#'#'\"'\"'#g;s#.*#par_restarts='&'#" ; else echo "# par_restarts="; fi ) +$( if [ ! -z ${VIASH_PAR_COMMON_VARIANTS+x} ]; then echo "${VIASH_PAR_COMMON_VARIANTS}" | sed "s#'#'\"'\"'#g;s#.*#par_common_variants='&'#" ; else echo "# par_common_variants="; fi ) +$( if [ ! -z ${VIASH_PAR_KNOWN_GENOTYPES+x} ]; then echo "${VIASH_PAR_KNOWN_GENOTYPES}" | sed "s#'#'\"'\"'#g;s#.*#par_known_genotypes='&'#" ; else echo "# par_known_genotypes="; fi ) +$( if [ ! -z ${VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES+x} ]; then echo "${VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES}" | sed "s#'#'\"'\"'#g;s#.*#par_known_genotypes_sample_names='&'#" ; else echo "# par_known_genotypes_sample_names="; fi ) +$( if [ ! -z ${VIASH_PAR_SKIP_REMAP+x} ]; then echo "${VIASH_PAR_SKIP_REMAP}" | sed "s#'#'\"'\"'#g;s#.*#par_skip_remap='&'#" ; else echo "# par_skip_remap="; fi ) +$( if [ ! -z ${VIASH_PAR_IGNORE+x} ]; then echo "${VIASH_PAR_IGNORE}" | sed "s#'#'\"'\"'#g;s#.*#par_ignore='&'#" ; else echo "# par_ignore="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\$par_skip_remap" == "false" ]] && unset par_skip_remap +[[ "\$par_ignore" == "false" ]] && unset par_ignore + +if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" +fi + +/opt/souporcell/souporcell_pipeline.py \\ + --bam \$par_bam \\ + --fasta \$par_fasta \\ + --barcodes \$par_barcodes \\ + --clusters \$par_clusters \\ + --ploidy \$par_ploidy \\ + --min_alt \$par_min_alt \\ + --min_ref \$par_min_ref \\ + --max_loci \$par_max_loci \\ + --out_dir \$par_output \\ + --threads \${par_threads:=1} \\ + \${par_restarts:+--restarts \$par_restarts} \\ + \${par_common_variants:+--common_variants \$par_common_variants} \\ + \${par_known_genotypes:+--known_genotypes \$par_known_genotypes} \\ + \${par_known_genotypes_sample_names:+--known_genotypes_sample_names \$par_known_genotypes_sample_names} \\ + \${par_skip_remap:+--skip_remap True} \\ + \${par_ignore:+--ignore True} + +cut -d\$'\\t' -f 1-3 "\$par_output/clusters.tsv" | +sed 's/\\t/,/g' | +awk 'BEGIN{FS=OFS=","} {\$2=(\$2=="singlet")?\$3:\$2; NF=NF-1; print}' | +sed '1s/barcode,status/cell,donor_id/' > "\$par_output/cell_annotation.csv" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_FASTA" ]; then + VIASH_PAR_FASTA=$(ViashDockerStripAutomount "$VIASH_PAR_FASTA") + fi + if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_PAR_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_BAM") + fi + if [ ! -z "$VIASH_PAR_BAM_INDEX" ]; then + VIASH_PAR_BAM_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_BAM_INDEX") + fi + if [ ! -z "$VIASH_PAR_BARCODES" ]; then + VIASH_PAR_BARCODES=$(ViashDockerStripAutomount "$VIASH_PAR_BARCODES") + fi + if [ ! -z "$VIASH_PAR_COMMON_VARIANTS" ]; then + VIASH_PAR_COMMON_VARIANTS=$(ViashDockerStripAutomount "$VIASH_PAR_COMMON_VARIANTS") + fi + if [ ! -z "$VIASH_PAR_KNOWN_GENOTYPES" ]; then + VIASH_PAR_KNOWN_GENOTYPES=$(ViashDockerStripAutomount "$VIASH_PAR_KNOWN_GENOTYPES") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/genetic_demux/vireo/.config.vsh.yaml b/target/executable/genetic_demux/vireo/.config.vsh.yaml new file mode 100644 index 00000000..4863ddc2 --- /dev/null +++ b/target/executable/genetic_demux/vireo/.config.vsh.yaml @@ -0,0 +1,354 @@ +name: "vireo" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--cell_data" + description: "The cell genotype file in VCF format or cellSNP folder with sparse\ + \ matrices." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_donor" + description: "Number of donors to demultiplex; can be larger than provided in\ + \ donor_file." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vartrix_data" + description: "The cell genotype files in vartrix outputs." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--donor_file" + description: "The donor genotype file in VCF format. Please filter the sample\ + \ and region with bcftools first!" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--geno_tag" + description: "The tag for donor genotype." + info: null + default: + - "PL" + required: false + choices: + - "GT" + - "GP" + - "PL" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--no_doublet" + description: "If use, not checking doublets." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_init" + description: "Number of random initializations, when GT needs to learn." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--extra_donor" + description: "Number of extra donor in pre-cluster, when GT needs to learn." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--extra_donorMode" + description: "Method for searching from extra donors. size: n_cell per donor;\ + \ distance: GT distance between donors" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--force_learn_gt" + description: "If use, treat donor GT as prior only." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--ase_mode" + description: "If use, turn on SNP specific allelic ratio." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--no_plot" + description: "If use, turn off plotting GT distance." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--rand_seed" + description: "Seed for random initialization" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_range" + description: "Range of cells to process." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--call_ambient_rnas" + description: "If use, detect ambient RNAs in each cell." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "vireo" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Vireo is primarily designed for demultiplexing cells into donors by\ + \ modelling of expressed alleles." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "vireo_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "threadpoolctl" + - "vireoSNP" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/vireo/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/genetic_demux/vireo" + executable: "target/executable/genetic_demux/vireo/vireo" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/genetic_demux/vireo/nextflow_labels.config b/target/executable/genetic_demux/vireo/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/genetic_demux/vireo/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/genetic_demux/vireo/vireo b/target/executable/genetic_demux/vireo/vireo new file mode 100755 index 00000000..7a106d43 --- /dev/null +++ b/target/executable/genetic_demux/vireo/vireo @@ -0,0 +1,1494 @@ +#!/usr/bin/env bash + +# vireo main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Xichen Wu (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="vireo" +VIASH_META_FUNCTIONALITY_NAME="vireo" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "threadpoolctl" "vireoSNP" + +LABEL org.opencontainers.image.authors="Xichen Wu" +LABEL org.opencontainers.image.description="Companion container for running component genetic_demux vireo" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "vireo main" + echo "" + echo "Vireo is primarily designed for demultiplexing cells into donors by modelling of" + echo "expressed alleles." + echo "" + echo "Input:" + echo " --cell_data" + echo " type: file, file must exist" + echo " The cell genotype file in VCF format or cellSNP folder with sparse" + echo " matrices." + echo "" + echo " --n_donor" + echo " type: integer" + echo " default: 2" + echo " Number of donors to demultiplex; can be larger than provided in" + echo " donor_file." + echo "" + echo " --vartrix_data" + echo " type: file, file must exist" + echo " The cell genotype files in vartrix outputs." + echo "" + echo " --donor_file" + echo " type: file, file must exist" + echo " The donor genotype file in VCF format. Please filter the sample and" + echo " region with bcftools first!" + echo "" + echo " --geno_tag" + echo " type: string" + echo " default: PL" + echo " choices: [ GT, GP, PL ]" + echo " The tag for donor genotype." + echo "" + echo " --no_doublet" + echo " type: boolean" + echo " default: false" + echo " If use, not checking doublets." + echo "" + echo " --n_init" + echo " type: integer" + echo " default: 50" + echo " Number of random initializations, when GT needs to learn." + echo "" + echo " --extra_donor" + echo " type: integer" + echo " default: 0" + echo " Number of extra donor in pre-cluster, when GT needs to learn." + echo "" + echo " --extra_donorMode" + echo " type: string" + echo " Method for searching from extra donors. size: n_cell per donor;" + echo " distance: GT distance between donors" + echo "" + echo " --force_learn_gt" + echo " type: boolean" + echo " default: false" + echo " If use, treat donor GT as prior only." + echo "" + echo " --ase_mode" + echo " type: boolean" + echo " default: false" + echo " If use, turn on SNP specific allelic ratio." + echo "" + echo " --no_plot" + echo " type: boolean" + echo " default: false" + echo " If use, turn off plotting GT distance." + echo "" + echo " --rand_seed" + echo " type: integer" + echo " Seed for random initialization" + echo "" + echo " --cell_range" + echo " type: string" + echo " Range of cells to process." + echo "" + echo " --call_ambient_rnas" + echo " type: boolean" + echo " default: false" + echo " If use, detect ambient RNAs in each cell." + echo "" + echo "Output:" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: vireo" + echo " Output directory" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "vireo main" + exit + ;; + --cell_data) + [ -n "$VIASH_PAR_CELL_DATA" ] && ViashError Bad arguments for option \'--cell_data\': \'$VIASH_PAR_CELL_DATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_DATA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_data. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_data=*) + [ -n "$VIASH_PAR_CELL_DATA" ] && ViashError Bad arguments for option \'--cell_data=*\': \'$VIASH_PAR_CELL_DATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_DATA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_donor) + [ -n "$VIASH_PAR_N_DONOR" ] && ViashError Bad arguments for option \'--n_donor\': \'$VIASH_PAR_N_DONOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_DONOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_donor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_donor=*) + [ -n "$VIASH_PAR_N_DONOR" ] && ViashError Bad arguments for option \'--n_donor=*\': \'$VIASH_PAR_N_DONOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_DONOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vartrix_data) + [ -n "$VIASH_PAR_VARTRIX_DATA" ] && ViashError Bad arguments for option \'--vartrix_data\': \'$VIASH_PAR_VARTRIX_DATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARTRIX_DATA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vartrix_data. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vartrix_data=*) + [ -n "$VIASH_PAR_VARTRIX_DATA" ] && ViashError Bad arguments for option \'--vartrix_data=*\': \'$VIASH_PAR_VARTRIX_DATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARTRIX_DATA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --donor_file) + [ -n "$VIASH_PAR_DONOR_FILE" ] && ViashError Bad arguments for option \'--donor_file\': \'$VIASH_PAR_DONOR_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DONOR_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --donor_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --donor_file=*) + [ -n "$VIASH_PAR_DONOR_FILE" ] && ViashError Bad arguments for option \'--donor_file=*\': \'$VIASH_PAR_DONOR_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DONOR_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --geno_tag) + [ -n "$VIASH_PAR_GENO_TAG" ] && ViashError Bad arguments for option \'--geno_tag\': \'$VIASH_PAR_GENO_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_TAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --geno_tag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --geno_tag=*) + [ -n "$VIASH_PAR_GENO_TAG" ] && ViashError Bad arguments for option \'--geno_tag=*\': \'$VIASH_PAR_GENO_TAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENO_TAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --no_doublet) + [ -n "$VIASH_PAR_NO_DOUBLET" ] && ViashError Bad arguments for option \'--no_doublet\': \'$VIASH_PAR_NO_DOUBLET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_DOUBLET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --no_doublet. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --no_doublet=*) + [ -n "$VIASH_PAR_NO_DOUBLET" ] && ViashError Bad arguments for option \'--no_doublet=*\': \'$VIASH_PAR_NO_DOUBLET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_DOUBLET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_init) + [ -n "$VIASH_PAR_N_INIT" ] && ViashError Bad arguments for option \'--n_init\': \'$VIASH_PAR_N_INIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_INIT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_init. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_init=*) + [ -n "$VIASH_PAR_N_INIT" ] && ViashError Bad arguments for option \'--n_init=*\': \'$VIASH_PAR_N_INIT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_INIT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --extra_donor) + [ -n "$VIASH_PAR_EXTRA_DONOR" ] && ViashError Bad arguments for option \'--extra_donor\': \'$VIASH_PAR_EXTRA_DONOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXTRA_DONOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --extra_donor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --extra_donor=*) + [ -n "$VIASH_PAR_EXTRA_DONOR" ] && ViashError Bad arguments for option \'--extra_donor=*\': \'$VIASH_PAR_EXTRA_DONOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXTRA_DONOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --extra_donorMode) + [ -n "$VIASH_PAR_EXTRA_DONORMODE" ] && ViashError Bad arguments for option \'--extra_donorMode\': \'$VIASH_PAR_EXTRA_DONORMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXTRA_DONORMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --extra_donorMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --extra_donorMode=*) + [ -n "$VIASH_PAR_EXTRA_DONORMODE" ] && ViashError Bad arguments for option \'--extra_donorMode=*\': \'$VIASH_PAR_EXTRA_DONORMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXTRA_DONORMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --force_learn_gt) + [ -n "$VIASH_PAR_FORCE_LEARN_GT" ] && ViashError Bad arguments for option \'--force_learn_gt\': \'$VIASH_PAR_FORCE_LEARN_GT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_LEARN_GT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --force_learn_gt. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --force_learn_gt=*) + [ -n "$VIASH_PAR_FORCE_LEARN_GT" ] && ViashError Bad arguments for option \'--force_learn_gt=*\': \'$VIASH_PAR_FORCE_LEARN_GT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_LEARN_GT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ase_mode) + [ -n "$VIASH_PAR_ASE_MODE" ] && ViashError Bad arguments for option \'--ase_mode\': \'$VIASH_PAR_ASE_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ASE_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ase_mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ase_mode=*) + [ -n "$VIASH_PAR_ASE_MODE" ] && ViashError Bad arguments for option \'--ase_mode=*\': \'$VIASH_PAR_ASE_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ASE_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --no_plot) + [ -n "$VIASH_PAR_NO_PLOT" ] && ViashError Bad arguments for option \'--no_plot\': \'$VIASH_PAR_NO_PLOT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_PLOT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --no_plot. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --no_plot=*) + [ -n "$VIASH_PAR_NO_PLOT" ] && ViashError Bad arguments for option \'--no_plot=*\': \'$VIASH_PAR_NO_PLOT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_PLOT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rand_seed) + [ -n "$VIASH_PAR_RAND_SEED" ] && ViashError Bad arguments for option \'--rand_seed\': \'$VIASH_PAR_RAND_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RAND_SEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rand_seed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rand_seed=*) + [ -n "$VIASH_PAR_RAND_SEED" ] && ViashError Bad arguments for option \'--rand_seed=*\': \'$VIASH_PAR_RAND_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RAND_SEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_range) + [ -n "$VIASH_PAR_CELL_RANGE" ] && ViashError Bad arguments for option \'--cell_range\': \'$VIASH_PAR_CELL_RANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_RANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_range. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_range=*) + [ -n "$VIASH_PAR_CELL_RANGE" ] && ViashError Bad arguments for option \'--cell_range=*\': \'$VIASH_PAR_CELL_RANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_RANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --call_ambient_rnas) + [ -n "$VIASH_PAR_CALL_AMBIENT_RNAS" ] && ViashError Bad arguments for option \'--call_ambient_rnas\': \'$VIASH_PAR_CALL_AMBIENT_RNAS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CALL_AMBIENT_RNAS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --call_ambient_rnas. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --call_ambient_rnas=*) + [ -n "$VIASH_PAR_CALL_AMBIENT_RNAS" ] && ViashError Bad arguments for option \'--call_ambient_rnas=*\': \'$VIASH_PAR_CALL_AMBIENT_RNAS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CALL_AMBIENT_RNAS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/genetic_demux/vireo:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_N_DONOR+x} ]; then + VIASH_PAR_N_DONOR="2" +fi +if [ -z ${VIASH_PAR_GENO_TAG+x} ]; then + VIASH_PAR_GENO_TAG="PL" +fi +if [ -z ${VIASH_PAR_NO_DOUBLET+x} ]; then + VIASH_PAR_NO_DOUBLET="false" +fi +if [ -z ${VIASH_PAR_N_INIT+x} ]; then + VIASH_PAR_N_INIT="50" +fi +if [ -z ${VIASH_PAR_EXTRA_DONOR+x} ]; then + VIASH_PAR_EXTRA_DONOR="0" +fi +if [ -z ${VIASH_PAR_FORCE_LEARN_GT+x} ]; then + VIASH_PAR_FORCE_LEARN_GT="false" +fi +if [ -z ${VIASH_PAR_ASE_MODE+x} ]; then + VIASH_PAR_ASE_MODE="false" +fi +if [ -z ${VIASH_PAR_NO_PLOT+x} ]; then + VIASH_PAR_NO_PLOT="false" +fi +if [ -z ${VIASH_PAR_CALL_AMBIENT_RNAS+x} ]; then + VIASH_PAR_CALL_AMBIENT_RNAS="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_CELL_DATA" ] && [ ! -e "$VIASH_PAR_CELL_DATA" ]; then + ViashError "Input file '$VIASH_PAR_CELL_DATA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VARTRIX_DATA" ] && [ ! -e "$VIASH_PAR_VARTRIX_DATA" ]; then + ViashError "Input file '$VIASH_PAR_VARTRIX_DATA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_DONOR_FILE" ] && [ ! -e "$VIASH_PAR_DONOR_FILE" ]; then + ViashError "Input file '$VIASH_PAR_DONOR_FILE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_DONOR" ]]; then + if ! [[ "$VIASH_PAR_N_DONOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_donor' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NO_DOUBLET" ]]; then + if ! [[ "$VIASH_PAR_NO_DOUBLET" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--no_doublet' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_INIT" ]]; then + if ! [[ "$VIASH_PAR_N_INIT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_init' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXTRA_DONOR" ]]; then + if ! [[ "$VIASH_PAR_EXTRA_DONOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--extra_donor' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FORCE_LEARN_GT" ]]; then + if ! [[ "$VIASH_PAR_FORCE_LEARN_GT" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--force_learn_gt' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ASE_MODE" ]]; then + if ! [[ "$VIASH_PAR_ASE_MODE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--ase_mode' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NO_PLOT" ]]; then + if ! [[ "$VIASH_PAR_NO_PLOT" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--no_plot' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RAND_SEED" ]]; then + if ! [[ "$VIASH_PAR_RAND_SEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--rand_seed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CALL_AMBIENT_RNAS" ]]; then + if ! [[ "$VIASH_PAR_CALL_AMBIENT_RNAS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--call_ambient_rnas' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_GENO_TAG" ]; then + VIASH_PAR_GENO_TAG_CHOICES=("GT;GP;PL") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_GENO_TAG_CHOICES[*]};" =~ ";$VIASH_PAR_GENO_TAG;" ]]; then + ViashError '--geno_tag' specified value of \'$VIASH_PAR_GENO_TAG\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_CELL_DATA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CELL_DATA")" ) + VIASH_PAR_CELL_DATA=$(ViashDockerAutodetectMount "$VIASH_PAR_CELL_DATA") +fi +if [ ! -z "$VIASH_PAR_VARTRIX_DATA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VARTRIX_DATA")" ) + VIASH_PAR_VARTRIX_DATA=$(ViashDockerAutodetectMount "$VIASH_PAR_VARTRIX_DATA") +fi +if [ ! -z "$VIASH_PAR_DONOR_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_DONOR_FILE")" ) + VIASH_PAR_DONOR_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_DONOR_FILE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-vireo-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_CELL_DATA+x} ]; then echo "${VIASH_PAR_CELL_DATA}" | sed "s#'#'\"'\"'#g;s#.*#par_cell_data='&'#" ; else echo "# par_cell_data="; fi ) +$( if [ ! -z ${VIASH_PAR_N_DONOR+x} ]; then echo "${VIASH_PAR_N_DONOR}" | sed "s#'#'\"'\"'#g;s#.*#par_n_donor='&'#" ; else echo "# par_n_donor="; fi ) +$( if [ ! -z ${VIASH_PAR_VARTRIX_DATA+x} ]; then echo "${VIASH_PAR_VARTRIX_DATA}" | sed "s#'#'\"'\"'#g;s#.*#par_vartrix_data='&'#" ; else echo "# par_vartrix_data="; fi ) +$( if [ ! -z ${VIASH_PAR_DONOR_FILE+x} ]; then echo "${VIASH_PAR_DONOR_FILE}" | sed "s#'#'\"'\"'#g;s#.*#par_donor_file='&'#" ; else echo "# par_donor_file="; fi ) +$( if [ ! -z ${VIASH_PAR_GENO_TAG+x} ]; then echo "${VIASH_PAR_GENO_TAG}" | sed "s#'#'\"'\"'#g;s#.*#par_geno_tag='&'#" ; else echo "# par_geno_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_DOUBLET+x} ]; then echo "${VIASH_PAR_NO_DOUBLET}" | sed "s#'#'\"'\"'#g;s#.*#par_no_doublet='&'#" ; else echo "# par_no_doublet="; fi ) +$( if [ ! -z ${VIASH_PAR_N_INIT+x} ]; then echo "${VIASH_PAR_N_INIT}" | sed "s#'#'\"'\"'#g;s#.*#par_n_init='&'#" ; else echo "# par_n_init="; fi ) +$( if [ ! -z ${VIASH_PAR_EXTRA_DONOR+x} ]; then echo "${VIASH_PAR_EXTRA_DONOR}" | sed "s#'#'\"'\"'#g;s#.*#par_extra_donor='&'#" ; else echo "# par_extra_donor="; fi ) +$( if [ ! -z ${VIASH_PAR_EXTRA_DONORMODE+x} ]; then echo "${VIASH_PAR_EXTRA_DONORMODE}" | sed "s#'#'\"'\"'#g;s#.*#par_extra_donorMode='&'#" ; else echo "# par_extra_donorMode="; fi ) +$( if [ ! -z ${VIASH_PAR_FORCE_LEARN_GT+x} ]; then echo "${VIASH_PAR_FORCE_LEARN_GT}" | sed "s#'#'\"'\"'#g;s#.*#par_force_learn_gt='&'#" ; else echo "# par_force_learn_gt="; fi ) +$( if [ ! -z ${VIASH_PAR_ASE_MODE+x} ]; then echo "${VIASH_PAR_ASE_MODE}" | sed "s#'#'\"'\"'#g;s#.*#par_ase_mode='&'#" ; else echo "# par_ase_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_PLOT+x} ]; then echo "${VIASH_PAR_NO_PLOT}" | sed "s#'#'\"'\"'#g;s#.*#par_no_plot='&'#" ; else echo "# par_no_plot="; fi ) +$( if [ ! -z ${VIASH_PAR_RAND_SEED+x} ]; then echo "${VIASH_PAR_RAND_SEED}" | sed "s#'#'\"'\"'#g;s#.*#par_rand_seed='&'#" ; else echo "# par_rand_seed="; fi ) +$( if [ ! -z ${VIASH_PAR_CELL_RANGE+x} ]; then echo "${VIASH_PAR_CELL_RANGE}" | sed "s#'#'\"'\"'#g;s#.*#par_cell_range='&'#" ; else echo "# par_cell_range="; fi ) +$( if [ ! -z ${VIASH_PAR_CALL_AMBIENT_RNAS+x} ]; then echo "${VIASH_PAR_CALL_AMBIENT_RNAS}" | sed "s#'#'\"'\"'#g;s#.*#par_call_ambient_rnas='&'#" ; else echo "# par_call_ambient_rnas="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\$par_no_doublet" == "false" ]] && unset par_no_doublet +[[ "\$par_force_learn_gt" == "false" ]] && unset par_force_learn_gt +[[ "\$par_ase_mode" == "false" ]] && unset par_ase_mode +[[ "\$par_no_plot" == "false" ]] && unset par_no_plot +[[ "\$par_call_ambient_rnas" == "false" ]] && unset par_call_ambient_rnas + +if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" +fi + +vireo \\ + --cellData \$par_cell_data \\ + --nDonor \$par_n_donor \\ + --genoTag \$par_geno_tag \\ + --nInit \$par_n_init \\ + --extraDonor \$par_extra_donor \\ + --out "\${par_output}" \\ + \${par_vartrix_data:+--vatrixData \$par_vartrix_data} \\ + \${par_donor_file:+--donorFile \$par_donor_file} \\ + \${par_no_doublet:+--noDoublet} \\ + \${par_extra_donorMode:+--extraDonorMode \$par_extra_donorMode} \\ + \${par_force_learn_gt:+--forceLearnGT} \\ + \${par_ase_mode:+--ASEmode} \\ + \${par_no_plot:+--noPlot} \\ + \${par_rand_seed:+--randSeed \$par_rand_seed} \\ + \${par_cell_range:+--cellRange \$par_cell_range} \\ + \${par_call_ambient_rnas:+--callAmbientRNAs} \\ + \${meta_cpus:+--nproc \$meta_cpus} + +cut -d\$'\\t' -f 1-2 "\$par_output/donor_ids.tsv" | tr '\\t' ',' > "\$par_output/cell_annotation.csv" +awk 'BEGIN{FS=OFS=","} NR>1{ gsub("donor", "", \$2) } 1' "\$par_output/cell_annotation.csv" > "\$par_output/cell_annotation_temp.csv" && mv "\$par_output/cell_annotation_temp.csv" "\$par_output/cell_annotation.csv" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_CELL_DATA" ]; then + VIASH_PAR_CELL_DATA=$(ViashDockerStripAutomount "$VIASH_PAR_CELL_DATA") + fi + if [ ! -z "$VIASH_PAR_VARTRIX_DATA" ]; then + VIASH_PAR_VARTRIX_DATA=$(ViashDockerStripAutomount "$VIASH_PAR_VARTRIX_DATA") + fi + if [ ! -z "$VIASH_PAR_DONOR_FILE" ]; then + VIASH_PAR_DONOR_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_DONOR_FILE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/integrate/harmonypy/.config.vsh.yaml b/target/executable/integrate/harmonypy/.config.vsh.yaml new file mode 100644 index 00000000..028d33f3 --- /dev/null +++ b/target/executable/integrate/harmonypy/.config.vsh.yaml @@ -0,0 +1,322 @@ +name: "harmonypy" +namespace: "integrate" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "Which .obsm slot to use as a starting PCA embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_pca_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--theta" + description: "Diversity clustering penalty parameter. Can be set as a single value\ + \ for all batch observations or as multiple values, one for each observation\ + \ in the batches defined by --obs_covariates. theta=0 does not encourage any\ + \ diversity. Larger values of theta result in more diverse clusters." + info: null + default: + - 2.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--obs_covariates" + description: "The .obs field(s) that define the covariate(s) to regress out." + info: null + example: + - "batch" + - "sample" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony.\ + \ Based on an implementation in python from https://github.com/slowkow/harmonypy" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "harmonypy~=0.0.6" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/harmonypy/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integrate/harmonypy" + executable: "target/executable/integrate/harmonypy/harmonypy" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/integrate/harmonypy/compress_h5mu.py b/target/executable/integrate/harmonypy/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/integrate/harmonypy/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/integrate/harmonypy/harmonypy b/target/executable/integrate/harmonypy/harmonypy new file mode 100755 index 00000000..67cc8f59 --- /dev/null +++ b/target/executable/integrate/harmonypy/harmonypy @@ -0,0 +1,1330 @@ +#!/usr/bin/env bash + +# harmonypy main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Robrecht Cannoodt (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="harmonypy" +VIASH_META_FUNCTIONALITY_NAME="harmonypy" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "harmonypy~=0.0.6" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component integrate harmonypy" +LABEL org.opencontainers.image.created="2025-08-22T07:23:10Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "harmonypy main" + echo "" + echo "Performs Harmony integration based as described in" + echo "https://github.com/immunogenomics/harmony. Based on an implementation in python" + echo "from https://github.com/slowkow/harmonypy" + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obsm_input" + echo " type: string" + echo " default: X_pca" + echo " Which .obsm slot to use as a starting PCA embedding." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_pca_integrated" + echo " In which .obsm slot to store the resulting integrated embedding." + echo "" + echo " --theta" + echo " type: double, multiple values allowed" + echo " default: 2.0" + echo " Diversity clustering penalty parameter. Can be set as a single value for" + echo " all batch observations or as multiple values, one for each observation" + echo " in the batches defined by --obs_covariates. theta=0 does not encourage" + echo " any diversity. Larger values of theta result in more diverse clusters." + echo "" + echo " --obs_covariates" + echo " type: string, required parameter, multiple values allowed" + echo " example: batch;sample" + echo " The .obs field(s) that define the covariate(s) to regress out." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "harmonypy main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_input) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_input=*) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input=*\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --theta) + if [ -z "$VIASH_PAR_THETA" ]; then + VIASH_PAR_THETA="$2" + else + VIASH_PAR_THETA="$VIASH_PAR_THETA;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --theta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --theta=*) + if [ -z "$VIASH_PAR_THETA" ]; then + VIASH_PAR_THETA=$(ViashRemoveFlags "$1") + else + VIASH_PAR_THETA="$VIASH_PAR_THETA;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --obs_covariates) + if [ -z "$VIASH_PAR_OBS_COVARIATES" ]; then + VIASH_PAR_OBS_COVARIATES="$2" + else + VIASH_PAR_OBS_COVARIATES="$VIASH_PAR_OBS_COVARIATES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_covariates. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_covariates=*) + if [ -z "$VIASH_PAR_OBS_COVARIATES" ]; then + VIASH_PAR_OBS_COVARIATES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_COVARIATES="$VIASH_PAR_OBS_COVARIATES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/integrate/harmonypy:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_COVARIATES+x} ]; then + ViashError '--obs_covariates' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSM_INPUT+x} ]; then + VIASH_PAR_OBSM_INPUT="X_pca" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_pca_integrated" +fi +if [ -z ${VIASH_PAR_THETA+x} ]; then + VIASH_PAR_THETA="2.0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [ -n "$VIASH_PAR_THETA" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_THETA; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--theta' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-harmonypy-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata +import sys +from harmonypy import run_harmony + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'theta': $( if [ ! -z ${VIASH_PAR_THETA+x} ]; then echo "list(map(float, r'${VIASH_PAR_THETA//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'obs_covariates': $( if [ ! -z ${VIASH_PAR_OBS_COVARIATES+x} ]; then echo "r'${VIASH_PAR_OBS_COVARIATES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading %s from %s", par["modality"], par["input"]) + mod = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + logger.info("Selecting embedding %s", par["obsm_input"]) + pca_embedding = mod.obsm[par["obsm_input"]] + metadata = mod.obs + logger.info( + "Available covariates to regress out: ", ", ".join(metadata.columns.to_list()) + ) + logger.info( + "Running harmonypy with theta %s to regress out the following variables: ", + par["theta"], + ", ".join(par["obs_covariates"]), + ) + ho = run_harmony(pca_embedding, metadata, par["obs_covariates"], theta=par["theta"]) + logger.info("Adding output to slot %s", par["obsm_output"]) + mod.obsm[par["obsm_output"]] = ho.Z_corr.T + logger.info("Writing to %s") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] + ) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/integrate/harmonypy/nextflow_labels.config b/target/executable/integrate/harmonypy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/integrate/harmonypy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/integrate/harmonypy/setup_logger.py b/target/executable/integrate/harmonypy/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/integrate/harmonypy/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/integrate/scanorama/.config.vsh.yaml b/target/executable/integrate/scanorama/.config.vsh.yaml new file mode 100644 index 00000000..e858aa88 --- /dev/null +++ b/target/executable/integrate/scanorama/.config.vsh.yaml @@ -0,0 +1,360 @@ +name: "scanorama" +namespace: "integrate" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output .h5mu file" + info: null + default: + - "output.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "batch" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "Basis obsm slot to run scanorama on." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The name of the field in adata.obsm where the integrated embeddings\ + \ will be stored after running this function. Defaults to X_scanorama." + info: null + default: + - "X_scanorama" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--knn" + description: "Number of nearest neighbors to use for matching." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size used in the alignment vector computation. Useful\ + \ when integrating very large (>100k samples) datasets. Set to large value that\ + \ runs within available memory." + info: null + default: + - 5000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--sigma" + description: "Correction smoothing parameter on Gaussian kernel." + info: null + default: + - 15.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--approx" + description: "Use approximate nearest neighbors with Python annoy; greatly speeds\ + \ up matching runtime." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "Alignment score minimum cutoff" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Use Scanorama to integrate different experiments.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scanorama" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/scanorama/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integrate/scanorama" + executable: "target/executable/integrate/scanorama/scanorama" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/integrate/scanorama/compress_h5mu.py b/target/executable/integrate/scanorama/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/integrate/scanorama/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/integrate/scanorama/nextflow_labels.config b/target/executable/integrate/scanorama/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/integrate/scanorama/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/integrate/scanorama/scanorama b/target/executable/integrate/scanorama/scanorama new file mode 100755 index 00000000..36196943 --- /dev/null +++ b/target/executable/integrate/scanorama/scanorama @@ -0,0 +1,1418 @@ +#!/usr/bin/env bash + +# scanorama main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (author) +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scanorama" +VIASH_META_FUNCTIONALITY_NAME="scanorama" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps build-essential && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "scanorama" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component integrate scanorama" +LABEL org.opencontainers.image.created="2025-08-22T07:23:10Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scanorama main" + echo "" + echo "Use Scanorama to integrate different experiments." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5ad" + echo " Output .h5mu file" + echo "" + echo " --obs_batch" + echo " type: string" + echo " default: batch" + echo " Column name discriminating between your batches." + echo "" + echo " --obsm_input" + echo " type: string" + echo " default: X_pca" + echo " Basis obsm slot to run scanorama on." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_scanorama" + echo " The name of the field in adata.obsm where the integrated embeddings will" + echo " be stored after running this function. Defaults to X_scanorama." + echo "" + echo " --knn" + echo " type: integer" + echo " default: 20" + echo " Number of nearest neighbors to use for matching." + echo "" + echo " --batch_size" + echo " type: integer" + echo " default: 5000" + echo " The batch size used in the alignment vector computation. Useful when" + echo " integrating very large (>100k samples) datasets. Set to large value that" + echo " runs within available memory." + echo "" + echo " --sigma" + echo " type: double" + echo " default: 15.0" + echo " Correction smoothing parameter on Gaussian kernel." + echo "" + echo " --approx" + echo " type: boolean" + echo " default: true" + echo " Use approximate nearest neighbors with Python annoy; greatly speeds up" + echo " matching runtime." + echo "" + echo " --alpha" + echo " type: double" + echo " default: 0.1" + echo " Alignment score minimum cutoff" + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scanorama main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch=*) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch=*\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_input) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_input=*) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input=*\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --knn) + [ -n "$VIASH_PAR_KNN" ] && ViashError Bad arguments for option \'--knn\': \'$VIASH_PAR_KNN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KNN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --knn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --knn=*) + [ -n "$VIASH_PAR_KNN" ] && ViashError Bad arguments for option \'--knn=*\': \'$VIASH_PAR_KNN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_KNN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --batch_size) + [ -n "$VIASH_PAR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--batch_size\': \'$VIASH_PAR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BATCH_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --batch_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --batch_size=*) + [ -n "$VIASH_PAR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--batch_size=*\': \'$VIASH_PAR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BATCH_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sigma) + [ -n "$VIASH_PAR_SIGMA" ] && ViashError Bad arguments for option \'--sigma\': \'$VIASH_PAR_SIGMA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SIGMA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sigma. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sigma=*) + [ -n "$VIASH_PAR_SIGMA" ] && ViashError Bad arguments for option \'--sigma=*\': \'$VIASH_PAR_SIGMA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SIGMA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --approx) + [ -n "$VIASH_PAR_APPROX" ] && ViashError Bad arguments for option \'--approx\': \'$VIASH_PAR_APPROX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_APPROX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --approx. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --approx=*) + [ -n "$VIASH_PAR_APPROX" ] && ViashError Bad arguments for option \'--approx=*\': \'$VIASH_PAR_APPROX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_APPROX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alpha) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alpha=*) + [ -n "$VIASH_PAR_ALPHA" ] && ViashError Bad arguments for option \'--alpha=*\': \'$VIASH_PAR_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALPHA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/integrate/scanorama:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBS_BATCH+x} ]; then + VIASH_PAR_OBS_BATCH="batch" +fi +if [ -z ${VIASH_PAR_OBSM_INPUT+x} ]; then + VIASH_PAR_OBSM_INPUT="X_pca" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_scanorama" +fi +if [ -z ${VIASH_PAR_KNN+x} ]; then + VIASH_PAR_KNN="20" +fi +if [ -z ${VIASH_PAR_BATCH_SIZE+x} ]; then + VIASH_PAR_BATCH_SIZE="5000" +fi +if [ -z ${VIASH_PAR_SIGMA+x} ]; then + VIASH_PAR_SIGMA="15.0" +fi +if [ -z ${VIASH_PAR_APPROX+x} ]; then + VIASH_PAR_APPROX="true" +fi +if [ -z ${VIASH_PAR_ALPHA+x} ]; then + VIASH_PAR_ALPHA="0.1" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_KNN" ]]; then + if ! [[ "$VIASH_PAR_KNN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--knn' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BATCH_SIZE" ]]; then + if ! [[ "$VIASH_PAR_BATCH_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--batch_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SIGMA" ]]; then + if ! [[ "$VIASH_PAR_SIGMA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--sigma' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_APPROX" ]]; then + if ! [[ "$VIASH_PAR_APPROX" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--approx' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALPHA" ]]; then + if ! [[ "$VIASH_PAR_ALPHA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alpha' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scanorama-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from scanpy.external.pp import scanorama_integrate +from mudata import read_h5ad + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'knn': $( if [ ! -z ${VIASH_PAR_KNN+x} ]; then echo "int(r'${VIASH_PAR_KNN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'batch_size': $( if [ ! -z ${VIASH_PAR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_BATCH_SIZE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sigma': $( if [ ! -z ${VIASH_PAR_SIGMA+x} ]; then echo "float(r'${VIASH_PAR_SIGMA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'approx': $( if [ ! -z ${VIASH_PAR_APPROX+x} ]; then echo "r'${VIASH_PAR_APPROX//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from input %s", par["modality"], par["input"]) +mod = read_h5ad(par["input"], mod=par["modality"]) + +arguments = { + "key": par["obs_batch"], + "basis": par["obsm_input"], + "adjusted_basis": par["obsm_output"], + "knn": par["knn"], + "alpha": par["alpha"], + "sigma": par["sigma"], + "approx": par["approx"], + "batch_size": par["batch_size"], +} + +logger.info( + "Running scanorama with parameters: \\n", + "\\n\\t".join( + [ + f"{argument_name}: {argument_value}" + for argument_name, argument_value in arguments.items() + ] + ), +) +# Integration. +scanorama_integrate(mod, **arguments) +logger.info("Writing output to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] +) + +logger.info("Finished!") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/integrate/scanorama/setup_logger.py b/target/executable/integrate/scanorama/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/integrate/scanorama/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/integrate/scarches/.config.vsh.yaml b/target/executable/integrate/scarches/.config.vsh.yaml new file mode 100644 index 00000000..313756d9 --- /dev/null +++ b/target/executable/integrate/scarches/.config.vsh.yaml @@ -0,0 +1,482 @@ +name: "scarches" +namespace: "integrate" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + description: "Arguments related to the input (query) dataset" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file to use as a query" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer to be used for scArches, if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch" + description: "Name of the .obs column with batch information." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_label" + description: "Name of the .obs column with celltype information." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "Name of the .var column with gene names, if the var .index is not\ + \ to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_size_factor" + description: "Key in adata.obs for size factor information. Instead of using library\ + \ size as a size factor,\nthe provided size factor column will be used as offset\ + \ in the mean of the likelihood.\nAssumed to be on linear scale.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + arguments: + - type: "file" + name: "--reference" + alternatives: + - "-r" + description: "Path to the directory with reference model or a web link." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_class" + description: "For legacy models; the type of model (where the type of model was\ + \ not saved with it; e.g. when they were generated with scvi-tools versions\ + \ < 0.15)." + info: null + example: + - "SCANVI" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_output" + description: "Output directory for model" + info: null + default: + - "model" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_integrated_scanvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_predictions" + description: "In which .obs slot to store the resulting label predictions. Only\ + \ relevant if a scANVI model was provided." + info: null + default: + - "scanvi_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_probabilities" + description: "In which .obs slot to store the probabilities of the label predictions.\ + \ Only relevant if a scANVI model was provided." + info: null + default: + - "scanvi_proba" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Early stopping arguments" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs reference mapping with scArches" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "scanvi_model" +- type: "file" + path: "scvi_model" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "HLCA_reference_model.zip" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "jax[cuda]" + - "scvi-tools~=1.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/scarches/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integrate/scarches" + executable: "target/executable/integrate/scarches/scarches" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/integrate/scarches/compress_h5mu.py b/target/executable/integrate/scarches/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/integrate/scarches/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/integrate/scarches/nextflow_labels.config b/target/executable/integrate/scarches/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/integrate/scarches/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/integrate/scarches/scarches b/target/executable/integrate/scarches/scarches new file mode 100755 index 00000000..116674d0 --- /dev/null +++ b/target/executable/integrate/scarches/scarches @@ -0,0 +1,2018 @@ +#!/usr/bin/env bash + +# scarches main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (author) +# * Dorien Roosen (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scarches" +VIASH_META_FUNCTIONALITY_NAME="scarches" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:25.05-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "jax[cuda]" "scvi-tools~=1.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Vladimir Shitov, Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component integrate scarches" +LABEL org.opencontainers.image.created="2025-08-22T07:23:10Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scarches main" + echo "" + echo "Performs reference mapping with scArches" + echo "" + echo "Inputs:" + echo " Arguments related to the input (query) dataset" + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file to use as a query" + echo "" + echo " --layer" + echo " type: string" + echo " Layer to be used for scArches, if .X is not to be used." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_obs_batch" + echo " type: string" + echo " Name of the .obs column with batch information." + echo "" + echo " --input_obs_label" + echo " type: string" + echo " Name of the .obs column with celltype information." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " Name of the .var column with gene names, if the var .index is not to be" + echo " used." + echo "" + echo " --input_obs_size_factor" + echo " type: string" + echo " Key in adata.obs for size factor information. Instead of using library" + echo " size as a size factor," + echo " the provided size factor column will be used as offset in the mean of" + echo " the likelihood." + echo " Assumed to be on linear scale." + echo "" + echo "Reference:" + echo " -r, --reference" + echo " type: file, required parameter, file must exist" + echo " Path to the directory with reference model or a web link." + echo "" + echo " --reference_class" + echo " type: string" + echo " example: SCANVI" + echo " For legacy models; the type of model (where the type of model was not" + echo " saved with it; e.g. when they were generated with scvi-tools versions <" + echo " 0.15)." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --model_output" + echo " type: file, output, file must exist" + echo " default: model" + echo " Output directory for model" + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_integrated_scanvi" + echo " In which .obsm slot to store the resulting integrated embedding." + echo "" + echo " --obs_output_predictions" + echo " type: string" + echo " default: scanvi_pred" + echo " In which .obs slot to store the resulting label predictions. Only" + echo " relevant if a scANVI model was provided." + echo "" + echo " --obs_output_probabilities" + echo " type: string" + echo " default: scanvi_proba" + echo " In which .obs slot to store the probabilities of the label predictions." + echo " Only relevant if a scANVI model was provided." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Early stopping arguments:" + echo " --early_stopping" + echo " type: boolean" + echo " Whether to perform early stopping with respect to the validation set." + echo "" + echo " --early_stopping_monitor" + echo " type: string" + echo " default: elbo_validation" + echo " choices: [ elbo_validation, reconstruction_loss_validation," + echo "kl_local_validation ]" + echo " Metric logged during validation set epoch." + echo "" + echo " --early_stopping_patience" + echo " type: integer" + echo " default: 45" + echo " min: 1" + echo " Number of validation epochs with no improvement after which training" + echo " will be stopped." + echo "" + echo " --early_stopping_min_delta" + echo " type: double" + echo " default: 0.0" + echo " min: 0.0" + echo " Minimum change in the monitored quantity to qualify as an improvement," + echo " i.e. an absolute change of less than min_delta, will count as no" + echo " improvement." + echo "" + echo "Learning parameters:" + echo " --max_epochs" + echo " type: integer" + echo " Number of passes through the dataset, defaults to (20000 / number of" + echo " cells) * 400 or 400; whichever is smallest." + echo "" + echo " --reduce_lr_on_plateau" + echo " type: boolean" + echo " default: true" + echo " Whether to monitor validation loss and reduce learning rate when" + echo " validation set \`lr_scheduler_metric\` plateaus." + echo "" + echo " --lr_factor" + echo " type: double" + echo " default: 0.6" + echo " min: 0.0" + echo " Factor to reduce learning rate." + echo "" + echo " --lr_patience" + echo " type: double" + echo " default: 30.0" + echo " min: 0.0" + echo " Number of epochs with no improvement after which learning rate will be" + echo " reduced." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scarches main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_batch) + [ -n "$VIASH_PAR_INPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--input_obs_batch\': \'$VIASH_PAR_INPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_batch=*) + [ -n "$VIASH_PAR_INPUT_OBS_BATCH" ] && ViashError Bad arguments for option \'--input_obs_batch=*\': \'$VIASH_PAR_INPUT_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_label) + [ -n "$VIASH_PAR_INPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--input_obs_label\': \'$VIASH_PAR_INPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_label=*) + [ -n "$VIASH_PAR_INPUT_OBS_LABEL" ] && ViashError Bad arguments for option \'--input_obs_label=*\': \'$VIASH_PAR_INPUT_OBS_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obs_size_factor) + [ -n "$VIASH_PAR_INPUT_OBS_SIZE_FACTOR" ] && ViashError Bad arguments for option \'--input_obs_size_factor\': \'$VIASH_PAR_INPUT_OBS_SIZE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_SIZE_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obs_size_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obs_size_factor=*) + [ -n "$VIASH_PAR_INPUT_OBS_SIZE_FACTOR" ] && ViashError Bad arguments for option \'--input_obs_size_factor=*\': \'$VIASH_PAR_INPUT_OBS_SIZE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBS_SIZE_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -r) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'-r\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -r. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_class) + [ -n "$VIASH_PAR_REFERENCE_CLASS" ] && ViashError Bad arguments for option \'--reference_class\': \'$VIASH_PAR_REFERENCE_CLASS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_CLASS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_class. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_class=*) + [ -n "$VIASH_PAR_REFERENCE_CLASS" ] && ViashError Bad arguments for option \'--reference_class=*\': \'$VIASH_PAR_REFERENCE_CLASS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_CLASS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_output) + [ -n "$VIASH_PAR_MODEL_OUTPUT" ] && ViashError Bad arguments for option \'--model_output\': \'$VIASH_PAR_MODEL_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_output=*) + [ -n "$VIASH_PAR_MODEL_OUTPUT" ] && ViashError Bad arguments for option \'--model_output=*\': \'$VIASH_PAR_MODEL_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_output_predictions) + [ -n "$VIASH_PAR_OBS_OUTPUT_PREDICTIONS" ] && ViashError Bad arguments for option \'--obs_output_predictions\': \'$VIASH_PAR_OBS_OUTPUT_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_output_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_output_predictions=*) + [ -n "$VIASH_PAR_OBS_OUTPUT_PREDICTIONS" ] && ViashError Bad arguments for option \'--obs_output_predictions=*\': \'$VIASH_PAR_OBS_OUTPUT_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_output_probabilities) + [ -n "$VIASH_PAR_OBS_OUTPUT_PROBABILITIES" ] && ViashError Bad arguments for option \'--obs_output_probabilities\': \'$VIASH_PAR_OBS_OUTPUT_PROBABILITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PROBABILITIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_output_probabilities. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_output_probabilities=*) + [ -n "$VIASH_PAR_OBS_OUTPUT_PROBABILITIES" ] && ViashError Bad arguments for option \'--obs_output_probabilities=*\': \'$VIASH_PAR_OBS_OUTPUT_PROBABILITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT_PROBABILITIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping) + [ -n "$VIASH_PAR_EARLY_STOPPING" ] && ViashError Bad arguments for option \'--early_stopping\': \'$VIASH_PAR_EARLY_STOPPING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping=*) + [ -n "$VIASH_PAR_EARLY_STOPPING" ] && ViashError Bad arguments for option \'--early_stopping=*\': \'$VIASH_PAR_EARLY_STOPPING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_monitor) + [ -n "$VIASH_PAR_EARLY_STOPPING_MONITOR" ] && ViashError Bad arguments for option \'--early_stopping_monitor\': \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MONITOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_monitor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_monitor=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_MONITOR" ] && ViashError Bad arguments for option \'--early_stopping_monitor=*\': \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MONITOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_patience) + [ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ] && ViashError Bad arguments for option \'--early_stopping_patience\': \'$VIASH_PAR_EARLY_STOPPING_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_PATIENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_patience. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_patience=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ] && ViashError Bad arguments for option \'--early_stopping_patience=*\': \'$VIASH_PAR_EARLY_STOPPING_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_PATIENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_min_delta) + [ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ] && ViashError Bad arguments for option \'--early_stopping_min_delta\': \'$VIASH_PAR_EARLY_STOPPING_MIN_DELTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MIN_DELTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_min_delta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_min_delta=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ] && ViashError Bad arguments for option \'--early_stopping_min_delta=*\': \'$VIASH_PAR_EARLY_STOPPING_MIN_DELTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MIN_DELTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_epochs) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_epochs=*) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs=*\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reduce_lr_on_plateau) + [ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ] && ViashError Bad arguments for option \'--reduce_lr_on_plateau\': \'$VIASH_PAR_REDUCE_LR_ON_PLATEAU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REDUCE_LR_ON_PLATEAU="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reduce_lr_on_plateau. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reduce_lr_on_plateau=*) + [ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ] && ViashError Bad arguments for option \'--reduce_lr_on_plateau=*\': \'$VIASH_PAR_REDUCE_LR_ON_PLATEAU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REDUCE_LR_ON_PLATEAU=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lr_factor) + [ -n "$VIASH_PAR_LR_FACTOR" ] && ViashError Bad arguments for option \'--lr_factor\': \'$VIASH_PAR_LR_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lr_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lr_factor=*) + [ -n "$VIASH_PAR_LR_FACTOR" ] && ViashError Bad arguments for option \'--lr_factor=*\': \'$VIASH_PAR_LR_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lr_patience) + [ -n "$VIASH_PAR_LR_PATIENCE" ] && ViashError Bad arguments for option \'--lr_patience\': \'$VIASH_PAR_LR_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_PATIENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lr_patience. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lr_patience=*) + [ -n "$VIASH_PAR_LR_PATIENCE" ] && ViashError Bad arguments for option \'--lr_patience=*\': \'$VIASH_PAR_LR_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_PATIENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/integrate/scarches:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_MODEL_OUTPUT+x} ]; then + VIASH_PAR_MODEL_OUTPUT="model" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_integrated_scanvi" +fi +if [ -z ${VIASH_PAR_OBS_OUTPUT_PREDICTIONS+x} ]; then + VIASH_PAR_OBS_OUTPUT_PREDICTIONS="scanvi_pred" +fi +if [ -z ${VIASH_PAR_OBS_OUTPUT_PROBABILITIES+x} ]; then + VIASH_PAR_OBS_OUTPUT_PROBABILITIES="scanvi_proba" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then + VIASH_PAR_EARLY_STOPPING_MONITOR="elbo_validation" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then + VIASH_PAR_EARLY_STOPPING_PATIENCE="45" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then + VIASH_PAR_EARLY_STOPPING_MIN_DELTA="0.0" +fi +if [ -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then + VIASH_PAR_REDUCE_LR_ON_PLATEAU="true" +fi +if [ -z ${VIASH_PAR_LR_FACTOR+x} ]; then + VIASH_PAR_LR_FACTOR="0.6" +fi +if [ -z ${VIASH_PAR_LR_PATIENCE+x} ]; then + VIASH_PAR_LR_PATIENCE="30.0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EARLY_STOPPING" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--early_stopping' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING_PATIENCE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--early_stopping_patience' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_EARLY_STOPPING_PATIENCE -lt 1 ]]; then + ViashError '--early_stopping_patience' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--early_stopping_min_delta' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_EARLY_STOPPING_MIN_DELTA '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--early_stopping_min_delta' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_EARLY_STOPPING_MIN_DELTA -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--early_stopping_min_delta' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--early_stopping_min_delta' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_MAX_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_MAX_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ]]; then + if ! [[ "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--reduce_lr_on_plateau' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LR_FACTOR" ]]; then + if ! [[ "$VIASH_PAR_LR_FACTOR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lr_factor' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LR_FACTOR '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--lr_factor' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LR_FACTOR -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lr_factor' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lr_factor' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_LR_PATIENCE" ]]; then + if ! [[ "$VIASH_PAR_LR_PATIENCE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lr_patience' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LR_PATIENCE '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--lr_patience' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LR_PATIENCE -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lr_patience' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lr_patience' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_EARLY_STOPPING_MONITOR" ]; then + VIASH_PAR_EARLY_STOPPING_MONITOR_CHOICES=("elbo_validation;reconstruction_loss_validation;kl_local_validation") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_EARLY_STOPPING_MONITOR_CHOICES[*]};" =~ ";$VIASH_PAR_EARLY_STOPPING_MONITOR;" ]]; then + ViashError '--early_stopping_monitor' specified value of \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_MODEL_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_MODEL_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_OUTPUT")" ) + VIASH_PAR_MODEL_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_MODEL_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scarches-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata +from anndata import AnnData +import scvi +import pandas as pd +import numpy as np +import torch +from tempfile import TemporaryDirectory +from pathlib import Path +import pickle + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obs_size_factor': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_SIZE_FACTOR+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_SIZE_FACTOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_class': $( if [ ! -z ${VIASH_PAR_REFERENCE_CLASS+x} ]; then echo "r'${VIASH_PAR_REFERENCE_CLASS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_output': $( if [ ! -z ${VIASH_PAR_MODEL_OUTPUT+x} ]; then echo "r'${VIASH_PAR_MODEL_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_output_predictions': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PREDICTIONS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_output_probabilities': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PROBABILITIES+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PROBABILITIES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'early_stopping': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping_monitor': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING_MONITOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'early_stopping_patience': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then echo "int(r'${VIASH_PAR_EARLY_STOPPING_PATIENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'early_stopping_min_delta': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then echo "float(r'${VIASH_PAR_EARLY_STOPPING_MIN_DELTA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reduce_lr_on_plateau': $( if [ ! -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then echo "r'${VIASH_PAR_REDUCE_LR_ON_PLATEAU//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'lr_factor': $( if [ ! -z ${VIASH_PAR_LR_FACTOR+x} ]; then echo "float(r'${VIASH_PAR_LR_FACTOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'lr_patience': $( if [ ! -z ${VIASH_PAR_LR_PATIENCE+x} ]; then echo "float(r'${VIASH_PAR_LR_PATIENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def _read_model_name_from_registry(model_path) -> str: + """Read registry with information about the model, return the model name""" + registry = scvi.model.base.BaseModelClass.load_registry(model_path) + return registry["model_name"] + + +def _detect_base_model(model_path): + """Read from the model's file which scvi_tools model it contains""" + return getattr(scvi.model, _read_model_name_from_registry(model_path)) + + +def _validate_obs_metadata_params(model_registry, model_name): + """ + Validates .obs metadata par variables that are required by scvi-tools models. + + This function checks that necessary .obs metadata field names (batch, labels, size factors) + specified in the model registry have the required corresponding parameters provided in the input. + + Parameters + ---------- + model_registry : dict + Dictionary containing model configuration and requirements. + model_name : str + Name of the model being validated. + """ + + # Maps the keys of the model registry to their corresponding par keys containing the .obs metadata + registry_par_mapper = { + "batch_key": "input_obs_batch", + "labels_key": "input_obs_label", + "size_factor_key": "input_obs_size_factor", + } + + for registry_key, key in registry_par_mapper.items(): + model_registry_key = model_registry.get(registry_key) + par_key = par[key] + + if model_registry_key and not par_key: + if key != "input_obs_label" or not model_registry.get("unlabeled_category"): + raise ValueError( + f"The provided {model_name} model requires \`--{key}\` to be provided." + ) + + elif par_key and not model_registry_key: + logger.warning( + f"\`--{key}\` was provided but is not used in the provided {model_name} model." + ) + + +def _align_query_with_registry(adata_query, model_path): + """ + Creates a qeury AnnData object with the expected structure and metadata fields that are aligned with the pre-trained reference model. + + Parameters + ---------- + adata_query : AnnData + The query AnnData object to be aligned with the model structure. + model_path : str + Path to the directory containing the pre-trained model. + + Returns + ------- + AnnData + A new AnnData object with structure and metadata aligned to match the + requirements of the pre-trained model. + """ + + model = _detect_base_model(model_path) + model_name = _read_model_name_from_registry(model_path) + model_registry = model.load_registry(model_path)["setup_args"] + _validate_obs_metadata_params(model_registry, model_name) + + # Sanitize gene names and set as index of the AnnData object + # all scArches VAE models expect gene names to be in the .var index + adata_query = set_var_index(adata_query, par["input_var_gene_names"]) + + # align layer + query_layer = ( + adata_query.X if not par["layer"] else adata_query.layers[par["layer"]] + ) + var_index = adata_query.var_names.tolist() + obs_index = adata_query.obs_names.tolist() + + # align observations + query_obs = {} + + ## batch_key, size_factor_key + simple_mappings = { + "batch_key": "input_obs_batch", # relevant for AUTOZI, LinearSCVI, PEAKVI, SCANVI, SCVI, TOTALVI, MULTIVI, JaxSCVI + "size_factor_key": "input_obs_size_factor", # relevant for SCANVI, SCVI, TOTALVI, MULTIVI + } + + for registry_key, par_key in simple_mappings.items(): + if model_registry.get(registry_key): + query_obs[model_registry[registry_key]] = adata_query.obs[ + par[par_key] + ].tolist() + + ## labels-key, relevant for AUTOZI, CondSCVI, LinearSCVI, PEAKVI, SCANVI, SCVI + if model_registry.get("labels_key"): + if par["input_obs_label"]: + query_obs[model_registry["labels_key"]] = adata_query.obs[ + par["input_obs_label"] + ].tolist() + else: + adata_query.obs[model_registry["labels_key"]] = model_registry[ + "unlabeled_category" + ] + query_obs[model_registry["labels_key"]] = adata_query.obs[ + model_registry["labels_key"] + ].tolist() + + obs = pd.DataFrame(query_obs, index=obs_index) + var = pd.DataFrame(index=var_index) + + aligned_query_anndata = AnnData(X=query_layer, obs=obs, var=var) + if model_registry["layer"]: + aligned_query_anndata.layers[model_registry["layer"]] = query_layer + + return aligned_query_anndata + + +def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1): + """ + A function to map the query data to the reference atlas. + + Input: + * adata_query: An AnnData object with the query + * model_path: The reference model directory + + Output: + * vae_query: the trained scvi_tools model + * adata_query: The AnnData object with the query preprocessed for the mapping to the reference + """ + model = _detect_base_model(model_path) + + # Keys of the AnnData query object need to match the exact keys in the reference model registry + + aligned_adata_query = _align_query_with_registry(adata_query, model_path) + + try: + model.prepare_query_anndata(aligned_adata_query, model_path) + except ValueError: + logger.warning( + "ValueError thrown when preparing adata for mapping. Clearing .varm field to prevent it" + ) + aligned_adata_query.varm.clear() + try: + model.prepare_query_anndata(aligned_adata_query, model_path) + except ValueError: + raise ValueError( + f"Could not perform model.prepare_model_anndata, likely because the model was trained with different var names then were found in the index. \\n\\nmodel var_names: {model.prepare_query_anndata(aligned_adata_query, model_path, return_reference_var_names=True).tolist()} \\n\\nquery data var_names: {aligned_adata_query.var_names.tolist()} " + ) + + try: + # Load query data into the model + vae_query = model.load_query_data( + aligned_adata_query, model_path, freeze_dropout=True + ) + except KeyError: + # Older models do not have the 'setup_method_name' key saved with the model. + # Assume these were generated from an anndata object. + model_torch = torch.load( + f"{model_path}/model.pt", weights_only=False, map_location="cpu" + ) + model_torch["attr_dict"]["registry_"]["setup_method_name"] = "setup_anndata" + + with TemporaryDirectory(dir=meta["temp_dir"]) as tempdir: + temp_file_name = Path(tempdir) / "model.pt" + torch.save(model_torch, temp_file_name) + del model_torch + vae_query = model.load_query_data( + aligned_adata_query, tempdir, freeze_dropout=True + ) + + # Train scArches model for query mapping + vae_query.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + check_val_every_n_epoch=check_val_every_n_epoch, + accelerator="auto", + ) + + return vae_query + + +def _convert_object_dtypes_to_strings(adata): + """Convert object dtypes in .var and .obs to string to prevent error when saving file""" + + def convert_cols(df): + object_cols = df.columns[df.dtypes == "object"] + for col in object_cols: + df[col] = df[col].astype(str) + return df + + adata.var = convert_cols(adata.var) + adata.obs = convert_cols(adata.obs) + + return adata + + +def _get_model_path(model_path: str): + """Obtain path to the directory with reference model. If the proposed \`model_path\` is a .zip archive, unzip it. If nesessary, convert model to the new format + + Parameters + ---------- + model_path : str + Path to a directory, where to search for the model or to a zip file containing the model + + Returns + ------- + Path to a directory with reference model in format of scvi-tools>=0.15 + """ + import os + import zipfile + import tempfile + from pathlib import Path + + if os.path.isdir(model_path) and "model.pt" in os.listdir(model_path): + # Probably, the \`model_path\` already contains model in the output format of scvi-tools>=0.15 + return model_path + + # The model either has old format or is a zip file downloaded from Zenodo + new_directory = Path(tempfile.TemporaryDirectory().name) + + if zipfile.is_zipfile(model_path): + with zipfile.ZipFile(model_path) as archive: + archive.extractall(new_directory) + model_dir = next(new_directory.glob("**/*.pt")).parent + + else: + model_dir = next(Path(model_path).glob("**/*.pt")).parent + + if "model_params.pt" in os.listdir(model_dir): + try: + # The current approach is to store the class with the model in a registry + model_class = _detect_base_model(model_dir) + except ValueError: + # Failed to load the registry, try + with (model_dir / "attr.pkl").open("rb") as open_file: + attrs = pickle.load(open_file) + try: + model_name = attrs["registry_"]["model_name"] + except KeyError: + # With older scvi-tools models, the model name was not stored with the model... + model_name = par["reference_class"] + if not model_name: + raise ValueError( + "Could not load reference model. This might be because it is a legacy model for which the model type was not saved with the model. Please provide it manually using the 'reference_class' argument." + ) + if not hasattr(scvi.model, model_name): + raise ValueError( + f"Requested to load legacy model using type {model_name}, but such a class does not exist in \`scvi.models\`." + ) + model_class = getattr(scvi.model, model_name) + + # The model is in the \`directory\`, but it was generated with scvi-tools<0.15 + # TODO: for new references (that could not be SCANVI based), we need to check the base class somehow. Reading registry does not work with models generated by scvi-tools<0.15 + # Here I assume that the reference model is for HLCA and thus is SCANVI based + converted_model_path = os.path.join(model_dir, "converted") + model_class.convert_legacy_save(model_dir, converted_model_path) + return converted_model_path + + elif "model.pt" in os.listdir(model_dir): + # Archive contained model in the new format, so just return the directory + return model_dir + + else: + raise ValueError( + "Cannot find model in the provided reference path. Please, provide a path or a link to the directory with reference model. For HLCA use https://zenodo.org/record/6337966/files/HLCA_reference_model.zip" + ) + + +def main(): + logger.info("Reading %s, modality %s", par["input"], par["modality"]) + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + adata_query = adata.copy() + + model_path = _get_model_path(par["reference"]) + vae_query = map_to_existing_reference(adata_query, model_path=model_path) + model_name = _read_model_name_from_registry(model_path) + + # Save info about the used model + adata.uns["integration_method"] = model_name + + logger.info("Trying to write latent representation") + output_key = par["obsm_output"].format(model_name=model_name) + adata.obsm[output_key] = vae_query.get_latent_representation() + + # In addition to integrating the data, scANVI also performs label annotations + if model_name == "SCANVI": + adata.obs[par["obs_output_predictions"]] = vae_query.predict() + adata.obs[par["obs_output_probabilities"]] = np.max( + vae_query.predict(soft=True), axis=1 + ) + + logger.info("Converting dtypes") + adata = _convert_object_dtypes_to_strings(adata) + + logger.info( + "Saving h5mu file to %s with compression %s", + par["output"], + par["output_compression"], + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + + logger.info("Saving model") + vae_query.save(par["model_output"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ]; then + VIASH_PAR_MODEL_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ] && [ ! -e "$VIASH_PAR_MODEL_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_MODEL_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/integrate/scarches/set_var_index.py b/target/executable/integrate/scarches/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/integrate/scarches/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/integrate/scarches/setup_logger.py b/target/executable/integrate/scarches/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/integrate/scarches/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/integrate/scvi/.config.vsh.yaml b/target/executable/integrate/scvi/.config.vsh.yaml new file mode 100644 index 00000000..a3acb06b --- /dev/null +++ b/target/executable/integrate/scvi/.config.vsh.yaml @@ -0,0 +1,638 @@ +name: "scvi" +namespace: "integrate" +version: "main" +authors: +- name: "Malte D. Luecken" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "malte.luecken@helmholtz-muenchen.de" + github: "LuckyMD" + orcid: "0000-0001-7464-7921" + linkedin: "malte-l%C3%BCcken-b8b21049" + twitter: "MDLuecken" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Group Leader" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Matthias Beyens" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "MatthiasBeyens" + orcid: "0000-0003-3304-0706" + email: "matthias.beyens@gmail.com" + linkedin: "mbeyens" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: ".var column containing gene names. By default, use the index." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_labels" + description: "Key in adata.obs for label information. Categories will automatically\ + \ be \nconverted into integer categories and saved to adata.obs['_scvi_labels'].\n\ + If None, assigns the same label to all the data.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_size_factor" + description: "Key in adata.obs for size factor information. Instead of using library\ + \ size as a size factor,\nthe provided size factor column will be used as offset\ + \ in the mean of the likelihood.\nAssumed to be on linear scale.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_categorical_covariate" + description: "Keys in adata.obs that correspond to categorical data. These covariates\ + \ can be added in\naddition to the batch covariate and are also treated as nuisance\ + \ factors\n(i.e., the model tries to minimize their effects on the latent space).\n\ + Thus, these should not be used for biologically-relevant factors that you do\ + \ _not_ want to correct for.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--obs_continuous_covariate" + description: "Keys in adata.obs that correspond to continuous data. These covariates\ + \ can be added in\naddition to the batch covariate and are also treated as nuisance\ + \ factors\n(i.e., the model tries to minimize their effects on the latent space).\ + \ Thus, these should not be\nused for biologically-relevant factors that you\ + \ do _not_ want to correct for.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_model" + description: "Folder where the state of the trained model will be saved to." + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_scvi_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "SCVI options" + arguments: + - type: "integer" + name: "--n_hidden_nodes" + description: "Number of nodes per hidden layer." + info: null + default: + - 128 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_dimensions_latent_space" + description: "Dimensionality of the latent space." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_hidden_layers" + description: "Number of hidden layers used for encoder and decoder neural-networks." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--dropout_rate" + description: "Dropout rate for the neural networks." + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--dispersion" + description: "Set the behavior for the dispersion for negative binomial distributions:\n\ + - gene: dispersion parameter of negative binomial is constant per gene across\ + \ cells\n- gene-batch: dispersion can differ between different batches\n- gene-label:\ + \ dispersion can differ between different labels\n- gene-cell: dispersion can\ + \ differ for every gene in every cell\n" + info: null + default: + - "gene" + required: false + choices: + - "gene" + - "gene-batch" + - "gene-label" + - "gene-cell" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_likelihood" + description: "Model used to generate the expression data from a count-based likelihood\ + \ distribution.\n- nb: Negative binomial distribution\n- zinb: Zero-inflated\ + \ negative binomial distribution\n- poisson: Poisson distribution\n" + info: null + default: + - "nb" + required: false + choices: + - "nb" + - "zinb" + - "poisson" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variational auto-encoder model options" + arguments: + - type: "string" + name: "--use_layer_normalization" + description: "Neural networks for which to enable layer normalization. \n" + info: null + default: + - "both" + required: false + choices: + - "encoder" + - "decoder" + - "none" + - "both" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--use_batch_normalization" + description: "Neural networks for which to enable batch normalization. \n" + info: null + default: + - "none" + required: false + choices: + - "encoder" + - "decoder" + - "none" + - "both" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--encode_covariates" + description: "Whether to concatenate covariates to expression in encoder" + info: null + direction: "input" + - type: "boolean_true" + name: "--deeply_inject_covariates" + description: "Whether to concatenate covariates into output of hidden layers in\ + \ encoder/decoder. \nThis option only applies when n_layers > 1. The covariates\ + \ are concatenated to\nthe input of subsequent hidden layers.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--use_observed_lib_size" + description: "Use observed library size for RNA as scaling factor in mean of conditional\ + \ distribution.\n" + info: null + direction: "input" +- name: "Early stopping arguments" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Data validition" + arguments: + - type: "integer" + name: "--n_obs_min_count" + description: "Minimum number of cells threshold ensuring that every obs_batch\ + \ category has sufficient observations (cells) for model training." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_var_min_count" + description: "Minimum number of genes threshold ensuring that every var_input\ + \ filter has sufficient observations (genes) for model training." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + - "gpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "jax[cuda]" + - "scvi-tools~=1.3.1" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/scvi/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integrate/scvi" + executable: "target/executable/integrate/scvi/scvi" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/integrate/scvi/compress_h5mu.py b/target/executable/integrate/scvi/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/integrate/scvi/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/integrate/scvi/nextflow_labels.config b/target/executable/integrate/scvi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/integrate/scvi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/integrate/scvi/scvi b/target/executable/integrate/scvi/scvi new file mode 100755 index 00000000..837b75ca --- /dev/null +++ b/target/executable/integrate/scvi/scvi @@ -0,0 +1,2142 @@ +#!/usr/bin/env bash + +# scvi main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Malte D. Luecken (author) +# * Dries Schaumont (maintainer) +# * Matthias Beyens (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scvi" +VIASH_META_FUNCTIONALITY_NAME="scvi" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:25.05-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "jax[cuda]" "scvi-tools~=1.3.1" + +LABEL org.opencontainers.image.authors="Malte D. Luecken, Dries Schaumont, Matthias Beyens" +LABEL org.opencontainers.image.description="Companion container for running component integrate scvi" +LABEL org.opencontainers.image.created="2025-08-22T07:23:11Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scvi main" + echo "" + echo "Performs scvi integration as done in the human lung cell atlas" + echo "https://github.com/LungCellAtlas/HLCA" + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. If None, X is used" + echo "" + echo " --obs_batch" + echo " type: string" + echo " default: sample_id" + echo " Column name discriminating between your batches." + echo "" + echo " --var_gene_names" + echo " type: string" + echo " .var column containing gene names. By default, use the index." + echo "" + echo " --var_input" + echo " type: string" + echo " .var column containing highly variable genes. By default, do not subset" + echo " genes." + echo "" + echo " --obs_labels" + echo " type: string" + echo " Key in adata.obs for label information. Categories will automatically be" + echo " converted into integer categories and saved to" + echo " adata.obs['_scvi_labels']." + echo " If None, assigns the same label to all the data." + echo "" + echo " --obs_size_factor" + echo " type: string" + echo " Key in adata.obs for size factor information. Instead of using library" + echo " size as a size factor," + echo " the provided size factor column will be used as offset in the mean of" + echo " the likelihood." + echo " Assumed to be on linear scale." + echo "" + echo " --obs_categorical_covariate" + echo " type: string, multiple values allowed" + echo " Keys in adata.obs that correspond to categorical data. These covariates" + echo " can be added in" + echo " addition to the batch covariate and are also treated as nuisance factors" + echo " (i.e., the model tries to minimize their effects on the latent space)." + echo " Thus, these should not be used for biologically-relevant factors that" + echo " you do _not_ want to correct for." + echo "" + echo " --obs_continuous_covariate" + echo " type: string, multiple values allowed" + echo " Keys in adata.obs that correspond to continuous data. These covariates" + echo " can be added in" + echo " addition to the batch covariate and are also treated as nuisance factors" + echo " (i.e., the model tries to minimize their effects on the latent space)." + echo " Thus, these should not be" + echo " used for biologically-relevant factors that you do _not_ want to correct" + echo " for." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --output_model" + echo " type: file, output, file must exist" + echo " Folder where the state of the trained model will be saved to." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_scvi_integrated" + echo " In which .obsm slot to store the resulting integrated embedding." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "SCVI options:" + echo " --n_hidden_nodes" + echo " type: integer" + echo " default: 128" + echo " Number of nodes per hidden layer." + echo "" + echo " --n_dimensions_latent_space" + echo " type: integer" + echo " default: 30" + echo " Dimensionality of the latent space." + echo "" + echo " --n_hidden_layers" + echo " type: integer" + echo " default: 2" + echo " Number of hidden layers used for encoder and decoder neural-networks." + echo "" + echo " --dropout_rate" + echo " type: double" + echo " default: 0.1" + echo " Dropout rate for the neural networks." + echo "" + echo " --dispersion" + echo " type: string" + echo " default: gene" + echo " choices: [ gene, gene-batch, gene-label, gene-cell ]" + echo " Set the behavior for the dispersion for negative binomial distributions:" + echo " - gene: dispersion parameter of negative binomial is constant per gene" + echo " across cells" + echo " - gene-batch: dispersion can differ between different batches" + echo " - gene-label: dispersion can differ between different labels" + echo " - gene-cell: dispersion can differ for every gene in every cell" + echo "" + echo " --gene_likelihood" + echo " type: string" + echo " default: nb" + echo " choices: [ nb, zinb, poisson ]" + echo " Model used to generate the expression data from a count-based likelihood" + echo " distribution." + echo " - nb: Negative binomial distribution" + echo " - zinb: Zero-inflated negative binomial distribution" + echo " - poisson: Poisson distribution" + echo "" + echo "Variational auto-encoder model options:" + echo " --use_layer_normalization" + echo " type: string" + echo " default: both" + echo " choices: [ encoder, decoder, none, both ]" + echo " Neural networks for which to enable layer normalization." + echo "" + echo " --use_batch_normalization" + echo " type: string" + echo " default: none" + echo " choices: [ encoder, decoder, none, both ]" + echo " Neural networks for which to enable batch normalization." + echo "" + echo " --encode_covariates" + echo " type: boolean_false" + echo " Whether to concatenate covariates to expression in encoder" + echo "" + echo " --deeply_inject_covariates" + echo " type: boolean_true" + echo " Whether to concatenate covariates into output of hidden layers in" + echo " encoder/decoder." + echo " This option only applies when n_layers > 1. The covariates are" + echo " concatenated to" + echo " the input of subsequent hidden layers." + echo "" + echo " --use_observed_lib_size" + echo " type: boolean_true" + echo " Use observed library size for RNA as scaling factor in mean of" + echo " conditional distribution." + echo "" + echo "Early stopping arguments:" + echo " --early_stopping" + echo " type: boolean" + echo " Whether to perform early stopping with respect to the validation set." + echo "" + echo " --early_stopping_monitor" + echo " type: string" + echo " default: elbo_validation" + echo " choices: [ elbo_validation, reconstruction_loss_validation," + echo "kl_local_validation ]" + echo " Metric logged during validation set epoch." + echo "" + echo " --early_stopping_patience" + echo " type: integer" + echo " default: 45" + echo " min: 1" + echo " Number of validation epochs with no improvement after which training" + echo " will be stopped." + echo "" + echo " --early_stopping_min_delta" + echo " type: double" + echo " default: 0.0" + echo " min: 0.0" + echo " Minimum change in the monitored quantity to qualify as an improvement," + echo " i.e. an absolute change of less than min_delta, will count as no" + echo " improvement." + echo "" + echo "Learning parameters:" + echo " --max_epochs" + echo " type: integer" + echo " Number of passes through the dataset, defaults to (20000 / number of" + echo " cells) * 400 or 400; whichever is smallest." + echo "" + echo " --reduce_lr_on_plateau" + echo " type: boolean" + echo " default: true" + echo " Whether to monitor validation loss and reduce learning rate when" + echo " validation set \`lr_scheduler_metric\` plateaus." + echo "" + echo " --lr_factor" + echo " type: double" + echo " default: 0.6" + echo " min: 0.0" + echo " Factor to reduce learning rate." + echo "" + echo " --lr_patience" + echo " type: double" + echo " default: 30.0" + echo " min: 0.0" + echo " Number of epochs with no improvement after which learning rate will be" + echo " reduced." + echo "" + echo "Data validition:" + echo " --n_obs_min_count" + echo " type: integer" + echo " default: 0" + echo " Minimum number of cells threshold ensuring that every obs_batch category" + echo " has sufficient observations (cells) for model training." + echo "" + echo " --n_var_min_count" + echo " type: integer" + echo " default: 0" + echo " Minimum number of genes threshold ensuring that every var_input filter" + echo " has sufficient observations (genes) for model training." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scvi main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_batch) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch=*) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch=*\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_gene_names) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_gene_names=*) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names=*\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_labels) + [ -n "$VIASH_PAR_OBS_LABELS" ] && ViashError Bad arguments for option \'--obs_labels\': \'$VIASH_PAR_OBS_LABELS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LABELS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_labels. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_labels=*) + [ -n "$VIASH_PAR_OBS_LABELS" ] && ViashError Bad arguments for option \'--obs_labels=*\': \'$VIASH_PAR_OBS_LABELS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_LABELS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_size_factor) + [ -n "$VIASH_PAR_OBS_SIZE_FACTOR" ] && ViashError Bad arguments for option \'--obs_size_factor\': \'$VIASH_PAR_OBS_SIZE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_SIZE_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_size_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_size_factor=*) + [ -n "$VIASH_PAR_OBS_SIZE_FACTOR" ] && ViashError Bad arguments for option \'--obs_size_factor=*\': \'$VIASH_PAR_OBS_SIZE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_SIZE_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_categorical_covariate) + if [ -z "$VIASH_PAR_OBS_CATEGORICAL_COVARIATE" ]; then + VIASH_PAR_OBS_CATEGORICAL_COVARIATE="$2" + else + VIASH_PAR_OBS_CATEGORICAL_COVARIATE="$VIASH_PAR_OBS_CATEGORICAL_COVARIATE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_categorical_covariate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_categorical_covariate=*) + if [ -z "$VIASH_PAR_OBS_CATEGORICAL_COVARIATE" ]; then + VIASH_PAR_OBS_CATEGORICAL_COVARIATE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_CATEGORICAL_COVARIATE="$VIASH_PAR_OBS_CATEGORICAL_COVARIATE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --obs_continuous_covariate) + if [ -z "$VIASH_PAR_OBS_CONTINUOUS_COVARIATE" ]; then + VIASH_PAR_OBS_CONTINUOUS_COVARIATE="$2" + else + VIASH_PAR_OBS_CONTINUOUS_COVARIATE="$VIASH_PAR_OBS_CONTINUOUS_COVARIATE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_continuous_covariate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_continuous_covariate=*) + if [ -z "$VIASH_PAR_OBS_CONTINUOUS_COVARIATE" ]; then + VIASH_PAR_OBS_CONTINUOUS_COVARIATE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_CONTINUOUS_COVARIATE="$VIASH_PAR_OBS_CONTINUOUS_COVARIATE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_model) + [ -n "$VIASH_PAR_OUTPUT_MODEL" ] && ViashError Bad arguments for option \'--output_model\': \'$VIASH_PAR_OUTPUT_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_model=*) + [ -n "$VIASH_PAR_OUTPUT_MODEL" ] && ViashError Bad arguments for option \'--output_model=*\': \'$VIASH_PAR_OUTPUT_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_hidden_nodes) + [ -n "$VIASH_PAR_N_HIDDEN_NODES" ] && ViashError Bad arguments for option \'--n_hidden_nodes\': \'$VIASH_PAR_N_HIDDEN_NODES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_HIDDEN_NODES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_hidden_nodes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_hidden_nodes=*) + [ -n "$VIASH_PAR_N_HIDDEN_NODES" ] && ViashError Bad arguments for option \'--n_hidden_nodes=*\': \'$VIASH_PAR_N_HIDDEN_NODES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_HIDDEN_NODES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_dimensions_latent_space) + [ -n "$VIASH_PAR_N_DIMENSIONS_LATENT_SPACE" ] && ViashError Bad arguments for option \'--n_dimensions_latent_space\': \'$VIASH_PAR_N_DIMENSIONS_LATENT_SPACE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_DIMENSIONS_LATENT_SPACE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_dimensions_latent_space. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_dimensions_latent_space=*) + [ -n "$VIASH_PAR_N_DIMENSIONS_LATENT_SPACE" ] && ViashError Bad arguments for option \'--n_dimensions_latent_space=*\': \'$VIASH_PAR_N_DIMENSIONS_LATENT_SPACE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_DIMENSIONS_LATENT_SPACE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_hidden_layers) + [ -n "$VIASH_PAR_N_HIDDEN_LAYERS" ] && ViashError Bad arguments for option \'--n_hidden_layers\': \'$VIASH_PAR_N_HIDDEN_LAYERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_HIDDEN_LAYERS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_hidden_layers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_hidden_layers=*) + [ -n "$VIASH_PAR_N_HIDDEN_LAYERS" ] && ViashError Bad arguments for option \'--n_hidden_layers=*\': \'$VIASH_PAR_N_HIDDEN_LAYERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_HIDDEN_LAYERS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dropout_rate) + [ -n "$VIASH_PAR_DROPOUT_RATE" ] && ViashError Bad arguments for option \'--dropout_rate\': \'$VIASH_PAR_DROPOUT_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DROPOUT_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dropout_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dropout_rate=*) + [ -n "$VIASH_PAR_DROPOUT_RATE" ] && ViashError Bad arguments for option \'--dropout_rate=*\': \'$VIASH_PAR_DROPOUT_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DROPOUT_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dispersion) + [ -n "$VIASH_PAR_DISPERSION" ] && ViashError Bad arguments for option \'--dispersion\': \'$VIASH_PAR_DISPERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DISPERSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dispersion. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dispersion=*) + [ -n "$VIASH_PAR_DISPERSION" ] && ViashError Bad arguments for option \'--dispersion=*\': \'$VIASH_PAR_DISPERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DISPERSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_likelihood) + [ -n "$VIASH_PAR_GENE_LIKELIHOOD" ] && ViashError Bad arguments for option \'--gene_likelihood\': \'$VIASH_PAR_GENE_LIKELIHOOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_LIKELIHOOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_likelihood. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_likelihood=*) + [ -n "$VIASH_PAR_GENE_LIKELIHOOD" ] && ViashError Bad arguments for option \'--gene_likelihood=*\': \'$VIASH_PAR_GENE_LIKELIHOOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_LIKELIHOOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --use_layer_normalization) + [ -n "$VIASH_PAR_USE_LAYER_NORMALIZATION" ] && ViashError Bad arguments for option \'--use_layer_normalization\': \'$VIASH_PAR_USE_LAYER_NORMALIZATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_LAYER_NORMALIZATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --use_layer_normalization. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --use_layer_normalization=*) + [ -n "$VIASH_PAR_USE_LAYER_NORMALIZATION" ] && ViashError Bad arguments for option \'--use_layer_normalization=*\': \'$VIASH_PAR_USE_LAYER_NORMALIZATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_LAYER_NORMALIZATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --use_batch_normalization) + [ -n "$VIASH_PAR_USE_BATCH_NORMALIZATION" ] && ViashError Bad arguments for option \'--use_batch_normalization\': \'$VIASH_PAR_USE_BATCH_NORMALIZATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_BATCH_NORMALIZATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --use_batch_normalization. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --use_batch_normalization=*) + [ -n "$VIASH_PAR_USE_BATCH_NORMALIZATION" ] && ViashError Bad arguments for option \'--use_batch_normalization=*\': \'$VIASH_PAR_USE_BATCH_NORMALIZATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_BATCH_NORMALIZATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --encode_covariates) + [ -n "$VIASH_PAR_ENCODE_COVARIATES" ] && ViashError Bad arguments for option \'--encode_covariates\': \'$VIASH_PAR_ENCODE_COVARIATES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ENCODE_COVARIATES=false + shift 1 + ;; + --deeply_inject_covariates) + [ -n "$VIASH_PAR_DEEPLY_INJECT_COVARIATES" ] && ViashError Bad arguments for option \'--deeply_inject_covariates\': \'$VIASH_PAR_DEEPLY_INJECT_COVARIATES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DEEPLY_INJECT_COVARIATES=true + shift 1 + ;; + --use_observed_lib_size) + [ -n "$VIASH_PAR_USE_OBSERVED_LIB_SIZE" ] && ViashError Bad arguments for option \'--use_observed_lib_size\': \'$VIASH_PAR_USE_OBSERVED_LIB_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_OBSERVED_LIB_SIZE=true + shift 1 + ;; + --early_stopping) + [ -n "$VIASH_PAR_EARLY_STOPPING" ] && ViashError Bad arguments for option \'--early_stopping\': \'$VIASH_PAR_EARLY_STOPPING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping=*) + [ -n "$VIASH_PAR_EARLY_STOPPING" ] && ViashError Bad arguments for option \'--early_stopping=*\': \'$VIASH_PAR_EARLY_STOPPING\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_monitor) + [ -n "$VIASH_PAR_EARLY_STOPPING_MONITOR" ] && ViashError Bad arguments for option \'--early_stopping_monitor\': \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MONITOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_monitor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_monitor=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_MONITOR" ] && ViashError Bad arguments for option \'--early_stopping_monitor=*\': \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MONITOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_patience) + [ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ] && ViashError Bad arguments for option \'--early_stopping_patience\': \'$VIASH_PAR_EARLY_STOPPING_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_PATIENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_patience. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_patience=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ] && ViashError Bad arguments for option \'--early_stopping_patience=*\': \'$VIASH_PAR_EARLY_STOPPING_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_PATIENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --early_stopping_min_delta) + [ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ] && ViashError Bad arguments for option \'--early_stopping_min_delta\': \'$VIASH_PAR_EARLY_STOPPING_MIN_DELTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MIN_DELTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --early_stopping_min_delta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --early_stopping_min_delta=*) + [ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ] && ViashError Bad arguments for option \'--early_stopping_min_delta=*\': \'$VIASH_PAR_EARLY_STOPPING_MIN_DELTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EARLY_STOPPING_MIN_DELTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_epochs) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_epochs=*) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs=*\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reduce_lr_on_plateau) + [ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ] && ViashError Bad arguments for option \'--reduce_lr_on_plateau\': \'$VIASH_PAR_REDUCE_LR_ON_PLATEAU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REDUCE_LR_ON_PLATEAU="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reduce_lr_on_plateau. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reduce_lr_on_plateau=*) + [ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ] && ViashError Bad arguments for option \'--reduce_lr_on_plateau=*\': \'$VIASH_PAR_REDUCE_LR_ON_PLATEAU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REDUCE_LR_ON_PLATEAU=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lr_factor) + [ -n "$VIASH_PAR_LR_FACTOR" ] && ViashError Bad arguments for option \'--lr_factor\': \'$VIASH_PAR_LR_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lr_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lr_factor=*) + [ -n "$VIASH_PAR_LR_FACTOR" ] && ViashError Bad arguments for option \'--lr_factor=*\': \'$VIASH_PAR_LR_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lr_patience) + [ -n "$VIASH_PAR_LR_PATIENCE" ] && ViashError Bad arguments for option \'--lr_patience\': \'$VIASH_PAR_LR_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_PATIENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lr_patience. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lr_patience=*) + [ -n "$VIASH_PAR_LR_PATIENCE" ] && ViashError Bad arguments for option \'--lr_patience=*\': \'$VIASH_PAR_LR_PATIENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LR_PATIENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_obs_min_count) + [ -n "$VIASH_PAR_N_OBS_MIN_COUNT" ] && ViashError Bad arguments for option \'--n_obs_min_count\': \'$VIASH_PAR_N_OBS_MIN_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_OBS_MIN_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_obs_min_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_obs_min_count=*) + [ -n "$VIASH_PAR_N_OBS_MIN_COUNT" ] && ViashError Bad arguments for option \'--n_obs_min_count=*\': \'$VIASH_PAR_N_OBS_MIN_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_OBS_MIN_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_var_min_count) + [ -n "$VIASH_PAR_N_VAR_MIN_COUNT" ] && ViashError Bad arguments for option \'--n_var_min_count\': \'$VIASH_PAR_N_VAR_MIN_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_VAR_MIN_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_var_min_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_var_min_count=*) + [ -n "$VIASH_PAR_N_VAR_MIN_COUNT" ] && ViashError Bad arguments for option \'--n_var_min_count=*\': \'$VIASH_PAR_N_VAR_MIN_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_VAR_MIN_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/integrate/scvi:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBS_BATCH+x} ]; then + VIASH_PAR_OBS_BATCH="sample_id" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_scvi_integrated" +fi +if [ -z ${VIASH_PAR_N_HIDDEN_NODES+x} ]; then + VIASH_PAR_N_HIDDEN_NODES="128" +fi +if [ -z ${VIASH_PAR_N_DIMENSIONS_LATENT_SPACE+x} ]; then + VIASH_PAR_N_DIMENSIONS_LATENT_SPACE="30" +fi +if [ -z ${VIASH_PAR_N_HIDDEN_LAYERS+x} ]; then + VIASH_PAR_N_HIDDEN_LAYERS="2" +fi +if [ -z ${VIASH_PAR_DROPOUT_RATE+x} ]; then + VIASH_PAR_DROPOUT_RATE="0.1" +fi +if [ -z ${VIASH_PAR_DISPERSION+x} ]; then + VIASH_PAR_DISPERSION="gene" +fi +if [ -z ${VIASH_PAR_GENE_LIKELIHOOD+x} ]; then + VIASH_PAR_GENE_LIKELIHOOD="nb" +fi +if [ -z ${VIASH_PAR_USE_LAYER_NORMALIZATION+x} ]; then + VIASH_PAR_USE_LAYER_NORMALIZATION="both" +fi +if [ -z ${VIASH_PAR_USE_BATCH_NORMALIZATION+x} ]; then + VIASH_PAR_USE_BATCH_NORMALIZATION="none" +fi +if [ -z ${VIASH_PAR_ENCODE_COVARIATES+x} ]; then + VIASH_PAR_ENCODE_COVARIATES="true" +fi +if [ -z ${VIASH_PAR_DEEPLY_INJECT_COVARIATES+x} ]; then + VIASH_PAR_DEEPLY_INJECT_COVARIATES="false" +fi +if [ -z ${VIASH_PAR_USE_OBSERVED_LIB_SIZE+x} ]; then + VIASH_PAR_USE_OBSERVED_LIB_SIZE="false" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then + VIASH_PAR_EARLY_STOPPING_MONITOR="elbo_validation" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then + VIASH_PAR_EARLY_STOPPING_PATIENCE="45" +fi +if [ -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then + VIASH_PAR_EARLY_STOPPING_MIN_DELTA="0.0" +fi +if [ -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then + VIASH_PAR_REDUCE_LR_ON_PLATEAU="true" +fi +if [ -z ${VIASH_PAR_LR_FACTOR+x} ]; then + VIASH_PAR_LR_FACTOR="0.6" +fi +if [ -z ${VIASH_PAR_LR_PATIENCE+x} ]; then + VIASH_PAR_LR_PATIENCE="30.0" +fi +if [ -z ${VIASH_PAR_N_OBS_MIN_COUNT+x} ]; then + VIASH_PAR_N_OBS_MIN_COUNT="0" +fi +if [ -z ${VIASH_PAR_N_VAR_MIN_COUNT+x} ]; then + VIASH_PAR_N_VAR_MIN_COUNT="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_HIDDEN_NODES" ]]; then + if ! [[ "$VIASH_PAR_N_HIDDEN_NODES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_hidden_nodes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_DIMENSIONS_LATENT_SPACE" ]]; then + if ! [[ "$VIASH_PAR_N_DIMENSIONS_LATENT_SPACE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_dimensions_latent_space' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_HIDDEN_LAYERS" ]]; then + if ! [[ "$VIASH_PAR_N_HIDDEN_LAYERS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_hidden_layers' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DROPOUT_RATE" ]]; then + if ! [[ "$VIASH_PAR_DROPOUT_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--dropout_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ENCODE_COVARIATES" ]]; then + if ! [[ "$VIASH_PAR_ENCODE_COVARIATES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--encode_covariates' has to be a boolean_false. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DEEPLY_INJECT_COVARIATES" ]]; then + if ! [[ "$VIASH_PAR_DEEPLY_INJECT_COVARIATES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--deeply_inject_covariates' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_OBSERVED_LIB_SIZE" ]]; then + if ! [[ "$VIASH_PAR_USE_OBSERVED_LIB_SIZE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--use_observed_lib_size' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--early_stopping' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING_PATIENCE" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING_PATIENCE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--early_stopping_patience' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_EARLY_STOPPING_PATIENCE -lt 1 ]]; then + ViashError '--early_stopping_patience' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" ]]; then + if ! [[ "$VIASH_PAR_EARLY_STOPPING_MIN_DELTA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--early_stopping_min_delta' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_EARLY_STOPPING_MIN_DELTA '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--early_stopping_min_delta' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_EARLY_STOPPING_MIN_DELTA -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--early_stopping_min_delta' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--early_stopping_min_delta' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_MAX_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_MAX_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" ]]; then + if ! [[ "$VIASH_PAR_REDUCE_LR_ON_PLATEAU" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--reduce_lr_on_plateau' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LR_FACTOR" ]]; then + if ! [[ "$VIASH_PAR_LR_FACTOR" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lr_factor' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LR_FACTOR '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--lr_factor' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LR_FACTOR -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lr_factor' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lr_factor' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_LR_PATIENCE" ]]; then + if ! [[ "$VIASH_PAR_LR_PATIENCE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--lr_patience' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + if command -v bc &> /dev/null; then + if ! [[ `echo $VIASH_PAR_LR_PATIENCE '>=' 0.0 | bc` -eq 1 ]]; then + ViashError '--lr_patience' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + elif command -v awk &> /dev/null; then + if ! [[ `awk -v n1=$VIASH_PAR_LR_PATIENCE -v n2=0.0 'BEGIN { print (n1 >= n2) ? "1" : "0" }'` -eq 1 ]]; then + ViashError '--lr_patience' has be more than or equal to 0.0. Use "--help" to get more information on the parameters. + exit 1 + fi + else + ViashWarning '--lr_patience' specifies a minimum value but the value was not verified as neither \'bc\' or \`awk\` are present on the system. + fi +fi +if [[ -n "$VIASH_PAR_N_OBS_MIN_COUNT" ]]; then + if ! [[ "$VIASH_PAR_N_OBS_MIN_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_obs_min_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_VAR_MIN_COUNT" ]]; then + if ! [[ "$VIASH_PAR_N_VAR_MIN_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_var_min_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_DISPERSION" ]; then + VIASH_PAR_DISPERSION_CHOICES=("gene;gene-batch;gene-label;gene-cell") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_DISPERSION_CHOICES[*]};" =~ ";$VIASH_PAR_DISPERSION;" ]]; then + ViashError '--dispersion' specified value of \'$VIASH_PAR_DISPERSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_GENE_LIKELIHOOD" ]; then + VIASH_PAR_GENE_LIKELIHOOD_CHOICES=("nb;zinb;poisson") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_GENE_LIKELIHOOD_CHOICES[*]};" =~ ";$VIASH_PAR_GENE_LIKELIHOOD;" ]]; then + ViashError '--gene_likelihood' specified value of \'$VIASH_PAR_GENE_LIKELIHOOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_USE_LAYER_NORMALIZATION" ]; then + VIASH_PAR_USE_LAYER_NORMALIZATION_CHOICES=("encoder;decoder;none;both") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_USE_LAYER_NORMALIZATION_CHOICES[*]};" =~ ";$VIASH_PAR_USE_LAYER_NORMALIZATION;" ]]; then + ViashError '--use_layer_normalization' specified value of \'$VIASH_PAR_USE_LAYER_NORMALIZATION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_USE_BATCH_NORMALIZATION" ]; then + VIASH_PAR_USE_BATCH_NORMALIZATION_CHOICES=("encoder;decoder;none;both") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_USE_BATCH_NORMALIZATION_CHOICES[*]};" =~ ";$VIASH_PAR_USE_BATCH_NORMALIZATION;" ]]; then + ViashError '--use_batch_normalization' specified value of \'$VIASH_PAR_USE_BATCH_NORMALIZATION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_EARLY_STOPPING_MONITOR" ]; then + VIASH_PAR_EARLY_STOPPING_MONITOR_CHOICES=("elbo_validation;reconstruction_loss_validation;kl_local_validation") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_EARLY_STOPPING_MONITOR_CHOICES[*]};" =~ ";$VIASH_PAR_EARLY_STOPPING_MONITOR;" ]]; then + ViashError '--early_stopping_monitor' specified value of \'$VIASH_PAR_EARLY_STOPPING_MONITOR\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_MODEL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_MODEL")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_MODEL")" ) + VIASH_PAR_OUTPUT_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_MODEL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_MODEL" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scvi-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from scanpy._utils import check_nonnegative_integers +import mudata +import scvi + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_labels': $( if [ ! -z ${VIASH_PAR_OBS_LABELS+x} ]; then echo "r'${VIASH_PAR_OBS_LABELS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_size_factor': $( if [ ! -z ${VIASH_PAR_OBS_SIZE_FACTOR+x} ]; then echo "r'${VIASH_PAR_OBS_SIZE_FACTOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_categorical_covariate': $( if [ ! -z ${VIASH_PAR_OBS_CATEGORICAL_COVARIATE+x} ]; then echo "r'${VIASH_PAR_OBS_CATEGORICAL_COVARIATE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'obs_continuous_covariate': $( if [ ! -z ${VIASH_PAR_OBS_CONTINUOUS_COVARIATE+x} ]; then echo "r'${VIASH_PAR_OBS_CONTINUOUS_COVARIATE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_model': $( if [ ! -z ${VIASH_PAR_OUTPUT_MODEL+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_hidden_nodes': $( if [ ! -z ${VIASH_PAR_N_HIDDEN_NODES+x} ]; then echo "int(r'${VIASH_PAR_N_HIDDEN_NODES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_dimensions_latent_space': $( if [ ! -z ${VIASH_PAR_N_DIMENSIONS_LATENT_SPACE+x} ]; then echo "int(r'${VIASH_PAR_N_DIMENSIONS_LATENT_SPACE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_hidden_layers': $( if [ ! -z ${VIASH_PAR_N_HIDDEN_LAYERS+x} ]; then echo "int(r'${VIASH_PAR_N_HIDDEN_LAYERS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'dropout_rate': $( if [ ! -z ${VIASH_PAR_DROPOUT_RATE+x} ]; then echo "float(r'${VIASH_PAR_DROPOUT_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'dispersion': $( if [ ! -z ${VIASH_PAR_DISPERSION+x} ]; then echo "r'${VIASH_PAR_DISPERSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gene_likelihood': $( if [ ! -z ${VIASH_PAR_GENE_LIKELIHOOD+x} ]; then echo "r'${VIASH_PAR_GENE_LIKELIHOOD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'use_layer_normalization': $( if [ ! -z ${VIASH_PAR_USE_LAYER_NORMALIZATION+x} ]; then echo "r'${VIASH_PAR_USE_LAYER_NORMALIZATION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'use_batch_normalization': $( if [ ! -z ${VIASH_PAR_USE_BATCH_NORMALIZATION+x} ]; then echo "r'${VIASH_PAR_USE_BATCH_NORMALIZATION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'encode_covariates': $( if [ ! -z ${VIASH_PAR_ENCODE_COVARIATES+x} ]; then echo "r'${VIASH_PAR_ENCODE_COVARIATES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'deeply_inject_covariates': $( if [ ! -z ${VIASH_PAR_DEEPLY_INJECT_COVARIATES+x} ]; then echo "r'${VIASH_PAR_DEEPLY_INJECT_COVARIATES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'use_observed_lib_size': $( if [ ! -z ${VIASH_PAR_USE_OBSERVED_LIB_SIZE+x} ]; then echo "r'${VIASH_PAR_USE_OBSERVED_LIB_SIZE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping_monitor': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING_MONITOR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'early_stopping_patience': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then echo "int(r'${VIASH_PAR_EARLY_STOPPING_PATIENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'early_stopping_min_delta': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then echo "float(r'${VIASH_PAR_EARLY_STOPPING_MIN_DELTA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reduce_lr_on_plateau': $( if [ ! -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then echo "r'${VIASH_PAR_REDUCE_LR_ON_PLATEAU//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'lr_factor': $( if [ ! -z ${VIASH_PAR_LR_FACTOR+x} ]; then echo "float(r'${VIASH_PAR_LR_FACTOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'lr_patience': $( if [ ! -z ${VIASH_PAR_LR_PATIENCE+x} ]; then echo "float(r'${VIASH_PAR_LR_PATIENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_obs_min_count': $( if [ ! -z ${VIASH_PAR_N_OBS_MIN_COUNT+x} ]; then echo "int(r'${VIASH_PAR_N_OBS_MIN_COUNT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_var_min_count': $( if [ ! -z ${VIASH_PAR_N_VAR_MIN_COUNT+x} ]; then echo "int(r'${VIASH_PAR_N_VAR_MIN_COUNT//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +# TODO: optionally, move to qa +# https://github.com/openpipelines-bio/openpipeline/issues/435 +def check_validity_anndata(adata, layer, obs_batch, n_obs_min_count, n_var_min_count): + assert check_nonnegative_integers(adata.layers[layer] if layer else adata.X), ( + "Make sure input adata contains raw_counts" + ) + + assert len(set(adata.var_names)) == len(adata.var_names), ( + "Dataset contains multiple genes with same gene name." + ) + + # Ensure every obs_batch category has sufficient observations + assert min(adata.obs[[obs_batch]].value_counts()) > n_obs_min_count, ( + f"Anndata has fewer than {n_obs_min_count} cells." + ) + + assert adata.n_vars > n_var_min_count, ( + f"Anndata has fewer than {n_var_min_count} genes." + ) + + +def main(): + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + if par["var_input"]: + # Subset to HVG + adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() + else: + adata_subset = adata.copy() + + # Sanitize gene names and set as index of the AnnData object + adata_subset = set_var_index(adata_subset, par["var_gene_names"]) + + check_validity_anndata( + adata_subset, + par["input_layer"], + par["obs_batch"], + par["n_obs_min_count"], + par["n_var_min_count"], + ) + + # Set up the data + scvi.model.SCVI.setup_anndata( + adata_subset, + batch_key=par["obs_batch"], + layer=par["input_layer"], + labels_key=par["obs_labels"], + size_factor_key=par["obs_size_factor"], + categorical_covariate_keys=par["obs_categorical_covariate"], + continuous_covariate_keys=par["obs_continuous_covariate"], + ) + + # Set up the model + vae_uns = scvi.model.SCVI( + adata_subset, + n_hidden=par["n_hidden_nodes"], + n_latent=par["n_dimensions_latent_space"], + n_layers=par["n_hidden_layers"], + dropout_rate=par["dropout_rate"], + dispersion=par["dispersion"], + gene_likelihood=par["gene_likelihood"], + use_layer_norm=par["use_layer_normalization"], + use_batch_norm=par["use_batch_normalization"], + encode_covariates=par[ + "encode_covariates" + ], # Default (True) is for better scArches performance -> maybe don't use this always? + deeply_inject_covariates=par[ + "deeply_inject_covariates" + ], # Default (False) for better scArches performance -> maybe don't use this always? + use_observed_lib_size=par[ + "use_observed_lib_size" + ], # When size_factors are not passed + ) + + plan_kwargs = { + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], + } + + # Train the model + vae_uns.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + plan_kwargs=plan_kwargs, + check_val_every_n_epoch=1, + accelerator="auto", + ) + # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set + + # Get the latent output + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() + + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + if par["output_model"]: + vae_uns.save(par["output_model"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ]; then + VIASH_PAR_OUTPUT_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_MODEL") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MODEL" ] && [ ! -e "$VIASH_PAR_OUTPUT_MODEL" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_MODEL' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/integrate/scvi/set_var_index.py b/target/executable/integrate/scvi/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/executable/integrate/scvi/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/executable/integrate/scvi/subset_vars.py b/target/executable/integrate/scvi/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/integrate/scvi/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/integrate/totalvi/.config.vsh.yaml b/target/executable/integrate/totalvi/.config.vsh.yaml new file mode 100644 index 00000000..96a3af63 --- /dev/null +++ b/target/executable/integrate/totalvi/.config.vsh.yaml @@ -0,0 +1,395 @@ +name: "totalvi" +namespace: "integrate" +version: "main" +authors: +- name: "Vladimir Shitov" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file with query data to integrate with reference." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "-r" + description: "Input h5mu file with reference data to train the TOTALVI model." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--force_retrain" + alternatives: + - "-f" + description: "If true, retrain the model and save it to reference_model_path" + info: null + direction: "input" + - type: "string" + name: "--query_modality" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--query_proteins_modality" + description: "Name of the modality in the input (query) h5mu file containing protein\ + \ data" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_modality" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_proteins_modality" + description: "Name of the modality containing proteins in the reference" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_integrated_totalvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_normalized_rna_output" + description: "In which .obsm slot to store the normalized RNA from TOTALVI." + info: null + default: + - "X_totalvi_normalized_rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_normalized_protein_output" + description: "In which .obsm slot to store the normalized protein data from TOTALVI." + info: null + default: + - "X_totalvi_normalized_protein" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference_model_path" + description: "Directory with the reference model. If not exists, trained model\ + \ will be saved there" + info: null + default: + - "totalvi_model_reference" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--query_model_path" + description: "Directory, where the query model will be saved" + info: null + default: + - "totalvi_model_query" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset" + info: null + default: + - 400 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_query_epochs" + description: "Number of passes through the dataset, when fine-tuning model for\ + \ query" + info: null + default: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--weight_decay" + description: "Weight decay, when fine-tuning model for query" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "jax[cuda]" + - "scvi-tools~=1.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/totalvi/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integrate/totalvi" + executable: "target/executable/integrate/totalvi/totalvi" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/integrate/totalvi/nextflow_labels.config b/target/executable/integrate/totalvi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/integrate/totalvi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/integrate/totalvi/setup_logger.py b/target/executable/integrate/totalvi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/integrate/totalvi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/integrate/totalvi/totalvi b/target/executable/integrate/totalvi/totalvi new file mode 100755 index 00000000..aee5bb82 --- /dev/null +++ b/target/executable/integrate/totalvi/totalvi @@ -0,0 +1,1712 @@ +#!/usr/bin/env bash + +# totalvi main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="totalvi" +VIASH_META_FUNCTIONALITY_NAME="totalvi" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:25.05-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "jax[cuda]" "scvi-tools~=1.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component integrate totalvi" +LABEL org.opencontainers.image.created="2025-08-22T07:23:11Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "totalvi main" + echo "" + echo "Performs mapping to the reference by totalvi model:" + echo "https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI" + echo "" + echo "Inputs:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file with query data to integrate with reference." + echo "" + echo " -r, --reference" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file with reference data to train the TOTALVI model." + echo "" + echo " -f, --force_retrain" + echo " type: boolean_true" + echo " If true, retrain the model and save it to reference_model_path" + echo "" + echo " --query_modality" + echo " type: string" + echo " default: rna" + echo "" + echo " --query_proteins_modality" + echo " type: string" + echo " Name of the modality in the input (query) h5mu file containing protein" + echo " data" + echo "" + echo " --reference_modality" + echo " type: string" + echo " default: rna" + echo "" + echo " --reference_proteins_modality" + echo " type: string" + echo " default: prot" + echo " Name of the modality containing proteins in the reference" + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. If None, X is used" + echo "" + echo " --obs_batch" + echo " type: string" + echo " default: sample_id" + echo " Column name discriminating between your batches." + echo "" + echo " --var_input" + echo " type: string" + echo " .var column containing highly variable genes. By default, do not subset" + echo " genes." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --obsm_output" + echo " type: string" + echo " default: X_integrated_totalvi" + echo " In which .obsm slot to store the resulting integrated embedding." + echo "" + echo " --obsm_normalized_rna_output" + echo " type: string" + echo " default: X_totalvi_normalized_rna" + echo " In which .obsm slot to store the normalized RNA from TOTALVI." + echo "" + echo " --obsm_normalized_protein_output" + echo " type: string" + echo " default: X_totalvi_normalized_protein" + echo " In which .obsm slot to store the normalized protein data from TOTALVI." + echo "" + echo " --reference_model_path" + echo " type: file, output, file must exist" + echo " default: totalvi_model_reference" + echo " Directory with the reference model. If not exists, trained model will be" + echo " saved there" + echo "" + echo " --query_model_path" + echo " type: file, output, file must exist" + echo " default: totalvi_model_query" + echo " Directory, where the query model will be saved" + echo "" + echo "Learning parameters:" + echo " --max_epochs" + echo " type: integer" + echo " default: 400" + echo " Number of passes through the dataset" + echo "" + echo " --max_query_epochs" + echo " type: integer" + echo " default: 200" + echo " Number of passes through the dataset, when fine-tuning model for query" + echo "" + echo " --weight_decay" + echo " type: double" + echo " default: 0.0" + echo " Weight decay, when fine-tuning model for query" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "totalvi main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -r) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'-r\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -r. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --force_retrain) + [ -n "$VIASH_PAR_FORCE_RETRAIN" ] && ViashError Bad arguments for option \'--force_retrain\': \'$VIASH_PAR_FORCE_RETRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_RETRAIN=true + shift 1 + ;; + -f) + [ -n "$VIASH_PAR_FORCE_RETRAIN" ] && ViashError Bad arguments for option \'-f\': \'$VIASH_PAR_FORCE_RETRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_RETRAIN=true + shift 1 + ;; + --query_modality) + [ -n "$VIASH_PAR_QUERY_MODALITY" ] && ViashError Bad arguments for option \'--query_modality\': \'$VIASH_PAR_QUERY_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUERY_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --query_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --query_modality=*) + [ -n "$VIASH_PAR_QUERY_MODALITY" ] && ViashError Bad arguments for option \'--query_modality=*\': \'$VIASH_PAR_QUERY_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUERY_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --query_proteins_modality) + [ -n "$VIASH_PAR_QUERY_PROTEINS_MODALITY" ] && ViashError Bad arguments for option \'--query_proteins_modality\': \'$VIASH_PAR_QUERY_PROTEINS_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUERY_PROTEINS_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --query_proteins_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --query_proteins_modality=*) + [ -n "$VIASH_PAR_QUERY_PROTEINS_MODALITY" ] && ViashError Bad arguments for option \'--query_proteins_modality=*\': \'$VIASH_PAR_QUERY_PROTEINS_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUERY_PROTEINS_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_modality) + [ -n "$VIASH_PAR_REFERENCE_MODALITY" ] && ViashError Bad arguments for option \'--reference_modality\': \'$VIASH_PAR_REFERENCE_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_modality=*) + [ -n "$VIASH_PAR_REFERENCE_MODALITY" ] && ViashError Bad arguments for option \'--reference_modality=*\': \'$VIASH_PAR_REFERENCE_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_proteins_modality) + [ -n "$VIASH_PAR_REFERENCE_PROTEINS_MODALITY" ] && ViashError Bad arguments for option \'--reference_proteins_modality\': \'$VIASH_PAR_REFERENCE_PROTEINS_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_PROTEINS_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_proteins_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_proteins_modality=*) + [ -n "$VIASH_PAR_REFERENCE_PROTEINS_MODALITY" ] && ViashError Bad arguments for option \'--reference_proteins_modality=*\': \'$VIASH_PAR_REFERENCE_PROTEINS_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_PROTEINS_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_batch) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch=*) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch=*\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_output=*) + [ -n "$VIASH_PAR_OBSM_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_output=*\': \'$VIASH_PAR_OBSM_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_normalized_rna_output) + [ -n "$VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_normalized_rna_output\': \'$VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_normalized_rna_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_normalized_rna_output=*) + [ -n "$VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_normalized_rna_output=*\': \'$VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_normalized_protein_output) + [ -n "$VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_normalized_protein_output\': \'$VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_normalized_protein_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_normalized_protein_output=*) + [ -n "$VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT" ] && ViashError Bad arguments for option \'--obsm_normalized_protein_output=*\': \'$VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_model_path) + [ -n "$VIASH_PAR_REFERENCE_MODEL_PATH" ] && ViashError Bad arguments for option \'--reference_model_path\': \'$VIASH_PAR_REFERENCE_MODEL_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_MODEL_PATH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_model_path. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_model_path=*) + [ -n "$VIASH_PAR_REFERENCE_MODEL_PATH" ] && ViashError Bad arguments for option \'--reference_model_path=*\': \'$VIASH_PAR_REFERENCE_MODEL_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_MODEL_PATH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --query_model_path) + [ -n "$VIASH_PAR_QUERY_MODEL_PATH" ] && ViashError Bad arguments for option \'--query_model_path\': \'$VIASH_PAR_QUERY_MODEL_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUERY_MODEL_PATH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --query_model_path. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --query_model_path=*) + [ -n "$VIASH_PAR_QUERY_MODEL_PATH" ] && ViashError Bad arguments for option \'--query_model_path=*\': \'$VIASH_PAR_QUERY_MODEL_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUERY_MODEL_PATH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_epochs) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_epochs=*) + [ -n "$VIASH_PAR_MAX_EPOCHS" ] && ViashError Bad arguments for option \'--max_epochs=*\': \'$VIASH_PAR_MAX_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_query_epochs) + [ -n "$VIASH_PAR_MAX_QUERY_EPOCHS" ] && ViashError Bad arguments for option \'--max_query_epochs\': \'$VIASH_PAR_MAX_QUERY_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_QUERY_EPOCHS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_query_epochs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_query_epochs=*) + [ -n "$VIASH_PAR_MAX_QUERY_EPOCHS" ] && ViashError Bad arguments for option \'--max_query_epochs=*\': \'$VIASH_PAR_MAX_QUERY_EPOCHS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_QUERY_EPOCHS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --weight_decay) + [ -n "$VIASH_PAR_WEIGHT_DECAY" ] && ViashError Bad arguments for option \'--weight_decay\': \'$VIASH_PAR_WEIGHT_DECAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WEIGHT_DECAY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --weight_decay. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --weight_decay=*) + [ -n "$VIASH_PAR_WEIGHT_DECAY" ] && ViashError Bad arguments for option \'--weight_decay=*\': \'$VIASH_PAR_WEIGHT_DECAY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WEIGHT_DECAY=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/integrate/totalvi:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_FORCE_RETRAIN+x} ]; then + VIASH_PAR_FORCE_RETRAIN="false" +fi +if [ -z ${VIASH_PAR_QUERY_MODALITY+x} ]; then + VIASH_PAR_QUERY_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_REFERENCE_MODALITY+x} ]; then + VIASH_PAR_REFERENCE_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_REFERENCE_PROTEINS_MODALITY+x} ]; then + VIASH_PAR_REFERENCE_PROTEINS_MODALITY="prot" +fi +if [ -z ${VIASH_PAR_OBS_BATCH+x} ]; then + VIASH_PAR_OBS_BATCH="sample_id" +fi +if [ -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then + VIASH_PAR_OBSM_OUTPUT="X_integrated_totalvi" +fi +if [ -z ${VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT+x} ]; then + VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT="X_totalvi_normalized_rna" +fi +if [ -z ${VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT+x} ]; then + VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT="X_totalvi_normalized_protein" +fi +if [ -z ${VIASH_PAR_REFERENCE_MODEL_PATH+x} ]; then + VIASH_PAR_REFERENCE_MODEL_PATH="totalvi_model_reference" +fi +if [ -z ${VIASH_PAR_QUERY_MODEL_PATH+x} ]; then + VIASH_PAR_QUERY_MODEL_PATH="totalvi_model_query" +fi +if [ -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then + VIASH_PAR_MAX_EPOCHS="400" +fi +if [ -z ${VIASH_PAR_MAX_QUERY_EPOCHS+x} ]; then + VIASH_PAR_MAX_QUERY_EPOCHS="200" +fi +if [ -z ${VIASH_PAR_WEIGHT_DECAY+x} ]; then + VIASH_PAR_WEIGHT_DECAY="0.0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_FORCE_RETRAIN" ]]; then + if ! [[ "$VIASH_PAR_FORCE_RETRAIN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--force_retrain' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_MAX_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_QUERY_EPOCHS" ]]; then + if ! [[ "$VIASH_PAR_MAX_QUERY_EPOCHS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_query_epochs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WEIGHT_DECAY" ]]; then + if ! [[ "$VIASH_PAR_WEIGHT_DECAY" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--weight_decay' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_REFERENCE_MODEL_PATH" ] && [ ! -d "$(dirname "$VIASH_PAR_REFERENCE_MODEL_PATH")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_REFERENCE_MODEL_PATH")" +fi +if [ ! -z "$VIASH_PAR_QUERY_MODEL_PATH" ] && [ ! -d "$(dirname "$VIASH_PAR_QUERY_MODEL_PATH")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_QUERY_MODEL_PATH")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_REFERENCE_MODEL_PATH" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE_MODEL_PATH")" ) + VIASH_PAR_REFERENCE_MODEL_PATH=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE_MODEL_PATH") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_REFERENCE_MODEL_PATH" ) +fi +if [ ! -z "$VIASH_PAR_QUERY_MODEL_PATH" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_QUERY_MODEL_PATH")" ) + VIASH_PAR_QUERY_MODEL_PATH=$(ViashDockerAutodetectMount "$VIASH_PAR_QUERY_MODEL_PATH") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_QUERY_MODEL_PATH" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-totalvi-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from typing import Tuple + +import os +import sys +import mudata +from anndata import AnnData # For type hints +from mudata import MuData # For type hints +import numpy as np +import scvi +from scipy.sparse import issparse + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'force_retrain': $( if [ ! -z ${VIASH_PAR_FORCE_RETRAIN+x} ]; then echo "r'${VIASH_PAR_FORCE_RETRAIN//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'query_modality': $( if [ ! -z ${VIASH_PAR_QUERY_MODALITY+x} ]; then echo "r'${VIASH_PAR_QUERY_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'query_proteins_modality': $( if [ ! -z ${VIASH_PAR_QUERY_PROTEINS_MODALITY+x} ]; then echo "r'${VIASH_PAR_QUERY_PROTEINS_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_modality': $( if [ ! -z ${VIASH_PAR_REFERENCE_MODALITY+x} ]; then echo "r'${VIASH_PAR_REFERENCE_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_proteins_modality': $( if [ ! -z ${VIASH_PAR_REFERENCE_PROTEINS_MODALITY+x} ]; then echo "r'${VIASH_PAR_REFERENCE_PROTEINS_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_normalized_rna_output': $( if [ ! -z ${VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_normalized_protein_output': $( if [ ! -z ${VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_model_path': $( if [ ! -z ${VIASH_PAR_REFERENCE_MODEL_PATH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_MODEL_PATH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'query_model_path': $( if [ ! -z ${VIASH_PAR_QUERY_MODEL_PATH+x} ]; then echo "r'${VIASH_PAR_QUERY_MODEL_PATH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_query_epochs': $( if [ ! -z ${VIASH_PAR_MAX_QUERY_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_QUERY_EPOCHS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'weight_decay': $( if [ ! -z ${VIASH_PAR_WEIGHT_DECAY+x} ]; then echo "float(r'${VIASH_PAR_WEIGHT_DECAY//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def align_proteins_names( + adata_reference: AnnData, + mdata_query: MuData, + adata_query: AnnData, + reference_proteins_key: str, + query_proteins_key: str, +) -> AnnData: + """Make sure that proteins are located in the same .obsm slot in reference and query. Pad query proteins with zeros if they are absent""" + proteins_reference = adata_reference.obsm[reference_proteins_key] + + # If query has no protein data, put matrix of zeros + if not query_proteins_key or query_proteins_key not in mdata_query.mod: + adata_query.obsm[reference_proteins_key] = np.zeros( + (adata_query.n_obs, proteins_reference.shape[1]) + ) + else: + # Make sure that proteins expression has the same key in query and reference + adata_query.obsm[reference_proteins_key] = adata_query.obsm[query_proteins_key] + + return adata_query + + +def extract_proteins_to_anndata( + mdata: MuData, rna_modality_key, protein_modality_key, input_layer, hvg_var_key=None +) -> AnnData: + """TOTALVI requires data to be stored in AnnData format with protein counts in .obsm slot. This function performs the conversion""" + adata: AnnData = mdata.mod[rna_modality_key].copy() + + if hvg_var_key: + selected_genes = adata.var_names[adata.var[hvg_var_key]] + adata = adata[:, selected_genes].copy() + + if protein_modality_key in mdata.mod: + # Put the proteins modality into .obsm slot + proteins_reference_adata = mdata.mod[protein_modality_key].copy() + + if input_layer is None: + proteins = proteins_reference_adata.X + else: + proteins = proteins_reference_adata.obsm[input_layer] + + if issparse(proteins): + proteins = proteins.toarray() + + adata.obsm[protein_modality_key] = proteins + + return adata + + +def build_reference_model( + adata_reference: AnnData, max_train_epochs: int = 400 +) -> scvi.model.TOTALVI: + vae_reference = scvi.model.TOTALVI( + adata_reference, use_layer_norm="both", use_batch_norm="none" + ) + vae_reference.train(max_train_epochs) + + vae_reference.save(par["reference_model_path"]) + + return vae_reference + + +def is_retraining_model() -> bool: + """Decide, whether reference model should be trained. It happens when no model exists or force_retrain flag is on""" + + trained_model_exists = os.path.isdir(par["reference_model_path"]) and ( + "model.pt" in os.listdir(par["reference_model_path"]) + ) + return not trained_model_exists or par["force_retrain"] + + +def map_query_to_reference( + mdata_reference: MuData, mdata_query: MuData, adata_query: AnnData +) -> Tuple[scvi.model.TOTALVI, AnnData]: + """Build model on the provided reference if necessary, and map query to the reference""" + + adata_reference: AnnData = extract_proteins_to_anndata( + mdata_reference, + rna_modality_key=par["reference_modality"], + protein_modality_key=par["reference_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) + + scvi.model.TOTALVI.setup_anndata( + adata_reference, + batch_key=par["obs_batch"], + protein_expression_obsm_key=par["reference_proteins_modality"], + ) + + if is_retraining_model(): + vae_reference = build_reference_model( + adata_reference, max_train_epochs=par["max_epochs"] + ) + else: + vae_reference = scvi.model.TOTALVI.load( + dir_path=par["reference_model_path"], adata=adata_reference + ) + + adata_query: AnnData = align_proteins_names( + adata_reference, + mdata_query, + adata_query, + reference_proteins_key=par["reference_proteins_modality"], + query_proteins_key=par["query_proteins_modality"], + ) + + # Reorder genes and pad missing genes with 0s + scvi.model.TOTALVI.prepare_query_anndata(adata_query, vae_reference) + + # Train the model for query + vae_query = scvi.model.TOTALVI.load_query_data(adata_query, vae_reference) + vae_query.train( + par["max_query_epochs"], plan_kwargs=dict(weight_decay=par["weight_decay"]) + ) + + return vae_query, adata_query + + +def main(): + mdata_query = mudata.read(par["input"].strip()) + adata_query = extract_proteins_to_anndata( + mdata_query, + rna_modality_key=par["query_modality"], + protein_modality_key=par["query_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) + + if par["reference"].endswith(".h5mu"): + logger.info("Reading reference") + mdata_reference = mudata.read(par["reference"].strip()) + + logger.info("Mapping query to the reference") + vae_query, adata_query = map_query_to_reference( + mdata_reference, mdata_query, adata_query + ) + else: + raise ValueError("Incorrect format of reference, please provide a .h5mu file") + + adata_query.uns["integration_method"] = "totalvi" + + logger.info("Getting the latent representation of query") + mdata_query.mod[par["query_modality"]].obsm[par["obsm_output"]] = ( + vae_query.get_latent_representation() + ) + + norm_rna, norm_protein = vae_query.get_normalized_expression() + mdata_query.mod[par["query_modality"]].obsm[par["obsm_normalized_rna_output"]] = ( + norm_rna.to_numpy() + ) + + if par["query_proteins_modality"] in mdata_query.mod: + mdata_query.mod[par["query_proteins_modality"]].obsm[ + par["obsm_normalized_protein_output"] + ] = norm_protein.to_numpy() + + logger.info("Updating mdata") + mdata_query.update() + + logger.info("Saving updated query data") + mdata_query.write_h5mu(par["output"].strip()) + + logger.info("Saving query model") + vae_query.save(par["query_model_path"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE_MODEL_PATH" ]; then + VIASH_PAR_REFERENCE_MODEL_PATH=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE_MODEL_PATH") + fi + if [ ! -z "$VIASH_PAR_QUERY_MODEL_PATH" ]; then + VIASH_PAR_QUERY_MODEL_PATH=$(ViashDockerStripAutomount "$VIASH_PAR_QUERY_MODEL_PATH") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE_MODEL_PATH" ] && [ ! -e "$VIASH_PAR_REFERENCE_MODEL_PATH" ]; then + ViashError "Output file '$VIASH_PAR_REFERENCE_MODEL_PATH' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_QUERY_MODEL_PATH" ] && [ ! -e "$VIASH_PAR_QUERY_MODEL_PATH" ]; then + ViashError "Output file '$VIASH_PAR_QUERY_MODEL_PATH' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/interpret/lianapy/.config.vsh.yaml b/target/executable/interpret/lianapy/.config.vsh.yaml new file mode 100644 index 00000000..8137d883 --- /dev/null +++ b/target/executable/interpret/lianapy/.config.vsh.yaml @@ -0,0 +1,404 @@ +name: "lianapy" +namespace: "interpret" +version: "main" +authors: +- name: "Mauro Saporita" + roles: + - "author" + info: + role: "Contributor" + links: + email: "maurosaporita@gmail.com" + github: "mauro-saporita" + linkedin: "mauro-saporita-930b06a5" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Lead Nextflow Developer" +- name: "Povilas Gibas" + roles: + - "author" + info: + role: "Contributor" + links: + email: "povilasgibas@gmail.com" + github: "PoGibas" + linkedin: "povilas-gibas" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Bioinformatician" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer in anndata.AnnData.layers to use. If None, use mudata.mod[modality].X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--de_method" + description: "Differential expression method. `scanpy.tl.rank_genes_groups` is\ + \ used to rank genes according to 1vsRest." + info: null + default: + - "t-test" + required: false + choices: + - "logreg" + - "t-test" + - "wilcoxon" + - "t-test_overestim_var" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--groupby" + description: "The key of the observations grouping to consider." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--resource_name" + description: "Name of the resource to be loaded and use for ligand-receptor inference." + info: null + default: + - "consensus" + required: false + choices: + - "baccin2019" + - "cellcall" + - "cellchatdb" + - "cellinker" + - "cellphonedb" + - "celltalkdb" + - "connectomedb2020" + - "consensus" + - "embrace" + - "guide2pharma" + - "hpmr" + - "icellnet" + - "italk" + - "kirouac2010" + - "lrdb" + - "mouseconsensus" + - "ramilowski2015" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_symbol" + description: "Column name in var DataFrame in which gene symbol are stored." + info: null + default: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--expr_prop" + description: "Minimum expression proportion for the ligands/receptors (and their\ + \ subunits) in the corresponding cell identities. Set to '0', to return unfiltered\ + \ results." + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells" + description: "Minimum cells per cell identity ('groupby') to be considered for\ + \ downstream analysis." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--aggregate_method" + description: "Method aggregation approach, one of ['mean', 'rra'], where 'mean'\ + \ represents the mean rank, while 'rra' is the RobustRankAggregate (Kolde et\ + \ al., 2014) of the interactions." + info: null + default: + - "rra" + required: false + choices: + - "mean" + - "rra" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--consensus_opts" + description: "Strategies to aggregate interactions across methods. Default is\ + \ None - i.e. ['Specificity', 'Magnitude'] and both specificity and magnitude\ + \ are aggregated." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean" + name: "--return_all_lrs" + description: "Bool whether to return all LRs, or only those that surpass the 'expr_prop'\ + \ threshold. Those interactions that do not pass the 'expr_prop' threshold will\ + \ be assigned to the *worst* score of the ones that do. 'False' by default." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_perms" + description: "Number of permutations for the permutation test. Note that this\ + \ is relevant only for permutation-based methods - e.g. 'CellPhoneDB" + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs LIANA integration based as described in https://github.com/saezlab/liana-py" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "liana~=1.5.0" + - "scipy<1.16" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/interpret/lianapy/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/interpret/lianapy" + executable: "target/executable/interpret/lianapy/lianapy" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/interpret/lianapy/compress_h5mu.py b/target/executable/interpret/lianapy/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/interpret/lianapy/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/interpret/lianapy/lianapy b/target/executable/interpret/lianapy/lianapy new file mode 100755 index 00000000..54ff510e --- /dev/null +++ b/target/executable/interpret/lianapy/lianapy @@ -0,0 +1,1544 @@ +#!/usr/bin/env bash + +# lianapy main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Mauro Saporita (author) +# * Povilas Gibas (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="lianapy" +VIASH_META_FUNCTIONALITY_NAME="lianapy" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "liana~=1.5.0" "scipy<1.16" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Mauro Saporita, Povilas Gibas" +LABEL org.opencontainers.image.description="Companion container for running component interpret lianapy" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "lianapy main" + echo "" + echo "Performs LIANA integration based as described in" + echo "https://github.com/saezlab/liana-py" + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " Layer in anndata.AnnData.layers to use. If None, use" + echo " mudata.mod[modality].X." + echo "" + echo " --de_method" + echo " type: string" + echo " default: t-test" + echo " choices: [ logreg, t-test, wilcoxon, t-test_overestim_var ]" + echo " Differential expression method. \`scanpy.tl.rank_genes_groups\` is used to" + echo " rank genes according to 1vsRest." + echo "" + echo " --groupby" + echo " type: string, required parameter" + echo " The key of the observations grouping to consider." + echo "" + echo " --resource_name" + echo " type: string" + echo " default: consensus" + echo " choices: [ baccin2019, cellcall, cellchatdb, cellinker, cellphonedb," + echo "celltalkdb, connectomedb2020, consensus, embrace, guide2pharma, hpmr, icellnet," + echo "italk, kirouac2010, lrdb, mouseconsensus, ramilowski2015 ]" + echo " Name of the resource to be loaded and use for ligand-receptor inference." + echo "" + echo " --gene_symbol" + echo " type: string" + echo " default: gene_symbol" + echo " Column name in var DataFrame in which gene symbol are stored." + echo "" + echo " --expr_prop" + echo " type: double" + echo " default: 0.1" + echo " Minimum expression proportion for the ligands/receptors (and their" + echo " subunits) in the corresponding cell identities. Set to '0', to return" + echo " unfiltered results." + echo "" + echo " --min_cells" + echo " type: integer" + echo " default: 5" + echo " Minimum cells per cell identity ('groupby') to be considered for" + echo " downstream analysis." + echo "" + echo " --aggregate_method" + echo " type: string" + echo " default: rra" + echo " choices: [ mean, rra ]" + echo " Method aggregation approach, one of ['mean', 'rra'], where 'mean'" + echo " represents the mean rank, while 'rra' is the RobustRankAggregate (Kolde" + echo " et al., 2014) of the interactions." + echo "" + echo " --consensus_opts" + echo " type: string, multiple values allowed" + echo " Strategies to aggregate interactions across methods. Default is None -" + echo " i.e. ['Specificity', 'Magnitude'] and both specificity and magnitude are" + echo " aggregated." + echo "" + echo " --return_all_lrs" + echo " type: boolean" + echo " default: false" + echo " Bool whether to return all LRs, or only those that surpass the" + echo " 'expr_prop' threshold. Those interactions that do not pass the" + echo " 'expr_prop' threshold will be assigned to the *worst* score of the ones" + echo " that do. 'False' by default." + echo "" + echo " --n_perms" + echo " type: integer" + echo " default: 100" + echo " Number of permutations for the permutation test. Note that this is" + echo " relevant only for permutation-based methods - e.g. 'CellPhoneDB" + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "lianapy main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --de_method) + [ -n "$VIASH_PAR_DE_METHOD" ] && ViashError Bad arguments for option \'--de_method\': \'$VIASH_PAR_DE_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DE_METHOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --de_method. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --de_method=*) + [ -n "$VIASH_PAR_DE_METHOD" ] && ViashError Bad arguments for option \'--de_method=*\': \'$VIASH_PAR_DE_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DE_METHOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --groupby) + [ -n "$VIASH_PAR_GROUPBY" ] && ViashError Bad arguments for option \'--groupby\': \'$VIASH_PAR_GROUPBY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUPBY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --groupby. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --groupby=*) + [ -n "$VIASH_PAR_GROUPBY" ] && ViashError Bad arguments for option \'--groupby=*\': \'$VIASH_PAR_GROUPBY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GROUPBY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --resource_name) + [ -n "$VIASH_PAR_RESOURCE_NAME" ] && ViashError Bad arguments for option \'--resource_name\': \'$VIASH_PAR_RESOURCE_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RESOURCE_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --resource_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --resource_name=*) + [ -n "$VIASH_PAR_RESOURCE_NAME" ] && ViashError Bad arguments for option \'--resource_name=*\': \'$VIASH_PAR_RESOURCE_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RESOURCE_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_symbol) + [ -n "$VIASH_PAR_GENE_SYMBOL" ] && ViashError Bad arguments for option \'--gene_symbol\': \'$VIASH_PAR_GENE_SYMBOL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_SYMBOL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_symbol. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_symbol=*) + [ -n "$VIASH_PAR_GENE_SYMBOL" ] && ViashError Bad arguments for option \'--gene_symbol=*\': \'$VIASH_PAR_GENE_SYMBOL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_SYMBOL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expr_prop) + [ -n "$VIASH_PAR_EXPR_PROP" ] && ViashError Bad arguments for option \'--expr_prop\': \'$VIASH_PAR_EXPR_PROP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPR_PROP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expr_prop. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expr_prop=*) + [ -n "$VIASH_PAR_EXPR_PROP" ] && ViashError Bad arguments for option \'--expr_prop=*\': \'$VIASH_PAR_EXPR_PROP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPR_PROP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_cells) + [ -n "$VIASH_PAR_MIN_CELLS" ] && ViashError Bad arguments for option \'--min_cells\': \'$VIASH_PAR_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_cells=*) + [ -n "$VIASH_PAR_MIN_CELLS" ] && ViashError Bad arguments for option \'--min_cells=*\': \'$VIASH_PAR_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --aggregate_method) + [ -n "$VIASH_PAR_AGGREGATE_METHOD" ] && ViashError Bad arguments for option \'--aggregate_method\': \'$VIASH_PAR_AGGREGATE_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AGGREGATE_METHOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --aggregate_method. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --aggregate_method=*) + [ -n "$VIASH_PAR_AGGREGATE_METHOD" ] && ViashError Bad arguments for option \'--aggregate_method=*\': \'$VIASH_PAR_AGGREGATE_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AGGREGATE_METHOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --consensus_opts) + if [ -z "$VIASH_PAR_CONSENSUS_OPTS" ]; then + VIASH_PAR_CONSENSUS_OPTS="$2" + else + VIASH_PAR_CONSENSUS_OPTS="$VIASH_PAR_CONSENSUS_OPTS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --consensus_opts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --consensus_opts=*) + if [ -z "$VIASH_PAR_CONSENSUS_OPTS" ]; then + VIASH_PAR_CONSENSUS_OPTS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CONSENSUS_OPTS="$VIASH_PAR_CONSENSUS_OPTS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --return_all_lrs) + [ -n "$VIASH_PAR_RETURN_ALL_LRS" ] && ViashError Bad arguments for option \'--return_all_lrs\': \'$VIASH_PAR_RETURN_ALL_LRS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RETURN_ALL_LRS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --return_all_lrs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --return_all_lrs=*) + [ -n "$VIASH_PAR_RETURN_ALL_LRS" ] && ViashError Bad arguments for option \'--return_all_lrs=*\': \'$VIASH_PAR_RETURN_ALL_LRS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RETURN_ALL_LRS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_perms) + [ -n "$VIASH_PAR_N_PERMS" ] && ViashError Bad arguments for option \'--n_perms\': \'$VIASH_PAR_N_PERMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PERMS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_perms. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_perms=*) + [ -n "$VIASH_PAR_N_PERMS" ] && ViashError Bad arguments for option \'--n_perms=*\': \'$VIASH_PAR_N_PERMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PERMS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/interpret/lianapy:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_GROUPBY+x} ]; then + ViashError '--groupby' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_DE_METHOD+x} ]; then + VIASH_PAR_DE_METHOD="t-test" +fi +if [ -z ${VIASH_PAR_RESOURCE_NAME+x} ]; then + VIASH_PAR_RESOURCE_NAME="consensus" +fi +if [ -z ${VIASH_PAR_GENE_SYMBOL+x} ]; then + VIASH_PAR_GENE_SYMBOL="gene_symbol" +fi +if [ -z ${VIASH_PAR_EXPR_PROP+x} ]; then + VIASH_PAR_EXPR_PROP="0.1" +fi +if [ -z ${VIASH_PAR_MIN_CELLS+x} ]; then + VIASH_PAR_MIN_CELLS="5" +fi +if [ -z ${VIASH_PAR_AGGREGATE_METHOD+x} ]; then + VIASH_PAR_AGGREGATE_METHOD="rra" +fi +if [ -z ${VIASH_PAR_RETURN_ALL_LRS+x} ]; then + VIASH_PAR_RETURN_ALL_LRS="false" +fi +if [ -z ${VIASH_PAR_N_PERMS+x} ]; then + VIASH_PAR_N_PERMS="100" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EXPR_PROP" ]]; then + if ! [[ "$VIASH_PAR_EXPR_PROP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--expr_prop' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CELLS" ]]; then + if ! [[ "$VIASH_PAR_MIN_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RETURN_ALL_LRS" ]]; then + if ! [[ "$VIASH_PAR_RETURN_ALL_LRS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--return_all_lrs' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_PERMS" ]]; then + if ! [[ "$VIASH_PAR_N_PERMS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_perms' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_DE_METHOD" ]; then + VIASH_PAR_DE_METHOD_CHOICES=("logreg;t-test;wilcoxon;t-test_overestim_var") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_DE_METHOD_CHOICES[*]};" =~ ";$VIASH_PAR_DE_METHOD;" ]]; then + ViashError '--de_method' specified value of \'$VIASH_PAR_DE_METHOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_RESOURCE_NAME" ]; then + VIASH_PAR_RESOURCE_NAME_CHOICES=("baccin2019;cellcall;cellchatdb;cellinker;cellphonedb;celltalkdb;connectomedb2020;consensus;embrace;guide2pharma;hpmr;icellnet;italk;kirouac2010;lrdb;mouseconsensus;ramilowski2015") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_RESOURCE_NAME_CHOICES[*]};" =~ ";$VIASH_PAR_RESOURCE_NAME;" ]]; then + ViashError '--resource_name' specified value of \'$VIASH_PAR_RESOURCE_NAME\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_AGGREGATE_METHOD" ]; then + VIASH_PAR_AGGREGATE_METHOD_CHOICES=("mean;rra") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_AGGREGATE_METHOD_CHOICES[*]};" =~ ";$VIASH_PAR_AGGREGATE_METHOD;" ]]; then + ViashError '--aggregate_method' specified value of \'$VIASH_PAR_AGGREGATE_METHOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-lianapy-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import liana +import mudata + +# TODO: Remove when grouping labels exist +# For sign/PCA/ +import pandas as pd + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'de_method': $( if [ ! -z ${VIASH_PAR_DE_METHOD+x} ]; then echo "r'${VIASH_PAR_DE_METHOD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'groupby': $( if [ ! -z ${VIASH_PAR_GROUPBY+x} ]; then echo "r'${VIASH_PAR_GROUPBY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resource_name': $( if [ ! -z ${VIASH_PAR_RESOURCE_NAME+x} ]; then echo "r'${VIASH_PAR_RESOURCE_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gene_symbol': $( if [ ! -z ${VIASH_PAR_GENE_SYMBOL+x} ]; then echo "r'${VIASH_PAR_GENE_SYMBOL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'expr_prop': $( if [ ! -z ${VIASH_PAR_EXPR_PROP+x} ]; then echo "float(r'${VIASH_PAR_EXPR_PROP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_cells': $( if [ ! -z ${VIASH_PAR_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'aggregate_method': $( if [ ! -z ${VIASH_PAR_AGGREGATE_METHOD+x} ]; then echo "r'${VIASH_PAR_AGGREGATE_METHOD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'consensus_opts': $( if [ ! -z ${VIASH_PAR_CONSENSUS_OPTS+x} ]; then echo "r'${VIASH_PAR_CONSENSUS_OPTS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'return_all_lrs': $( if [ ! -z ${VIASH_PAR_RETURN_ALL_LRS+x} ]; then echo "r'${VIASH_PAR_RETURN_ALL_LRS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'n_perms': $( if [ ! -z ${VIASH_PAR_N_PERMS+x} ]; then echo "int(r'${VIASH_PAR_N_PERMS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def main(): + # Get input data + mod = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + # Add dummy grouping labels when they do not exist + if par["groupby"] not in mod.obs: + raise ValueError( + f"Column {par['groupy']} does not exist in " + f".obs for modality {par['modality']}." + ) + mod_col = mod.obs[par["groupby"]] + original_groupby_col = mod_col.copy() + if not isinstance(mod_col, pd.CategoricalDtype): + mod.obs[par["groupby"]] = mod_col.astype(str).astype("category") + + # Solve gene labels + orig_gene_label = mod.var.index + mod.var_names = mod.var[par["gene_symbol"]].astype(str) + mod.var_names_make_unique() + + if not meta.get("cpus"): + meta["cpus"] = 1 + n_jobs = meta["cpus"] - 1 if meta["cpus"] > 2 else 1 + liana.mt.rank_aggregate( + adata=mod, + groupby=par["groupby"], + resource_name=par["resource_name"], + expr_prop=par["expr_prop"], + min_cells=par["min_cells"], + aggregate_method=par["aggregate_method"], + consensus_opts=par["consensus_opts"], + return_all_lrs=par["return_all_lrs"], + layer=par["layer"], + de_method=par["de_method"], + n_perms=par["n_perms"], + n_jobs=n_jobs, + verbose=True, + inplace=True, + use_raw=False, + ) + + # Return original gene labels + mod.var_names = orig_gene_label + + # Undo modifications to groupby column + mod.obs[par["groupby"]] = original_groupby_col + + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/interpret/lianapy/nextflow_labels.config b/target/executable/interpret/lianapy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/interpret/lianapy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/labels_transfer/knn/.config.vsh.yaml b/target/executable/labels_transfer/knn/.config.vsh.yaml new file mode 100644 index 00000000..abd5304e --- /dev/null +++ b/target/executable/labels_transfer/knn/.config.vsh.yaml @@ -0,0 +1,502 @@ +name: "knn" +namespace: "labels_transfer" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The query data to transfer the labels to. Should be a .h5mu file." + info: + label: "Query" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's inference,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + required: false + description: "The embedding to use for the classifier's inference.\ + \ Override using the `--input_obsm_features` argument. If not provided,\ + \ the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g.\ + \ by the same model or preprocessing).\n" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to use." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's inference.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g. by the same\ + \ model or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference dataset arguments" + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train classifiers on." + info: + label: "Reference" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's training,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + description: "The embedding to use for the classifier's training.\ + \ Override using the `--reference_obsm_features` argument.\nMake\ + \ sure that embedding was obtained in the same way as the query\ + \ embedding (e.g. by the same model or preprocessing).\n" + required: true + obs: + - type: "string" + name: "targets" + multiple: true + example: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + description: "The target labels to transfer. Override using the `--reference_obs_targets`\ + \ argument." + required: true + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's training.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the query embedding (e.g. by the same model\ + \ or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_targets" + description: "The `.obs` key(s) of the target labels to tranfer." + info: null + default: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels transfered\ + \ from the reference." + info: + label: "Output data" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + obs: + - type: "string" + name: "predictions" + description: "The predicted labels. Override using the `--output_obs_predictions`\ + \ argument." + required: true + - type: "double" + name: "probability" + description: "The probability of the predicted labels. Override using\ + \ the `--output_obs_probability` argument." + required: false + obsm: + - type: "double" + name: "X_scvi" + description: "The embedding used for the classifier's inference. Could\ + \ have any name, specified by `input_obsm_features` argument.\"" + required: false + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\nIf provided,\ + \ must have the same length as `--reference_obs_targets`.\nIf empty, will default\ + \ to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n\ + If provided, must have the same length as `--reference_obs_targets`.\nIf empty,\ + \ will default to the `reference_obs_targets` combined with the `\"_probability\"\ + ` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Input dataset (query) arguments" + arguments: + - type: "string" + name: "--input_obsm_distances" + description: "The `.obsm` key of the input (query) dataset containing pre-calculated\ + \ distances. \nIf not provided, the distances will be calculated using PyNNDescent.\n\ + Make sure the distance matrix contains distances relative to the reference dataset\ + \ and were obtained in the same way as the reference embedding.\n" + info: null + example: + - "bbknn_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference dataset arguments" + arguments: + - type: "string" + name: "--reference_obsm_distances" + description: "The `.obsm` key of the reference dataset containing pre-calculated\ + \ distances. \nIf not provided, the distances will be calculated using PyNNDescent.\n" + info: null + example: + - "bbknn_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "KNN label transfer arguments" + arguments: + - type: "string" + name: "--weights" + description: "Weight function used in prediction. Possible values are:\n- `uniform`\ + \ - all points in each neighborhood are weighted equally \n- `distance` - weight\ + \ points by the inverse of their distance\n- `gaussian` - weight points by the\ + \ sum of their Gaussian kernel similarities to each sample\n" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "distance" + - "gaussian" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors" + description: "The number of neighbors to use in k-neighbor graph structure used\ + \ for fast approximate nearest neighbor search with PyNNDescent. \nLarger values\ + \ will result in more accurate search results at the cost of computation time.\n" + info: null + default: + - 15 + required: false + min: 5 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component performs label transfer from reference to query using\ + \ a K-Neirest Neighbors classifier.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pkg-config" + - "libhdf5-dev" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "pynndescent~=0.5.10" + - "numpy<2" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/labels_transfer/knn/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/labels_transfer/knn" + executable: "target/executable/labels_transfer/knn/knn" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/labels_transfer/knn/compress_h5mu.py b/target/executable/labels_transfer/knn/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/labels_transfer/knn/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/labels_transfer/knn/helper.py b/target/executable/labels_transfer/knn/helper.py new file mode 100644 index 00000000..06d5d3a3 --- /dev/null +++ b/target/executable/labels_transfer/knn/helper.py @@ -0,0 +1,42 @@ +def check_arguments(par): + # check output .obs predictions + if not par["output_obs_predictions"]: + par["output_obs_predictions"] = [ + t + "_pred" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" + ) + + # check output .obs uncertainty + if not par["output_obs_probability"]: + par["output_obs_probability"] = [ + t + "_probability" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" + ) + + return par + + +def get_reference_features(adata_reference, par, logger): + if par["reference_obsm_features"] is None: + logger.info("Using .X of reference data") + train_data = adata_reference.X + else: + logger.info(f"Using .obsm[{par['reference_obsm_features']}] of reference data") + train_data = adata_reference.obsm[par["reference_obsm_features"]] + + return train_data + + +def get_query_features(adata, par, logger): + if par["input_obsm_features"] is None: + logger.info("Using .X of query data") + query_data = adata.X + else: + logger.info(f"Using .obsm[{par['input_obsm_features']}] of query data") + query_data = adata.obsm[par["input_obsm_features"]] + + return query_data diff --git a/target/executable/labels_transfer/knn/knn b/target/executable/labels_transfer/knn/knn new file mode 100755 index 00000000..e44b3b7c --- /dev/null +++ b/target/executable/labels_transfer/knn/knn @@ -0,0 +1,1586 @@ +#!/usr/bin/env bash + +# knn main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer, author) +# * Vladimir Shitov (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="knn" +VIASH_META_FUNCTIONALITY_NAME="knn" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps pkg-config libhdf5-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pynndescent~=0.5.10" "numpy<2" + +LABEL org.opencontainers.image.authors="Dorien Roosen, Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component labels_transfer knn" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "knn main" + echo "" + echo "This component performs label transfer from reference to query using a K-Neirest" + echo "Neighbors classifier." + echo "" + echo "Input dataset (query) arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " The query data to transfer the labels to. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to use." + echo "" + echo " --input_obsm_features" + echo " type: string" + echo " example: X_scvi" + echo " The \`.obsm\` key of the embedding to use for the classifier's inference." + echo " If not provided, the \`.X\` slot will be used instead." + echo " Make sure that embedding was obtained in the same way as the reference" + echo " embedding (e.g. by the same model or preprocessing)." + echo "" + echo "Reference dataset arguments:" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train classifiers on." + echo "" + echo " --reference_obsm_features" + echo " type: string" + echo " example: X_scvi" + echo " The \`.obsm\` key of the embedding to use for the classifier's training." + echo " If not provided, the \`.X\` slot will be used instead." + echo " Make sure that embedding was obtained in the same way as the query" + echo " embedding (e.g. by the same model or preprocessing)." + echo "" + echo " --reference_obs_targets" + echo " type: string, multiple values allowed" + echo " default:" + echo "ann_level_1;ann_level_2;ann_level_3;ann_level_4;ann_level_5;ann_finest_level" + echo " The \`.obs\` key(s) of the target labels to tranfer." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " The query data in .h5mu format with predicted labels transfered from the" + echo " reference." + echo "" + echo " --output_obs_predictions" + echo " type: string, multiple values allowed" + echo " In which \`.obs\` slots to store the predicted information." + echo " If provided, must have the same length as \`--reference_obs_targets\`." + echo " If empty, will default to the \`reference_obs_targets\` combined with the" + echo " \`\"_pred\"\` suffix." + echo "" + echo " --output_obs_probability" + echo " type: string, multiple values allowed" + echo " In which \`.obs\` slots to store the probability of the predictions." + echo " If provided, must have the same length as \`--reference_obs_targets\`." + echo " If empty, will default to the \`reference_obs_targets\` combined with the" + echo " \`\"_probability\"\` suffix." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Input dataset (query) arguments:" + echo " --input_obsm_distances" + echo " type: string" + echo " example: bbknn_distances" + echo " The \`.obsm\` key of the input (query) dataset containing pre-calculated" + echo " distances." + echo " If not provided, the distances will be calculated using PyNNDescent." + echo " Make sure the distance matrix contains distances relative to the" + echo " reference dataset and were obtained in the same way as the reference" + echo " embedding." + echo "" + echo "Reference dataset arguments:" + echo " --reference_obsm_distances" + echo " type: string" + echo " example: bbknn_distances" + echo " The \`.obsm\` key of the reference dataset containing pre-calculated" + echo " distances." + echo " If not provided, the distances will be calculated using PyNNDescent." + echo "" + echo "KNN label transfer arguments:" + echo " --weights" + echo " type: string" + echo " default: uniform" + echo " choices: [ uniform, distance, gaussian ]" + echo " Weight function used in prediction. Possible values are:" + echo " - \`uniform\` - all points in each neighborhood are weighted equally" + echo " - \`distance\` - weight points by the inverse of their distance" + echo " - \`gaussian\` - weight points by the sum of their Gaussian kernel" + echo " similarities to each sample" + echo "" + echo " --n_neighbors" + echo " type: integer" + echo " default: 15" + echo " min: 5" + echo " The number of neighbors to use in k-neighbor graph structure used for" + echo " fast approximate nearest neighbor search with PyNNDescent." + echo " Larger values will result in more accurate search results at the cost of" + echo " computation time." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "knn main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obsm_features) + [ -n "$VIASH_PAR_INPUT_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--input_obsm_features\': \'$VIASH_PAR_INPUT_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_FEATURES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obsm_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obsm_features=*) + [ -n "$VIASH_PAR_INPUT_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--input_obsm_features=*\': \'$VIASH_PAR_INPUT_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_FEATURES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obsm_features) + [ -n "$VIASH_PAR_REFERENCE_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--reference_obsm_features\': \'$VIASH_PAR_REFERENCE_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBSM_FEATURES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obsm_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obsm_features=*) + [ -n "$VIASH_PAR_REFERENCE_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--reference_obsm_features=*\': \'$VIASH_PAR_REFERENCE_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBSM_FEATURES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_targets) + if [ -z "$VIASH_PAR_REFERENCE_OBS_TARGETS" ]; then + VIASH_PAR_REFERENCE_OBS_TARGETS="$2" + else + VIASH_PAR_REFERENCE_OBS_TARGETS="$VIASH_PAR_REFERENCE_OBS_TARGETS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_targets. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_targets=*) + if [ -z "$VIASH_PAR_REFERENCE_OBS_TARGETS" ]; then + VIASH_PAR_REFERENCE_OBS_TARGETS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_REFERENCE_OBS_TARGETS="$VIASH_PAR_REFERENCE_OBS_TARGETS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + else + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$VIASH_PAR_OUTPUT_OBS_PREDICTIONS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$VIASH_PAR_OUTPUT_OBS_PREDICTIONS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_obs_probability) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + else + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$VIASH_PAR_OUTPUT_OBS_PROBABILITY;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$VIASH_PAR_OUTPUT_OBS_PROBABILITY;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obsm_distances) + [ -n "$VIASH_PAR_INPUT_OBSM_DISTANCES" ] && ViashError Bad arguments for option \'--input_obsm_distances\': \'$VIASH_PAR_INPUT_OBSM_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_DISTANCES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obsm_distances. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obsm_distances=*) + [ -n "$VIASH_PAR_INPUT_OBSM_DISTANCES" ] && ViashError Bad arguments for option \'--input_obsm_distances=*\': \'$VIASH_PAR_INPUT_OBSM_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_DISTANCES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obsm_distances) + [ -n "$VIASH_PAR_REFERENCE_OBSM_DISTANCES" ] && ViashError Bad arguments for option \'--reference_obsm_distances\': \'$VIASH_PAR_REFERENCE_OBSM_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBSM_DISTANCES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obsm_distances. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obsm_distances=*) + [ -n "$VIASH_PAR_REFERENCE_OBSM_DISTANCES" ] && ViashError Bad arguments for option \'--reference_obsm_distances=*\': \'$VIASH_PAR_REFERENCE_OBSM_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBSM_DISTANCES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --weights) + [ -n "$VIASH_PAR_WEIGHTS" ] && ViashError Bad arguments for option \'--weights\': \'$VIASH_PAR_WEIGHTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WEIGHTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --weights. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --weights=*) + [ -n "$VIASH_PAR_WEIGHTS" ] && ViashError Bad arguments for option \'--weights=*\': \'$VIASH_PAR_WEIGHTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WEIGHTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_neighbors) + [ -n "$VIASH_PAR_N_NEIGHBORS" ] && ViashError Bad arguments for option \'--n_neighbors\': \'$VIASH_PAR_N_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_neighbors. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_neighbors=*) + [ -n "$VIASH_PAR_N_NEIGHBORS" ] && ViashError Bad arguments for option \'--n_neighbors=*\': \'$VIASH_PAR_N_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/labels_transfer/knn:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGETS+x} ]; then + VIASH_PAR_REFERENCE_OBS_TARGETS="ann_level_1;ann_level_2;ann_level_3;ann_level_4;ann_level_5;ann_finest_level" +fi +if [ -z ${VIASH_PAR_WEIGHTS+x} ]; then + VIASH_PAR_WEIGHTS="uniform" +fi +if [ -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then + VIASH_PAR_N_NEIGHBORS="15" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_NEIGHBORS" ]]; then + if ! [[ "$VIASH_PAR_N_NEIGHBORS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_neighbors' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_N_NEIGHBORS -lt 5 ]]; then + ViashError '--n_neighbors' has be more than or equal to 5. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_WEIGHTS" ]; then + VIASH_PAR_WEIGHTS_CHOICES=("uniform;distance;gaussian") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_WEIGHTS_CHOICES[*]};" =~ ";$VIASH_PAR_WEIGHTS;" ]]; then + ViashError '--weights' specified value of \'$VIASH_PAR_WEIGHTS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-knn-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import mudata as mu +import numpy as np +import sys +from pynndescent import PyNNDescentTransformer +from sklearn.neighbors import KNeighborsClassifier + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obsm_features': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_FEATURES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obsm_features': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBSM_FEATURES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_targets': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGETS+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGETS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obsm_distances': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_DISTANCES+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_DISTANCES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obsm_distances': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBSM_DISTANCES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBSM_DISTANCES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'weights': $( if [ ! -z ${VIASH_PAR_WEIGHTS+x} ]; then echo "r'${VIASH_PAR_WEIGHTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_neighbors': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger + + +# END TEMPORARY WORKAROUND setup_logger + + +def distances_to_affinities(distances): + # Apply Gaussian kernel to distances + stds = np.std(distances, axis=1) + stds = (2.0 / stds) ** 2 + stds = stds.reshape(-1, 1) + distances_tilda = np.exp(-np.true_divide(distances, stds)) + + # normalize the distances_tilda + # if the sum of a row of the distances tilda equals 0, + # set normalized distances for that row to 1 + # else divide the row values by the value of the sum of the row + distances_tilda_normalized = np.where( + np.sum(distances_tilda, axis=1, keepdims=True) == 0, + 1, + distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True), + ) + return distances_tilda_normalized + + +logger = setup_logger() + +# Reading in data +logger.info( + f"Reading in query dataset {par['input']} and reference datasets {par['reference']}" +) +q_adata = mu.read_h5ad(par["input"], mod=par["modality"]) +r_adata = mu.read_h5ad(par["reference"], mod=par["modality"]) + +# check arguments +logger.info("Checking arguments") +par = check_arguments(par) + +if par["input_obsm_distances"] and par["reference_obsm_distances"]: + logger.info( + "Using pre-calculated distances for KNN classification as provided in \`--input_obsm_distances\` and \`--reference_obsm_distances\`." + ) + + assert par["input_obsm_distances"] in q_adata.obsm, ( + f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}." + ) + assert par["reference_obsm_distances"] in r_adata.obsm, ( + f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}." + ) + + query_neighbors = q_adata.obsm[par["input_obsm_distances"]] + reference_neighbors = r_adata.obsm[par["reference_obsm_distances"]] + + if query_neighbors.shape[1] != reference_neighbors.shape[1]: + raise ValueError( + "The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset." + ) + + # Make sure the number of neighbors present in the distance matrix matches the requested number of neighbors in --n_neighbors + # Otherwise reduce n_neighbors for KNN + smallest_neighbor_count = min( + np.diff(query_neighbors.indptr).min(), np.diff(reference_neighbors.indptr).min() + ) + if smallest_neighbor_count < par["n_neighbors"]: + logger.warning( + f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification" + ) + par["n_neighbors"] = smallest_neighbor_count + +elif par["input_obsm_distances"] or par["reference_obsm_distances"]: + raise ValueError( + "Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification." + ) + +elif not par["input_obsm_distances"] and not par["reference_obsm_distances"]: + logger.info( + "No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm." + ) + # Generating training and inference data + train_X = get_reference_features(r_adata, par, logger) + inference_X = get_query_features(q_adata, par, logger) + + neighbors_transformer = PyNNDescentTransformer( + n_neighbors=par["n_neighbors"], + parallel_batch_queries=True, + ) + neighbors_transformer.fit(train_X) + + # Square sparse matrix with distances to n neighbors in reference data + query_neighbors = neighbors_transformer.transform(inference_X) + reference_neighbors = neighbors_transformer.transform(train_X) + +# For each target, train a classifier and predict labels +for obs_tar, obs_pred, obs_proba in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], +): + logger.info(f"Predicting labels for {obs_tar}") + + weights_dict = { + "uniform": "uniform", + "distance": "distance", + "gaussian": distances_to_affinities, + } + + logger.info(f"Using KNN classifier with {par['weights']} weights") + train_y = r_adata.obs[obs_tar].to_numpy() + classifier = KNeighborsClassifier( + n_neighbors=par["n_neighbors"], + metric="precomputed", + weights=weights_dict[par["weights"]], + ) + classifier.fit(X=reference_neighbors, y=train_y) + predicted_labels = classifier.predict(query_neighbors) + probabilities = classifier.predict_proba(query_neighbors).max(axis=1) + + # save_results + logger.info( + f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs" + ) + q_adata.obs[obs_pred] = predicted_labels + q_adata.obs[obs_proba] = probabilities + +logger.info(f"Saving output data to {par['output']}") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], q_adata, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/labels_transfer/knn/nextflow_labels.config b/target/executable/labels_transfer/knn/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/labels_transfer/knn/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/labels_transfer/xgboost/.config.vsh.yaml b/target/executable/labels_transfer/xgboost/.config.vsh.yaml new file mode 100644 index 00000000..03e8ef32 --- /dev/null +++ b/target/executable/labels_transfer/xgboost/.config.vsh.yaml @@ -0,0 +1,651 @@ +name: "xgboost" +namespace: "labels_transfer" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The query data to transfer the labels to. Should be a .h5mu file." + info: + label: "Query" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's inference,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + required: false + description: "The embedding to use for the classifier's inference.\ + \ Override using the `--input_obsm_features` argument. If not provided,\ + \ the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g.\ + \ by the same model or preprocessing).\n" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to use." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's inference.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g. by the same\ + \ model or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference dataset arguments" + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train classifiers on." + info: + label: "Reference" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's training,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + description: "The embedding to use for the classifier's training.\ + \ Override using the `--reference_obsm_features` argument.\nMake\ + \ sure that embedding was obtained in the same way as the query\ + \ embedding (e.g. by the same model or preprocessing).\n" + required: true + obs: + - type: "string" + name: "targets" + multiple: true + example: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + description: "The target labels to transfer. Override using the `--reference_obs_targets`\ + \ argument." + required: true + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's training.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the query embedding (e.g. by the same model\ + \ or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_targets" + description: "The `.obs` key(s) of the target labels to tranfer." + info: null + default: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels transfered\ + \ from the reference." + info: + label: "Output data" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + obs: + - type: "string" + name: "predictions" + description: "The predicted labels. Override using the `--output_obs_predictions`\ + \ argument." + required: true + - type: "double" + name: "probability" + description: "The probability of the predicted labels. Override using\ + \ the `--output_obs_probability` argument." + required: false + obsm: + - type: "double" + name: "X_scvi" + description: "The embedding used for the classifier's inference. Could\ + \ have any name, specified by `input_obsm_features` argument.\"" + required: false + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\nIf provided,\ + \ must have the same length as `--reference_obs_targets`.\nIf empty, will default\ + \ to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n\ + If provided, must have the same length as `--reference_obs_targets`.\nIf empty,\ + \ will default to the `reference_obs_targets` combined with the `\"_probability\"\ + ` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Execution arguments" + arguments: + - type: "boolean_true" + name: "--force_retrain" + alternatives: + - "-f" + description: "Retrain models on the reference even if model_output directory already\ + \ has trained classifiers. WARNING! It will rewrite existing classifiers for\ + \ targets in the model_output directory!" + info: null + direction: "input" + - type: "boolean" + name: "--use_gpu" + description: "Use GPU during models training and inference (recommended)." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--verbosity" + alternatives: + - "-v" + description: "The verbosity level for evaluation of the classifier from the range\ + \ [0,2]" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_output" + description: "Output directory for model" + info: null + default: + - "model" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_uns_parameters" + description: "The key in `uns` slot of the output AnnData object to store the\ + \ parameters of the XGBoost classifier." + info: null + default: + - "xgboost_parameters" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "double" + name: "--learning_rate" + alternatives: + - "--eta" + description: "Step size shrinkage used in update to prevents overfitting. Range:\ + \ [0,1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_split_loss" + alternatives: + - "--gamma" + description: "Minimum loss reduction required to make a further partition on a\ + \ leaf node of the tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_depth" + alternatives: + - "-d" + description: "Maximum depth of a tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 6 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_child_weight" + description: "Minimum sum of instance weight (hessian) needed in a child. See\ + \ https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_delta_step" + description: "Maximum delta step we allow each leaf output to be. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--subsample" + description: "Subsample ratio of the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sampling_method" + description: "The method to use to sample the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "gradient_based" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--colsample_bytree" + description: "Fraction of columns to be subsampled. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--colsample_bylevel" + description: "Subsample ratio of columns for each level. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--colsample_bynode" + description: "Subsample ratio of columns for each node (split). Range (0, 1].\ + \ See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--reg_lambda" + alternatives: + - "--lambda" + description: "L2 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--reg_alpha" + alternatives: + - "--alpha" + description: "L1 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--scale_pos_weight" + description: "Control the balance of positive and negative weights, useful for\ + \ unbalanced classes. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs label transfer from reference to query using XGBoost classifier" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + method_id: "XGBClassifier" +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "apt" + packages: + - "libopenblas-dev" + - "liblapack-dev" + - "gfortran" + interactive: false + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "xgboost~=2.1.3" + - "scikit-learn<1.6" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/labels_transfer/xgboost/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/labels_transfer/xgboost" + executable: "target/executable/labels_transfer/xgboost/xgboost" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/labels_transfer/xgboost/compress_h5mu.py b/target/executable/labels_transfer/xgboost/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/labels_transfer/xgboost/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/labels_transfer/xgboost/helper.py b/target/executable/labels_transfer/xgboost/helper.py new file mode 100644 index 00000000..06d5d3a3 --- /dev/null +++ b/target/executable/labels_transfer/xgboost/helper.py @@ -0,0 +1,42 @@ +def check_arguments(par): + # check output .obs predictions + if not par["output_obs_predictions"]: + par["output_obs_predictions"] = [ + t + "_pred" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" + ) + + # check output .obs uncertainty + if not par["output_obs_probability"]: + par["output_obs_probability"] = [ + t + "_probability" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" + ) + + return par + + +def get_reference_features(adata_reference, par, logger): + if par["reference_obsm_features"] is None: + logger.info("Using .X of reference data") + train_data = adata_reference.X + else: + logger.info(f"Using .obsm[{par['reference_obsm_features']}] of reference data") + train_data = adata_reference.obsm[par["reference_obsm_features"]] + + return train_data + + +def get_query_features(adata, par, logger): + if par["input_obsm_features"] is None: + logger.info("Using .X of query data") + query_data = adata.X + else: + logger.info(f"Using .obsm[{par['input_obsm_features']}] of query data") + query_data = adata.obsm[par["input_obsm_features"]] + + return query_data diff --git a/target/executable/labels_transfer/xgboost/nextflow_labels.config b/target/executable/labels_transfer/xgboost/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/labels_transfer/xgboost/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/labels_transfer/xgboost/setup_logger.py b/target/executable/labels_transfer/xgboost/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/labels_transfer/xgboost/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/labels_transfer/xgboost/xgboost b/target/executable/labels_transfer/xgboost/xgboost new file mode 100755 index 00000000..4fe83c83 --- /dev/null +++ b/target/executable/labels_transfer/xgboost/xgboost @@ -0,0 +1,2279 @@ +#!/usr/bin/env bash + +# xgboost main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="xgboost" +VIASH_META_FUNCTIONALITY_NAME="xgboost" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libopenblas-dev liblapack-dev gfortran && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scanpy~=1.10.4" "xgboost~=2.1.3" "scikit-learn<1.6" + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component labels_transfer xgboost" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "xgboost main" + echo "" + echo "Performs label transfer from reference to query using XGBoost classifier" + echo "" + echo "Input dataset (query) arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " The query data to transfer the labels to. Should be a .h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality to use." + echo "" + echo " --input_obsm_features" + echo " type: string" + echo " example: X_scvi" + echo " The \`.obsm\` key of the embedding to use for the classifier's inference." + echo " If not provided, the \`.X\` slot will be used instead." + echo " Make sure that embedding was obtained in the same way as the reference" + echo " embedding (e.g. by the same model or preprocessing)." + echo "" + echo "Reference dataset arguments:" + echo " --reference" + echo " type: file, file must exist" + echo " example: reference.h5mu" + echo " The reference data to train classifiers on." + echo "" + echo " --reference_obsm_features" + echo " type: string" + echo " example: X_scvi" + echo " The \`.obsm\` key of the embedding to use for the classifier's training." + echo " If not provided, the \`.X\` slot will be used instead." + echo " Make sure that embedding was obtained in the same way as the query" + echo " embedding (e.g. by the same model or preprocessing)." + echo "" + echo " --reference_obs_targets" + echo " type: string, multiple values allowed" + echo " default:" + echo "ann_level_1;ann_level_2;ann_level_3;ann_level_4;ann_level_5;ann_finest_level" + echo " The \`.obs\` key(s) of the target labels to tranfer." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " The query data in .h5mu format with predicted labels transfered from the" + echo " reference." + echo "" + echo " --output_obs_predictions" + echo " type: string, multiple values allowed" + echo " In which \`.obs\` slots to store the predicted information." + echo " If provided, must have the same length as \`--reference_obs_targets\`." + echo " If empty, will default to the \`reference_obs_targets\` combined with the" + echo " \`\"_pred\"\` suffix." + echo "" + echo " --output_obs_probability" + echo " type: string, multiple values allowed" + echo " In which \`.obs\` slots to store the probability of the predictions." + echo " If provided, must have the same length as \`--reference_obs_targets\`." + echo " If empty, will default to the \`reference_obs_targets\` combined with the" + echo " \`\"_probability\"\` suffix." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Execution arguments:" + echo " -f, --force_retrain" + echo " type: boolean_true" + echo " Retrain models on the reference even if model_output directory already" + echo " has trained classifiers. WARNING! It will rewrite existing classifiers" + echo " for targets in the model_output directory!" + echo "" + echo " --use_gpu" + echo " type: boolean" + echo " default: false" + echo " Use GPU during models training and inference (recommended)." + echo "" + echo " -v, --verbosity" + echo " type: integer" + echo " default: 1" + echo " The verbosity level for evaluation of the classifier from the range" + echo " [0,2]" + echo "" + echo " --model_output" + echo " type: file, output, file must exist" + echo " default: model" + echo " Output directory for model" + echo "" + echo " --output_uns_parameters" + echo " type: string" + echo " default: xgboost_parameters" + echo " The key in \`uns\` slot of the output AnnData object to store the" + echo " parameters of the XGBoost classifier." + echo "" + echo "Learning parameters:" + echo " --eta, --learning_rate" + echo " type: double" + echo " default: 0.3" + echo " Step size shrinkage used in update to prevents overfitting. Range:" + echo " [0,1]. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --gamma, --min_split_loss" + echo " type: double" + echo " default: 0.0" + echo " Minimum loss reduction required to make a further partition on a leaf" + echo " node of the tree. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " -d, --max_depth" + echo " type: integer" + echo " default: 6" + echo " Maximum depth of a tree. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --min_child_weight" + echo " type: integer" + echo " default: 1" + echo " Minimum sum of instance weight (hessian) needed in a child. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --max_delta_step" + echo " type: double" + echo " default: 0.0" + echo " Maximum delta step we allow each leaf output to be. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --subsample" + echo " type: double" + echo " default: 1.0" + echo " Subsample ratio of the training instances. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --sampling_method" + echo " type: string" + echo " default: uniform" + echo " choices: [ uniform, gradient_based ]" + echo " The method to use to sample the training instances. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --colsample_bytree" + echo " type: double" + echo " default: 1.0" + echo " Fraction of columns to be subsampled. Range (0, 1]. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --colsample_bylevel" + echo " type: double" + echo " default: 1.0" + echo " Subsample ratio of columns for each level. Range (0, 1]. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --colsample_bynode" + echo " type: double" + echo " default: 1.0" + echo " Subsample ratio of columns for each node (split). Range (0, 1]. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --lambda, --reg_lambda" + echo " type: double" + echo " default: 1.0" + echo " L2 regularization term on weights. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --alpha, --reg_alpha" + echo " type: double" + echo " default: 0.0" + echo " L1 regularization term on weights. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo " --scale_pos_weight" + echo " type: double" + echo " default: 1.0" + echo " Control the balance of positive and negative weights, useful for" + echo " unbalanced classes. See" + echo " " + echo "https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster" + echo " for the reference" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "xgboost main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obsm_features) + [ -n "$VIASH_PAR_INPUT_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--input_obsm_features\': \'$VIASH_PAR_INPUT_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_FEATURES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obsm_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obsm_features=*) + [ -n "$VIASH_PAR_INPUT_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--input_obsm_features=*\': \'$VIASH_PAR_INPUT_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_FEATURES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obsm_features) + [ -n "$VIASH_PAR_REFERENCE_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--reference_obsm_features\': \'$VIASH_PAR_REFERENCE_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBSM_FEATURES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obsm_features. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obsm_features=*) + [ -n "$VIASH_PAR_REFERENCE_OBSM_FEATURES" ] && ViashError Bad arguments for option \'--reference_obsm_features=*\': \'$VIASH_PAR_REFERENCE_OBSM_FEATURES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_OBSM_FEATURES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_obs_targets) + if [ -z "$VIASH_PAR_REFERENCE_OBS_TARGETS" ]; then + VIASH_PAR_REFERENCE_OBS_TARGETS="$2" + else + VIASH_PAR_REFERENCE_OBS_TARGETS="$VIASH_PAR_REFERENCE_OBS_TARGETS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_obs_targets. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_obs_targets=*) + if [ -z "$VIASH_PAR_REFERENCE_OBS_TARGETS" ]; then + VIASH_PAR_REFERENCE_OBS_TARGETS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_REFERENCE_OBS_TARGETS="$VIASH_PAR_REFERENCE_OBS_TARGETS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + else + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$VIASH_PAR_OUTPUT_OBS_PREDICTIONS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$VIASH_PAR_OUTPUT_OBS_PREDICTIONS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_obs_probability) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + else + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$VIASH_PAR_OUTPUT_OBS_PROBABILITY;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + if [ -z "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$VIASH_PAR_OUTPUT_OBS_PROBABILITY;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --force_retrain) + [ -n "$VIASH_PAR_FORCE_RETRAIN" ] && ViashError Bad arguments for option \'--force_retrain\': \'$VIASH_PAR_FORCE_RETRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_RETRAIN=true + shift 1 + ;; + -f) + [ -n "$VIASH_PAR_FORCE_RETRAIN" ] && ViashError Bad arguments for option \'-f\': \'$VIASH_PAR_FORCE_RETRAIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_RETRAIN=true + shift 1 + ;; + --use_gpu) + [ -n "$VIASH_PAR_USE_GPU" ] && ViashError Bad arguments for option \'--use_gpu\': \'$VIASH_PAR_USE_GPU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_GPU="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --use_gpu. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --use_gpu=*) + [ -n "$VIASH_PAR_USE_GPU" ] && ViashError Bad arguments for option \'--use_gpu=*\': \'$VIASH_PAR_USE_GPU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_USE_GPU=$(ViashRemoveFlags "$1") + shift 1 + ;; + --verbosity) + [ -n "$VIASH_PAR_VERBOSITY" ] && ViashError Bad arguments for option \'--verbosity\': \'$VIASH_PAR_VERBOSITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --verbosity. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --verbosity=*) + [ -n "$VIASH_PAR_VERBOSITY" ] && ViashError Bad arguments for option \'--verbosity=*\': \'$VIASH_PAR_VERBOSITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + -v) + [ -n "$VIASH_PAR_VERBOSITY" ] && ViashError Bad arguments for option \'-v\': \'$VIASH_PAR_VERBOSITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -v. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_output) + [ -n "$VIASH_PAR_MODEL_OUTPUT" ] && ViashError Bad arguments for option \'--model_output\': \'$VIASH_PAR_MODEL_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_output=*) + [ -n "$VIASH_PAR_MODEL_OUTPUT" ] && ViashError Bad arguments for option \'--model_output=*\': \'$VIASH_PAR_MODEL_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_uns_parameters) + [ -n "$VIASH_PAR_OUTPUT_UNS_PARAMETERS" ] && ViashError Bad arguments for option \'--output_uns_parameters\': \'$VIASH_PAR_OUTPUT_UNS_PARAMETERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_UNS_PARAMETERS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_uns_parameters. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_uns_parameters=*) + [ -n "$VIASH_PAR_OUTPUT_UNS_PARAMETERS" ] && ViashError Bad arguments for option \'--output_uns_parameters=*\': \'$VIASH_PAR_OUTPUT_UNS_PARAMETERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_UNS_PARAMETERS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --learning_rate) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --learning_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --learning_rate=*) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--learning_rate=*\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --eta) + [ -n "$VIASH_PAR_LEARNING_RATE" ] && ViashError Bad arguments for option \'--eta\': \'$VIASH_PAR_LEARNING_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LEARNING_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --eta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_split_loss) + [ -n "$VIASH_PAR_MIN_SPLIT_LOSS" ] && ViashError Bad arguments for option \'--min_split_loss\': \'$VIASH_PAR_MIN_SPLIT_LOSS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SPLIT_LOSS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_split_loss. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_split_loss=*) + [ -n "$VIASH_PAR_MIN_SPLIT_LOSS" ] && ViashError Bad arguments for option \'--min_split_loss=*\': \'$VIASH_PAR_MIN_SPLIT_LOSS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SPLIT_LOSS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gamma) + [ -n "$VIASH_PAR_MIN_SPLIT_LOSS" ] && ViashError Bad arguments for option \'--gamma\': \'$VIASH_PAR_MIN_SPLIT_LOSS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SPLIT_LOSS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gamma. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_depth) + [ -n "$VIASH_PAR_MAX_DEPTH" ] && ViashError Bad arguments for option \'--max_depth\': \'$VIASH_PAR_MAX_DEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DEPTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_depth. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_depth=*) + [ -n "$VIASH_PAR_MAX_DEPTH" ] && ViashError Bad arguments for option \'--max_depth=*\': \'$VIASH_PAR_MAX_DEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DEPTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + -d) + [ -n "$VIASH_PAR_MAX_DEPTH" ] && ViashError Bad arguments for option \'-d\': \'$VIASH_PAR_MAX_DEPTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DEPTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -d. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_child_weight) + [ -n "$VIASH_PAR_MIN_CHILD_WEIGHT" ] && ViashError Bad arguments for option \'--min_child_weight\': \'$VIASH_PAR_MIN_CHILD_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CHILD_WEIGHT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_child_weight. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_child_weight=*) + [ -n "$VIASH_PAR_MIN_CHILD_WEIGHT" ] && ViashError Bad arguments for option \'--min_child_weight=*\': \'$VIASH_PAR_MIN_CHILD_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CHILD_WEIGHT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_delta_step) + [ -n "$VIASH_PAR_MAX_DELTA_STEP" ] && ViashError Bad arguments for option \'--max_delta_step\': \'$VIASH_PAR_MAX_DELTA_STEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DELTA_STEP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_delta_step. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_delta_step=*) + [ -n "$VIASH_PAR_MAX_DELTA_STEP" ] && ViashError Bad arguments for option \'--max_delta_step=*\': \'$VIASH_PAR_MAX_DELTA_STEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_DELTA_STEP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --subsample) + [ -n "$VIASH_PAR_SUBSAMPLE" ] && ViashError Bad arguments for option \'--subsample\': \'$VIASH_PAR_SUBSAMPLE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSAMPLE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --subsample. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --subsample=*) + [ -n "$VIASH_PAR_SUBSAMPLE" ] && ViashError Bad arguments for option \'--subsample=*\': \'$VIASH_PAR_SUBSAMPLE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSAMPLE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sampling_method) + [ -n "$VIASH_PAR_SAMPLING_METHOD" ] && ViashError Bad arguments for option \'--sampling_method\': \'$VIASH_PAR_SAMPLING_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLING_METHOD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sampling_method. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sampling_method=*) + [ -n "$VIASH_PAR_SAMPLING_METHOD" ] && ViashError Bad arguments for option \'--sampling_method=*\': \'$VIASH_PAR_SAMPLING_METHOD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLING_METHOD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --colsample_bytree) + [ -n "$VIASH_PAR_COLSAMPLE_BYTREE" ] && ViashError Bad arguments for option \'--colsample_bytree\': \'$VIASH_PAR_COLSAMPLE_BYTREE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COLSAMPLE_BYTREE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --colsample_bytree. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --colsample_bytree=*) + [ -n "$VIASH_PAR_COLSAMPLE_BYTREE" ] && ViashError Bad arguments for option \'--colsample_bytree=*\': \'$VIASH_PAR_COLSAMPLE_BYTREE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COLSAMPLE_BYTREE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --colsample_bylevel) + [ -n "$VIASH_PAR_COLSAMPLE_BYLEVEL" ] && ViashError Bad arguments for option \'--colsample_bylevel\': \'$VIASH_PAR_COLSAMPLE_BYLEVEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COLSAMPLE_BYLEVEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --colsample_bylevel. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --colsample_bylevel=*) + [ -n "$VIASH_PAR_COLSAMPLE_BYLEVEL" ] && ViashError Bad arguments for option \'--colsample_bylevel=*\': \'$VIASH_PAR_COLSAMPLE_BYLEVEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COLSAMPLE_BYLEVEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --colsample_bynode) + [ -n "$VIASH_PAR_COLSAMPLE_BYNODE" ] && ViashError Bad arguments for option \'--colsample_bynode\': \'$VIASH_PAR_COLSAMPLE_BYNODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COLSAMPLE_BYNODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --colsample_bynode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --colsample_bynode=*) + [ -n "$VIASH_PAR_COLSAMPLE_BYNODE" ] && ViashError Bad arguments for option \'--colsample_bynode=*\': \'$VIASH_PAR_COLSAMPLE_BYNODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COLSAMPLE_BYNODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reg_lambda) + [ -n "$VIASH_PAR_REG_LAMBDA" ] && ViashError Bad arguments for option \'--reg_lambda\': \'$VIASH_PAR_REG_LAMBDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REG_LAMBDA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reg_lambda. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reg_lambda=*) + [ -n "$VIASH_PAR_REG_LAMBDA" ] && ViashError Bad arguments for option \'--reg_lambda=*\': \'$VIASH_PAR_REG_LAMBDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REG_LAMBDA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lambda) + [ -n "$VIASH_PAR_REG_LAMBDA" ] && ViashError Bad arguments for option \'--lambda\': \'$VIASH_PAR_REG_LAMBDA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REG_LAMBDA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lambda. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reg_alpha) + [ -n "$VIASH_PAR_REG_ALPHA" ] && ViashError Bad arguments for option \'--reg_alpha\': \'$VIASH_PAR_REG_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REG_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reg_alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reg_alpha=*) + [ -n "$VIASH_PAR_REG_ALPHA" ] && ViashError Bad arguments for option \'--reg_alpha=*\': \'$VIASH_PAR_REG_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REG_ALPHA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alpha) + [ -n "$VIASH_PAR_REG_ALPHA" ] && ViashError Bad arguments for option \'--alpha\': \'$VIASH_PAR_REG_ALPHA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REG_ALPHA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alpha. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scale_pos_weight) + [ -n "$VIASH_PAR_SCALE_POS_WEIGHT" ] && ViashError Bad arguments for option \'--scale_pos_weight\': \'$VIASH_PAR_SCALE_POS_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCALE_POS_WEIGHT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scale_pos_weight. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scale_pos_weight=*) + [ -n "$VIASH_PAR_SCALE_POS_WEIGHT" ] && ViashError Bad arguments for option \'--scale_pos_weight=*\': \'$VIASH_PAR_SCALE_POS_WEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCALE_POS_WEIGHT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/labels_transfer/xgboost:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_REFERENCE_OBS_TARGETS+x} ]; then + VIASH_PAR_REFERENCE_OBS_TARGETS="ann_level_1;ann_level_2;ann_level_3;ann_level_4;ann_level_5;ann_finest_level" +fi +if [ -z ${VIASH_PAR_FORCE_RETRAIN+x} ]; then + VIASH_PAR_FORCE_RETRAIN="false" +fi +if [ -z ${VIASH_PAR_USE_GPU+x} ]; then + VIASH_PAR_USE_GPU="false" +fi +if [ -z ${VIASH_PAR_VERBOSITY+x} ]; then + VIASH_PAR_VERBOSITY="1" +fi +if [ -z ${VIASH_PAR_MODEL_OUTPUT+x} ]; then + VIASH_PAR_MODEL_OUTPUT="model" +fi +if [ -z ${VIASH_PAR_OUTPUT_UNS_PARAMETERS+x} ]; then + VIASH_PAR_OUTPUT_UNS_PARAMETERS="xgboost_parameters" +fi +if [ -z ${VIASH_PAR_LEARNING_RATE+x} ]; then + VIASH_PAR_LEARNING_RATE="0.3" +fi +if [ -z ${VIASH_PAR_MIN_SPLIT_LOSS+x} ]; then + VIASH_PAR_MIN_SPLIT_LOSS="0.0" +fi +if [ -z ${VIASH_PAR_MAX_DEPTH+x} ]; then + VIASH_PAR_MAX_DEPTH="6" +fi +if [ -z ${VIASH_PAR_MIN_CHILD_WEIGHT+x} ]; then + VIASH_PAR_MIN_CHILD_WEIGHT="1" +fi +if [ -z ${VIASH_PAR_MAX_DELTA_STEP+x} ]; then + VIASH_PAR_MAX_DELTA_STEP="0.0" +fi +if [ -z ${VIASH_PAR_SUBSAMPLE+x} ]; then + VIASH_PAR_SUBSAMPLE="1.0" +fi +if [ -z ${VIASH_PAR_SAMPLING_METHOD+x} ]; then + VIASH_PAR_SAMPLING_METHOD="uniform" +fi +if [ -z ${VIASH_PAR_COLSAMPLE_BYTREE+x} ]; then + VIASH_PAR_COLSAMPLE_BYTREE="1.0" +fi +if [ -z ${VIASH_PAR_COLSAMPLE_BYLEVEL+x} ]; then + VIASH_PAR_COLSAMPLE_BYLEVEL="1.0" +fi +if [ -z ${VIASH_PAR_COLSAMPLE_BYNODE+x} ]; then + VIASH_PAR_COLSAMPLE_BYNODE="1.0" +fi +if [ -z ${VIASH_PAR_REG_LAMBDA+x} ]; then + VIASH_PAR_REG_LAMBDA="1.0" +fi +if [ -z ${VIASH_PAR_REG_ALPHA+x} ]; then + VIASH_PAR_REG_ALPHA="0.0" +fi +if [ -z ${VIASH_PAR_SCALE_POS_WEIGHT+x} ]; then + VIASH_PAR_SCALE_POS_WEIGHT="1.0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_FORCE_RETRAIN" ]]; then + if ! [[ "$VIASH_PAR_FORCE_RETRAIN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--force_retrain' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_USE_GPU" ]]; then + if ! [[ "$VIASH_PAR_USE_GPU" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--use_gpu' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VERBOSITY" ]]; then + if ! [[ "$VIASH_PAR_VERBOSITY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--verbosity' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LEARNING_RATE" ]]; then + if ! [[ "$VIASH_PAR_LEARNING_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--learning_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SPLIT_LOSS" ]]; then + if ! [[ "$VIASH_PAR_MIN_SPLIT_LOSS" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_split_loss' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_DEPTH" ]]; then + if ! [[ "$VIASH_PAR_MAX_DEPTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_depth' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CHILD_WEIGHT" ]]; then + if ! [[ "$VIASH_PAR_MIN_CHILD_WEIGHT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_child_weight' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_DELTA_STEP" ]]; then + if ! [[ "$VIASH_PAR_MAX_DELTA_STEP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--max_delta_step' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SUBSAMPLE" ]]; then + if ! [[ "$VIASH_PAR_SUBSAMPLE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--subsample' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_COLSAMPLE_BYTREE" ]]; then + if ! [[ "$VIASH_PAR_COLSAMPLE_BYTREE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--colsample_bytree' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_COLSAMPLE_BYLEVEL" ]]; then + if ! [[ "$VIASH_PAR_COLSAMPLE_BYLEVEL" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--colsample_bylevel' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_COLSAMPLE_BYNODE" ]]; then + if ! [[ "$VIASH_PAR_COLSAMPLE_BYNODE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--colsample_bynode' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REG_LAMBDA" ]]; then + if ! [[ "$VIASH_PAR_REG_LAMBDA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--reg_lambda' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_REG_ALPHA" ]]; then + if ! [[ "$VIASH_PAR_REG_ALPHA" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--reg_alpha' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCALE_POS_WEIGHT" ]]; then + if ! [[ "$VIASH_PAR_SCALE_POS_WEIGHT" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--scale_pos_weight' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_SAMPLING_METHOD" ]; then + VIASH_PAR_SAMPLING_METHOD_CHOICES=("uniform;gradient_based") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SAMPLING_METHOD_CHOICES[*]};" =~ ";$VIASH_PAR_SAMPLING_METHOD;" ]]; then + ViashError '--sampling_method' specified value of \'$VIASH_PAR_SAMPLING_METHOD\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_MODEL_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_MODEL_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_OUTPUT")" ) + VIASH_PAR_MODEL_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_MODEL_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-xgboost-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import json +import os +from typing import Optional +import yaml +from pathlib import Path + +import mudata +import numpy as np +import pandas as pd +import xgboost as xgb +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.preprocessing import LabelEncoder + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obsm_features': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_FEATURES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obsm_features': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBSM_FEATURES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_obs_targets': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGETS+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGETS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'force_retrain': $( if [ ! -z ${VIASH_PAR_FORCE_RETRAIN+x} ]; then echo "r'${VIASH_PAR_FORCE_RETRAIN//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'use_gpu': $( if [ ! -z ${VIASH_PAR_USE_GPU+x} ]; then echo "r'${VIASH_PAR_USE_GPU//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'verbosity': $( if [ ! -z ${VIASH_PAR_VERBOSITY+x} ]; then echo "int(r'${VIASH_PAR_VERBOSITY//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'model_output': $( if [ ! -z ${VIASH_PAR_MODEL_OUTPUT+x} ]; then echo "r'${VIASH_PAR_MODEL_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_uns_parameters': $( if [ ! -z ${VIASH_PAR_OUTPUT_UNS_PARAMETERS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_UNS_PARAMETERS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_split_loss': $( if [ ! -z ${VIASH_PAR_MIN_SPLIT_LOSS+x} ]; then echo "float(r'${VIASH_PAR_MIN_SPLIT_LOSS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_depth': $( if [ ! -z ${VIASH_PAR_MAX_DEPTH+x} ]; then echo "int(r'${VIASH_PAR_MAX_DEPTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_child_weight': $( if [ ! -z ${VIASH_PAR_MIN_CHILD_WEIGHT+x} ]; then echo "int(r'${VIASH_PAR_MIN_CHILD_WEIGHT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_delta_step': $( if [ ! -z ${VIASH_PAR_MAX_DELTA_STEP+x} ]; then echo "float(r'${VIASH_PAR_MAX_DELTA_STEP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'subsample': $( if [ ! -z ${VIASH_PAR_SUBSAMPLE+x} ]; then echo "float(r'${VIASH_PAR_SUBSAMPLE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sampling_method': $( if [ ! -z ${VIASH_PAR_SAMPLING_METHOD+x} ]; then echo "r'${VIASH_PAR_SAMPLING_METHOD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'colsample_bytree': $( if [ ! -z ${VIASH_PAR_COLSAMPLE_BYTREE+x} ]; then echo "float(r'${VIASH_PAR_COLSAMPLE_BYTREE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'colsample_bylevel': $( if [ ! -z ${VIASH_PAR_COLSAMPLE_BYLEVEL+x} ]; then echo "float(r'${VIASH_PAR_COLSAMPLE_BYLEVEL//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'colsample_bynode': $( if [ ! -z ${VIASH_PAR_COLSAMPLE_BYNODE+x} ]; then echo "float(r'${VIASH_PAR_COLSAMPLE_BYNODE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reg_lambda': $( if [ ! -z ${VIASH_PAR_REG_LAMBDA+x} ]; then echo "float(r'${VIASH_PAR_REG_LAMBDA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'reg_alpha': $( if [ ! -z ${VIASH_PAR_REG_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_REG_ALPHA//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scale_pos_weight': $( if [ ! -z ${VIASH_PAR_SCALE_POS_WEIGHT+x} ]; then echo "float(r'${VIASH_PAR_SCALE_POS_WEIGHT//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + +# look for training params for method +argument_groups = {grp["name"]: grp["arguments"] for grp in config["argument_groups"]} +training_arg_names = [ + arg["name"].replace("--", "") for arg in argument_groups["Learning parameters"] +] +training_params = {arg_name: par[arg_name] for arg_name in training_arg_names} + + +def encode_labels(y): + labels_encoder = LabelEncoder() + labels_encoder.fit(y) + + return labels_encoder.transform(y), labels_encoder + + +def get_model_eval(xgb_model, X_test, y_test, labels_encoder): + preds = xgb_model.predict(X_test) + + cr = classification_report( + labels_encoder.inverse_transform(y_test), + labels_encoder.inverse_transform(preds), + output_dict=True, + ) + cr_df = pd.DataFrame(cr).transpose() + + return cr_df + + +def train_test_split_adata(adata, labels): + train_data = pd.DataFrame(data=adata.X, index=adata.obs_names) + + X_train, X_test, y_train, y_test = train_test_split( + train_data, labels, test_size=0.2, random_state=42, stratify=labels + ) + + return X_train, X_test, y_train, y_test + + +def train_xgb_model(X_train, y_train, gpu=True) -> xgb.XGBClassifier: + n_classes = len(np.unique(y_train)) + objective = "binary:logistic" if n_classes == 2 else "multi:softprob" + + tree_method = "gpu_hist" if gpu else "hist" + xgbc = xgb.XGBClassifier( + tree_method=tree_method, objective=objective, **training_params + ) + xgbc.fit(X_train, y_train) + + return xgbc + + +def build_classifier( + X, y, labels_encoder, label_key, eval_verbosity: Optional[int] = 1, gpu=True +) -> xgb.XGBClassifier: + # Adata prep + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + # Note: Do we need a new train-test split for each classifier? + + # Model training + xgb_model = train_xgb_model(X_train, y_train, gpu=gpu) + + # Model eval + if eval_verbosity != 0: + cr_df = get_model_eval(xgb_model, X_test, y_test, labels_encoder) + + if eval_verbosity == 2: + print(cr_df) + + else: + overall_accuracy = cr_df["support"]["accuracy"] + low_prec_key = cr_df.precision.idxmin() + low_prec_val = cr_df.precision.min() + low_rec_key = cr_df.recall.idxmin() + low_rec_val = cr_df.recall.min() + low_f1_key = cr_df["f1-score"].idxmin() + low_f1_val = cr_df["f1-score"].min() + + print("") + print(f"Summary stats for {label_key} model:") + print(f"Overall accuracy: {overall_accuracy}") + print(f"Min. precision: {low_prec_key}: {low_prec_val}") + print(f"Min. Recall: {low_rec_key}: {low_rec_val}") + print(f"Min. F1-score: {low_f1_key}: {low_f1_val}") + print("") + + return xgb_model + + +def build_ref_classifiers( + adata_reference, + targets, + model_path, + eval_verbosity: Optional[int] = 1, + gpu: Optional[bool] = True, +) -> None: + """ + This function builds xgboost classifiers on a reference embedding for a designated number of + adata_reference.obs columns. Classifier .xgb files and a model_info.json file is written to the \`model_path\` + directory. Model evaluation is printed to stdout. + + Inputs: + * \`adata_reference\`: The AnnData object that was used to train the reference model + * \`model_path\`: The reference model directory where the classifiers will also be stored + * \`eval_verbosity\`: The verbosity level for evaluation of the classifier from the range [0;2]. + * \`gpu\`: Boolean indicating whether a gpu is available for classifier training + + + Example: + \`\`\` + >>> adata + AnnData object with n_obs x n_vars = 700 x 765 + obs: "ann_finest_level", "ann_level_1" + + >>> os.listdir("/path/to/model") + model_params.pt* + + >>> build_ref_classifiers(adata, "path/to/model", eval_verbosity=1, gpu=True) + >>> os.listdir("/path/to/model") + classifier_ann_finest_level.xgb* model_info.json* + classifier_ann_level_1.xgb* model_params.pt* + \`\`\` + """ + + # Check inputs + if not isinstance(eval_verbosity, int): + raise TypeError("\`eval_verbosity\` should be an integer between 0 and 2.") + + if eval_verbosity < 0 or eval_verbosity > 2: + raise ValueError("\`eval_verbosity\` should be an integer between 0 and 2.") + + train_data = get_reference_features(adata_reference, par, logger) + + if not os.path.exists(model_path): + os.makedirs(model_path, exist_ok=True) + + # Map from name of classifier to file names + classifiers = dict() + + for label, obs_pred in zip(targets, par["output_obs_predictions"]): + if label not in adata_reference.obs: + raise ValueError(f"{label} is not in the \`adata\` object passed!") + + filename = "classifier_" + label + ".xgb" + + labels, labels_encoder = encode_labels(adata_reference.obs[label]) + logger.info(f"Classes: {labels_encoder.classes_}") + + logger.info(f"Building classifier for {label}...") + xgb_model = build_classifier( + X=train_data, + y=labels, + labels_encoder=labels_encoder, + label_key=label, + eval_verbosity=eval_verbosity, + gpu=gpu, + ) + + # Save classifier + logger.info("Saving model") + xgb_model.save_model(os.path.join(model_path, filename)) + + # Store classifier info + classifiers[label] = { + "filename": filename, + "labels": labels_encoder.classes_.tolist(), + "obs_column": obs_pred, + "model_params": training_params, + } + + # Store model_info.json file + model_info = {"classifier_info": classifiers} + + logger.info("Writing model_info to the file") + # Read previous file if it exists + if os.path.exists(model_path + "/model_info.json"): + logger.info("Old model_info file found, updating") + with open(model_path + "/model_info.json", "r") as f: + old_model_info = json.loads(f.read()) + + for key in old_model_info: + if key in model_info: + old_model_info[key].update(model_info[key]) + json_string = json.dumps(old_model_info, indent=4) + + else: + logger.info("Creating a new file") + json_string = json.dumps(model_info, indent=4) + + with open(model_path + "/model_info.json", "w") as f: + f.write(json_string) + + +def project_labels( + query_dataset, + cell_type_classifier_model: xgb.XGBClassifier, + annotation_column_name="label_pred", + probability_column_name="label_probability", + probability_thresh=None, # Note: currently not passed to predict function +): + """ + A function that projects predicted labels onto the query dataset, along with probability estimations. + Performs in-place update of the adata object, adding columns to the \`obs\` DataFrame. + + Input: + * \`query_dataset\`: The query \`AnnData\` object + * \`model_file\`: Path to the classification model file + * \`prediction_key\`: Column name in \`adata.obs\` where to store the predicted labels + * \`probability_key\`: Column name in \`adata.obs\` where to store the labels probabilities + * \`probability_thresh\`: The probability threshold below which we call a cell 'Unknown' + + Output: + Nothing is output, the passed anndata is modified inplace + + """ + + if (probability_thresh is not None) and ( + probability_thresh < 0 or probability_thresh > 1 + ): + raise ValueError("\`probability_thresh\` must be \`None\` or between 0 and 1.") + + query_data = get_query_features(query_dataset, par, logger) + + # Predict labels and probabilities + query_dataset.obs[annotation_column_name] = cell_type_classifier_model.predict( + query_data + ) + + logger.info("Predicting probabilities") + probs = cell_type_classifier_model.predict_proba(query_data) + + # Format probabilities + df_probs = pd.DataFrame( + probs, + columns=cell_type_classifier_model.classes_, + index=query_dataset.obs_names, + ) + query_dataset.obs[probability_column_name] = df_probs.max(1) + + # Note: this is here in case we want to propose a set of values for the user to accept to seed the + # manual curation of predicted labels + if probability_thresh is not None: + logger.info("Marking uncertain predictions") + query_dataset.obs[annotation_column_name + "_filtered"] = [ + val + if query_dataset.obs[probability_column_name][i] >= probability_thresh + else "Unknown" + for i, val in enumerate(query_dataset.obs[annotation_column_name]) + ] + + return query_dataset + + +def predict( + query_dataset, + cell_type_classifier_model_path, + annotation_column_name: str, + prediction_column_name: str, + probability_column_name: str, + models_info, + use_gpu: bool = False, +) -> pd.DataFrame: + """ + Returns \`obs\` DataFrame with prediction columns appended + """ + + tree_method = "gpu_hist" if use_gpu else "hist" + + labels = models_info["classifier_info"][annotation_column_name]["labels"] + + objective = "binary:logistic" if len(labels) == 2 else "multi:softprob" + cell_type_classifier_model = xgb.XGBClassifier( + tree_method=tree_method, objective=objective + ) + + logger.info("Loading model") + cell_type_classifier_model.load_model(fname=cell_type_classifier_model_path) + + logger.info("Predicting labels") + project_labels( + query_dataset, + cell_type_classifier_model, + annotation_column_name=prediction_column_name, + probability_column_name=probability_column_name, + ) + + logger.info("Converting labels from numbers to classes") + labels_encoder = LabelEncoder() + labels_encoder.classes_ = np.array(labels) + query_dataset.obs[prediction_column_name] = labels_encoder.inverse_transform( + query_dataset.obs[prediction_column_name] + ) + + return query_dataset + + +def main(par): + logger.info("Checking arguments") + par = check_arguments(par) + + adata_query = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + adata_reference = mudata.read_h5ad(par["reference"], mod=par["modality"]) + + # If classifiers for targets are in the model_output directory, simply open them and run (unless \`retrain\` != True) + # If some classifiers are missing, train and save them first + # Predict and save the query data + + targets_to_train = [] + + for obs_target in par["reference_obs_targets"]: + if ( + not os.path.exists(par["model_output"]) + or f"classifier_{obs_target}.xgb" not in os.listdir(par["model_output"]) + or par["force_retrain"] + ): + logger.info(f"Classifier for {obs_target} added to a training schedule") + targets_to_train.append(obs_target) + else: + logger.info(f"Found classifier for {obs_target}, no retraining required") + + build_ref_classifiers( + adata_reference, + targets_to_train, + model_path=par["model_output"], + gpu=par["use_gpu"], + eval_verbosity=par["verbosity"], + ) + + output_uns_parameters = adata_query.uns.get(par["output_uns_parameters"], {}) + + with open(par["model_output"] + "/model_info.json", "r") as f: + models_info = json.loads(f.read()) + + for obs_target, obs_pred, obs_unc in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], + ): + logger.info(f"Predicting {obs_target}") + + adata_query = predict( + query_dataset=adata_query, + cell_type_classifier_model_path=os.path.join( + par["model_output"], "classifier_" + obs_target + ".xgb" + ), + annotation_column_name=obs_target, + prediction_column_name=obs_pred, + probability_column_name=obs_unc, + models_info=models_info, + use_gpu=par["use_gpu"], + ) + + if obs_target in targets_to_train: + # Save information about the transfer to .uns + output_uns_parameters[obs_target] = { + "method": "XGBClassifier", + **training_params, + } + + adata_query.uns[par["output_uns_parameters"]] = output_uns_parameters + + logger.info("Writing output to %s", par["output"]) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata_query, None + ) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ]; then + VIASH_PAR_MODEL_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_OUTPUT" ] && [ ! -e "$VIASH_PAR_MODEL_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_MODEL_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/bd_rhapsody/.config.vsh.yaml b/target/executable/mapping/bd_rhapsody/.config.vsh.yaml new file mode 100644 index 00000000..62169422 --- /dev/null +++ b/target/executable/mapping/bd_rhapsody/.config.vsh.yaml @@ -0,0 +1,1174 @@ +name: "bd_rhapsody" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--reads" + description: "Reads (optional) - Path to your FASTQ.GZ formatted read files from\ + \ libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample\ + \ Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n" + info: + config_key: "Reads" + example: + - "WTALibrary_S1_L001_R1_001.fastq.gz" + - "WTALibrary_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reads_atac" + description: "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\n\ + You may specify as many R1/R2/I2 files as you want.\n" + info: + config_key: "Reads_ATAC" + example: + - "ATACLibrary_S2_L001_R1_001.fastq.gz" + - "ATACLibrary_S2_L001_R2_001.fastq.gz" + - "ATACLibrary_S2_L001_I2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "References" + description: "Assay type will be inferred from the provided reference(s).\nDo not\ + \ provide both reference_archive and targeted_reference at the same time.\n \n\ + Valid reference input combinations:\n - reference_archive: WTA only\n - reference_archive\ + \ & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference:\ + \ WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference:\ + \ WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n\ + \ - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n\ + \ - targeted_reference: Targeted only\n - targeted_reference & abseq_reference:\ + \ Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can\ + \ be generated with the bd_rhapsody_make_reference component.\nAlternatively,\ + \ BD also provides standard references which can be downloaded from these locations:\n\ + \n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n\ + \ - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n" + arguments: + - type: "file" + name: "--reference_archive" + description: "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure\ + \ of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level\ + \ folder\n - `star_index/`: sub-folder containing STAR index, that is files\ + \ created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation\ + \ e.g. \"gencode.v43.primary_assembly.annotation.gtf\"\n" + info: + config_key: "Reference_Archive" + example: + - "RhapRef_Human_WTA_2023-02.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targeted_reference" + description: "Path to the targeted reference file in FASTA format.\n" + info: + config_key: "Targeted_Reference" + example: + - "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--abseq_reference" + description: "Path to the AbSeq reference file in FASTA format. Only needed if\ + \ BD AbSeq Ab-Oligos are used." + info: + config_key: "AbSeq_Reference" + example: + - "AbSeq_reference.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--supplemental_reference" + alternatives: + - "-s" + description: "Path to the supplemental reference file in FASTA format. Only needed\ + \ if there are additional transgene sequences to be aligned against in a WTA\ + \ assay experiment." + info: + config_key: "Supplemental_Reference" + example: + - "supplemental_reference.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + description: "Outputs for all pipeline runs" + arguments: + - type: "file" + name: "--output_dir" + alternatives: + - "-o" + description: "The unprocessed output directory containing all the outputs from\ + \ the pipeline." + info: null + example: + - "output_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_seurat" + description: "Single-cell analysis tool inputs. Seurat (.rds) input file containing\ + \ RSEC molecules data table and all cell annotation metadata." + info: + template: "sample_Seurat.rds" + example: + - "output_seurat.rds" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_mudata" + info: + template: "sample.h5mu" + example: + - "output_mudata.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--metrics_summary" + description: "Metrics Summary. Report containing sequencing, molecules, and cell\ + \ metrics." + info: + template: "sample_Metrics_Summary.csv" + example: + - "metrics_summary.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--pipeline_report" + description: "Pipeline Report. Summary report containing the results from the\ + \ sequencing analysis pipeline run." + info: + template: "sample_Pipeline_Report.html" + example: + - "pipeline_report.html" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--rsec_mols_per_cell" + description: "Molecules per bioproduct per cell bassed on RSEC" + info: + template: "sample_RSEC_MolsPerCell_MEX.zip" + example: + - "RSEC_MolsPerCell_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--dbec_mols_per_cell" + description: "Molecules per bioproduct per cell bassed on DBEC. DBEC data table\ + \ is only output if the experiment includes targeted mRNA or AbSeq bioproducts." + info: + template: "sample_DBEC_MolsPerCell_MEX.zip" + example: + - "DBEC_MolsPerCell_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--rsec_mols_per_cell_unfiltered" + description: "Unfiltered tables containing all cell labels with 10 reads." + info: + template: "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip" + example: + - "RSEC_MolsPerCell_Unfiltered_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "Alignment file of R2 with associated R1 annotations for Bioproduct." + info: + template: "sample_Bioproduct.bam" + example: + - "BioProduct.bam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam_index" + description: "Index file for the alignment file." + info: + template: "sample_Bioproduct.bam.bai" + example: + - "BioProduct.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bioproduct_stats" + description: "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier\ + \ adjustment algorithms on a per-bioproduct basis." + info: + template: "sample_Bioproduct_Stats.csv" + example: + - "Bioproduct_Stats.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--dimred_tsne" + description: "t-SNE dimensionality reduction coordinates per cell index" + info: + template: "sample_assay_tSNE_coordinates.csv" + example: + - "tSNE_coordinates.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--dimred_umap" + description: "UMAP dimensionality reduction coordinates per cell index" + info: + template: "sample_assay_UMAP_coordinates.csv" + example: + - "UMAP_coordinates.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--immune_cell_classification" + description: "Immune Cell Classification. Cell type classification based on the\ + \ expression of immune cell markers." + info: + template: "sample_assay_cell_type_experimental.csv" + example: + - "Immune_Cell_Classification.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Multiplex outputs" + description: "Outputs when multiplex option is selected" + arguments: + - type: "file" + name: "--sample_tag_metrics" + description: "Sample Tag Metrics. Metrics from the sample determination algorithm." + info: + template: "sample_Sample_Tag_Metrics.csv" + example: + - "Sample_Tag_Metrics.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_tag_calls" + description: "Sample Tag Calls. Assigned Sample Tag for each putative cell" + info: + template: "sample_Sample_Tag_Calls.csv" + example: + - "Sample_Tag_Calls.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_tag_counts" + description: "Sample Tag Counts. Separate data tables and metric summary for cells\ + \ assigned to each sample tag. Note: For putative cells that could not be assigned\ + \ a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output." + info: + template: "sample_Sample_Tag.zip" + example: + - "Sample_Tag1.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sample_tag_counts_unassigned" + description: "Sample Tag Counts Unassigned. Data table and metric summary for\ + \ cells that could not be assigned a specific Sample Tag." + info: + template: "sample_Multiplet_and_Undetermined.zip" + example: + - "Multiplet_and_Undetermined.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "VDJ Outputs" + description: "Outputs when VDJ option selected" + arguments: + - type: "file" + name: "--vdj_metrics" + description: "VDJ Metrics. Overall metrics from the VDJ analysis." + info: + template: "sample_VDJ_Metrics.csv" + example: + - "VDJ_Metrics.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_per_cell" + description: "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments,\ + \ CDR3 sequences, paired chains, and cell type." + info: + template: "sample_VDJ_perCell.csv" + example: + - "VDJ_perCell.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_per_cell_uncorrected" + description: "VDJ Per Cell Uncorrected. Cell specific read and molecule counts,\ + \ VDJ gene segments, CDR3 sequences, paired chains, and cell type." + info: + template: "sample_VDJ_perCell_uncorrected.csv" + example: + - "VDJ_perCell_uncorrected.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_dominant_contigs" + description: "VDJ Dominant Contigs. Dominant contig for each cell label chain\ + \ type combination (putative cells only)." + info: + template: "sample_VDJ_Dominant_Contigs_AIRR.csv" + example: + - "VDJ_Dominant_Contigs_AIRR.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_unfiltered_contigs" + description: "VDJ Unfiltered Contigs. All contigs that were assembled and annotated\ + \ successfully (all cells)." + info: + template: "sample_VDJ_Unfiltered_Contigs_AIRR.csv" + example: + - "VDJ_Unfiltered_Contigs_AIRR.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "ATAC-Seq outputs" + description: "Outputs when ATAC-Seq option selected" + arguments: + - type: "file" + name: "--atac_metrics" + description: "ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + info: + template: "sample_ATAC_Metrics.csv" + example: + - "ATAC_Metrics.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_metrics_json" + description: "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in\ + \ JSON format." + info: + template: "sample_ATAC_Metrics.json" + example: + - "ATAC_Metrics.json" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_fragments" + description: "ATAC Fragments. Chromosomal location, cell index, and read support\ + \ for each fragment detected" + info: + template: "sample_ATAC_Fragments.bed.gz" + example: + - "ATAC_Fragments.bed.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_fragments_index" + description: "Index of ATAC Fragments." + info: + template: "sample_ATAC_Fragments.bed.gz.tbi" + example: + - "ATAC_Fragments.bed.gz.tbi" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_transposase_sites" + description: "ATAC Transposase Sites. Chromosomal location, cell index, and read\ + \ support for each transposase site detected" + info: + template: "sample_ATAC_Transposase_Sites.bed.gz" + example: + - "ATAC_Transposase_Sites.bed.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_transposase_sites_index" + description: "Index of ATAC Transposase Sites." + info: + template: "sample_ATAC_Transposase_Sites.bed.gz.tbi" + example: + - "ATAC_Transposase_Sites.bed.gz.tbi" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_peaks" + description: "ATAC Peaks. Peak regions of transposase activity" + info: + template: "sample_ATAC_Peaks.bed.gz" + example: + - "ATAC_Peaks.bed.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_peaks_index" + description: "Index of ATAC Peaks." + info: + template: "sample_ATAC_Peaks.bed.gz.tbi" + example: + - "ATAC_Peaks.bed.gz.tbi" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_peak_annotation" + description: "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + info: + template: "sample_peak_annotation.tsv.gz" + example: + - "peak_annotation.tsv.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_cell_by_peak" + description: "ATAC Cell by Peak. Peak regions of transposase activity per cell" + info: + template: "sample_ATAC_Cell_by_Peak_MEX.zip" + example: + - "ATAC_Cell_by_Peak_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_cell_by_peak_unfiltered" + description: "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell\ + \ labels with >=1 transposase sites in peaks." + info: + template: "sample_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + example: + - "ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_bam" + description: "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations\ + \ for ATAC-Seq. Only output if the BAM generation flag is set to true." + info: + template: "sample_ATAC.bam" + example: + - "ATAC.bam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_bam_index" + description: "Index of ATAC BAM." + info: + template: "sample_ATAC.bam.bai" + example: + - "ATAC.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "AbSeq Cell Calling outputs" + description: "Outputs when Cell Calling Abseq is selected" + arguments: + - type: "file" + name: "--protein_aggregates_experimental" + description: "Protein Aggregates Experimental" + info: + template: "sample_Protein_Aggregates_Experimental.csv" + example: + - "Protein_Aggregates_Experimental.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Putative Cell Calling Settings" + arguments: + - type: "string" + name: "--cell_calling_data" + description: "Specify the dataset to be used for putative cell calling: mRNA,\ + \ AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset,\ + \ please provide an AbSeq_Reference fasta file above.\n \nFor putative cell\ + \ calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive\ + \ file above.\n \nThe default data for putative cell calling, will be determined\ + \ the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n\ + - If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n" + info: + config_key: "Cell_Calling_Data" + example: + - "mRNA" + required: false + choices: + - "mRNA" + - "AbSeq" + - "ATAC" + - "mRNA_and_ATAC" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_calling_bioproduct_algorithm" + description: "Specify the bioproduct algorithm to be used for putative cell calling:\ + \ Basic or Refined\n \nBy default, the Basic algorithm will be used for putative\ + \ cell calling.\n" + info: + config_key: "Cell_Calling_Bioproduct_Algorithm" + example: + - "Basic" + required: false + choices: + - "Basic" + - "Refined" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_calling_atac_algorithm" + description: "Specify the ATAC-seq algorithm to be used for putative cell calling:\ + \ Basic or Refined\n \nBy default, the Basic algorithm will be used for putative\ + \ cell calling.\n" + info: + config_key: "Cell_Calling_ATAC_Algorithm" + example: + - "Basic" + required: false + choices: + - "Basic" + - "Refined" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--exact_cell_count" + description: "Set a specific number of cells as putative, based on those with\ + \ the highest error-corrected read count\n" + info: + config_key: "Exact_Cell_Count" + example: + - 10000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--expected_cell_count" + description: "Guide the basic putative cell calling algorithm by providing an\ + \ estimate of the number of cells expected. Usually this can be the number\ + \ of cells loaded into the Rhapsody cartridge. If there are multiple inflection\ + \ points on the second derivative cumulative curve, this will ensure the one\ + \ selected is near the expected. \n" + info: + config_key: "Expected_Cell_Count" + example: + - 20000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Intronic Reads Settings" + arguments: + - type: "boolean" + name: "--exclude_intronic_reads" + description: "By default, the flag is false, and reads aligned to exons and introns\ + \ are considered and represented in molecule counts. When the flag is set to\ + \ true, intronic reads will be excluded.\nThe value can be true or false.\n" + info: + config_key: "Exclude_Intronic_Reads" + example: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Multiplex Settings" + arguments: + - type: "string" + name: "--sample_tags_version" + description: "Specify the version of the Sample Tags used in the run:\n\n* If\ + \ Sample Tag Multiplexing was done, specify the appropriate version: human,\ + \ mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK +\ + \ Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not\ + \ an SMK + ATAC-Seq only run), choose the \"nuclei_includes_mrna\" option.\n\ + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)),\ + \ choose the \"nuclei_atac_only\" option.\n" + info: + config_key: "Sample_Tags_Version" + example: + - "human" + required: false + choices: + - "human" + - "mouse" + - "flex" + - "nuclei_includes_mrna" + - "nuclei_atac_only" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_names" + description: "Specify the tag number followed by '-' and the desired sample name\ + \ to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n" + info: + config_key: "Tag_Names" + example: + - "4-mySample" + - "9-myOtherSample" + - "6-alsoThisSample" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "VDJ arguments" + arguments: + - type: "string" + name: "--vdj_version" + description: "If VDJ was done, specify the appropriate option: human, mouse, humanBCR,\ + \ humanTCR, mouseBCR, mouseTCR\n" + info: + config_key: "VDJ_Version" + example: + - "human" + required: false + choices: + - "human" + - "mouse" + - "humanBCR" + - "humanTCR" + - "mouseBCR" + - "mouseTCR" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "ATAC options" + arguments: + - type: "file" + name: "--predefined_atac_peaks" + description: "An optional BED file containing pre-established chromatin accessibility\ + \ peak regions for generating the ATAC cell-by-peak matrix." + info: + config_key: "Predefined_ATAC_Peaks" + example: + - "predefined_peaks.bed" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Additional options" + arguments: + - type: "string" + name: "--run_name" + description: "Specify a run name to use as the output file base name. Use only\ + \ letters, numbers, or hyphens. Do not use special characters or spaces.\n" + info: + config_key: "Run_Name" + default: + - "sample" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--generate_bam" + description: "Specify whether to create the BAM file output\n" + info: + config_key: "Generate_Bam" + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--long_reads" + description: "Use STARlong (default: undefined - i.e. autodetects based on read\ + \ lengths) - Specify if the STARlong aligner should be used instead of STAR.\ + \ Set to true if the reads are longer than 650bp.\n" + info: + config_key: "Long_Reads" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Advanced options" + description: "NOTE: Only change these if you are really sure about what you are\ + \ doing\n" + arguments: + - type: "string" + name: "--custom_star_params" + description: "Modify STAR alignment parameters - Set this parameter to fully override\ + \ default STAR mapping parameters used in the pipeline.\nFor reference this\ + \ is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread\ + \ 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq\ + \ AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin\ + \ 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax\ + \ 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT\ + \ set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`,\ + \ `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n" + info: + config_key: "Custom_STAR_Params" + example: + - "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed\ + \ 2000000" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--custom_bwa_mem2_params" + description: "Modify bwa-mem2 alignment parameters - Set this parameter to fully\ + \ override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does\ + \ not specify any custom mapping params to bwa-mem2 so program default values\ + \ are used\nThis applies to fastqs provided in the Reads_ATAC user input \n\ + Do NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2\ + \ version 2.2.1\n" + info: + config_key: "Custom_bwa_mem2_Params" + example: + - "-k 16 -w 200 -r" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "CWL-runner arguments" + arguments: + - type: "boolean" + name: "--parallel" + description: "Run jobs in parallel." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--timestamps" + description: "Add timestamps to the errors, warnings, and notifications." + info: null + direction: "input" +- name: "Undocumented arguments" + arguments: + - type: "integer" + name: "--abseq_umi" + info: + config_key: "AbSeq_UMI" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--target_analysis" + info: + config_key: "Target_analysis" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--vdj_jgene_evalue" + description: "e-value threshold for J gene. The e-value threshold for J gene call\ + \ by IgBlast/PyIR, default is set as 0.001\n" + info: + config_key: "VDJ_JGene_Evalue" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--vdj_vgene_evalue" + description: "e-value threshold for V gene. The e-value threshold for V gene call\ + \ by IgBlast/PyIR, default is set as 0.001\n" + info: + config_key: "VDJ_VGene_Evalue" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--write_filtered_reads" + description: "Output processed FASTQ reads." + info: + config_key: "Write_Filtered_Reads" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "rhapsody_pipeline_2.2.1_nodocker.cwl" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n \nThis pipeline\ + \ performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\n\ + sequencing libraries are those generated by the BD Rhapsody assay kits, including:\ + \ Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell\ + \ Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning\ + \ 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement'\ + \ from the YAML.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "rhapsody_cell_label.py" +- type: "file" + path: "reference.fa.gz" +- type: "file" + path: "reference.gtf.gz" +- type: "file" + path: "reference_bd_rhapsody.tar.gz" +- type: "file" + path: "raw" +info: + keywords: + - "rna-seq" + - "single-cell" + - "multiomic" + - "atac-seq" + - "targeted" + - "abseq" + - "tcr" + - "bcr" + links: + repository: "https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1" + documentation: "https://bd-rhapsody-bioinfo-docs.genomics.bd.com" + license: "Unknown" +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bdgenomics/rhapsody:2.2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "cwlref-runner" + - "cwl-runner" + - "ruamel.yaml" + - "biopython" + - "gffutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/bd_rhapsody/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/bd_rhapsody" + executable: "target/executable/mapping/bd_rhapsody/bd_rhapsody" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/bd_rhapsody/bd_rhapsody b/target/executable/mapping/bd_rhapsody/bd_rhapsody new file mode 100755 index 00000000..1f3fb7a0 --- /dev/null +++ b/target/executable/mapping/bd_rhapsody/bd_rhapsody @@ -0,0 +1,3518 @@ +#!/usr/bin/env bash + +# bd_rhapsody main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bd_rhapsody" +VIASH_META_FUNCTIONALITY_NAME="bd_rhapsody" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM bdgenomics/rhapsody:2.2.1 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "cwlref-runner" "cwl-runner" "ruamel.yaml" "biopython" "gffutils" + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component mapping bd_rhapsody" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bd_rhapsody main" + echo "" + echo "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1" + echo "This pipeline performs analysis of single-cell multiomic sequence read (FASTQ)" + echo "data. The supported" + echo "sequencing libraries are those generated by the BD Rhapsody assay kits," + echo "including: Whole Transcriptome" + echo "mRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing," + echo "TCR/BCR, and" + echo "ATAC-Seq" + echo "" + echo "The CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl'" + echo "and removing all objects with class 'DockerRequirement' from the YAML." + echo "" + echo "Inputs:" + echo " --reads" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "WTALibrary_S1_L001_R1_001.fastq.gz;WTALibrary_S1_L001_R2_001.fastq.gz" + echo " Reads (optional) - Path to your FASTQ.GZ formatted read files from" + echo " libraries that may include:" + echo " - WTA mRNA" + echo " - Targeted mRNA" + echo " - AbSeq" + echo " - Sample Multiplexing" + echo " - VDJ" + echo " You may specify as many R1/R2 read pairs as you want." + echo "" + echo " --reads_atac" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "ATACLibrary_S2_L001_R1_001.fastq.gz;ATACLibrary_S2_L001_R2_001.fastq.gz;ATACLibrary_S2_L001_I2_001.fastq.gz" + echo " Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries." + echo " You may specify as many R1/R2/I2 files as you want." + echo "" + echo "References:" + echo " Assay type will be inferred from the provided reference(s)." + echo " Do not provide both reference_archive and targeted_reference at the same" + echo " time." + echo " Valid reference input combinations:" + echo " - reference_archive: WTA only" + echo " - reference_archive & abseq_reference: WTA + AbSeq" + echo " - reference_archive & supplemental_reference: WTA + extra transgenes" + echo " - reference_archive & abseq_reference & supplemental_reference: WTA +" + echo " AbSeq + extra transgenes" + echo " - reference_archive: WTA + ATAC or ATAC only" + echo " - reference_archive & supplemental_reference: WTA + ATAC + extra" + echo " transgenes" + echo " - targeted_reference: Targeted only" + echo " - targeted_reference & abseq_reference: Targeted + AbSeq" + echo " - abseq_reference: AbSeq only" + echo " The reference_archive can be generated with the bd_rhapsody_make_reference" + echo " component." + echo " Alternatively, BD also provides standard references which can be downloaded" + echo " from these locations:" + echo " - Human:" + echo " " + echo "https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz" + echo " - Mouse:" + echo " " + echo "https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz" + echo "" + echo " --reference_archive" + echo " type: file, file must exist" + echo " example: RhapRef_Human_WTA_2023-02.tar.gz" + echo " Path to Rhapsody WTA Reference in the tar.gz format." + echo " Structure of the reference archive:" + echo " - \`BD_Rhapsody_Reference_Files/\`: top level folder" + echo " - \`star_index/\`: sub-folder containing STAR index, that is files" + echo " created with \`STAR --runMode genomeGenerate\`" + echo " - GTF for gene-transcript-annotation e.g." + echo " \"gencode.v43.primary_assembly.annotation.gtf\"" + echo "" + echo " --targeted_reference" + echo " type: file, multiple values allowed, file must exist" + echo " example: BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + echo " Path to the targeted reference file in FASTA format." + echo "" + echo " --abseq_reference" + echo " type: file, multiple values allowed, file must exist" + echo " example: AbSeq_reference.fasta" + echo " Path to the AbSeq reference file in FASTA format. Only needed if BD" + echo " AbSeq Ab-Oligos are used." + echo "" + echo " -s, --supplemental_reference" + echo " type: file, multiple values allowed, file must exist" + echo " example: supplemental_reference.fasta" + echo " Path to the supplemental reference file in FASTA format. Only needed if" + echo " there are additional transgene sequences to be aligned against in a WTA" + echo " assay experiment." + echo "" + echo "Outputs:" + echo " Outputs for all pipeline runs" + echo "" + echo " -o, --output_dir" + echo " type: file, required parameter, output, file must exist" + echo " example: output_dir" + echo " The unprocessed output directory containing all the outputs from the" + echo " pipeline." + echo "" + echo " --output_seurat" + echo " type: file, output, file must exist" + echo " example: output_seurat.rds" + echo " Single-cell analysis tool inputs. Seurat (.rds) input file containing" + echo " RSEC molecules data table and all cell annotation metadata." + echo "" + echo " --output_mudata" + echo " type: file, output, file must exist" + echo " example: output_mudata.h5mu" + echo "" + echo " --metrics_summary" + echo " type: file, output, file must exist" + echo " example: metrics_summary.csv" + echo " Metrics Summary. Report containing sequencing, molecules, and cell" + echo " metrics." + echo "" + echo " --pipeline_report" + echo " type: file, output, file must exist" + echo " example: pipeline_report.html" + echo " Pipeline Report. Summary report containing the results from the" + echo " sequencing analysis pipeline run." + echo "" + echo " --rsec_mols_per_cell" + echo " type: file, output, file must exist" + echo " example: RSEC_MolsPerCell_MEX.zip" + echo " Molecules per bioproduct per cell bassed on RSEC" + echo "" + echo " --dbec_mols_per_cell" + echo " type: file, output, file must exist" + echo " example: DBEC_MolsPerCell_MEX.zip" + echo " Molecules per bioproduct per cell bassed on DBEC. DBEC data table is" + echo " only output if the experiment includes targeted mRNA or AbSeq" + echo " bioproducts." + echo "" + echo " --rsec_mols_per_cell_unfiltered" + echo " type: file, output, file must exist" + echo " example: RSEC_MolsPerCell_Unfiltered_MEX.zip" + echo " Unfiltered tables containing all cell labels with 10 reads." + echo "" + echo " --bam" + echo " type: file, output, file must exist" + echo " example: BioProduct.bam" + echo " Alignment file of R2 with associated R1 annotations for Bioproduct." + echo "" + echo " --bam_index" + echo " type: file, output, file must exist" + echo " example: BioProduct.bam.bai" + echo " Index file for the alignment file." + echo "" + echo " --bioproduct_stats" + echo " type: file, output, file must exist" + echo " example: Bioproduct_Stats.csv" + echo " Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier" + echo " adjustment algorithms on a per-bioproduct basis." + echo "" + echo " --dimred_tsne" + echo " type: file, output, file must exist" + echo " example: tSNE_coordinates.csv" + echo " t-SNE dimensionality reduction coordinates per cell index" + echo "" + echo " --dimred_umap" + echo " type: file, output, file must exist" + echo " example: UMAP_coordinates.csv" + echo " UMAP dimensionality reduction coordinates per cell index" + echo "" + echo " --immune_cell_classification" + echo " type: file, output, file must exist" + echo " example: Immune_Cell_Classification.csv" + echo " Immune Cell Classification. Cell type classification based on the" + echo " expression of immune cell markers." + echo "" + echo "Multiplex outputs:" + echo " Outputs when multiplex option is selected" + echo "" + echo " --sample_tag_metrics" + echo " type: file, output, file must exist" + echo " example: Sample_Tag_Metrics.csv" + echo " Sample Tag Metrics. Metrics from the sample determination algorithm." + echo "" + echo " --sample_tag_calls" + echo " type: file, output, file must exist" + echo " example: Sample_Tag_Calls.csv" + echo " Sample Tag Calls. Assigned Sample Tag for each putative cell" + echo "" + echo " --sample_tag_counts" + echo " type: file, multiple values allowed, output, file must exist" + echo " example: Sample_Tag1.zip" + echo " Sample Tag Counts. Separate data tables and metric summary for cells" + echo " assigned to each sample tag. Note: For putative cells that could not be" + echo " assigned a specific Sample Tag, a Multiplet_and_Undetermined.zip file is" + echo " also output." + echo "" + echo " --sample_tag_counts_unassigned" + echo " type: file, output, file must exist" + echo " example: Multiplet_and_Undetermined.zip" + echo " Sample Tag Counts Unassigned. Data table and metric summary for cells" + echo " that could not be assigned a specific Sample Tag." + echo "" + echo "VDJ Outputs:" + echo " Outputs when VDJ option selected" + echo "" + echo " --vdj_metrics" + echo " type: file, output, file must exist" + echo " example: VDJ_Metrics.csv" + echo " VDJ Metrics. Overall metrics from the VDJ analysis." + echo "" + echo " --vdj_per_cell" + echo " type: file, output, file must exist" + echo " example: VDJ_perCell.csv" + echo " VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments," + echo " CDR3 sequences, paired chains, and cell type." + echo "" + echo " --vdj_per_cell_uncorrected" + echo " type: file, output, file must exist" + echo " example: VDJ_perCell_uncorrected.csv" + echo " VDJ Per Cell Uncorrected. Cell specific read and molecule counts, VDJ" + echo " gene segments, CDR3 sequences, paired chains, and cell type." + echo "" + echo " --vdj_dominant_contigs" + echo " type: file, output, file must exist" + echo " example: VDJ_Dominant_Contigs_AIRR.csv" + echo " VDJ Dominant Contigs. Dominant contig for each cell label chain type" + echo " combination (putative cells only)." + echo "" + echo " --vdj_unfiltered_contigs" + echo " type: file, output, file must exist" + echo " example: VDJ_Unfiltered_Contigs_AIRR.csv" + echo " VDJ Unfiltered Contigs. All contigs that were assembled and annotated" + echo " successfully (all cells)." + echo "" + echo "ATAC-Seq outputs:" + echo " Outputs when ATAC-Seq option selected" + echo "" + echo " --atac_metrics" + echo " type: file, output, file must exist" + echo " example: ATAC_Metrics.csv" + echo " ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + echo "" + echo " --atac_metrics_json" + echo " type: file, output, file must exist" + echo " example: ATAC_Metrics.json" + echo " ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in JSON" + echo " format." + echo "" + echo " --atac_fragments" + echo " type: file, output, file must exist" + echo " example: ATAC_Fragments.bed.gz" + echo " ATAC Fragments. Chromosomal location, cell index, and read support for" + echo " each fragment detected" + echo "" + echo " --atac_fragments_index" + echo " type: file, output, file must exist" + echo " example: ATAC_Fragments.bed.gz.tbi" + echo " Index of ATAC Fragments." + echo "" + echo " --atac_transposase_sites" + echo " type: file, output, file must exist" + echo " example: ATAC_Transposase_Sites.bed.gz" + echo " ATAC Transposase Sites. Chromosomal location, cell index, and read" + echo " support for each transposase site detected" + echo "" + echo " --atac_transposase_sites_index" + echo " type: file, output, file must exist" + echo " example: ATAC_Transposase_Sites.bed.gz.tbi" + echo " Index of ATAC Transposase Sites." + echo "" + echo " --atac_peaks" + echo " type: file, output, file must exist" + echo " example: ATAC_Peaks.bed.gz" + echo " ATAC Peaks. Peak regions of transposase activity" + echo "" + echo " --atac_peaks_index" + echo " type: file, output, file must exist" + echo " example: ATAC_Peaks.bed.gz.tbi" + echo " Index of ATAC Peaks." + echo "" + echo " --atac_peak_annotation" + echo " type: file, output, file must exist" + echo " example: peak_annotation.tsv.gz" + echo " ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + echo "" + echo " --atac_cell_by_peak" + echo " type: file, output, file must exist" + echo " example: ATAC_Cell_by_Peak_MEX.zip" + echo " ATAC Cell by Peak. Peak regions of transposase activity per cell" + echo "" + echo " --atac_cell_by_peak_unfiltered" + echo " type: file, output, file must exist" + echo " example: ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + echo " ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell labels" + echo " with >=1 transposase sites in peaks." + echo "" + echo " --atac_bam" + echo " type: file, output, file must exist" + echo " example: ATAC.bam" + echo " ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations" + echo " for ATAC-Seq. Only output if the BAM generation flag is set to true." + echo "" + echo " --atac_bam_index" + echo " type: file, output, file must exist" + echo " example: ATAC.bam.bai" + echo " Index of ATAC BAM." + echo "" + echo "AbSeq Cell Calling outputs:" + echo " Outputs when Cell Calling Abseq is selected" + echo "" + echo " --protein_aggregates_experimental" + echo " type: file, output, file must exist" + echo " example: Protein_Aggregates_Experimental.csv" + echo " Protein Aggregates Experimental" + echo "" + echo "Putative Cell Calling Settings:" + echo " --cell_calling_data" + echo " type: string" + echo " example: mRNA" + echo " choices: [ mRNA, AbSeq, ATAC, mRNA_and_ATAC ]" + echo " Specify the dataset to be used for putative cell calling: mRNA, AbSeq," + echo " ATAC, mRNA_and_ATAC" + echo " For putative cell calling using an AbSeq dataset, please provide an" + echo " AbSeq_Reference fasta file above." + echo " For putative cell calling using an ATAC dataset, please provide a" + echo " WTA+ATAC-Seq Reference_Archive file above." + echo " The default data for putative cell calling, will be determined the" + echo " following way:" + echo " - If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC" + echo " - If only ATAC Reads exist: ATAC" + echo " - Otherwise: mRNA" + echo "" + echo " --cell_calling_bioproduct_algorithm" + echo " type: string" + echo " example: Basic" + echo " choices: [ Basic, Refined ]" + echo " Specify the bioproduct algorithm to be used for putative cell calling:" + echo " Basic or Refined" + echo " By default, the Basic algorithm will be used for putative cell calling." + echo "" + echo " --cell_calling_atac_algorithm" + echo " type: string" + echo " example: Basic" + echo " choices: [ Basic, Refined ]" + echo " Specify the ATAC-seq algorithm to be used for putative cell calling:" + echo " Basic or Refined" + echo " By default, the Basic algorithm will be used for putative cell calling." + echo "" + echo " --exact_cell_count" + echo " type: integer" + echo " example: 10000" + echo " min: 1" + echo " Set a specific number of cells as putative, based on those with the" + echo " highest error-corrected read count" + echo "" + echo " --expected_cell_count" + echo " type: integer" + echo " example: 20000" + echo " min: 1" + echo " Guide the basic putative cell calling algorithm by providing an estimate" + echo " of the number of cells expected. Usually this can be the number of" + echo " cells loaded into the Rhapsody cartridge. If there are multiple" + echo " inflection points on the second derivative cumulative curve, this will" + echo " ensure the one selected is near the expected." + echo "" + echo "Intronic Reads Settings:" + echo " --exclude_intronic_reads" + echo " type: boolean" + echo " example: false" + echo " By default, the flag is false, and reads aligned to exons and introns" + echo " are considered and represented in molecule counts. When the flag is set" + echo " to true, intronic reads will be excluded." + echo " The value can be true or false." + echo "" + echo "Multiplex Settings:" + echo " --sample_tags_version" + echo " type: string" + echo " example: human" + echo " choices: [ human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only ]" + echo " Specify the version of the Sample Tags used in the run:" + echo " * If Sample Tag Multiplexing was done, specify the appropriate version:" + echo " human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only" + echo " * If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq" + echo " (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the" + echo " \"nuclei_includes_mrna\" option." + echo " * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic" + echo " ATAC-Seq (WTA+ATAC-Seq)), choose the \"nuclei_atac_only\" option." + echo "" + echo " --tag_names" + echo " type: string, multiple values allowed" + echo " example: 4-mySample;9-myOtherSample;6-alsoThisSample" + echo " Specify the tag number followed by '-' and the desired sample name to" + echo " appear in Sample_Tag_Metrics.csv" + echo " Do not use the special characters." + echo "" + echo "VDJ arguments:" + echo " --vdj_version" + echo " type: string" + echo " example: human" + echo " choices: [ human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR ]" + echo " If VDJ was done, specify the appropriate option: human, mouse, humanBCR," + echo " humanTCR, mouseBCR, mouseTCR" + echo "" + echo "ATAC options:" + echo " --predefined_atac_peaks" + echo " type: file, file must exist" + echo " example: predefined_peaks.bed" + echo " An optional BED file containing pre-established chromatin accessibility" + echo " peak regions for generating the ATAC cell-by-peak matrix." + echo "" + echo "Additional options:" + echo " --run_name" + echo " type: string" + echo " default: sample" + echo " Specify a run name to use as the output file base name. Use only" + echo " letters, numbers, or hyphens. Do not use special characters or spaces." + echo "" + echo " --generate_bam" + echo " type: boolean" + echo " default: false" + echo " Specify whether to create the BAM file output" + echo "" + echo " --long_reads" + echo " type: boolean" + echo " Use STARlong (default: undefined - i.e. autodetects based on read" + echo " lengths) - Specify if the STARlong aligner should be used instead of" + echo " STAR. Set to true if the reads are longer than 650bp." + echo "" + echo "Advanced options:" + echo " NOTE: Only change these if you are really sure about what you are doing" + echo "" + echo " --custom_star_params" + echo " type: string" + echo " example: --alignIntronMax 6000 --outFilterScoreMinOverLread 0.1" + echo "--limitOutSJcollapsed 2000000" + echo " Modify STAR alignment parameters - Set this parameter to fully override" + echo " default STAR mapping parameters used in the pipeline." + echo " For reference this is the default that is used:" + echo " Short Reads: \`--outFilterScoreMinOverLread 0" + echo " --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0" + echo " --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + echo " --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed" + echo " 2000000\`" + echo " Long Reads: Same as Short Reads + \`--seedPerReadNmax 10000\`" + echo " This applies to fastqs provided in the Reads user input" + echo " Do NOT set any non-mapping related params like \`--genomeDir\`," + echo " \`--outSAMtype\`, \`--outSAMunmapped\`, \`--readFilesIn\`, \`--runThreadN\`," + echo " etc." + echo " We use STAR version 2.7.10b" + echo "" + echo " --custom_bwa_mem2_params" + echo " type: string" + echo " example: -k 16 -w 200 -r" + echo " Modify bwa-mem2 alignment parameters - Set this parameter to fully" + echo " override bwa-mem2 mapping parameters used in the pipeline" + echo " The pipeline does not specify any custom mapping params to bwa-mem2 so" + echo " program default values are used" + echo " This applies to fastqs provided in the Reads_ATAC user input" + echo " Do NOT set any non-mapping related params like \`-C\`, \`-t\`, etc." + echo " We use bwa-mem2 version 2.2.1" + echo "" + echo "CWL-runner arguments:" + echo " --parallel" + echo " type: boolean" + echo " default: true" + echo " Run jobs in parallel." + echo "" + echo " --timestamps" + echo " type: boolean_true" + echo " Add timestamps to the errors, warnings, and notifications." + echo "" + echo "Undocumented arguments:" + echo " --abseq_umi" + echo " type: integer" + echo "" + echo " --target_analysis" + echo " type: boolean" + echo "" + echo " --vdj_jgene_evalue" + echo " type: double" + echo " e-value threshold for J gene. The e-value threshold for J gene call by" + echo " IgBlast/PyIR, default is set as 0.001" + echo "" + echo " --vdj_vgene_evalue" + echo " type: double" + echo " e-value threshold for V gene. The e-value threshold for V gene call by" + echo " IgBlast/PyIR, default is set as 0.001" + echo "" + echo " --write_filtered_reads" + echo " type: boolean" + echo " Output processed FASTQ reads." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bd_rhapsody main" + exit + ;; + --reads) + if [ -z "$VIASH_PAR_READS" ]; then + VIASH_PAR_READS="$2" + else + VIASH_PAR_READS="$VIASH_PAR_READS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reads=*) + if [ -z "$VIASH_PAR_READS" ]; then + VIASH_PAR_READS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READS="$VIASH_PAR_READS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reads_atac) + if [ -z "$VIASH_PAR_READS_ATAC" ]; then + VIASH_PAR_READS_ATAC="$2" + else + VIASH_PAR_READS_ATAC="$VIASH_PAR_READS_ATAC;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reads_atac. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reads_atac=*) + if [ -z "$VIASH_PAR_READS_ATAC" ]; then + VIASH_PAR_READS_ATAC=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READS_ATAC="$VIASH_PAR_READS_ATAC;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference_archive) + [ -n "$VIASH_PAR_REFERENCE_ARCHIVE" ] && ViashError Bad arguments for option \'--reference_archive\': \'$VIASH_PAR_REFERENCE_ARCHIVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_ARCHIVE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_archive. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_archive=*) + [ -n "$VIASH_PAR_REFERENCE_ARCHIVE" ] && ViashError Bad arguments for option \'--reference_archive=*\': \'$VIASH_PAR_REFERENCE_ARCHIVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_ARCHIVE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --targeted_reference) + if [ -z "$VIASH_PAR_TARGETED_REFERENCE" ]; then + VIASH_PAR_TARGETED_REFERENCE="$2" + else + VIASH_PAR_TARGETED_REFERENCE="$VIASH_PAR_TARGETED_REFERENCE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --targeted_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --targeted_reference=*) + if [ -z "$VIASH_PAR_TARGETED_REFERENCE" ]; then + VIASH_PAR_TARGETED_REFERENCE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_TARGETED_REFERENCE="$VIASH_PAR_TARGETED_REFERENCE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --abseq_reference) + if [ -z "$VIASH_PAR_ABSEQ_REFERENCE" ]; then + VIASH_PAR_ABSEQ_REFERENCE="$2" + else + VIASH_PAR_ABSEQ_REFERENCE="$VIASH_PAR_ABSEQ_REFERENCE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --abseq_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --abseq_reference=*) + if [ -z "$VIASH_PAR_ABSEQ_REFERENCE" ]; then + VIASH_PAR_ABSEQ_REFERENCE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ABSEQ_REFERENCE="$VIASH_PAR_ABSEQ_REFERENCE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --supplemental_reference) + if [ -z "$VIASH_PAR_SUPPLEMENTAL_REFERENCE" ]; then + VIASH_PAR_SUPPLEMENTAL_REFERENCE="$2" + else + VIASH_PAR_SUPPLEMENTAL_REFERENCE="$VIASH_PAR_SUPPLEMENTAL_REFERENCE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --supplemental_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --supplemental_reference=*) + if [ -z "$VIASH_PAR_SUPPLEMENTAL_REFERENCE" ]; then + VIASH_PAR_SUPPLEMENTAL_REFERENCE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SUPPLEMENTAL_REFERENCE="$VIASH_PAR_SUPPLEMENTAL_REFERENCE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -s) + if [ -z "$VIASH_PAR_SUPPLEMENTAL_REFERENCE" ]; then + VIASH_PAR_SUPPLEMENTAL_REFERENCE="$2" + else + VIASH_PAR_SUPPLEMENTAL_REFERENCE="$VIASH_PAR_SUPPLEMENTAL_REFERENCE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -s. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_dir) + [ -n "$VIASH_PAR_OUTPUT_DIR" ] && ViashError Bad arguments for option \'--output_dir\': \'$VIASH_PAR_OUTPUT_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_dir=*) + [ -n "$VIASH_PAR_OUTPUT_DIR" ] && ViashError Bad arguments for option \'--output_dir=*\': \'$VIASH_PAR_OUTPUT_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT_DIR" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_seurat) + [ -n "$VIASH_PAR_OUTPUT_SEURAT" ] && ViashError Bad arguments for option \'--output_seurat\': \'$VIASH_PAR_OUTPUT_SEURAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_SEURAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_seurat. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_seurat=*) + [ -n "$VIASH_PAR_OUTPUT_SEURAT" ] && ViashError Bad arguments for option \'--output_seurat=*\': \'$VIASH_PAR_OUTPUT_SEURAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_SEURAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_mudata) + [ -n "$VIASH_PAR_OUTPUT_MUDATA" ] && ViashError Bad arguments for option \'--output_mudata\': \'$VIASH_PAR_OUTPUT_MUDATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MUDATA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_mudata. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_mudata=*) + [ -n "$VIASH_PAR_OUTPUT_MUDATA" ] && ViashError Bad arguments for option \'--output_mudata=*\': \'$VIASH_PAR_OUTPUT_MUDATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MUDATA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --metrics_summary) + [ -n "$VIASH_PAR_METRICS_SUMMARY" ] && ViashError Bad arguments for option \'--metrics_summary\': \'$VIASH_PAR_METRICS_SUMMARY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRICS_SUMMARY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --metrics_summary. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --metrics_summary=*) + [ -n "$VIASH_PAR_METRICS_SUMMARY" ] && ViashError Bad arguments for option \'--metrics_summary=*\': \'$VIASH_PAR_METRICS_SUMMARY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRICS_SUMMARY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pipeline_report) + [ -n "$VIASH_PAR_PIPELINE_REPORT" ] && ViashError Bad arguments for option \'--pipeline_report\': \'$VIASH_PAR_PIPELINE_REPORT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PIPELINE_REPORT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pipeline_report. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pipeline_report=*) + [ -n "$VIASH_PAR_PIPELINE_REPORT" ] && ViashError Bad arguments for option \'--pipeline_report=*\': \'$VIASH_PAR_PIPELINE_REPORT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PIPELINE_REPORT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rsec_mols_per_cell) + [ -n "$VIASH_PAR_RSEC_MOLS_PER_CELL" ] && ViashError Bad arguments for option \'--rsec_mols_per_cell\': \'$VIASH_PAR_RSEC_MOLS_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RSEC_MOLS_PER_CELL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rsec_mols_per_cell. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rsec_mols_per_cell=*) + [ -n "$VIASH_PAR_RSEC_MOLS_PER_CELL" ] && ViashError Bad arguments for option \'--rsec_mols_per_cell=*\': \'$VIASH_PAR_RSEC_MOLS_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RSEC_MOLS_PER_CELL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dbec_mols_per_cell) + [ -n "$VIASH_PAR_DBEC_MOLS_PER_CELL" ] && ViashError Bad arguments for option \'--dbec_mols_per_cell\': \'$VIASH_PAR_DBEC_MOLS_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DBEC_MOLS_PER_CELL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dbec_mols_per_cell. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dbec_mols_per_cell=*) + [ -n "$VIASH_PAR_DBEC_MOLS_PER_CELL" ] && ViashError Bad arguments for option \'--dbec_mols_per_cell=*\': \'$VIASH_PAR_DBEC_MOLS_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DBEC_MOLS_PER_CELL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --rsec_mols_per_cell_unfiltered) + [ -n "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ] && ViashError Bad arguments for option \'--rsec_mols_per_cell_unfiltered\': \'$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --rsec_mols_per_cell_unfiltered. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --rsec_mols_per_cell_unfiltered=*) + [ -n "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ] && ViashError Bad arguments for option \'--rsec_mols_per_cell_unfiltered=*\': \'$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam=*) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam=*\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam_index) + [ -n "$VIASH_PAR_BAM_INDEX" ] && ViashError Bad arguments for option \'--bam_index\': \'$VIASH_PAR_BAM_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam_index=*) + [ -n "$VIASH_PAR_BAM_INDEX" ] && ViashError Bad arguments for option \'--bam_index=*\': \'$VIASH_PAR_BAM_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bioproduct_stats) + [ -n "$VIASH_PAR_BIOPRODUCT_STATS" ] && ViashError Bad arguments for option \'--bioproduct_stats\': \'$VIASH_PAR_BIOPRODUCT_STATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BIOPRODUCT_STATS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bioproduct_stats. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bioproduct_stats=*) + [ -n "$VIASH_PAR_BIOPRODUCT_STATS" ] && ViashError Bad arguments for option \'--bioproduct_stats=*\': \'$VIASH_PAR_BIOPRODUCT_STATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BIOPRODUCT_STATS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dimred_tsne) + [ -n "$VIASH_PAR_DIMRED_TSNE" ] && ViashError Bad arguments for option \'--dimred_tsne\': \'$VIASH_PAR_DIMRED_TSNE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DIMRED_TSNE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dimred_tsne. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dimred_tsne=*) + [ -n "$VIASH_PAR_DIMRED_TSNE" ] && ViashError Bad arguments for option \'--dimred_tsne=*\': \'$VIASH_PAR_DIMRED_TSNE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DIMRED_TSNE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dimred_umap) + [ -n "$VIASH_PAR_DIMRED_UMAP" ] && ViashError Bad arguments for option \'--dimred_umap\': \'$VIASH_PAR_DIMRED_UMAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DIMRED_UMAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dimred_umap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dimred_umap=*) + [ -n "$VIASH_PAR_DIMRED_UMAP" ] && ViashError Bad arguments for option \'--dimred_umap=*\': \'$VIASH_PAR_DIMRED_UMAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DIMRED_UMAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --immune_cell_classification) + [ -n "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ] && ViashError Bad arguments for option \'--immune_cell_classification\': \'$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_IMMUNE_CELL_CLASSIFICATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --immune_cell_classification. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --immune_cell_classification=*) + [ -n "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ] && ViashError Bad arguments for option \'--immune_cell_classification=*\': \'$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_IMMUNE_CELL_CLASSIFICATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_tag_metrics) + [ -n "$VIASH_PAR_SAMPLE_TAG_METRICS" ] && ViashError Bad arguments for option \'--sample_tag_metrics\': \'$VIASH_PAR_SAMPLE_TAG_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_METRICS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_tag_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_tag_metrics=*) + [ -n "$VIASH_PAR_SAMPLE_TAG_METRICS" ] && ViashError Bad arguments for option \'--sample_tag_metrics=*\': \'$VIASH_PAR_SAMPLE_TAG_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_METRICS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_tag_calls) + [ -n "$VIASH_PAR_SAMPLE_TAG_CALLS" ] && ViashError Bad arguments for option \'--sample_tag_calls\': \'$VIASH_PAR_SAMPLE_TAG_CALLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_CALLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_tag_calls. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_tag_calls=*) + [ -n "$VIASH_PAR_SAMPLE_TAG_CALLS" ] && ViashError Bad arguments for option \'--sample_tag_calls=*\': \'$VIASH_PAR_SAMPLE_TAG_CALLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_CALLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_tag_counts) + [ -n "$VIASH_PAR_SAMPLE_TAG_COUNTS" ] && ViashError Bad arguments for option \'--sample_tag_counts\': \'$VIASH_PAR_SAMPLE_TAG_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_tag_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_tag_counts=*) + [ -n "$VIASH_PAR_SAMPLE_TAG_COUNTS" ] && ViashError Bad arguments for option \'--sample_tag_counts=*\': \'$VIASH_PAR_SAMPLE_TAG_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_tag_counts_unassigned) + [ -n "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ] && ViashError Bad arguments for option \'--sample_tag_counts_unassigned\': \'$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_tag_counts_unassigned. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_tag_counts_unassigned=*) + [ -n "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ] && ViashError Bad arguments for option \'--sample_tag_counts_unassigned=*\': \'$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_metrics) + [ -n "$VIASH_PAR_VDJ_METRICS" ] && ViashError Bad arguments for option \'--vdj_metrics\': \'$VIASH_PAR_VDJ_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_METRICS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_metrics=*) + [ -n "$VIASH_PAR_VDJ_METRICS" ] && ViashError Bad arguments for option \'--vdj_metrics=*\': \'$VIASH_PAR_VDJ_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_METRICS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_per_cell) + [ -n "$VIASH_PAR_VDJ_PER_CELL" ] && ViashError Bad arguments for option \'--vdj_per_cell\': \'$VIASH_PAR_VDJ_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_PER_CELL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_per_cell. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_per_cell=*) + [ -n "$VIASH_PAR_VDJ_PER_CELL" ] && ViashError Bad arguments for option \'--vdj_per_cell=*\': \'$VIASH_PAR_VDJ_PER_CELL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_PER_CELL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_per_cell_uncorrected) + [ -n "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ] && ViashError Bad arguments for option \'--vdj_per_cell_uncorrected\': \'$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_PER_CELL_UNCORRECTED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_per_cell_uncorrected. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_per_cell_uncorrected=*) + [ -n "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ] && ViashError Bad arguments for option \'--vdj_per_cell_uncorrected=*\': \'$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_PER_CELL_UNCORRECTED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_dominant_contigs) + [ -n "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ] && ViashError Bad arguments for option \'--vdj_dominant_contigs\': \'$VIASH_PAR_VDJ_DOMINANT_CONTIGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_DOMINANT_CONTIGS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_dominant_contigs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_dominant_contigs=*) + [ -n "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ] && ViashError Bad arguments for option \'--vdj_dominant_contigs=*\': \'$VIASH_PAR_VDJ_DOMINANT_CONTIGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_DOMINANT_CONTIGS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_unfiltered_contigs) + [ -n "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ] && ViashError Bad arguments for option \'--vdj_unfiltered_contigs\': \'$VIASH_PAR_VDJ_UNFILTERED_CONTIGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_UNFILTERED_CONTIGS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_unfiltered_contigs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_unfiltered_contigs=*) + [ -n "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ] && ViashError Bad arguments for option \'--vdj_unfiltered_contigs=*\': \'$VIASH_PAR_VDJ_UNFILTERED_CONTIGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_UNFILTERED_CONTIGS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_metrics) + [ -n "$VIASH_PAR_ATAC_METRICS" ] && ViashError Bad arguments for option \'--atac_metrics\': \'$VIASH_PAR_ATAC_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_METRICS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_metrics=*) + [ -n "$VIASH_PAR_ATAC_METRICS" ] && ViashError Bad arguments for option \'--atac_metrics=*\': \'$VIASH_PAR_ATAC_METRICS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_METRICS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_metrics_json) + [ -n "$VIASH_PAR_ATAC_METRICS_JSON" ] && ViashError Bad arguments for option \'--atac_metrics_json\': \'$VIASH_PAR_ATAC_METRICS_JSON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_METRICS_JSON="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_metrics_json. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_metrics_json=*) + [ -n "$VIASH_PAR_ATAC_METRICS_JSON" ] && ViashError Bad arguments for option \'--atac_metrics_json=*\': \'$VIASH_PAR_ATAC_METRICS_JSON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_METRICS_JSON=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_fragments) + [ -n "$VIASH_PAR_ATAC_FRAGMENTS" ] && ViashError Bad arguments for option \'--atac_fragments\': \'$VIASH_PAR_ATAC_FRAGMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_FRAGMENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_fragments. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_fragments=*) + [ -n "$VIASH_PAR_ATAC_FRAGMENTS" ] && ViashError Bad arguments for option \'--atac_fragments=*\': \'$VIASH_PAR_ATAC_FRAGMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_FRAGMENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_fragments_index) + [ -n "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ] && ViashError Bad arguments for option \'--atac_fragments_index\': \'$VIASH_PAR_ATAC_FRAGMENTS_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_FRAGMENTS_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_fragments_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_fragments_index=*) + [ -n "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ] && ViashError Bad arguments for option \'--atac_fragments_index=*\': \'$VIASH_PAR_ATAC_FRAGMENTS_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_FRAGMENTS_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_transposase_sites) + [ -n "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ] && ViashError Bad arguments for option \'--atac_transposase_sites\': \'$VIASH_PAR_ATAC_TRANSPOSASE_SITES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_TRANSPOSASE_SITES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_transposase_sites. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_transposase_sites=*) + [ -n "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ] && ViashError Bad arguments for option \'--atac_transposase_sites=*\': \'$VIASH_PAR_ATAC_TRANSPOSASE_SITES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_TRANSPOSASE_SITES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_transposase_sites_index) + [ -n "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ] && ViashError Bad arguments for option \'--atac_transposase_sites_index\': \'$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_transposase_sites_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_transposase_sites_index=*) + [ -n "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ] && ViashError Bad arguments for option \'--atac_transposase_sites_index=*\': \'$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_peaks) + [ -n "$VIASH_PAR_ATAC_PEAKS" ] && ViashError Bad arguments for option \'--atac_peaks\': \'$VIASH_PAR_ATAC_PEAKS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_PEAKS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_peaks. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_peaks=*) + [ -n "$VIASH_PAR_ATAC_PEAKS" ] && ViashError Bad arguments for option \'--atac_peaks=*\': \'$VIASH_PAR_ATAC_PEAKS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_PEAKS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_peaks_index) + [ -n "$VIASH_PAR_ATAC_PEAKS_INDEX" ] && ViashError Bad arguments for option \'--atac_peaks_index\': \'$VIASH_PAR_ATAC_PEAKS_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_PEAKS_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_peaks_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_peaks_index=*) + [ -n "$VIASH_PAR_ATAC_PEAKS_INDEX" ] && ViashError Bad arguments for option \'--atac_peaks_index=*\': \'$VIASH_PAR_ATAC_PEAKS_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_PEAKS_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_peak_annotation) + [ -n "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ] && ViashError Bad arguments for option \'--atac_peak_annotation\': \'$VIASH_PAR_ATAC_PEAK_ANNOTATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_PEAK_ANNOTATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_peak_annotation. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_peak_annotation=*) + [ -n "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ] && ViashError Bad arguments for option \'--atac_peak_annotation=*\': \'$VIASH_PAR_ATAC_PEAK_ANNOTATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_PEAK_ANNOTATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_cell_by_peak) + [ -n "$VIASH_PAR_ATAC_CELL_BY_PEAK" ] && ViashError Bad arguments for option \'--atac_cell_by_peak\': \'$VIASH_PAR_ATAC_CELL_BY_PEAK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_CELL_BY_PEAK="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_cell_by_peak. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_cell_by_peak=*) + [ -n "$VIASH_PAR_ATAC_CELL_BY_PEAK" ] && ViashError Bad arguments for option \'--atac_cell_by_peak=*\': \'$VIASH_PAR_ATAC_CELL_BY_PEAK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_CELL_BY_PEAK=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_cell_by_peak_unfiltered) + [ -n "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ] && ViashError Bad arguments for option \'--atac_cell_by_peak_unfiltered\': \'$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_cell_by_peak_unfiltered. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_cell_by_peak_unfiltered=*) + [ -n "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ] && ViashError Bad arguments for option \'--atac_cell_by_peak_unfiltered=*\': \'$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_bam) + [ -n "$VIASH_PAR_ATAC_BAM" ] && ViashError Bad arguments for option \'--atac_bam\': \'$VIASH_PAR_ATAC_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_bam=*) + [ -n "$VIASH_PAR_ATAC_BAM" ] && ViashError Bad arguments for option \'--atac_bam=*\': \'$VIASH_PAR_ATAC_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --atac_bam_index) + [ -n "$VIASH_PAR_ATAC_BAM_INDEX" ] && ViashError Bad arguments for option \'--atac_bam_index\': \'$VIASH_PAR_ATAC_BAM_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_BAM_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --atac_bam_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --atac_bam_index=*) + [ -n "$VIASH_PAR_ATAC_BAM_INDEX" ] && ViashError Bad arguments for option \'--atac_bam_index=*\': \'$VIASH_PAR_ATAC_BAM_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ATAC_BAM_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --protein_aggregates_experimental) + [ -n "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ] && ViashError Bad arguments for option \'--protein_aggregates_experimental\': \'$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --protein_aggregates_experimental. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --protein_aggregates_experimental=*) + [ -n "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ] && ViashError Bad arguments for option \'--protein_aggregates_experimental=*\': \'$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_calling_data) + [ -n "$VIASH_PAR_CELL_CALLING_DATA" ] && ViashError Bad arguments for option \'--cell_calling_data\': \'$VIASH_PAR_CELL_CALLING_DATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_CALLING_DATA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_calling_data. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_calling_data=*) + [ -n "$VIASH_PAR_CELL_CALLING_DATA" ] && ViashError Bad arguments for option \'--cell_calling_data=*\': \'$VIASH_PAR_CELL_CALLING_DATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_CALLING_DATA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_calling_bioproduct_algorithm) + [ -n "$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM" ] && ViashError Bad arguments for option \'--cell_calling_bioproduct_algorithm\': \'$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_calling_bioproduct_algorithm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_calling_bioproduct_algorithm=*) + [ -n "$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM" ] && ViashError Bad arguments for option \'--cell_calling_bioproduct_algorithm=*\': \'$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_calling_atac_algorithm) + [ -n "$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM" ] && ViashError Bad arguments for option \'--cell_calling_atac_algorithm\': \'$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_calling_atac_algorithm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_calling_atac_algorithm=*) + [ -n "$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM" ] && ViashError Bad arguments for option \'--cell_calling_atac_algorithm=*\': \'$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --exact_cell_count) + [ -n "$VIASH_PAR_EXACT_CELL_COUNT" ] && ViashError Bad arguments for option \'--exact_cell_count\': \'$VIASH_PAR_EXACT_CELL_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXACT_CELL_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exact_cell_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exact_cell_count=*) + [ -n "$VIASH_PAR_EXACT_CELL_COUNT" ] && ViashError Bad arguments for option \'--exact_cell_count=*\': \'$VIASH_PAR_EXACT_CELL_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXACT_CELL_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expected_cell_count) + [ -n "$VIASH_PAR_EXPECTED_CELL_COUNT" ] && ViashError Bad arguments for option \'--expected_cell_count\': \'$VIASH_PAR_EXPECTED_CELL_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELL_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expected_cell_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expected_cell_count=*) + [ -n "$VIASH_PAR_EXPECTED_CELL_COUNT" ] && ViashError Bad arguments for option \'--expected_cell_count=*\': \'$VIASH_PAR_EXPECTED_CELL_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECTED_CELL_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --exclude_intronic_reads) + [ -n "$VIASH_PAR_EXCLUDE_INTRONIC_READS" ] && ViashError Bad arguments for option \'--exclude_intronic_reads\': \'$VIASH_PAR_EXCLUDE_INTRONIC_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_INTRONIC_READS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --exclude_intronic_reads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --exclude_intronic_reads=*) + [ -n "$VIASH_PAR_EXCLUDE_INTRONIC_READS" ] && ViashError Bad arguments for option \'--exclude_intronic_reads=*\': \'$VIASH_PAR_EXCLUDE_INTRONIC_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_INTRONIC_READS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_tags_version) + [ -n "$VIASH_PAR_SAMPLE_TAGS_VERSION" ] && ViashError Bad arguments for option \'--sample_tags_version\': \'$VIASH_PAR_SAMPLE_TAGS_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAGS_VERSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_tags_version. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_tags_version=*) + [ -n "$VIASH_PAR_SAMPLE_TAGS_VERSION" ] && ViashError Bad arguments for option \'--sample_tags_version=*\': \'$VIASH_PAR_SAMPLE_TAGS_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SAMPLE_TAGS_VERSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tag_names) + if [ -z "$VIASH_PAR_TAG_NAMES" ]; then + VIASH_PAR_TAG_NAMES="$2" + else + VIASH_PAR_TAG_NAMES="$VIASH_PAR_TAG_NAMES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tag_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tag_names=*) + if [ -z "$VIASH_PAR_TAG_NAMES" ]; then + VIASH_PAR_TAG_NAMES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_TAG_NAMES="$VIASH_PAR_TAG_NAMES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --vdj_version) + [ -n "$VIASH_PAR_VDJ_VERSION" ] && ViashError Bad arguments for option \'--vdj_version\': \'$VIASH_PAR_VDJ_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_VERSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_version. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_version=*) + [ -n "$VIASH_PAR_VDJ_VERSION" ] && ViashError Bad arguments for option \'--vdj_version=*\': \'$VIASH_PAR_VDJ_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_VERSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --predefined_atac_peaks) + [ -n "$VIASH_PAR_PREDEFINED_ATAC_PEAKS" ] && ViashError Bad arguments for option \'--predefined_atac_peaks\': \'$VIASH_PAR_PREDEFINED_ATAC_PEAKS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PREDEFINED_ATAC_PEAKS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --predefined_atac_peaks. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --predefined_atac_peaks=*) + [ -n "$VIASH_PAR_PREDEFINED_ATAC_PEAKS" ] && ViashError Bad arguments for option \'--predefined_atac_peaks=*\': \'$VIASH_PAR_PREDEFINED_ATAC_PEAKS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PREDEFINED_ATAC_PEAKS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --run_name) + [ -n "$VIASH_PAR_RUN_NAME" ] && ViashError Bad arguments for option \'--run_name\': \'$VIASH_PAR_RUN_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_NAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --run_name. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --run_name=*) + [ -n "$VIASH_PAR_RUN_NAME" ] && ViashError Bad arguments for option \'--run_name=*\': \'$VIASH_PAR_RUN_NAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_NAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --generate_bam) + [ -n "$VIASH_PAR_GENERATE_BAM" ] && ViashError Bad arguments for option \'--generate_bam\': \'$VIASH_PAR_GENERATE_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENERATE_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --generate_bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --generate_bam=*) + [ -n "$VIASH_PAR_GENERATE_BAM" ] && ViashError Bad arguments for option \'--generate_bam=*\': \'$VIASH_PAR_GENERATE_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENERATE_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --long_reads) + [ -n "$VIASH_PAR_LONG_READS" ] && ViashError Bad arguments for option \'--long_reads\': \'$VIASH_PAR_LONG_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LONG_READS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --long_reads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --long_reads=*) + [ -n "$VIASH_PAR_LONG_READS" ] && ViashError Bad arguments for option \'--long_reads=*\': \'$VIASH_PAR_LONG_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LONG_READS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --custom_star_params) + [ -n "$VIASH_PAR_CUSTOM_STAR_PARAMS" ] && ViashError Bad arguments for option \'--custom_star_params\': \'$VIASH_PAR_CUSTOM_STAR_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CUSTOM_STAR_PARAMS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --custom_star_params. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --custom_star_params=*) + [ -n "$VIASH_PAR_CUSTOM_STAR_PARAMS" ] && ViashError Bad arguments for option \'--custom_star_params=*\': \'$VIASH_PAR_CUSTOM_STAR_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CUSTOM_STAR_PARAMS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --custom_bwa_mem2_params) + [ -n "$VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS" ] && ViashError Bad arguments for option \'--custom_bwa_mem2_params\': \'$VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --custom_bwa_mem2_params. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --custom_bwa_mem2_params=*) + [ -n "$VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS" ] && ViashError Bad arguments for option \'--custom_bwa_mem2_params=*\': \'$VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --parallel) + [ -n "$VIASH_PAR_PARALLEL" ] && ViashError Bad arguments for option \'--parallel\': \'$VIASH_PAR_PARALLEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PARALLEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --parallel. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --parallel=*) + [ -n "$VIASH_PAR_PARALLEL" ] && ViashError Bad arguments for option \'--parallel=*\': \'$VIASH_PAR_PARALLEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PARALLEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --timestamps) + [ -n "$VIASH_PAR_TIMESTAMPS" ] && ViashError Bad arguments for option \'--timestamps\': \'$VIASH_PAR_TIMESTAMPS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TIMESTAMPS=true + shift 1 + ;; + --abseq_umi) + [ -n "$VIASH_PAR_ABSEQ_UMI" ] && ViashError Bad arguments for option \'--abseq_umi\': \'$VIASH_PAR_ABSEQ_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ABSEQ_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --abseq_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --abseq_umi=*) + [ -n "$VIASH_PAR_ABSEQ_UMI" ] && ViashError Bad arguments for option \'--abseq_umi=*\': \'$VIASH_PAR_ABSEQ_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ABSEQ_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --target_analysis) + [ -n "$VIASH_PAR_TARGET_ANALYSIS" ] && ViashError Bad arguments for option \'--target_analysis\': \'$VIASH_PAR_TARGET_ANALYSIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGET_ANALYSIS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --target_analysis. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --target_analysis=*) + [ -n "$VIASH_PAR_TARGET_ANALYSIS" ] && ViashError Bad arguments for option \'--target_analysis=*\': \'$VIASH_PAR_TARGET_ANALYSIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGET_ANALYSIS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_jgene_evalue) + [ -n "$VIASH_PAR_VDJ_JGENE_EVALUE" ] && ViashError Bad arguments for option \'--vdj_jgene_evalue\': \'$VIASH_PAR_VDJ_JGENE_EVALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_JGENE_EVALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_jgene_evalue. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_jgene_evalue=*) + [ -n "$VIASH_PAR_VDJ_JGENE_EVALUE" ] && ViashError Bad arguments for option \'--vdj_jgene_evalue=*\': \'$VIASH_PAR_VDJ_JGENE_EVALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_JGENE_EVALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_vgene_evalue) + [ -n "$VIASH_PAR_VDJ_VGENE_EVALUE" ] && ViashError Bad arguments for option \'--vdj_vgene_evalue\': \'$VIASH_PAR_VDJ_VGENE_EVALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_VGENE_EVALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_vgene_evalue. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_vgene_evalue=*) + [ -n "$VIASH_PAR_VDJ_VGENE_EVALUE" ] && ViashError Bad arguments for option \'--vdj_vgene_evalue=*\': \'$VIASH_PAR_VDJ_VGENE_EVALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_VGENE_EVALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --write_filtered_reads) + [ -n "$VIASH_PAR_WRITE_FILTERED_READS" ] && ViashError Bad arguments for option \'--write_filtered_reads\': \'$VIASH_PAR_WRITE_FILTERED_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WRITE_FILTERED_READS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --write_filtered_reads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --write_filtered_reads=*) + [ -n "$VIASH_PAR_WRITE_FILTERED_READS" ] && ViashError Bad arguments for option \'--write_filtered_reads=*\': \'$VIASH_PAR_WRITE_FILTERED_READS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WRITE_FILTERED_READS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/bd_rhapsody:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_OUTPUT_DIR+x} ]; then + ViashError '--output_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_RUN_NAME+x} ]; then + VIASH_PAR_RUN_NAME="sample" +fi +if [ -z ${VIASH_PAR_GENERATE_BAM+x} ]; then + VIASH_PAR_GENERATE_BAM="false" +fi +if [ -z ${VIASH_PAR_PARALLEL+x} ]; then + VIASH_PAR_PARALLEL="true" +fi +if [ -z ${VIASH_PAR_TIMESTAMPS+x} ]; then + VIASH_PAR_TIMESTAMPS="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_READS" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_READS; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_READS_ATAC" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_READS_ATAC; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ] && [ ! -e "$VIASH_PAR_REFERENCE_ARCHIVE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE_ARCHIVE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TARGETED_REFERENCE" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_TARGETED_REFERENCE; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_ABSEQ_REFERENCE" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_ABSEQ_REFERENCE; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_SUPPLEMENTAL_REFERENCE" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_SUPPLEMENTAL_REFERENCE; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_PREDEFINED_ATAC_PEAKS" ] && [ ! -e "$VIASH_PAR_PREDEFINED_ATAC_PEAKS" ]; then + ViashError "Input file '$VIASH_PAR_PREDEFINED_ATAC_PEAKS' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_SAMPLE_TAG_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_SAMPLE_TAG_COUNTS" =~ \* ]]; then + ViashError '--sample_tag_counts' has to be a path containing a wildcard, e.g. 'output_*.txt'. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXACT_CELL_COUNT" ]]; then + if ! [[ "$VIASH_PAR_EXACT_CELL_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--exact_cell_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_EXACT_CELL_COUNT -lt 1 ]]; then + ViashError '--exact_cell_count' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXPECTED_CELL_COUNT" ]]; then + if ! [[ "$VIASH_PAR_EXPECTED_CELL_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--expected_cell_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_EXPECTED_CELL_COUNT -lt 1 ]]; then + ViashError '--expected_cell_count' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXCLUDE_INTRONIC_READS" ]]; then + if ! [[ "$VIASH_PAR_EXCLUDE_INTRONIC_READS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--exclude_intronic_reads' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENERATE_BAM" ]]; then + if ! [[ "$VIASH_PAR_GENERATE_BAM" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--generate_bam' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LONG_READS" ]]; then + if ! [[ "$VIASH_PAR_LONG_READS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--long_reads' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PARALLEL" ]]; then + if ! [[ "$VIASH_PAR_PARALLEL" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--parallel' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TIMESTAMPS" ]]; then + if ! [[ "$VIASH_PAR_TIMESTAMPS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--timestamps' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ABSEQ_UMI" ]]; then + if ! [[ "$VIASH_PAR_ABSEQ_UMI" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--abseq_umi' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TARGET_ANALYSIS" ]]; then + if ! [[ "$VIASH_PAR_TARGET_ANALYSIS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--target_analysis' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VDJ_JGENE_EVALUE" ]]; then + if ! [[ "$VIASH_PAR_VDJ_JGENE_EVALUE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--vdj_jgene_evalue' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VDJ_VGENE_EVALUE" ]]; then + if ! [[ "$VIASH_PAR_VDJ_VGENE_EVALUE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--vdj_vgene_evalue' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WRITE_FILTERED_READS" ]]; then + if ! [[ "$VIASH_PAR_WRITE_FILTERED_READS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--write_filtered_reads' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_CELL_CALLING_DATA" ]; then + VIASH_PAR_CELL_CALLING_DATA_CHOICES=("mRNA;AbSeq;ATAC;mRNA_and_ATAC") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CELL_CALLING_DATA_CHOICES[*]};" =~ ";$VIASH_PAR_CELL_CALLING_DATA;" ]]; then + ViashError '--cell_calling_data' specified value of \'$VIASH_PAR_CELL_CALLING_DATA\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM" ]; then + VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM_CHOICES=("Basic;Refined") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM_CHOICES[*]};" =~ ";$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM;" ]]; then + ViashError '--cell_calling_bioproduct_algorithm' specified value of \'$VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM" ]; then + VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM_CHOICES=("Basic;Refined") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM_CHOICES[*]};" =~ ";$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM;" ]]; then + ViashError '--cell_calling_atac_algorithm' specified value of \'$VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_SAMPLE_TAGS_VERSION" ]; then + VIASH_PAR_SAMPLE_TAGS_VERSION_CHOICES=("human;mouse;flex;nuclei_includes_mrna;nuclei_atac_only") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SAMPLE_TAGS_VERSION_CHOICES[*]};" =~ ";$VIASH_PAR_SAMPLE_TAGS_VERSION;" ]]; then + ViashError '--sample_tags_version' specified value of \'$VIASH_PAR_SAMPLE_TAGS_VERSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_VDJ_VERSION" ]; then + VIASH_PAR_VDJ_VERSION_CHOICES=("human;mouse;humanBCR;humanTCR;mouseBCR;mouseTCR") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_VDJ_VERSION_CHOICES[*]};" =~ ";$VIASH_PAR_VDJ_VERSION;" ]]; then + ViashError '--vdj_version' specified value of \'$VIASH_PAR_VDJ_VERSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_DIR")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_SEURAT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_SEURAT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_SEURAT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MUDATA" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_MUDATA")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_MUDATA")" +fi +if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ] && [ ! -d "$(dirname "$VIASH_PAR_METRICS_SUMMARY")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_METRICS_SUMMARY")" +fi +if [ ! -z "$VIASH_PAR_PIPELINE_REPORT" ] && [ ! -d "$(dirname "$VIASH_PAR_PIPELINE_REPORT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_PIPELINE_REPORT")" +fi +if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL" ] && [ ! -d "$(dirname "$VIASH_PAR_RSEC_MOLS_PER_CELL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_RSEC_MOLS_PER_CELL")" +fi +if [ ! -z "$VIASH_PAR_DBEC_MOLS_PER_CELL" ] && [ ! -d "$(dirname "$VIASH_PAR_DBEC_MOLS_PER_CELL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_DBEC_MOLS_PER_CELL")" +fi +if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ] && [ ! -d "$(dirname "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED")" +fi +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -d "$(dirname "$VIASH_PAR_BAM")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_BAM")" +fi +if [ ! -z "$VIASH_PAR_BAM_INDEX" ] && [ ! -d "$(dirname "$VIASH_PAR_BAM_INDEX")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_BAM_INDEX")" +fi +if [ ! -z "$VIASH_PAR_BIOPRODUCT_STATS" ] && [ ! -d "$(dirname "$VIASH_PAR_BIOPRODUCT_STATS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_BIOPRODUCT_STATS")" +fi +if [ ! -z "$VIASH_PAR_DIMRED_TSNE" ] && [ ! -d "$(dirname "$VIASH_PAR_DIMRED_TSNE")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_DIMRED_TSNE")" +fi +if [ ! -z "$VIASH_PAR_DIMRED_UMAP" ] && [ ! -d "$(dirname "$VIASH_PAR_DIMRED_UMAP")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_DIMRED_UMAP")" +fi +if [ ! -z "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ] && [ ! -d "$(dirname "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION")" +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_METRICS" ] && [ ! -d "$(dirname "$VIASH_PAR_SAMPLE_TAG_METRICS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_SAMPLE_TAG_METRICS")" +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_CALLS" ] && [ ! -d "$(dirname "$VIASH_PAR_SAMPLE_TAG_CALLS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_SAMPLE_TAG_CALLS")" +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS" ] && [ ! -d "$(dirname "$VIASH_PAR_SAMPLE_TAG_COUNTS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_SAMPLE_TAG_COUNTS")" +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ] && [ ! -d "$(dirname "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED")" +fi +if [ ! -z "$VIASH_PAR_VDJ_METRICS" ] && [ ! -d "$(dirname "$VIASH_PAR_VDJ_METRICS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_VDJ_METRICS")" +fi +if [ ! -z "$VIASH_PAR_VDJ_PER_CELL" ] && [ ! -d "$(dirname "$VIASH_PAR_VDJ_PER_CELL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_VDJ_PER_CELL")" +fi +if [ ! -z "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ] && [ ! -d "$(dirname "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED")" +fi +if [ ! -z "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ] && [ ! -d "$(dirname "$VIASH_PAR_VDJ_DOMINANT_CONTIGS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_VDJ_DOMINANT_CONTIGS")" +fi +if [ ! -z "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ] && [ ! -d "$(dirname "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS")" +fi +if [ ! -z "$VIASH_PAR_ATAC_METRICS" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_METRICS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_METRICS")" +fi +if [ ! -z "$VIASH_PAR_ATAC_METRICS_JSON" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_METRICS_JSON")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_METRICS_JSON")" +fi +if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_FRAGMENTS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_FRAGMENTS")" +fi +if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_FRAGMENTS_INDEX")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_FRAGMENTS_INDEX")" +fi +if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_TRANSPOSASE_SITES")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_TRANSPOSASE_SITES")" +fi +if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX")" +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAKS" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_PEAKS")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_PEAKS")" +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAKS_INDEX" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_PEAKS_INDEX")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_PEAKS_INDEX")" +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_PEAK_ANNOTATION")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_PEAK_ANNOTATION")" +fi +if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_CELL_BY_PEAK")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_CELL_BY_PEAK")" +fi +if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED")" +fi +if [ ! -z "$VIASH_PAR_ATAC_BAM" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_BAM")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_BAM")" +fi +if [ ! -z "$VIASH_PAR_ATAC_BAM_INDEX" ] && [ ! -d "$(dirname "$VIASH_PAR_ATAC_BAM_INDEX")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ATAC_BAM_INDEX")" +fi +if [ ! -z "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ] && [ ! -d "$(dirname "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_READS" ]; then + VIASH_TEST_READS=() + IFS=';' + for var in $VIASH_PAR_READS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_READS+=( "$var" ) + done + VIASH_PAR_READS=$(IFS=';' ; echo "${VIASH_TEST_READS[*]}") +fi +if [ ! -z "$VIASH_PAR_READS_ATAC" ]; then + VIASH_TEST_READS_ATAC=() + IFS=';' + for var in $VIASH_PAR_READS_ATAC; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_READS_ATAC+=( "$var" ) + done + VIASH_PAR_READS_ATAC=$(IFS=';' ; echo "${VIASH_TEST_READS_ATAC[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE_ARCHIVE")" ) + VIASH_PAR_REFERENCE_ARCHIVE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE_ARCHIVE") +fi +if [ ! -z "$VIASH_PAR_TARGETED_REFERENCE" ]; then + VIASH_TEST_TARGETED_REFERENCE=() + IFS=';' + for var in $VIASH_PAR_TARGETED_REFERENCE; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_TARGETED_REFERENCE+=( "$var" ) + done + VIASH_PAR_TARGETED_REFERENCE=$(IFS=';' ; echo "${VIASH_TEST_TARGETED_REFERENCE[*]}") +fi +if [ ! -z "$VIASH_PAR_ABSEQ_REFERENCE" ]; then + VIASH_TEST_ABSEQ_REFERENCE=() + IFS=';' + for var in $VIASH_PAR_ABSEQ_REFERENCE; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_ABSEQ_REFERENCE+=( "$var" ) + done + VIASH_PAR_ABSEQ_REFERENCE=$(IFS=';' ; echo "${VIASH_TEST_ABSEQ_REFERENCE[*]}") +fi +if [ ! -z "$VIASH_PAR_SUPPLEMENTAL_REFERENCE" ]; then + VIASH_TEST_SUPPLEMENTAL_REFERENCE=() + IFS=';' + for var in $VIASH_PAR_SUPPLEMENTAL_REFERENCE; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_SUPPLEMENTAL_REFERENCE+=( "$var" ) + done + VIASH_PAR_SUPPLEMENTAL_REFERENCE=$(IFS=';' ; echo "${VIASH_TEST_SUPPLEMENTAL_REFERENCE[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_DIR")" ) + VIASH_PAR_OUTPUT_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_DIR" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_SEURAT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_SEURAT")" ) + VIASH_PAR_OUTPUT_SEURAT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_SEURAT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_SEURAT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MUDATA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_MUDATA")" ) + VIASH_PAR_OUTPUT_MUDATA=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_MUDATA") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_MUDATA" ) +fi +if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_METRICS_SUMMARY")" ) + VIASH_PAR_METRICS_SUMMARY=$(ViashDockerAutodetectMount "$VIASH_PAR_METRICS_SUMMARY") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_METRICS_SUMMARY" ) +fi +if [ ! -z "$VIASH_PAR_PIPELINE_REPORT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PIPELINE_REPORT")" ) + VIASH_PAR_PIPELINE_REPORT=$(ViashDockerAutodetectMount "$VIASH_PAR_PIPELINE_REPORT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_PIPELINE_REPORT" ) +fi +if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_RSEC_MOLS_PER_CELL")" ) + VIASH_PAR_RSEC_MOLS_PER_CELL=$(ViashDockerAutodetectMount "$VIASH_PAR_RSEC_MOLS_PER_CELL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_RSEC_MOLS_PER_CELL" ) +fi +if [ ! -z "$VIASH_PAR_DBEC_MOLS_PER_CELL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_DBEC_MOLS_PER_CELL")" ) + VIASH_PAR_DBEC_MOLS_PER_CELL=$(ViashDockerAutodetectMount "$VIASH_PAR_DBEC_MOLS_PER_CELL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_DBEC_MOLS_PER_CELL" ) +fi +if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED")" ) + VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED=$(ViashDockerAutodetectMount "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ) +fi +if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM")" ) + VIASH_PAR_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_BAM" ) +fi +if [ ! -z "$VIASH_PAR_BAM_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM_INDEX")" ) + VIASH_PAR_BAM_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM_INDEX") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_BAM_INDEX" ) +fi +if [ ! -z "$VIASH_PAR_BIOPRODUCT_STATS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BIOPRODUCT_STATS")" ) + VIASH_PAR_BIOPRODUCT_STATS=$(ViashDockerAutodetectMount "$VIASH_PAR_BIOPRODUCT_STATS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_BIOPRODUCT_STATS" ) +fi +if [ ! -z "$VIASH_PAR_DIMRED_TSNE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_DIMRED_TSNE")" ) + VIASH_PAR_DIMRED_TSNE=$(ViashDockerAutodetectMount "$VIASH_PAR_DIMRED_TSNE") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_DIMRED_TSNE" ) +fi +if [ ! -z "$VIASH_PAR_DIMRED_UMAP" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_DIMRED_UMAP")" ) + VIASH_PAR_DIMRED_UMAP=$(ViashDockerAutodetectMount "$VIASH_PAR_DIMRED_UMAP") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_DIMRED_UMAP" ) +fi +if [ ! -z "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION")" ) + VIASH_PAR_IMMUNE_CELL_CLASSIFICATION=$(ViashDockerAutodetectMount "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ) +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_METRICS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_TAG_METRICS")" ) + VIASH_PAR_SAMPLE_TAG_METRICS=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_TAG_METRICS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_SAMPLE_TAG_METRICS" ) +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_CALLS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_TAG_CALLS")" ) + VIASH_PAR_SAMPLE_TAG_CALLS=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_TAG_CALLS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_SAMPLE_TAG_CALLS" ) +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS" ]; then + VIASH_TEST_SAMPLE_TAG_COUNTS=() + IFS=';' + for var in $VIASH_PAR_SAMPLE_TAG_COUNTS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_SAMPLE_TAG_COUNTS+=( "$var" ) + VIASH_CHOWN_VARS+=( "$var" ) + done + VIASH_PAR_SAMPLE_TAG_COUNTS=$(IFS=';' ; echo "${VIASH_TEST_SAMPLE_TAG_COUNTS[*]}") +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED")" ) + VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED=$(ViashDockerAutodetectMount "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ) +fi +if [ ! -z "$VIASH_PAR_VDJ_METRICS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_METRICS")" ) + VIASH_PAR_VDJ_METRICS=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_METRICS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_VDJ_METRICS" ) +fi +if [ ! -z "$VIASH_PAR_VDJ_PER_CELL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_PER_CELL")" ) + VIASH_PAR_VDJ_PER_CELL=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_PER_CELL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_VDJ_PER_CELL" ) +fi +if [ ! -z "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED")" ) + VIASH_PAR_VDJ_PER_CELL_UNCORRECTED=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ) +fi +if [ ! -z "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_DOMINANT_CONTIGS")" ) + VIASH_PAR_VDJ_DOMINANT_CONTIGS=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_DOMINANT_CONTIGS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ) +fi +if [ ! -z "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS")" ) + VIASH_PAR_VDJ_UNFILTERED_CONTIGS=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_METRICS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_METRICS")" ) + VIASH_PAR_ATAC_METRICS=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_METRICS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_METRICS" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_METRICS_JSON" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_METRICS_JSON")" ) + VIASH_PAR_ATAC_METRICS_JSON=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_METRICS_JSON") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_METRICS_JSON" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_FRAGMENTS")" ) + VIASH_PAR_ATAC_FRAGMENTS=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_FRAGMENTS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_FRAGMENTS" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_FRAGMENTS_INDEX")" ) + VIASH_PAR_ATAC_FRAGMENTS_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_FRAGMENTS_INDEX") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_TRANSPOSASE_SITES")" ) + VIASH_PAR_ATAC_TRANSPOSASE_SITES=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_TRANSPOSASE_SITES") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX")" ) + VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAKS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_PEAKS")" ) + VIASH_PAR_ATAC_PEAKS=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_PEAKS") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_PEAKS" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAKS_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_PEAKS_INDEX")" ) + VIASH_PAR_ATAC_PEAKS_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_PEAKS_INDEX") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_PEAKS_INDEX" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_PEAK_ANNOTATION")" ) + VIASH_PAR_ATAC_PEAK_ANNOTATION=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_PEAK_ANNOTATION") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_CELL_BY_PEAK")" ) + VIASH_PAR_ATAC_CELL_BY_PEAK=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_CELL_BY_PEAK") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_CELL_BY_PEAK" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED")" ) + VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_BAM")" ) + VIASH_PAR_ATAC_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_BAM") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_BAM" ) +fi +if [ ! -z "$VIASH_PAR_ATAC_BAM_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ATAC_BAM_INDEX")" ) + VIASH_PAR_ATAC_BAM_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_ATAC_BAM_INDEX") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ATAC_BAM_INDEX" ) +fi +if [ ! -z "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL")" ) + VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL=$(ViashDockerAutodetectMount "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ) +fi +if [ ! -z "$VIASH_PAR_PREDEFINED_ATAC_PEAKS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PREDEFINED_ATAC_PEAKS")" ) + VIASH_PAR_PREDEFINED_ATAC_PEAKS=$(ViashDockerAutodetectMount "$VIASH_PAR_PREDEFINED_ATAC_PEAKS") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bd_rhapsody-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil +import glob + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'reads': $( if [ ! -z ${VIASH_PAR_READS+x} ]; then echo "r'${VIASH_PAR_READS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reads_atac': $( if [ ! -z ${VIASH_PAR_READS_ATAC+x} ]; then echo "r'${VIASH_PAR_READS_ATAC//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference_archive': $( if [ ! -z ${VIASH_PAR_REFERENCE_ARCHIVE+x} ]; then echo "r'${VIASH_PAR_REFERENCE_ARCHIVE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'targeted_reference': $( if [ ! -z ${VIASH_PAR_TARGETED_REFERENCE+x} ]; then echo "r'${VIASH_PAR_TARGETED_REFERENCE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'abseq_reference': $( if [ ! -z ${VIASH_PAR_ABSEQ_REFERENCE+x} ]; then echo "r'${VIASH_PAR_ABSEQ_REFERENCE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'supplemental_reference': $( if [ ! -z ${VIASH_PAR_SUPPLEMENTAL_REFERENCE+x} ]; then echo "r'${VIASH_PAR_SUPPLEMENTAL_REFERENCE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_dir': $( if [ ! -z ${VIASH_PAR_OUTPUT_DIR+x} ]; then echo "r'${VIASH_PAR_OUTPUT_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_seurat': $( if [ ! -z ${VIASH_PAR_OUTPUT_SEURAT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_SEURAT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_mudata': $( if [ ! -z ${VIASH_PAR_OUTPUT_MUDATA+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MUDATA//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'metrics_summary': $( if [ ! -z ${VIASH_PAR_METRICS_SUMMARY+x} ]; then echo "r'${VIASH_PAR_METRICS_SUMMARY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pipeline_report': $( if [ ! -z ${VIASH_PAR_PIPELINE_REPORT+x} ]; then echo "r'${VIASH_PAR_PIPELINE_REPORT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rsec_mols_per_cell': $( if [ ! -z ${VIASH_PAR_RSEC_MOLS_PER_CELL+x} ]; then echo "r'${VIASH_PAR_RSEC_MOLS_PER_CELL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'dbec_mols_per_cell': $( if [ ! -z ${VIASH_PAR_DBEC_MOLS_PER_CELL+x} ]; then echo "r'${VIASH_PAR_DBEC_MOLS_PER_CELL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'rsec_mols_per_cell_unfiltered': $( if [ ! -z ${VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED+x} ]; then echo "r'${VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'bam': $( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "r'${VIASH_PAR_BAM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'bam_index': $( if [ ! -z ${VIASH_PAR_BAM_INDEX+x} ]; then echo "r'${VIASH_PAR_BAM_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'bioproduct_stats': $( if [ ! -z ${VIASH_PAR_BIOPRODUCT_STATS+x} ]; then echo "r'${VIASH_PAR_BIOPRODUCT_STATS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'dimred_tsne': $( if [ ! -z ${VIASH_PAR_DIMRED_TSNE+x} ]; then echo "r'${VIASH_PAR_DIMRED_TSNE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'dimred_umap': $( if [ ! -z ${VIASH_PAR_DIMRED_UMAP+x} ]; then echo "r'${VIASH_PAR_DIMRED_UMAP//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'immune_cell_classification': $( if [ ! -z ${VIASH_PAR_IMMUNE_CELL_CLASSIFICATION+x} ]; then echo "r'${VIASH_PAR_IMMUNE_CELL_CLASSIFICATION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sample_tag_metrics': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_METRICS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_METRICS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sample_tag_calls': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_CALLS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_CALLS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sample_tag_counts': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_COUNTS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_COUNTS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sample_tag_counts_unassigned': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_metrics': $( if [ ! -z ${VIASH_PAR_VDJ_METRICS+x} ]; then echo "r'${VIASH_PAR_VDJ_METRICS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_per_cell': $( if [ ! -z ${VIASH_PAR_VDJ_PER_CELL+x} ]; then echo "r'${VIASH_PAR_VDJ_PER_CELL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_per_cell_uncorrected': $( if [ ! -z ${VIASH_PAR_VDJ_PER_CELL_UNCORRECTED+x} ]; then echo "r'${VIASH_PAR_VDJ_PER_CELL_UNCORRECTED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_dominant_contigs': $( if [ ! -z ${VIASH_PAR_VDJ_DOMINANT_CONTIGS+x} ]; then echo "r'${VIASH_PAR_VDJ_DOMINANT_CONTIGS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_unfiltered_contigs': $( if [ ! -z ${VIASH_PAR_VDJ_UNFILTERED_CONTIGS+x} ]; then echo "r'${VIASH_PAR_VDJ_UNFILTERED_CONTIGS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_metrics': $( if [ ! -z ${VIASH_PAR_ATAC_METRICS+x} ]; then echo "r'${VIASH_PAR_ATAC_METRICS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_metrics_json': $( if [ ! -z ${VIASH_PAR_ATAC_METRICS_JSON+x} ]; then echo "r'${VIASH_PAR_ATAC_METRICS_JSON//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_fragments': $( if [ ! -z ${VIASH_PAR_ATAC_FRAGMENTS+x} ]; then echo "r'${VIASH_PAR_ATAC_FRAGMENTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_fragments_index': $( if [ ! -z ${VIASH_PAR_ATAC_FRAGMENTS_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_FRAGMENTS_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_transposase_sites': $( if [ ! -z ${VIASH_PAR_ATAC_TRANSPOSASE_SITES+x} ]; then echo "r'${VIASH_PAR_ATAC_TRANSPOSASE_SITES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_transposase_sites_index': $( if [ ! -z ${VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_peaks': $( if [ ! -z ${VIASH_PAR_ATAC_PEAKS+x} ]; then echo "r'${VIASH_PAR_ATAC_PEAKS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_peaks_index': $( if [ ! -z ${VIASH_PAR_ATAC_PEAKS_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_PEAKS_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_peak_annotation': $( if [ ! -z ${VIASH_PAR_ATAC_PEAK_ANNOTATION+x} ]; then echo "r'${VIASH_PAR_ATAC_PEAK_ANNOTATION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_cell_by_peak': $( if [ ! -z ${VIASH_PAR_ATAC_CELL_BY_PEAK+x} ]; then echo "r'${VIASH_PAR_ATAC_CELL_BY_PEAK//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_cell_by_peak_unfiltered': $( if [ ! -z ${VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED+x} ]; then echo "r'${VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_bam': $( if [ ! -z ${VIASH_PAR_ATAC_BAM+x} ]; then echo "r'${VIASH_PAR_ATAC_BAM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'atac_bam_index': $( if [ ! -z ${VIASH_PAR_ATAC_BAM_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_BAM_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'protein_aggregates_experimental': $( if [ ! -z ${VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL+x} ]; then echo "r'${VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cell_calling_data': $( if [ ! -z ${VIASH_PAR_CELL_CALLING_DATA+x} ]; then echo "r'${VIASH_PAR_CELL_CALLING_DATA//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cell_calling_bioproduct_algorithm': $( if [ ! -z ${VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM+x} ]; then echo "r'${VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cell_calling_atac_algorithm': $( if [ ! -z ${VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM+x} ]; then echo "r'${VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'exact_cell_count': $( if [ ! -z ${VIASH_PAR_EXACT_CELL_COUNT+x} ]; then echo "int(r'${VIASH_PAR_EXACT_CELL_COUNT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'expected_cell_count': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELL_COUNT+x} ]; then echo "int(r'${VIASH_PAR_EXPECTED_CELL_COUNT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'exclude_intronic_reads': $( if [ ! -z ${VIASH_PAR_EXCLUDE_INTRONIC_READS+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_INTRONIC_READS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'sample_tags_version': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAGS_VERSION+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAGS_VERSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'tag_names': $( if [ ! -z ${VIASH_PAR_TAG_NAMES+x} ]; then echo "r'${VIASH_PAR_TAG_NAMES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'vdj_version': $( if [ ! -z ${VIASH_PAR_VDJ_VERSION+x} ]; then echo "r'${VIASH_PAR_VDJ_VERSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'predefined_atac_peaks': $( if [ ! -z ${VIASH_PAR_PREDEFINED_ATAC_PEAKS+x} ]; then echo "r'${VIASH_PAR_PREDEFINED_ATAC_PEAKS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'run_name': $( if [ ! -z ${VIASH_PAR_RUN_NAME+x} ]; then echo "r'${VIASH_PAR_RUN_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'generate_bam': $( if [ ! -z ${VIASH_PAR_GENERATE_BAM+x} ]; then echo "r'${VIASH_PAR_GENERATE_BAM//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'long_reads': $( if [ ! -z ${VIASH_PAR_LONG_READS+x} ]; then echo "r'${VIASH_PAR_LONG_READS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'custom_star_params': $( if [ ! -z ${VIASH_PAR_CUSTOM_STAR_PARAMS+x} ]; then echo "r'${VIASH_PAR_CUSTOM_STAR_PARAMS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'custom_bwa_mem2_params': $( if [ ! -z ${VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS+x} ]; then echo "r'${VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'parallel': $( if [ ! -z ${VIASH_PAR_PARALLEL+x} ]; then echo "r'${VIASH_PAR_PARALLEL//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'timestamps': $( if [ ! -z ${VIASH_PAR_TIMESTAMPS+x} ]; then echo "r'${VIASH_PAR_TIMESTAMPS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'abseq_umi': $( if [ ! -z ${VIASH_PAR_ABSEQ_UMI+x} ]; then echo "int(r'${VIASH_PAR_ABSEQ_UMI//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'target_analysis': $( if [ ! -z ${VIASH_PAR_TARGET_ANALYSIS+x} ]; then echo "r'${VIASH_PAR_TARGET_ANALYSIS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'vdj_jgene_evalue': $( if [ ! -z ${VIASH_PAR_VDJ_JGENE_EVALUE+x} ]; then echo "float(r'${VIASH_PAR_VDJ_JGENE_EVALUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'vdj_vgene_evalue': $( if [ ! -z ${VIASH_PAR_VDJ_VGENE_EVALUE+x} ]; then echo "float(r'${VIASH_PAR_VDJ_VGENE_EVALUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'write_filtered_reads': $( if [ ! -z ${VIASH_PAR_WRITE_FILTERED_READS+x} ]; then echo "r'${VIASH_PAR_WRITE_FILTERED_READS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] + ] + + return config + + +def strip_margin(text: str) -> str: + return re.sub("(\\n?)[ \\t]*\\|", "\\\\1", text) + + +def process_params(par: dict[str, Any], config, temp_dir: str) -> str: + # check input parameters + assert par["reads"] or par["reads_atac"], ( + "Pass at least one set of inputs to --reads or --reads_atac." + ) + + # output to temp dir if output_dir was not passed + if not par["output_dir"]: + par["output_dir"] = os.path.join(temp_dir, "output") + + # checking sample prefix + if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): + print( + "--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", + flush=True, + ) + par["run_name"] = re.sub("[^A-Za-z0-9\\\\-]", "-", par["run_name"]) + + # make paths absolute + for argument in config["arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] + else: + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + + return par + + +def generate_config(par: dict[str, Any], config) -> str: + content_list = [ + strip_margin("""\\ + |#!/usr/bin/env cwl-runner + | + |cwl:tool: rhapsody + |""") + ] + + for argument in config["arguments"]: + arg_info = argument.get("info") or {} + config_key = arg_info.get("config_key") + if par[argument["clean_name"]] and config_key: + if argument["type"] == "file": + str = strip_margin(f"""\\ + |{config_key}: + |""") + if isinstance(par[argument["clean_name"]], list): + for file in par[argument["clean_name"]]: + str += strip_margin(f"""\\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\\ + | class: File + | location: "{par[argument["clean_name"]]}" + |""") + content_list.append(str) + else: + content_list.append( + strip_margin(f"""\\ + |{config_key}: {par[argument["clean_name"]]} + |""") + ) + + ## Write config to file + return "".join(content_list) + + +def generate_config_file( + par: dict[str, Any], config: dict[str, Any], temp_dir: str +) -> str: + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, config) + with open(config_file, "w") as f: + f.write(config_content) + return config_file + + +def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: + # create cwl file (if need be) + orig_cwl_file = os.path.join( + meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl" + ) + + # Inject computational requirements into pipeline + if meta["memory_mb"] or meta["cpus"]: + cwl_file = os.path.join(dir, "pipeline.cwl") + + # Read in the file + with open(orig_cwl_file, "r") as file: + cwl_data = file.read() + + # Inject computational requirements into pipeline + if meta["memory_mb"]: + memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS + cwl_data = re.sub( + '"ramMin": [^\\n]*[^,](,?)\\n', f'"ramMin": {memory}\\\\1\\n', cwl_data + ) + if meta["cpus"]: + cwl_data = re.sub( + '"coresMin": [^\\n]*[^,](,?)\\n', + f'"coresMin": {meta["cpus"]}\\\\1\\n', + cwl_data, + ) + + # Write the file out again + with open(cwl_file, "w") as file: + file.write(cwl_data) + else: + cwl_file = orig_cwl_file + + return cwl_file + + +def copy_outputs(par: dict[str, Any], config: dict[str, Any]): + for arg in config["arguments"]: + par_value = par[arg["clean_name"]] + if par_value and arg["type"] == "file" and arg["direction"] == "output": + # example template: '[sample_name]_(assay)_cell_type_experimental.csv' + template = (arg.get("info") or {}).get("template") + if template: + template_glob = ( + template.replace("sample", par["run_name"]) + .replace("assay", "*") + .replace("number", "*") + ) + files = glob.glob(os.path.join(par["output_dir"], template_glob)) + if len(files) == 0 and arg["required"]: + raise ValueError( + f"Expected output file '{template_glob}' not found." + ) + elif len(files) > 1 and not arg["multiple"]: + raise ValueError( + f"Expected single output file '{template_glob}', but found multiple." + ) + + if not arg["multiple"]: + try: + shutil.copy(files[0], par_value) + print(f"Copied {files[0]} to {par_value}") + except IndexError: + print(f"Unable to copy {template_glob} to {par_value}") + else: + # replace '*' in par_value with index + for i, file in enumerate(files): + shutil.copy(file, par_value.replace("*", str(i))) + + +def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config, temp_dir) + + ## Process parameters + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + par["output_dir"], + ] + + if par["parallel"]: + cmd.append("--parallel") + + if par["timestamps"]: + cmd.append("--timestamps") + + # Create cwl file (if need be) + cwl_file = generate_cwl_file(meta, temp_dir) + cmd.append(cwl_file) + + # Create params file + config_file = generate_config_file(par, config, temp_dir) + cmd.append(config_file) + + # keep environment variables but set TMPDIR to temp_dir + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + # Create output dir if not exists + if not os.path.exists(par["output_dir"]): + os.makedirs(par["output_dir"]) + + # Run command + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + # Copy outputs + copy_outputs(par, config) + + +if __name__ == "__main__": + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"] + ) as temp_dir: + main(par, meta, temp_dir) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_READS" ]; then + unset VIASH_TEST_READS + IFS=';' + for var in $VIASH_PAR_READS; do + unset IFS + if [ -z "$VIASH_TEST_READS" ]; then + VIASH_TEST_READS="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_READS="$VIASH_TEST_READS;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_READS="$VIASH_TEST_READS" + fi + if [ ! -z "$VIASH_PAR_READS_ATAC" ]; then + unset VIASH_TEST_READS_ATAC + IFS=';' + for var in $VIASH_PAR_READS_ATAC; do + unset IFS + if [ -z "$VIASH_TEST_READS_ATAC" ]; then + VIASH_TEST_READS_ATAC="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_READS_ATAC="$VIASH_TEST_READS_ATAC;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_READS_ATAC="$VIASH_TEST_READS_ATAC" + fi + if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ]; then + VIASH_PAR_REFERENCE_ARCHIVE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE_ARCHIVE") + fi + if [ ! -z "$VIASH_PAR_TARGETED_REFERENCE" ]; then + unset VIASH_TEST_TARGETED_REFERENCE + IFS=';' + for var in $VIASH_PAR_TARGETED_REFERENCE; do + unset IFS + if [ -z "$VIASH_TEST_TARGETED_REFERENCE" ]; then + VIASH_TEST_TARGETED_REFERENCE="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_TARGETED_REFERENCE="$VIASH_TEST_TARGETED_REFERENCE;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_TARGETED_REFERENCE="$VIASH_TEST_TARGETED_REFERENCE" + fi + if [ ! -z "$VIASH_PAR_ABSEQ_REFERENCE" ]; then + unset VIASH_TEST_ABSEQ_REFERENCE + IFS=';' + for var in $VIASH_PAR_ABSEQ_REFERENCE; do + unset IFS + if [ -z "$VIASH_TEST_ABSEQ_REFERENCE" ]; then + VIASH_TEST_ABSEQ_REFERENCE="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_ABSEQ_REFERENCE="$VIASH_TEST_ABSEQ_REFERENCE;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_ABSEQ_REFERENCE="$VIASH_TEST_ABSEQ_REFERENCE" + fi + if [ ! -z "$VIASH_PAR_SUPPLEMENTAL_REFERENCE" ]; then + unset VIASH_TEST_SUPPLEMENTAL_REFERENCE + IFS=';' + for var in $VIASH_PAR_SUPPLEMENTAL_REFERENCE; do + unset IFS + if [ -z "$VIASH_TEST_SUPPLEMENTAL_REFERENCE" ]; then + VIASH_TEST_SUPPLEMENTAL_REFERENCE="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_SUPPLEMENTAL_REFERENCE="$VIASH_TEST_SUPPLEMENTAL_REFERENCE;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_SUPPLEMENTAL_REFERENCE="$VIASH_TEST_SUPPLEMENTAL_REFERENCE" + fi + if [ ! -z "$VIASH_PAR_OUTPUT_DIR" ]; then + VIASH_PAR_OUTPUT_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_DIR") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_SEURAT" ]; then + VIASH_PAR_OUTPUT_SEURAT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_SEURAT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_MUDATA" ]; then + VIASH_PAR_OUTPUT_MUDATA=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_MUDATA") + fi + if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ]; then + VIASH_PAR_METRICS_SUMMARY=$(ViashDockerStripAutomount "$VIASH_PAR_METRICS_SUMMARY") + fi + if [ ! -z "$VIASH_PAR_PIPELINE_REPORT" ]; then + VIASH_PAR_PIPELINE_REPORT=$(ViashDockerStripAutomount "$VIASH_PAR_PIPELINE_REPORT") + fi + if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL" ]; then + VIASH_PAR_RSEC_MOLS_PER_CELL=$(ViashDockerStripAutomount "$VIASH_PAR_RSEC_MOLS_PER_CELL") + fi + if [ ! -z "$VIASH_PAR_DBEC_MOLS_PER_CELL" ]; then + VIASH_PAR_DBEC_MOLS_PER_CELL=$(ViashDockerStripAutomount "$VIASH_PAR_DBEC_MOLS_PER_CELL") + fi + if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ]; then + VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED=$(ViashDockerStripAutomount "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED") + fi + if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_PAR_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_BAM") + fi + if [ ! -z "$VIASH_PAR_BAM_INDEX" ]; then + VIASH_PAR_BAM_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_BAM_INDEX") + fi + if [ ! -z "$VIASH_PAR_BIOPRODUCT_STATS" ]; then + VIASH_PAR_BIOPRODUCT_STATS=$(ViashDockerStripAutomount "$VIASH_PAR_BIOPRODUCT_STATS") + fi + if [ ! -z "$VIASH_PAR_DIMRED_TSNE" ]; then + VIASH_PAR_DIMRED_TSNE=$(ViashDockerStripAutomount "$VIASH_PAR_DIMRED_TSNE") + fi + if [ ! -z "$VIASH_PAR_DIMRED_UMAP" ]; then + VIASH_PAR_DIMRED_UMAP=$(ViashDockerStripAutomount "$VIASH_PAR_DIMRED_UMAP") + fi + if [ ! -z "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ]; then + VIASH_PAR_IMMUNE_CELL_CLASSIFICATION=$(ViashDockerStripAutomount "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_TAG_METRICS" ]; then + VIASH_PAR_SAMPLE_TAG_METRICS=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_TAG_METRICS") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_TAG_CALLS" ]; then + VIASH_PAR_SAMPLE_TAG_CALLS=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_TAG_CALLS") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS" ]; then + VIASH_PAR_SAMPLE_TAG_COUNTS=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_TAG_COUNTS") + fi + if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ]; then + VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED=$(ViashDockerStripAutomount "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED") + fi + if [ ! -z "$VIASH_PAR_VDJ_METRICS" ]; then + VIASH_PAR_VDJ_METRICS=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_METRICS") + fi + if [ ! -z "$VIASH_PAR_VDJ_PER_CELL" ]; then + VIASH_PAR_VDJ_PER_CELL=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_PER_CELL") + fi + if [ ! -z "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ]; then + VIASH_PAR_VDJ_PER_CELL_UNCORRECTED=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED") + fi + if [ ! -z "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ]; then + VIASH_PAR_VDJ_DOMINANT_CONTIGS=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_DOMINANT_CONTIGS") + fi + if [ ! -z "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ]; then + VIASH_PAR_VDJ_UNFILTERED_CONTIGS=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS") + fi + if [ ! -z "$VIASH_PAR_ATAC_METRICS" ]; then + VIASH_PAR_ATAC_METRICS=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_METRICS") + fi + if [ ! -z "$VIASH_PAR_ATAC_METRICS_JSON" ]; then + VIASH_PAR_ATAC_METRICS_JSON=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_METRICS_JSON") + fi + if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS" ]; then + VIASH_PAR_ATAC_FRAGMENTS=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_FRAGMENTS") + fi + if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ]; then + VIASH_PAR_ATAC_FRAGMENTS_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_FRAGMENTS_INDEX") + fi + if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ]; then + VIASH_PAR_ATAC_TRANSPOSASE_SITES=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_TRANSPOSASE_SITES") + fi + if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ]; then + VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX") + fi + if [ ! -z "$VIASH_PAR_ATAC_PEAKS" ]; then + VIASH_PAR_ATAC_PEAKS=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_PEAKS") + fi + if [ ! -z "$VIASH_PAR_ATAC_PEAKS_INDEX" ]; then + VIASH_PAR_ATAC_PEAKS_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_PEAKS_INDEX") + fi + if [ ! -z "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ]; then + VIASH_PAR_ATAC_PEAK_ANNOTATION=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_PEAK_ANNOTATION") + fi + if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK" ]; then + VIASH_PAR_ATAC_CELL_BY_PEAK=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_CELL_BY_PEAK") + fi + if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ]; then + VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED") + fi + if [ ! -z "$VIASH_PAR_ATAC_BAM" ]; then + VIASH_PAR_ATAC_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_BAM") + fi + if [ ! -z "$VIASH_PAR_ATAC_BAM_INDEX" ]; then + VIASH_PAR_ATAC_BAM_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_ATAC_BAM_INDEX") + fi + if [ ! -z "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ]; then + VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL=$(ViashDockerStripAutomount "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL") + fi + if [ ! -z "$VIASH_PAR_PREDEFINED_ATAC_PEAKS" ]; then + VIASH_PAR_PREDEFINED_ATAC_PEAKS=$(ViashDockerStripAutomount "$VIASH_PAR_PREDEFINED_ATAC_PEAKS") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_DIR" ] && [ ! -e "$VIASH_PAR_OUTPUT_DIR" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_SEURAT" ] && [ ! -e "$VIASH_PAR_OUTPUT_SEURAT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_SEURAT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_MUDATA" ] && [ ! -e "$VIASH_PAR_OUTPUT_MUDATA" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_MUDATA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ] && [ ! -e "$VIASH_PAR_METRICS_SUMMARY" ]; then + ViashError "Output file '$VIASH_PAR_METRICS_SUMMARY' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_PIPELINE_REPORT" ] && [ ! -e "$VIASH_PAR_PIPELINE_REPORT" ]; then + ViashError "Output file '$VIASH_PAR_PIPELINE_REPORT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL" ] && [ ! -e "$VIASH_PAR_RSEC_MOLS_PER_CELL" ]; then + ViashError "Output file '$VIASH_PAR_RSEC_MOLS_PER_CELL' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_DBEC_MOLS_PER_CELL" ] && [ ! -e "$VIASH_PAR_DBEC_MOLS_PER_CELL" ]; then + ViashError "Output file '$VIASH_PAR_DBEC_MOLS_PER_CELL' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ] && [ ! -e "$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED" ]; then + ViashError "Output file '$VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -e "$VIASH_PAR_BAM" ]; then + ViashError "Output file '$VIASH_PAR_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM_INDEX" ] && [ ! -e "$VIASH_PAR_BAM_INDEX" ]; then + ViashError "Output file '$VIASH_PAR_BAM_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BIOPRODUCT_STATS" ] && [ ! -e "$VIASH_PAR_BIOPRODUCT_STATS" ]; then + ViashError "Output file '$VIASH_PAR_BIOPRODUCT_STATS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_DIMRED_TSNE" ] && [ ! -e "$VIASH_PAR_DIMRED_TSNE" ]; then + ViashError "Output file '$VIASH_PAR_DIMRED_TSNE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_DIMRED_UMAP" ] && [ ! -e "$VIASH_PAR_DIMRED_UMAP" ]; then + ViashError "Output file '$VIASH_PAR_DIMRED_UMAP' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ] && [ ! -e "$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION" ]; then + ViashError "Output file '$VIASH_PAR_IMMUNE_CELL_CLASSIFICATION' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_METRICS" ] && [ ! -e "$VIASH_PAR_SAMPLE_TAG_METRICS" ]; then + ViashError "Output file '$VIASH_PAR_SAMPLE_TAG_METRICS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_CALLS" ] && [ ! -e "$VIASH_PAR_SAMPLE_TAG_CALLS" ]; then + ViashError "Output file '$VIASH_PAR_SAMPLE_TAG_CALLS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS" ] && ! compgen -G "$VIASH_PAR_SAMPLE_TAG_COUNTS" > /dev/null; then + ViashError "Output file '$VIASH_PAR_SAMPLE_TAG_COUNTS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ] && [ ! -e "$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED" ]; then + ViashError "Output file '$VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_METRICS" ] && [ ! -e "$VIASH_PAR_VDJ_METRICS" ]; then + ViashError "Output file '$VIASH_PAR_VDJ_METRICS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_PER_CELL" ] && [ ! -e "$VIASH_PAR_VDJ_PER_CELL" ]; then + ViashError "Output file '$VIASH_PAR_VDJ_PER_CELL' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ] && [ ! -e "$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED" ]; then + ViashError "Output file '$VIASH_PAR_VDJ_PER_CELL_UNCORRECTED' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ] && [ ! -e "$VIASH_PAR_VDJ_DOMINANT_CONTIGS" ]; then + ViashError "Output file '$VIASH_PAR_VDJ_DOMINANT_CONTIGS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ] && [ ! -e "$VIASH_PAR_VDJ_UNFILTERED_CONTIGS" ]; then + ViashError "Output file '$VIASH_PAR_VDJ_UNFILTERED_CONTIGS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_METRICS" ] && [ ! -e "$VIASH_PAR_ATAC_METRICS" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_METRICS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_METRICS_JSON" ] && [ ! -e "$VIASH_PAR_ATAC_METRICS_JSON" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_METRICS_JSON' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS" ] && [ ! -e "$VIASH_PAR_ATAC_FRAGMENTS" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_FRAGMENTS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ] && [ ! -e "$VIASH_PAR_ATAC_FRAGMENTS_INDEX" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_FRAGMENTS_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ] && [ ! -e "$VIASH_PAR_ATAC_TRANSPOSASE_SITES" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_TRANSPOSASE_SITES' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ] && [ ! -e "$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAKS" ] && [ ! -e "$VIASH_PAR_ATAC_PEAKS" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_PEAKS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAKS_INDEX" ] && [ ! -e "$VIASH_PAR_ATAC_PEAKS_INDEX" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_PEAKS_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ] && [ ! -e "$VIASH_PAR_ATAC_PEAK_ANNOTATION" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_PEAK_ANNOTATION' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK" ] && [ ! -e "$VIASH_PAR_ATAC_CELL_BY_PEAK" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_CELL_BY_PEAK' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ] && [ ! -e "$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_BAM" ] && [ ! -e "$VIASH_PAR_ATAC_BAM" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ATAC_BAM_INDEX" ] && [ ! -e "$VIASH_PAR_ATAC_BAM_INDEX" ]; then + ViashError "Output file '$VIASH_PAR_ATAC_BAM_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ] && [ ! -e "$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL" ]; then + ViashError "Output file '$VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/bd_rhapsody/nextflow_labels.config b/target/executable/mapping/bd_rhapsody/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/bd_rhapsody/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl b/target/executable/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl new file mode 100755 index 00000000..30009f05 --- /dev/null +++ b/target/executable/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl @@ -0,0 +1,6146 @@ +#!/usr/bin/env cwl-runner +{ + "$graph": [ + { + "class": "CommandLineTool", + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "baseCommand": "ATAC_Cell_by_Peak.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--fragments" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Fragments" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--gtf" + }, + "id": "#ATAC_Cell_by_Peak.cwl/GTF" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peaks" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Peaks" + }, + { + "type": "File", + "loadContents": true, + "id": "#ATAC_Cell_by_Peak.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--transposase-sites" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Transposase_Sites" + } + ], + "arguments": [ + { + "prefix": "--base-name", + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + } + ], + "id": "#ATAC_Cell_by_Peak.cwl", + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_ATAC_Cell_Calling_Data.csv" + }, + "id": "#ATAC_Cell_by_Peak.cwl/ATAC_Cell_Calling_Data" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Initial_Seurat.rds" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Initial_Seurat_RDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Total_Fragment_Metrics.json" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Total_Fragment_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.log" + }, + "id": "#ATAC_Cell_by_Peak.cwl/output" + } + ] + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 48000 + } + ], + "baseCommand": "ATAC_Compile_Results.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--biop-putative-data-table" + }, + "id": "#ATAC_Compile_Results.cwl/Biop_putative_data_table" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--cell-order" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Order" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--cell-order-subsampled" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Order_Subsampled" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--fragments" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Compile_Results.cwl/Fragments" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--initial-seurat" + }, + "id": "#ATAC_Compile_Results.cwl/Initial_Seurat_RDS" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--input-metrics-tar" + }, + "id": "#ATAC_Compile_Results.cwl/Input_Metrics_tar" + }, + { + "type": "string", + "inputBinding": { + "prefix": "--genome-size" + }, + "id": "#ATAC_Compile_Results.cwl/Reference_Genome_Size" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#ATAC_Compile_Results.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--total-fragment-metrics" + }, + "id": "#ATAC_Compile_Results.cwl/Total_Fragment_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--unified-metrics" + }, + "id": "#ATAC_Compile_Results.cwl/Unified_Metrics" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_MEX.zip" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*/*_coordinates.csv" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_ATAC_Seurat.rds" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Seurat_RDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*cell_type_experimental.csv" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Type_Predictions" + }, + { + "type": "File", + "outputBinding": { + "glob": "metrics-files.tar.gz" + }, + "id": "#ATAC_Compile_Results.cwl/Metrics_tar" + }, + { + "type": "File", + "outputBinding": { + "glob": "mist_atac_compile_results.log" + }, + "id": "#ATAC_Compile_Results.cwl/output" + } + ], + "id": "#ATAC_Compile_Results.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": "${ if (inputs.Assay == 'ATAC') { return 2; } else { return 4; } }", + "ramMin": "${ if (inputs.Assay == 'ATAC') { return 4000; } else { return 32000; } }" + } + ], + "baseCommand": [ + "mist_add_to_bam.py" + ], + "inputs": [ + { + "type": "string", + "inputBinding": { + "prefix": "--assay" + }, + "id": "#AddtoBam.cwl/Assay" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--input-bam" + }, + "id": "#AddtoBam.cwl/Bam" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--cell-order-json" + }, + "id": "#AddtoBam.cwl/Cell_Order" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--corrected-mols-list", + "itemSeparator": "," + }, + "id": "#AddtoBam.cwl/Corrected_Mols" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#AddtoBam.cwl/Generate_Bam" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata-json" + }, + "id": "#AddtoBam.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#AddtoBam.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--target-gene-mapping" + }, + "id": "#AddtoBam.cwl/Target_Gene_Mapping" + } + ], + "arguments": [ + { + "prefix": "--bamIO-threads", + "valueFrom": "$(runtime.cores)" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "*.bam" + }, + "id": "#AddtoBam.cwl/Annotated_Bam" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#AddtoBam.cwl/output" + } + ], + "id": "#AddtoBam.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "AlignmentAnalysis stage of the Rain pipeline annotates aligned reads and collects a myriad of metrics on the aligned reads. Additional annotation is performed to the reads\n", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8, + "ramMin": 24000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "AlignmentAnalysisAndCountCB.sh" + ], + "inputs": [ + { + "inputBinding": { + "prefix": "--assay" + }, + "type": [ + "null", + "string" + ], + "id": "#AlignmentAnalysis.cwl/Assay" + }, + { + "inputBinding": { + "prefix": "--exclude-intronic-reads" + }, + "type": [ + "null", + "boolean" + ], + "id": "#AlignmentAnalysis.cwl/Exclude_Intronic_Reads" + }, + { + "inputBinding": { + "prefix": "--extra-seqs" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Extra_Seqs" + }, + { + "inputBinding": { + "prefix": "--gtf" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/GTF" + }, + { + "inputBinding": { + "prefix": "--threads" + }, + "type": [ + "null", + "int" + ], + "id": "#AlignmentAnalysis.cwl/Maximum_Threads" + }, + { + "inputBinding": { + "prefix": "--r2-bam", + "itemSeparator": "," + }, + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#AlignmentAnalysis.cwl/R2_BAM" + }, + { + "inputBinding": { + "prefix": "--quality-metrics" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/ReadQualityMetrics" + }, + { + "inputBinding": { + "prefix": "--run-metadata" + }, + "type": "File", + "id": "#AlignmentAnalysis.cwl/Run_Metadata" + }, + { + "inputBinding": { + "prefix": "--transcript-length" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Transcript_Length" + } + ], + "outputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*.annotated.*.bam" + }, + "id": "#AlignmentAnalysis.cwl/Annotated_Bam_Files" + }, + { + "outputBinding": { + "glob": "*logs.tar.gz" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Logs" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_SeqMetrics.csv" + }, + "id": "#AlignmentAnalysis.cwl/Seq_Metrics" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*Sorted_Valid_Reads.csv.*" + }, + "id": "#AlignmentAnalysis.cwl/Sorted_Valid_Reads_CSV" + }, + { + "type": "int", + "outputBinding": { + "glob": "count_estimates.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 30000; } return parseInt(JSON.parse(self[0].contents).num_bioproducts); }" + }, + "id": "#AlignmentAnalysis.cwl/num_bioproducts" + }, + { + "type": "int", + "outputBinding": { + "glob": "count_estimates.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 10000; } return parseInt(JSON.parse(self[0].contents).num_cell_estimate); }" + }, + "id": "#AlignmentAnalysis.cwl/num_cell_estimate" + }, + { + "type": "int", + "outputBinding": { + "glob": "num_vdj_reads.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 0; } return parseInt(JSON.parse(self[0].contents).BCR); }" + }, + "id": "#AlignmentAnalysis.cwl/num_valid_ig_reads" + }, + { + "type": "int", + "outputBinding": { + "glob": "num_vdj_reads.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 0; } return parseInt(JSON.parse(self[0].contents).TCR); }" + }, + "id": "#AlignmentAnalysis.cwl/num_valid_tcr_reads" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_BCR_Valid_Reads.fastq.gz" + }, + "id": "#AlignmentAnalysis.cwl/validIgReads" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_TCR_Valid_Reads.fastq.gz" + }, + "id": "#AlignmentAnalysis.cwl/validTcrReads" + } + ], + "id": "#AlignmentAnalysis.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_annotate_molecules.py" + ], + "inputs": [ + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--umi-option" + }, + "id": "#AnnotateMolecules.cwl/AbSeq_UMI" + }, + { + "type": [ + "null", + "string" + ], + "id": "#AnnotateMolecules.cwl/Assay" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#AnnotateMolecules.cwl/Run_Metadata" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--valid-annot" + }, + "id": "#AnnotateMolecules.cwl/Valids" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "*_bioproduct_stats*.json" + }, + "id": "#AnnotateMolecules.cwl/Bioproduct_Stats_List" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_CellBiopSummary.csv.*" + }, + "id": "#AnnotateMolecules.cwl/Cell_Biop_Summary_List" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_Annotation_Molecule_corrected.csv.*" + }, + "id": "#AnnotateMolecules.cwl/Corrected_Mols_List" + }, + { + "type": "int", + "outputBinding": { + "glob": "stats.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).total_molecules)\n" + }, + "id": "#AnnotateMolecules.cwl/Total_Molecules" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#AnnotateMolecules.cwl/output" + } + ], + "id": "#AnnotateMolecules.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": "boolean", + "id": "#Assay_Settings.cwl/AbSeq_Reference_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reads_ATAC_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reads_RNA_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reference_Archive_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Targeted_Reference_Present" + } + ], + "outputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#Assay_Settings.cwl/Assay_ATAC" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Assay_Settings.cwl/Assay_RNA" + } + ], + "expression": "${\n var assay_rna = null;\n var assay_atac = null;\n\n if (!inputs.Reads_ATAC_Present && !inputs.Reads_RNA_Present)\n {\n throw new Error('Invalid pipeline inputs: Please provide Reads for at least 1 of RNA or ATAC analysis.')\n }\n if (inputs.Targeted_Reference_Present && inputs.Reference_Archive_Present) {\n throw new Error('Invalid pipeline inputs: Do not provide both Targeted Reference and Reference Archive.')\n }\n if (!inputs.Targeted_Reference_Present && !inputs.AbSeq_Reference_Present && !inputs.Reference_Archive_Present) {\n throw new Error('Invalid pipeline inputs: Please provide either a Reference Archive or a Targeted Reference or an AbSeq Reference.')\n }\n\n if ( inputs.Reads_ATAC_Present )\n {\n assay_atac = \"ATAC\"\n }\n\n if (inputs.Reads_RNA_Present && inputs.Reference_Archive_Present) {\n assay_rna = \"WTA\"\n }\n else if (inputs.Reads_RNA_Present && (inputs.Targeted_Reference_Present || inputs.AbSeq_Reference_Present)) {\n assay_rna = \"Targeted\"\n }\n\n return ({Assay_RNA: assay_rna, Assay_ATAC: assay_atac})\n}", + "id": "#Assay_Settings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "boolean" + ], + "id": "#BamSettings.cwl/_Generate_Bam" + } + ], + "outputs": [ + { + "type": [ + "null", + "boolean" + ], + "id": "#BamSettings.cwl/Generate_Bam" + } + ], + "expression": "${\n // the create bam flag defaults to false\n var generateBam = false;\n // the user can set this flag to true, to enable creation of the bam file.\n if (inputs._Generate_Bam != null) {\n generateBam = inputs._Generate_Bam;\n }\n return ({\n Generate_Bam: generateBam,\n });\n}", + "id": "#BamSettings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "MultipleInputFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#BundleLogs.cwl/log_files" + } + ], + "outputs": [ + { + "type": "Directory", + "id": "#BundleLogs.cwl/logs_dir" + } + ], + "expression": "${\n /* shamelly cribbed from https://gist.github.com/jcxplorer/823878 */\n function uuid() {\n var uuid = \"\", i, random;\n for (i = 0; i < 32; i++) {\n random = Math.random() * 16 | 0;\n if (i == 8 || i == 12 || i == 16 || i == 20) {\n uuid += \"-\";\n }\n uuid += (i == 12 ? 4 : (i == 16 ? (random & 3 | 8) : random)).toString(16);\n }\n return uuid;\n }\n var listing = [];\n for (var i = 0; i < inputs.log_files.length; i++) {\n var log_file = inputs.log_files[i];\n /*\n Checking here for null in case a Node was skipped because of conditional execution.\n For e.g. Generate_Bam is used to skip the AddToBam, MergeBam and IndexBam nodes\n */\n if (log_file != null) {\n log_file.basename = uuid() + \"-\" + log_file.basename;\n listing.push(log_file);\n }\n }\n return ({\n logs_dir: {\n class: \"Directory\",\n basename: \"Logs\",\n listing: listing\n }\n });\n}", + "id": "#BundleLogs.cwl" + }, + { + "requirements": [ + { + "listing": [ + "$(inputs.Reference_Archive)" + ], + "class": "InitialWorkDirRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_check_references.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--abseq-reference" + }, + "id": "#CheckReference.cwl/AbSeq_Reference" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "inputBinding": { + "itemSeparator": ",", + "prefix": "--assay" + }, + "id": "#CheckReference.cwl/Assay" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--putative-cell-call" + }, + "id": "#CheckReference.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--predefined-peaks" + }, + "id": "#CheckReference.cwl/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + "File", + "Directory" + ], + "inputBinding": { + "prefix": "--reference-archive" + }, + "id": "#CheckReference.cwl/Reference_Archive" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--sample-tags-version" + }, + "id": "#CheckReference.cwl/Sample_Tags_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--supplemental-reference" + }, + "id": "#CheckReference.cwl/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--targeted-reference" + }, + "id": "#CheckReference.cwl/Targeted_Reference" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version" + }, + "id": "#CheckReference.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#CheckReference.cwl/Checked_Predefined_Peaks" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#CheckReference.cwl/Checked_Predefined_Peaks_Index" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "combined_extra_seq.fasta" + }, + "id": "#CheckReference.cwl/Extra_Seqs" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "full-gene-list.json" + }, + "id": "#CheckReference.cwl/Full_Genes" + }, + { + "type": "File", + "outputBinding": { + "glob": "BD_Rhapsody_Reference_Files/*.gtf" + }, + "id": "#CheckReference.cwl/GTF" + }, + { + "type": "Directory", + "outputBinding": { + "glob": "BD_Rhapsody_Reference_Files" + }, + "id": "#CheckReference.cwl/Index" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "target-gene.json" + }, + "id": "#CheckReference.cwl/Target_Gene_Mapping" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "transcript_length.json" + }, + "id": "#CheckReference.cwl/Transcript_Length" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#CheckReference.cwl/output" + } + ], + "id": "#CheckReference.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_generate_H5MU.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--atac-cell-by-peak" + }, + "id": "#GenerateH5MU.cwl/Atac_Datatables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--atac-metrics" + }, + "id": "#GenerateH5MU.cwl/Atac_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--bioproduct-stats" + }, + "id": "#GenerateH5MU.cwl/Bioproduct_Stats" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-experimental", + "itemSeparator": "," + }, + "id": "#GenerateH5MU.cwl/Cell_Type_Experimental" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--data-tables" + }, + "id": "#GenerateH5MU.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--coordinates-file-list" + }, + "id": "#GenerateH5MU.cwl/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--metrics-summary" + }, + "id": "#GenerateH5MU.cwl/Metrics_Summary" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peak_annotation" + }, + "id": "#GenerateH5MU.cwl/Peak_Annotation" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--protein-aggregates-experimental" + }, + "id": "#GenerateH5MU.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--putative-cells-origin" + }, + "id": "#GenerateH5MU.cwl/Putative_Cells_Origin" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#GenerateH5MU.cwl/Run_Metadata" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--sample-tag-files" + }, + "id": "#GenerateH5MU.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#GenerateH5MU.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell" + }, + "id": "#GenerateH5MU.cwl/VDJ_Per_Cell" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.h5mu" + }, + "id": "#GenerateH5MU.cwl/H5MU" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#GenerateH5MU.cwl/output" + } + ], + "id": "#GenerateH5MU.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "hints": [], + "baseCommand": "GenerateSeurat.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--atac-seurat-rds" + }, + "id": "#GenerateSeurat.cwl/ATAC_Seurat" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--bioproduct-stats" + }, + "id": "#GenerateSeurat.cwl/Bioproduct_Stats" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-experimental", + "itemSeparator": "," + }, + "id": "#GenerateSeurat.cwl/Cell_Type_Experimental" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--data-tables" + }, + "id": "#GenerateSeurat.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--coordinates-file-list" + }, + "id": "#GenerateSeurat.cwl/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--protein-aggregates-experimental" + }, + "id": "#GenerateSeurat.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--putative-cells-origin" + }, + "id": "#GenerateSeurat.cwl/Putative_Cells_Origin" + }, + { + "type": "File", + "loadContents": true, + "id": "#GenerateSeurat.cwl/Run_Metadata" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--sample-tag-csvs" + }, + "id": "#GenerateSeurat.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#GenerateSeurat.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell" + }, + "id": "#GenerateSeurat.cwl/VDJ_Per_Cell" + } + ], + "arguments": [ + { + "prefix": "--base-name", + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.rds" + }, + "id": "#GenerateSeurat.cwl/SeuratRDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.log" + }, + "id": "#GenerateSeurat.cwl/output" + } + ], + "id": "#GenerateSeurat.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 48000 + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_get_datatables.py" + ], + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--atac-cell-calling-data-file" + }, + "id": "#GetDataTable.cwl/ATAC_Cell_Calling_Input" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--bioproduct-stats-list" + }, + "id": "#GetDataTable.cwl/Bioproduct_Stats_List" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--cell-biop-summary-list" + }, + "id": "#GetDataTable.cwl/Cell_Biop_Summary_List" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--full-gene-list" + }, + "id": "#GetDataTable.cwl/Full_Genes" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#GetDataTable.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--seq-metrics" + }, + "id": "#GetDataTable.cwl/Seq_Metrics" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "int" + } + ], + "id": "#GetDataTable.cwl/Total_Molecules" + }, + { + "type": [ + "null", + "int" + ], + "id": "#GetDataTable.cwl/num_bioproducts" + }, + { + "type": [ + "null", + "int" + ], + "id": "#GetDataTable.cwl/num_cell_estimate" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_RSEC_MolsPerCell_MEX.zip" + }, + "id": "#GetDataTable.cwl/Biop_putative_data_table" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Bioproduct_Stats.csv" + }, + "id": "#GetDataTable.cwl/Bioproduct_Stats" + }, + { + "type": "File", + "outputBinding": { + "glob": "cell_order.json" + }, + "id": "#GetDataTable.cwl/Cell_Order" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "cell_order_subsampled.json" + }, + "id": "#GetDataTable.cwl/Cell_Order_Subsampled" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*cell_type_experimental.csv" + }, + "id": "#GetDataTable.cwl/Cell_Type_Predictions" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_MEX.zip" + }, + "id": "#GetDataTable.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_coordinates.csv" + }, + "id": "#GetDataTable.cwl/Dim_Reduction_Coord" + }, + { + "type": "File", + "outputBinding": { + "glob": "metrics-files.tar.gz" + }, + "id": "#GetDataTable.cwl/Metrics_tar" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "Protein_Agg/*_Protein_Aggregates_Experimental.csv" + }, + "id": "#GetDataTable.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "Cell_Label_Filtering/*_Putative_Cells_Origin.csv" + }, + "id": "#GetDataTable.cwl/Putative_Cells_Origin" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "SampleTag/*csv" + }, + "id": "#GetDataTable.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "SampleTag/*_Sample_Tag_Calls.csv" + }, + "id": "#GetDataTable.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "SampleTagArchives/*zip" + }, + "id": "#GetDataTable.cwl/SampleTag_perTagZips" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#GetDataTable.cwl/output" + } + ], + "id": "#GetDataTable.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "GetMachineResources gets available resources (current only the number of cpus) on the machine that is running the local deployment of the MIST pipeline.\n", + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_machine_resources.py" + ], + "inputs": [], + "outputs": [ + { + "type": "int", + "outputBinding": { + "glob": "resources.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).total_cpus_avail)" + }, + "id": "#GetMachineResources.cwl/Total_CPUs_Avail" + } + ], + "id": "#GetMachineResources.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/_AbSeq_UMI" + }, + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/_MinChunkSize" + }, + { + "type": [ + "null", + "long" + ], + "id": "#InternalSettings.cwl/_NumRecordsPerSplit" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_Subsample_Sample_Tags" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#InternalSettings.cwl/_Target_analysis" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_VDJ_VGene_Evalue" + } + ], + "outputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/AbSeq_UMI" + }, + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/MinChunkSize" + }, + { + "type": [ + "null", + "long" + ], + "id": "#InternalSettings.cwl/NumRecordsPerSplit" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/Subsample_Sample_Tags" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#InternalSettings.cwl/Target_analysis" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/VDJ_VGene_Evalue" + } + ], + "expression": "${\n var internalInputs = [\n '_AbSeq_UMI',\n '_MinChunkSize',\n '_NumRecordsPerSplit',\n '_Target_analysis',\n '_Subsample_Sample_Tags',\n '_VDJ_VGene_Evalue',\n '_VDJ_JGene_Evalue',\n ];\n var internalOutputs = {}\n for (var i = 0; i < internalInputs.length; i++) {\n var internalInput = internalInputs[i];\n var internalOutput = internalInput.slice(1); // remove leading underscore\n if (inputs.hasOwnProperty(internalInput)) {\n internalOutputs[internalOutput] = inputs[internalInput]; // if input specified, redirect to output\n } else {\n internalOutputs[internalOutput] = null; // if input not specified, provide a null\n }\n }\n return internalOutputs;\n}", + "id": "#InternalSettings.cwl" + }, + { + "class": "Workflow", + "label": "BD Rhapsody™ Sequence Analysis Pipeline", + "doc": "The BD Rhapsody™ assays are used to create sequencing libraries from single cell transcriptomes.\n\nAfter sequencing, the analysis pipeline takes the FASTQ files and a reference file for gene alignment. The pipeline generates molecular counts per cell, read counts per cell, metrics, and an alignment file.", + "requirements": [ + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "MultipleInputFeatureRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "AbSeq Reference", + "id": "#main/AbSeq_Reference" + }, + { + "type": [ + "null", + "int" + ], + "id": "#main/AbSeq_UMI" + }, + { + "label": "Cell Calling ATAC Algorithm", + "doc": "Specify the ATAC algorithm to be used for ATAC putative cell calling. The Basic algorithm is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm", + "symbols": [ + "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm/Basic", + "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm/Refined" + ] + } + ], + "id": "#main/Cell_Calling_ATAC_Algorithm" + }, + { + "label": "Cell Calling Bioproduct Algorithm", + "doc": "Specify the bioproduct algorithm to be used for mRNA/AbSeq putative cell calling. The Basic algorithm is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm", + "symbols": [ + "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm/Basic", + "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm/Refined" + ] + } + ], + "id": "#main/Cell_Calling_Bioproduct_Algorithm" + }, + { + "label": "Cell Calling Data", + "doc": "Specify the data to be used for putative cell calling.\nThe default data for putative cell calling will be determined the following way:\n - If mRNA and ATAC Reads exist, mRNA_and_ATAC is the default.\n - If only ATAC Reads exist, ATAC is the default.\n - Otherwise, mRNA is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_Data/Cell_Calling_Data", + "symbols": [ + "#main/Cell_Calling_Data/Cell_Calling_Data/mRNA", + "#main/Cell_Calling_Data/Cell_Calling_Data/AbSeq", + "#main/Cell_Calling_Data/Cell_Calling_Data/ATAC", + "#main/Cell_Calling_Data/Cell_Calling_Data/mRNA_and_ATAC" + ] + } + ], + "id": "#main/Cell_Calling_Data" + }, + { + "type": [ + "null", + "string" + ], + "label": "Custom STAR Params", + "doc": "Allows you to specify custom STAR aligner mapping parameters. Only the mapping parameters you provide here will be used with STAR, meaning that you must provide the complete list of parameters that you want to take effect. For reference, the parameters used by default in the pipeline are:\n\n 1. Short Reads: --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000\n 2. Long Reads: Same options as short reads + --seedPerReadNmax 10000\n\n\nExample input: --alignIntronMax 500000 --outFilterScoreMinOverLread 0 --limitOutSJcollapsed 2000000\n\nImportant:\n 1. This applies to fastqs provided in the Reads user input\n 2. Please do not specify any non-mapping related params like: --runThreadN, --genomeDir --outSAMtype, etc.\n 3. Please only use params supported by STAR version 2.7.10b\n", + "id": "#main/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "label": "Custom bwa-mem2 Params", + "doc": "Allows you to specify custom bwa-mem2 mapping parameters. Only the mapping parameters you provide here will be used with bwa-mem2, meaning that you must provide the complete list of parameters that you want to take effect. The pipeline uses program default mapping parameters.\n\nExample input: -k 15 -w 200 -r 2\n\nImportant:\n 1. This applies to fastqs provided in the Reads_ATAC user input\n 2. Please do not specify any non-mapping related params like: -C, -t, etc.\n 3. Please only use params supported by bwa-mem2 version 2.2.1\n", + "id": "#main/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "int" + ], + "label": "Exact Cell Count", + "doc": "Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count", + "id": "#main/Exact_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Exclude Intronic Reads", + "doc": "By default, reads aligned to exons and introns are considered and represented in molecule counts. Including intronic reads may increase sensitivity, resulting in an increase in molecule counts and the number of genes per cell for both cellular and nuclei samples. Intronic reads may indicate unspliced mRNAs and are also useful, for example, in the study of nuclei and RNA velocity. When set to true, intronic reads will be excluded.", + "id": "#main/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "int" + ], + "label": "Expected Cell Count", + "doc": "Optional. Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected.", + "id": "#main/Expected_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Generate Bam Output", + "doc": "Default: false. A Bam read alignment file contains reads from all the input libraries, but creating it can consume a lot of compute and disk resources. By setting this field to true, the Bam file will be created. This option is shared for both Bioproduct and ATAC libraries.\n", + "id": "#main/Generate_Bam" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Long Reads (>=650bp)", + "doc": "By default, we detect if there are any reads longer than 650bp and then flag QualCLAlign to use STARlong instead of STAR. This flag can be explicitly set if it is known in advance that there are reads longer than 650bp.\n", + "id": "#main/Long_Reads" + }, + { + "type": [ + "null", + "int" + ], + "label": "Maximum Number of Threads", + "doc": "The maximum number of threads to use in the pipeline. By default, all available cores are used.", + "id": "#main/Maximum_Threads" + }, + { + "type": [ + "null", + "File" + ], + "label": "ATAC Predefined Peak Regions", + "doc": "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. Only applies to ATAC assays.", + "id": "#main/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Reads", + "doc": "FASTQ files from libraries that may include WTA mRNA, Targeted mRNA, AbSeq, Sample Multiplexing, and related technologies", + "id": "#main/Reads" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Reads-ATAC", + "doc": "FASTQ files from libraries generated using the ATAC assay protocol. Each lane of a library is expected to have 3 FASTQs - R1, R2 and I1/I2, where the index read contains the Cell Barcode and UMI sequence. Only applies to ATAC assays.", + "id": "#main/Reads_ATAC" + }, + { + "type": [ + "null", + "File" + ], + "label": "Reference Files Archive", + "id": "#main/Reference_Archive" + }, + { + "label": "Run Name", + "type": [ + "null", + "string" + ], + "doc": "This is a name for output files, for example Experiment1_Metrics_Summary.csv. Default if left empty is to name run based on a library. Any non-alpha numeric characters will be changed to a hyphen.", + "id": "#main/Run_Name" + }, + { + "label": "Sample Tags Version", + "doc": "The sample multiplexing kit version. This option should only be set for a multiplexed experiment.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Sample_Tags_Version/Sample_Tags_Version", + "symbols": [ + "#main/Sample_Tags_Version/Sample_Tags_Version/human", + "#main/Sample_Tags_Version/Sample_Tags_Version/hs", + "#main/Sample_Tags_Version/Sample_Tags_Version/mouse", + "#main/Sample_Tags_Version/Sample_Tags_Version/mm", + "#main/Sample_Tags_Version/Sample_Tags_Version/flex", + "#main/Sample_Tags_Version/Sample_Tags_Version/nuclei_includes_mrna", + "#main/Sample_Tags_Version/Sample_Tags_Version/nuclei_atac_only", + "#main/Sample_Tags_Version/Sample_Tags_Version/custom" + ] + } + ], + "id": "#main/Sample_Tags_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Supplemental Reference", + "id": "#main/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "label": "Sample Tag Names", + "doc": "Specify the Sample Tag number followed by - (hyphen) and a sample name to appear in the output files. For example: 4-Ramos. Should be alpha numeric, with + - and _ allowed. Any special characters: &, (), [], {}, <>, ?, | will be corrected to underscores. \n", + "id": "#main/Tag_Names" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#main/Target_analysis" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Targeted Reference", + "id": "#main/Targeted_Reference" + }, + { + "type": [ + "null", + "float" + ], + "label": "e-value threshold for J gene", + "doc": "The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001\n", + "id": "#main/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "label": "e-value threshold for V gene", + "doc": "The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001\n", + "id": "#main/VDJ_VGene_Evalue" + }, + { + "label": "VDJ Species Version", + "doc": "The VDJ species and chain types. This option should only be set for VDJ experiment.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/VDJ_Version/VDJ_Version", + "symbols": [ + "#main/VDJ_Version/VDJ_Version/human", + "#main/VDJ_Version/VDJ_Version/hs", + "#main/VDJ_Version/VDJ_Version/mouse", + "#main/VDJ_Version/VDJ_Version/mm", + "#main/VDJ_Version/VDJ_Version/humanBCR", + "#main/VDJ_Version/VDJ_Version/humanTCR", + "#main/VDJ_Version/VDJ_Version/mouseBCR", + "#main/VDJ_Version/VDJ_Version/mouseTCR" + ] + } + ], + "id": "#main/VDJ_Version" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#main/Write_Filtered_Reads" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/MergeATAC/ATAC_out", + "id": "#main/ATAC" + }, + { + "label": "BAM files and indices", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/MergeBAM_RNA/Bam", + "#main/MergeBAM_RNA/BamIndex", + "#main/MergeBAM_ATAC/Bam", + "#main/MergeBAM_ATAC/BamIndex" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Bam" + }, + { + "label": "Bioproduct Statistics", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/Bioproduct_Stats" + }, + { + "label": "Data Tables", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/GetDataTable/Data_Tables", + "id": "#main/Data_Tables" + }, + { + "label": "Dimensionality Reduction Coordinates", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/QualCLAlign_ATAC/Failed_Reads_CSVs", + "id": "#main/Failed_Reads_CSVs_ATAC" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/QualCLAlign_RNA/Failed_Reads_CSVs", + "id": "#main/Failed_Reads_CSVs_RNA" + }, + { + "label": "Scanpy-Muon H5MU File", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GenerateH5MU/H5MU", + "id": "#main/H5MU" + }, + { + "label": "Immune Cell Classification (Experimental)", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/Immune_Cell_Classification(Experimental)" + }, + { + "label": "Pipeline Logs", + "type": "Directory", + "outputSource": "#main/BundleLogs/logs_dir", + "id": "#main/Logs" + }, + { + "label": "Metrics Summary", + "type": [ + "null", + "File" + ], + "outputSource": "#main/Metrics/Metrics_Summary", + "id": "#main/Metrics_Summary" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/MergeMultiplex/Multiplex_out", + "id": "#main/Multiplexing" + }, + { + "label": "Pipeline Report HTML", + "type": [ + "null", + "File" + ], + "outputSource": "#main/Metrics/Pipeline_Report_HTML", + "id": "#main/Pipeline_Report_HTML" + }, + { + "label": "Protein Aggregates (Experimental)", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/Protein_Aggregates_Experimental" + }, + { + "label": "Seurat RDS File", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GenerateSeurat/SeuratRDS", + "id": "#main/Seurat" + }, + { + "label": "vdjCellsDatatable", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/vdjCellsDatatable" + }, + { + "label": "vdjCellsDatatableUncorrected", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjCellsDatatableUncorrected", + "id": "#main/vdjCellsDatatableUncorrected" + }, + { + "label": "vdjDbecFilterImages", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjDbecFilterImages", + "id": "#main/vdjDbecFilterImages" + }, + { + "label": "vdjDominantContigsAIRR", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjDominantContigsAIRR", + "id": "#main/vdjDominantContigsAIRR" + }, + { + "label": "vdjMetricsCsv", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjMetricsCsv", + "id": "#main/vdjMetricsCsv" + }, + { + "label": "vdjUnfilteredContigsAIRR", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjUnfilteredContigsAIRR", + "id": "#main/vdjUnfilteredContigsAIRR" + } + ], + "steps": [ + { + "run": "#ATAC_Cell_by_Peak.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/ATAC_Cell_by_Peak/Assay" + }, + { + "source": "#main/QualCLAlign_ATAC/Fragments", + "id": "#main/ATAC_Cell_by_Peak/Fragments" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/ATAC_Cell_by_Peak/GTF" + }, + { + "source": "#main/QualCLAlign_ATAC/Peaks", + "id": "#main/ATAC_Cell_by_Peak/Peaks" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/ATAC_Cell_by_Peak/Run_Metadata" + }, + { + "source": "#main/QualCLAlign_ATAC/Transposase_Sites", + "id": "#main/ATAC_Cell_by_Peak/Transposase_Sites" + } + ], + "out": [ + "#main/ATAC_Cell_by_Peak/Initial_Seurat_RDS", + "#main/ATAC_Cell_by_Peak/ATAC_Cell_Calling_Data", + "#main/ATAC_Cell_by_Peak/Total_Fragment_Metrics", + "#main/ATAC_Cell_by_Peak/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/ATAC_Cell_by_Peak" + }, + { + "run": "#ATAC_Compile_Results.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/ATAC_Compile_Results/Assay" + }, + { + "source": "#main/GetDataTable/Biop_putative_data_table", + "id": "#main/ATAC_Compile_Results/Biop_putative_data_table" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/ATAC_Compile_Results/Cell_Order" + }, + { + "source": "#main/GetDataTable/Cell_Order_Subsampled", + "id": "#main/ATAC_Compile_Results/Cell_Order_Subsampled" + }, + { + "source": "#main/QualCLAlign_ATAC/Fragments", + "id": "#main/ATAC_Compile_Results/Fragments" + }, + { + "source": "#main/ATAC_Cell_by_Peak/Initial_Seurat_RDS", + "id": "#main/ATAC_Compile_Results/Initial_Seurat_RDS" + }, + { + "source": "#main/GetDataTable/Metrics_tar", + "id": "#main/ATAC_Compile_Results/Input_Metrics_tar" + }, + { + "source": "#main/QualCLAlign_ATAC/Reference_Genome_Size", + "id": "#main/ATAC_Compile_Results/Reference_Genome_Size" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/ATAC_Compile_Results/Run_Metadata" + }, + { + "source": "#main/ATAC_Cell_by_Peak/Total_Fragment_Metrics", + "id": "#main/ATAC_Compile_Results/Total_Fragment_Metrics" + }, + { + "source": "#main/QualCLAlign_ATAC/UnifiedMetrics", + "id": "#main/ATAC_Compile_Results/Unified_Metrics" + } + ], + "out": [ + "#main/ATAC_Compile_Results/Metrics_tar", + "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Seurat_RDS", + "#main/ATAC_Compile_Results/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/ATAC_Compile_Results" + }, + { + "run": "#AddtoBam.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/AddtoBam_ATAC/Assay" + }, + { + "source": [ + "#main/QualCLAlign_ATAC/BAMFiles" + ], + "default": [ + "does_not_exist" + ], + "id": "#main/AddtoBam_ATAC/Bam" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/AddtoBam_ATAC/Cell_Order" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/AddtoBam_ATAC/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AddtoBam_ATAC/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/AddtoBam_ATAC/SampleTag_Calls" + } + ], + "when": "$(inputs.Generate_Bam == true && inputs.Assay == \"ATAC\")", + "out": [ + "#main/AddtoBam_ATAC/Annotated_Bam", + "#main/AddtoBam_ATAC/output" + ], + "scatter": [ + "#main/AddtoBam_ATAC/Bam" + ], + "id": "#main/AddtoBam_ATAC" + }, + { + "run": "#AddtoBam.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AddtoBam_RNA/Assay" + }, + { + "source": "#main/AlignmentAnalysis/Annotated_Bam_Files", + "default": [ + "does_not_exist" + ], + "id": "#main/AddtoBam_RNA/Bam" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/AddtoBam_RNA/Cell_Order" + }, + { + "source": "#main/AnnotateMolecules/Corrected_Mols_List", + "id": "#main/AddtoBam_RNA/Corrected_Mols" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/AddtoBam_RNA/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AddtoBam_RNA/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/AddtoBam_RNA/SampleTag_Calls" + }, + { + "source": "#main/CheckReference/Target_Gene_Mapping", + "id": "#main/AddtoBam_RNA/Target_Gene_Mapping" + } + ], + "when": "$(inputs.Generate_Bam == true && (inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\"))", + "out": [ + "#main/AddtoBam_RNA/Annotated_Bam", + "#main/AddtoBam_RNA/output" + ], + "scatter": [ + "#main/AddtoBam_RNA/Bam" + ], + "id": "#main/AddtoBam_RNA" + }, + { + "run": "#AlignmentAnalysis.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AlignmentAnalysis/Assay" + }, + { + "source": "#main/Misc_Settings/Exclude_Intronic_Reads", + "id": "#main/AlignmentAnalysis/Exclude_Intronic_Reads" + }, + { + "source": "#main/CheckReference/Extra_Seqs", + "id": "#main/AlignmentAnalysis/Extra_Seqs" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/AlignmentAnalysis/GTF" + }, + { + "source": "#main/Maximum_Threads", + "id": "#main/AlignmentAnalysis/Maximum_Threads" + }, + { + "source": "#main/QualCLAlign_RNA/BAMFiles", + "id": "#main/AlignmentAnalysis/R2_BAM" + }, + { + "source": "#main/QualCLAlign_RNA/QualCLAlignMetrics", + "id": "#main/AlignmentAnalysis/ReadQualityMetrics" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AlignmentAnalysis/Run_Metadata" + }, + { + "source": "#main/CheckReference/Transcript_Length", + "id": "#main/AlignmentAnalysis/Transcript_Length" + } + ], + "out": [ + "#main/AlignmentAnalysis/Seq_Metrics", + "#main/AlignmentAnalysis/Annotated_Bam_Files", + "#main/AlignmentAnalysis/Sorted_Valid_Reads_CSV", + "#main/AlignmentAnalysis/num_valid_ig_reads", + "#main/AlignmentAnalysis/num_valid_tcr_reads", + "#main/AlignmentAnalysis/validIgReads", + "#main/AlignmentAnalysis/validTcrReads", + "#main/AlignmentAnalysis/num_cell_estimate", + "#main/AlignmentAnalysis/num_bioproducts" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/AlignmentAnalysis" + }, + { + "requirements": [ + { + "ramMin": 32000, + "class": "ResourceRequirement" + } + ], + "run": "#AnnotateMolecules.cwl", + "in": [ + { + "source": "#main/Internal_Settings/AbSeq_UMI", + "id": "#main/AnnotateMolecules/AbSeq_UMI" + }, + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AnnotateMolecules/Assay" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AnnotateMolecules/Run_Metadata" + }, + { + "source": "#main/AlignmentAnalysis/Sorted_Valid_Reads_CSV", + "default": [ + "does_not_exist" + ], + "id": "#main/AnnotateMolecules/Valids" + } + ], + "out": [ + "#main/AnnotateMolecules/Bioproduct_Stats_List", + "#main/AnnotateMolecules/Cell_Biop_Summary_List", + "#main/AnnotateMolecules/Corrected_Mols_List", + "#main/AnnotateMolecules/Total_Molecules", + "#main/AnnotateMolecules/output" + ], + "scatter": [ + "#main/AnnotateMolecules/Valids" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/AnnotateMolecules" + }, + { + "run": "#Assay_Settings.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "valueFrom": "${ if (self && self.length > 0){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/AbSeq_Reference_Present" + }, + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Assay_Settings/Reads_ATAC_Present" + }, + { + "source": "#main/Reads", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Assay_Settings/Reads_RNA_Present" + }, + { + "source": "#main/Reference_Archive", + "valueFrom": "${ if (self){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/Reference_Archive_Present" + }, + { + "source": "#main/Targeted_Reference", + "valueFrom": "${ if (self && self.length > 0){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/Targeted_Reference_Present" + } + ], + "out": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "id": "#main/Assay_Settings" + }, + { + "label": "Bam Settings", + "run": "#BamSettings.cwl", + "in": [ + { + "source": "#main/Generate_Bam", + "id": "#main/Bam_Settings/_Generate_Bam" + } + ], + "out": [ + "#main/Bam_Settings/Generate_Bam" + ], + "id": "#main/Bam_Settings" + }, + { + "run": "#BundleLogs.cwl", + "in": [ + { + "source": [ + "#main/CheckReference/output", + "#main/GetDataTable/output", + "#main/ATAC_Cell_by_Peak/output", + "#main/ATAC_Compile_Results/output", + "#main/Metrics/output", + "#main/AddtoBam_RNA/output", + "#main/AddtoBam_ATAC/output", + "#main/AnnotateMolecules/output", + "#main/MergeBAM_RNA/log", + "#main/MergeBAM_ATAC/log", + "#main/GenerateH5MU/output", + "#main/GenerateSeurat/output", + "#main/Peak_Annotation/output" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/BundleLogs/log_files" + } + ], + "out": [ + "#main/BundleLogs/logs_dir" + ], + "id": "#main/BundleLogs" + }, + { + "requirements": [ + { + "ramMin": 10000, + "class": "ResourceRequirement" + } + ], + "run": "#CheckReference.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "id": "#main/CheckReference/AbSeq_Reference" + }, + { + "source": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/CheckReference/Assay" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "id": "#main/CheckReference/Cell_Calling_Data" + }, + { + "source": "#main/Predefined_ATAC_Peaks", + "id": "#main/CheckReference/Predefined_ATAC_Peaks" + }, + { + "source": "#main/Reference_Archive", + "id": "#main/CheckReference/Reference_Archive" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tags_Version", + "id": "#main/CheckReference/Sample_Tags_Version" + }, + { + "source": "#main/Supplemental_Reference", + "id": "#main/CheckReference/Supplemental_Reference" + }, + { + "source": "#main/Targeted_Reference", + "id": "#main/CheckReference/Targeted_Reference" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/CheckReference/VDJ_Version" + } + ], + "out": [ + "#main/CheckReference/Index", + "#main/CheckReference/Extra_Seqs", + "#main/CheckReference/Full_Genes", + "#main/CheckReference/output", + "#main/CheckReference/Transcript_Length", + "#main/CheckReference/GTF", + "#main/CheckReference/Target_Gene_Mapping", + "#main/CheckReference/Checked_Predefined_Peaks", + "#main/CheckReference/Checked_Predefined_Peaks_Index" + ], + "id": "#main/CheckReference" + }, + { + "run": "#GenerateH5MU.cwl", + "in": [ + { + "source": "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "id": "#main/GenerateH5MU/Atac_Datatables" + }, + { + "source": "#main/Metrics/Metrics_ATAC", + "id": "#main/GenerateH5MU/Atac_Metrics" + }, + { + "source": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/GenerateH5MU/Bioproduct_Stats" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateH5MU/Cell_Type_Experimental" + }, + { + "source": "#main/GetDataTable/Data_Tables", + "id": "#main/GenerateH5MU/Data_Tables" + }, + { + "source": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateH5MU/Dim_Reduction_Coord" + }, + { + "source": "#main/Metrics/Metrics_Summary", + "id": "#main/GenerateH5MU/Metrics_Summary" + }, + { + "source": "#main/Peak_Annotation/Peak_Annotation_TSV", + "id": "#main/GenerateH5MU/Peak_Annotation" + }, + { + "source": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/GenerateH5MU/Protein_Aggregates_Experimental" + }, + { + "source": "#main/GetDataTable/Putative_Cells_Origin", + "id": "#main/GenerateH5MU/Putative_Cells_Origin" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GenerateH5MU/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_CSVs", + "id": "#main/GenerateH5MU/SampleTag_CSVs" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/GenerateH5MU/SampleTag_Calls" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/GenerateH5MU/VDJ_Per_Cell" + } + ], + "out": [ + "#main/GenerateH5MU/H5MU", + "#main/GenerateH5MU/output" + ], + "id": "#main/GenerateH5MU" + }, + { + "run": "#GenerateSeurat.cwl", + "in": [ + { + "source": "#main/ATAC_Compile_Results/ATAC_Seurat_RDS", + "id": "#main/GenerateSeurat/ATAC_Seurat" + }, + { + "source": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/GenerateSeurat/Bioproduct_Stats" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateSeurat/Cell_Type_Experimental" + }, + { + "source": "#main/GetDataTable/Data_Tables", + "id": "#main/GenerateSeurat/Data_Tables" + }, + { + "source": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateSeurat/Dim_Reduction_Coord" + }, + { + "source": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/GenerateSeurat/Protein_Aggregates_Experimental" + }, + { + "source": "#main/GetDataTable/Putative_Cells_Origin", + "id": "#main/GenerateSeurat/Putative_Cells_Origin" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GenerateSeurat/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_CSVs", + "id": "#main/GenerateSeurat/SampleTag_CSVs" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/GenerateSeurat/SampleTag_Calls" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/GenerateSeurat/VDJ_Per_Cell" + } + ], + "out": [ + "#main/GenerateSeurat/SeuratRDS", + "#main/GenerateSeurat/output" + ], + "id": "#main/GenerateSeurat" + }, + { + "run": "#GetDataTable.cwl", + "in": [ + { + "source": "#main/ATAC_Cell_by_Peak/ATAC_Cell_Calling_Data", + "id": "#main/GetDataTable/ATAC_Cell_Calling_Input" + }, + { + "source": "#main/AnnotateMolecules/Bioproduct_Stats_List", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Bioproduct_Stats_List" + }, + { + "source": "#main/AnnotateMolecules/Cell_Biop_Summary_List", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Cell_Biop_Summary_List" + }, + { + "source": "#main/CheckReference/Full_Genes", + "id": "#main/GetDataTable/Full_Genes" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GetDataTable/Run_Metadata" + }, + { + "source": "#main/AlignmentAnalysis/Seq_Metrics", + "id": "#main/GetDataTable/Seq_Metrics" + }, + { + "source": "#main/AnnotateMolecules/Total_Molecules", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Total_Molecules" + }, + { + "source": "#main/AlignmentAnalysis/num_bioproducts", + "id": "#main/GetDataTable/num_bioproducts" + }, + { + "source": "#main/AlignmentAnalysis/num_cell_estimate", + "id": "#main/GetDataTable/num_cell_estimate" + } + ], + "out": [ + "#main/GetDataTable/Metrics_tar", + "#main/GetDataTable/Bioproduct_Stats", + "#main/GetDataTable/Cell_Order", + "#main/GetDataTable/Cell_Order_Subsampled", + "#main/GetDataTable/Cell_Type_Predictions", + "#main/GetDataTable/Data_Tables", + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/GetDataTable/output", + "#main/GetDataTable/Protein_Aggregates_Experimental", + "#main/GetDataTable/Putative_Cells_Origin", + "#main/GetDataTable/SampleTag_Calls", + "#main/GetDataTable/SampleTag_CSVs", + "#main/GetDataTable/SampleTag_perTagZips", + "#main/GetDataTable/Biop_putative_data_table" + ], + "id": "#main/GetDataTable" + }, + { + "label": "Get Machine Resources", + "run": "#GetMachineResources.cwl", + "in": [], + "out": [ + "#main/GetMachineResources/Total_CPUs_Avail" + ], + "id": "#main/GetMachineResources" + }, + { + "label": "Internal Settings", + "run": "#InternalSettings.cwl", + "in": [ + { + "source": "#main/AbSeq_UMI", + "id": "#main/Internal_Settings/_AbSeq_UMI" + }, + { + "source": "#main/Target_analysis", + "id": "#main/Internal_Settings/_Target_analysis" + }, + { + "source": "#main/VDJ_JGene_Evalue", + "id": "#main/Internal_Settings/_VDJ_JGene_Evalue" + }, + { + "source": "#main/VDJ_VGene_Evalue", + "id": "#main/Internal_Settings/_VDJ_VGene_Evalue" + } + ], + "out": [ + "#main/Internal_Settings/AbSeq_UMI", + "#main/Internal_Settings/Target_analysis", + "#main/Internal_Settings/VDJ_VGene_Evalue", + "#main/Internal_Settings/VDJ_JGene_Evalue" + ], + "id": "#main/Internal_Settings" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": { + "items": [ + "null", + "File" + ], + "type": "array" + }, + "id": "#main/MergeATAC/run/ATAC_Files" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#main/MergeATAC/run/ATAC_out" + } + ], + "expression": "${\n var fp_array = [];\n for (var i = 0; i < inputs.ATAC_Files.length; i++) {\n var fp = inputs.ATAC_Files[i];\n if (fp != null) {\n fp_array.push(fp);\n }\n }\n return({\"ATAC_out\": fp_array});\n}" + }, + "in": [ + { + "source": [ + "#main/QualCLAlign_ATAC/Fragments", + "#main/QualCLAlign_ATAC/Transposase_Sites", + "#main/QualCLAlign_ATAC/Peaks", + "#main/Metrics/Metrics_ATAC", + "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "#main/Peak_Annotation/Peak_Annotation_TSV" + ], + "linkMerge": "merge_flattened", + "id": "#main/MergeATAC/ATAC_Files" + } + ], + "out": [ + "#main/MergeATAC/ATAC_out" + ], + "id": "#main/MergeATAC" + }, + { + "run": "#MergeBAM.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/MergeBAM_ATAC/Assay" + }, + { + "source": "#main/AddtoBam_ATAC/Annotated_Bam", + "id": "#main/MergeBAM_ATAC/BamFiles" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/MergeBAM_ATAC/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/MergeBAM_ATAC/Run_Metadata" + } + ], + "when": "$(inputs.Generate_Bam == true && inputs.Assay == \"ATAC\")", + "out": [ + "#main/MergeBAM_ATAC/Bam", + "#main/MergeBAM_ATAC/BamIndex", + "#main/MergeBAM_ATAC/log" + ], + "id": "#main/MergeBAM_ATAC" + }, + { + "run": "#MergeBAM.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/MergeBAM_RNA/Assay" + }, + { + "source": "#main/AddtoBam_RNA/Annotated_Bam", + "id": "#main/MergeBAM_RNA/BamFiles" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/MergeBAM_RNA/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/MergeBAM_RNA/Run_Metadata" + } + ], + "when": "$(inputs.Generate_Bam == true && (inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\"))", + "out": [ + "#main/MergeBAM_RNA/Bam", + "#main/MergeBAM_RNA/BamIndex", + "#main/MergeBAM_RNA/log" + ], + "id": "#main/MergeBAM_RNA" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": { + "items": [ + "null", + "File" + ], + "type": "array" + }, + "id": "#main/MergeMultiplex/run/SampleTag_Files" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#main/MergeMultiplex/run/Multiplex_out" + } + ], + "expression": "${\n var fp_array = [];\n for (var i = 0; i < inputs.SampleTag_Files.length; i++) {\n var fp = inputs.SampleTag_Files[i];\n if (fp != null) {\n fp_array.push(fp);\n }\n }\n return({\"Multiplex_out\": fp_array});\n}" + }, + "in": [ + { + "source": [ + "#main/GetDataTable/SampleTag_CSVs", + "#main/GetDataTable/SampleTag_perTagZips" + ], + "linkMerge": "merge_flattened", + "id": "#main/MergeMultiplex/SampleTag_Files" + } + ], + "out": [ + "#main/MergeMultiplex/Multiplex_out" + ], + "id": "#main/MergeMultiplex" + }, + { + "run": "#Metadata.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "id": "#main/Metadata_Settings/AbSeq_Reference" + }, + { + "source": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Assay" + }, + { + "source": [ + "#main/QualCLAlign_RNA/Bead_Version", + "#main/QualCLAlign_ATAC/Bead_Version" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Bead_Version" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_ATAC_Algorithm", + "id": "#main/Metadata_Settings/Cell_Calling_ATAC_Algorithm" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Bioproduct_Algorithm", + "id": "#main/Metadata_Settings/Cell_Calling_Bioproduct_Algorithm" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "id": "#main/Metadata_Settings/Cell_Calling_Data" + }, + { + "source": "#main/Misc_Settings/Custom_STAR_Params", + "id": "#main/Metadata_Settings/Custom_STAR_Params" + }, + { + "source": "#main/Misc_Settings/Custom_bwa_mem2_Params", + "id": "#main/Metadata_Settings/Custom_bwa_mem2_Params" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Exact_Cell_Count", + "id": "#main/Metadata_Settings/Exact_Cell_Count" + }, + { + "source": "#main/Misc_Settings/Exclude_Intronic_Reads", + "id": "#main/Metadata_Settings/Exclude_Intronic_Reads" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Expected_Cell_Count", + "id": "#main/Metadata_Settings/Expected_Cell_Count" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/Metadata_Settings/Generate_Bam" + }, + { + "source": "#main/QualCLAlign_RNA/Libraries", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Libraries" + }, + { + "source": "#main/QualCLAlign_ATAC/Libraries", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Libraries_ATAC" + }, + { + "source": "#main/Misc_Settings/Long_Reads", + "id": "#main/Metadata_Settings/Long_Reads" + }, + { + "valueFrom": "BD Rhapsody Sequence Analysis Pipeline", + "id": "#main/Metadata_Settings/Pipeline_Name" + }, + { + "source": "#main/Version/version", + "id": "#main/Metadata_Settings/Pipeline_Version" + }, + { + "source": "#main/Predefined_ATAC_Peaks", + "id": "#main/Metadata_Settings/Predefined_ATAC_Peaks" + }, + { + "source": "#main/QualCLAlign_RNA/ReadsList", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Reads" + }, + { + "source": "#main/QualCLAlign_ATAC/ReadsList", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Reads_ATAC" + }, + { + "source": "#main/Reference_Archive", + "id": "#main/Metadata_Settings/Reference_Archive" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/Metadata_Settings/Run_Base_Name" + }, + { + "source": "#main/Name_Settings/Run_Name", + "id": "#main/Metadata_Settings/Run_Name" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tag_Names", + "id": "#main/Metadata_Settings/Sample_Tag_Names" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tags_Version", + "id": "#main/Metadata_Settings/Sample_Tags_Version" + }, + { + "source": "#main/Start_Time/Start_Time", + "id": "#main/Metadata_Settings/Start_Time" + }, + { + "source": "#main/Supplemental_Reference", + "id": "#main/Metadata_Settings/Supplemental_Reference" + }, + { + "source": "#main/Targeted_Reference", + "id": "#main/Metadata_Settings/Targeted_Reference" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/Metadata_Settings/VDJ_Version" + } + ], + "out": [ + "#main/Metadata_Settings/Run_Metadata" + ], + "id": "#main/Metadata_Settings" + }, + { + "requirements": [ + { + "ramMin": 4000, + "class": "ResourceRequirement" + } + ], + "run": "#Metrics.cwl", + "in": [ + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metrics/Cell_Type_Predictions" + }, + { + "source": [ + "#main/ATAC_Compile_Results/Metrics_tar", + "#main/GetDataTable/Metrics_tar" + ], + "linkMerge": "merge_flattened", + "pickValue": "first_non_null", + "id": "#main/Metrics/Metrics_tar" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/Metrics/Run_Metadata" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/Metrics/vdjCellsDatatable" + }, + { + "source": "#main/VDJ_Compile_Results/vdjMetricsJson", + "id": "#main/Metrics/vdjMetricsJson" + } + ], + "out": [ + "#main/Metrics/Metrics_Summary", + "#main/Metrics/Metrics_Archive", + "#main/Metrics/Metrics_ATAC", + "#main/Metrics/Pipeline_Report_JSON", + "#main/Metrics/Pipeline_Report_HTML", + "#main/Metrics/output" + ], + "id": "#main/Metrics" + }, + { + "label": "Miscellaneous Settings", + "run": "#MiscSettings.cwl", + "in": [ + { + "source": "#main/Custom_STAR_Params", + "id": "#main/Misc_Settings/_Custom_STAR_Params" + }, + { + "source": "#main/Custom_bwa_mem2_Params", + "id": "#main/Misc_Settings/_Custom_bwa_mem2_Params" + }, + { + "source": "#main/Exclude_Intronic_Reads", + "id": "#main/Misc_Settings/_Exclude_Intronic_Reads" + }, + { + "source": "#main/Long_Reads", + "id": "#main/Misc_Settings/_Long_Reads" + } + ], + "out": [ + "#main/Misc_Settings/Exclude_Intronic_Reads", + "#main/Misc_Settings/Long_Reads", + "#main/Misc_Settings/Custom_STAR_Params", + "#main/Misc_Settings/Custom_bwa_mem2_Params" + ], + "id": "#main/Misc_Settings" + }, + { + "label": "Multiplexing Settings", + "run": "#MultiplexingSettings.cwl", + "in": [ + { + "source": "#main/Tag_Names", + "id": "#main/Multiplexing_Settings/_Sample_Tag_Names" + }, + { + "source": "#main/Sample_Tags_Version", + "id": "#main/Multiplexing_Settings/_Sample_Tags_Version" + } + ], + "out": [ + "#main/Multiplexing_Settings/Sample_Tag_Names", + "#main/Multiplexing_Settings/Sample_Tags_Version" + ], + "id": "#main/Multiplexing_Settings" + }, + { + "label": "Name Settings", + "run": "#NameSettings.cwl", + "in": [ + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ var fastqs = []; if(self) { for(var i = 0; i < self.length; i++) { fastqs.push(self[i].basename); } } return fastqs; }", + "id": "#main/Name_Settings/ATAC_Fastqs" + }, + { + "source": "#main/Reads", + "valueFrom": "${ var fastqs = []; if(self) { for(var i = 0; i < self.length; i++) { fastqs.push(self[i].basename); } } return fastqs; }", + "id": "#main/Name_Settings/Bioproduct_Fastqs" + }, + { + "source": "#main/Run_Name", + "id": "#main/Name_Settings/_Run_Name" + } + ], + "out": [ + "#main/Name_Settings/Run_Name", + "#main/Name_Settings/Run_Base_Name" + ], + "id": "#main/Name_Settings" + }, + { + "run": "#PeakAnnotation.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/Peak_Annotation/Assay" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/Peak_Annotation/Gtf" + }, + { + "source": "#main/QualCLAlign_ATAC/Peaks", + "id": "#main/Peak_Annotation/Peaks_bed" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/Peak_Annotation/Run_Metadata" + } + ], + "out": [ + "#main/Peak_Annotation/Peak_Annotation_TSV", + "#main/Peak_Annotation/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/Peak_Annotation" + }, + { + "label": "Putative Cell Calling Settings", + "run": "#PutativeCellSettings.cwl", + "in": [ + { + "source": "#main/Cell_Calling_ATAC_Algorithm", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_ATAC_Algorithm" + }, + { + "source": "#main/Cell_Calling_Bioproduct_Algorithm", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_Bioproduct_Algorithm" + }, + { + "source": "#main/Cell_Calling_Data", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_Data" + }, + { + "source": "#main/Exact_Cell_Count", + "id": "#main/Putative_Cell_Calling_Settings/_Exact_Cell_Count" + }, + { + "source": "#main/Expected_Cell_Count", + "id": "#main/Putative_Cell_Calling_Settings/_Expected_Cell_Count" + }, + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Putative_Cell_Calling_Settings/_Reads_ATAC_Present" + }, + { + "source": "#main/Reads", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Putative_Cell_Calling_Settings/_Reads_RNA_Present" + } + ], + "out": [ + "#main/Putative_Cell_Calling_Settings/Cell_Calling_ATAC_Algorithm", + "#main/Putative_Cell_Calling_Settings/Cell_Calling_Bioproduct_Algorithm", + "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "#main/Putative_Cell_Calling_Settings/Exact_Cell_Count", + "#main/Putative_Cell_Calling_Settings/Expected_Cell_Count" + ], + "id": "#main/Putative_Cell_Calling_Settings" + }, + { + "run": "#QualCLAlign.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/QualCLAlign_ATAC/Assay" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/QualCLAlign_ATAC/Generate_Bam" + }, + { + "source": "#main/CheckReference/Index", + "id": "#main/QualCLAlign_ATAC/Index" + }, + { + "source": "#main/CheckReference/Checked_Predefined_Peaks", + "id": "#main/QualCLAlign_ATAC/Predefined_ATAC_Peaks" + }, + { + "source": "#main/Reads_ATAC", + "id": "#main/QualCLAlign_ATAC/Reads" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/QualCLAlign_ATAC/Run_Base_Name" + }, + { + "source": [ + "#main/Maximum_Threads", + "#main/GetMachineResources/Total_CPUs_Avail", + "#main/Assay_Settings/Assay_RNA" + ], + "valueFrom": "${ var [max_threads, total_cpus_avail, assay_rna] = self;\nif (max_threads != null) { return max_threads; }\nif (assay_rna == null) { return total_cpus_avail; }\nif (total_cpus_avail >= 48) { return parseInt(total_cpus_avail / 2); } else { return total_cpus_avail; } }", + "id": "#main/QualCLAlign_ATAC/Threads" + }, + { + "source": "#main/Write_Filtered_Reads", + "id": "#main/QualCLAlign_ATAC/Write_Filtered_Reads" + }, + { + "source": "#main/Misc_Settings/Custom_bwa_mem2_Params", + "id": "#main/QualCLAlign_ATAC/bwa_mem2_Params" + } + ], + "out": [ + "#main/QualCLAlign_ATAC/Bead_Version", + "#main/QualCLAlign_ATAC/Libraries", + "#main/QualCLAlign_ATAC/ReadsList", + "#main/QualCLAlign_ATAC/BAMFiles", + "#main/QualCLAlign_ATAC/Fragments", + "#main/QualCLAlign_ATAC/Fragments_Index", + "#main/QualCLAlign_ATAC/Transposase_Sites", + "#main/QualCLAlign_ATAC/Transposase_Sites_Index", + "#main/QualCLAlign_ATAC/Peaks", + "#main/QualCLAlign_ATAC/Peaks_Index", + "#main/QualCLAlign_ATAC/QualCLAlignMetrics", + "#main/QualCLAlign_ATAC/UnifiedMetrics", + "#main/QualCLAlign_ATAC/Logs", + "#main/QualCLAlign_ATAC/Failed_Reads_CSVs", + "#main/QualCLAlign_ATAC/Reference_Genome_Size" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/QualCLAlign_ATAC" + }, + { + "run": "#QualCLAlign.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/QualCLAlign_RNA/Assay" + }, + { + "source": "#main/CheckReference/Extra_Seqs", + "id": "#main/QualCLAlign_RNA/Extra_Seqs" + }, + { + "source": "#main/CheckReference/Index", + "id": "#main/QualCLAlign_RNA/Index" + }, + { + "source": "#main/Misc_Settings/Long_Reads", + "id": "#main/QualCLAlign_RNA/Long_Reads" + }, + { + "source": "#main/Reads", + "id": "#main/QualCLAlign_RNA/Reads" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/QualCLAlign_RNA/Run_Base_Name" + }, + { + "source": "#main/Misc_Settings/Custom_STAR_Params", + "id": "#main/QualCLAlign_RNA/STAR_Params" + }, + { + "source": [ + "#main/Maximum_Threads", + "#main/GetMachineResources/Total_CPUs_Avail", + "#main/Assay_Settings/Assay_ATAC" + ], + "valueFrom": "${ var [max_threads, total_cpus_avail, assay_atac] = self;\nif (max_threads != null) { return max_threads; }\nif (assay_atac == null) { return total_cpus_avail; }\nif (total_cpus_avail >= 48) { return parseInt(total_cpus_avail / 2); } else { return total_cpus_avail; } }", + "id": "#main/QualCLAlign_RNA/Threads" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/QualCLAlign_RNA/VDJ_Version" + }, + { + "source": "#main/Write_Filtered_Reads", + "id": "#main/QualCLAlign_RNA/Write_Filtered_Reads" + } + ], + "out": [ + "#main/QualCLAlign_RNA/Bead_Version", + "#main/QualCLAlign_RNA/Libraries", + "#main/QualCLAlign_RNA/ReadsList", + "#main/QualCLAlign_RNA/BAMFiles", + "#main/QualCLAlign_RNA/QualCLAlignMetrics", + "#main/QualCLAlign_RNA/Logs", + "#main/QualCLAlign_RNA/Failed_Reads_CSVs" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/QualCLAlign_RNA" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [], + "outputs": [ + { + "type": "string", + "id": "#main/Start_Time/run/Start_Time" + } + ], + "expression": "${\n var today = new Date();\n var date = today.toDateString();\n var time = today.toLocaleTimeString('en-us', {timeZoneName: 'short'});\n return ({Start_Time: date + ' ' + time});\n} " + }, + "in": [], + "out": [ + "#main/Start_Time/Start_Time" + ], + "id": "#main/Start_Time" + }, + { + "run": "#VDJ_Analyze_Reads_IG.cwl", + "when": "$(inputs.VDJ_Version != null && inputs.VDJ_Version != \"humanTCR\" && inputs.VDJ_Version != \"mouseTCR\")", + "in": [ + { + "source": "#main/Maximum_Threads", + "id": "#main/VDJ_Analyze_Reads_IG/Maximum_Threads" + }, + { + "source": "#main/AlignmentAnalysis/num_valid_ig_reads", + "id": "#main/VDJ_Analyze_Reads_IG/Num_Valid_Reads_IG" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Analyze_Reads_IG/VDJ_Version" + }, + { + "source": "#main/AlignmentAnalysis/validIgReads", + "id": "#main/VDJ_Analyze_Reads_IG/Valid_Reads_Fastq_IG" + } + ], + "out": [ + "#main/VDJ_Analyze_Reads_IG/gatheredCalls" + ], + "id": "#main/VDJ_Analyze_Reads_IG" + }, + { + "run": "#VDJ_Analyze_Reads_TCR.cwl", + "when": "$(inputs.VDJ_Version != null && inputs.VDJ_Version != \"humanBCR\" && inputs.VDJ_Version != \"mouseBCR\")", + "in": [ + { + "source": "#main/Maximum_Threads", + "id": "#main/VDJ_Analyze_Reads_TCR/Maximum_Threads" + }, + { + "source": "#main/AlignmentAnalysis/num_valid_tcr_reads", + "id": "#main/VDJ_Analyze_Reads_TCR/Num_Valid_Reads_TCR" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Analyze_Reads_TCR/VDJ_Version" + }, + { + "source": "#main/AlignmentAnalysis/validTcrReads", + "id": "#main/VDJ_Analyze_Reads_TCR/Valid_Reads_Fastq_TCR" + } + ], + "out": [ + "#main/VDJ_Analyze_Reads_TCR/gatheredCalls" + ], + "id": "#main/VDJ_Analyze_Reads_TCR" + }, + { + "run": "#VDJ_Compile_Results.cwl", + "when": "$(inputs.VDJ_Version != null)", + "in": [ + { + "source": "#main/AlignmentAnalysis/Seq_Metrics", + "id": "#main/VDJ_Compile_Results/Seq_Metrics" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Compile_Results/VDJ_Version" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/VDJ_Compile_Results/cellTypeMapping" + }, + { + "valueFrom": "$([])", + "id": "#main/VDJ_Compile_Results/chainsToIgnore" + }, + { + "source": "#main/Internal_Settings/VDJ_JGene_Evalue", + "id": "#main/VDJ_Compile_Results/evalueJgene" + }, + { + "source": "#main/Internal_Settings/VDJ_VGene_Evalue", + "id": "#main/VDJ_Compile_Results/evalueVgene" + }, + { + "source": "#main/VDJ_Analyze_Reads_IG/gatheredCalls", + "id": "#main/VDJ_Compile_Results/igCalls" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/VDJ_Compile_Results/metadata" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/VDJ_Compile_Results/putativeCells" + }, + { + "source": "#main/VDJ_Analyze_Reads_TCR/gatheredCalls", + "id": "#main/VDJ_Compile_Results/tcrCalls" + } + ], + "out": [ + "#main/VDJ_Compile_Results/vdjCellsDatatable", + "#main/VDJ_Compile_Results/vdjCellsDatatableUncorrected", + "#main/VDJ_Compile_Results/vdjDominantContigsAIRR", + "#main/VDJ_Compile_Results/vdjUnfilteredContigsAIRR", + "#main/VDJ_Compile_Results/vdjMetricsJson", + "#main/VDJ_Compile_Results/vdjMetricsCsv", + "#main/VDJ_Compile_Results/vdjDbecFilterImages" + ], + "id": "#main/VDJ_Compile_Results" + }, + { + "label": "VDJ Settings", + "run": "#VDJ_Settings.cwl", + "in": [ + { + "source": "#main/VDJ_Version", + "id": "#main/VDJ_Settings/_VDJ_Version" + } + ], + "out": [ + "#main/VDJ_Settings/VDJ_Version" + ], + "id": "#main/VDJ_Settings" + }, + { + "run": "#Version.cwl", + "in": [], + "out": [ + "#main/Version/version" + ], + "id": "#main/Version" + } + ], + "id": "#main" + }, + { + "requirements": [ + { + "listing": [ + { + "entryname": "bam_files.txt", + "entry": "${\n function getBamsInSortedOrder(inputs) {\n // Create an associative array to hold the mapping from basename to full path\n var fileMap = {};\n\n // Extract basenames and map them to their full paths\n for (var i = 0; i < inputs.BamFiles.length; i++) {\n var file = inputs.BamFiles[i].path;\n var basename = file.split('/').pop();\n fileMap[basename] = file;\n }\n\n // Sort the basenames numerically\n // This works because all bams share the same prefix and have a numerical id :\n // i.e. foobar.0001.tagged.bam, foobar.0002.tagged.bam, foobar.0010.tagged.bam, etc.\n // so will be sorted by numerical ids\n var sortedBasenames = Object.keys(fileMap).sort(function(a, b) {\n return a.localeCompare(b, undefined, { numeric: true });\n });\n\n // Reconstruct the sorted full paths\n var sortedBamFiles = sortedBasenames.map(function(basename) {\n return fileMap[basename];\n });\n\n // Create a file of file names - 1 per line\n return sortedBamFiles.join('\\n');\n }\n\n // For ATAC we cat the bams so need them in a particular order\n if (inputs.Assay == \"ATAC\") {\n return getBamsInSortedOrder(inputs);\n }\n else {\n return inputs.BamFiles.map(function(file) {\n return file.path;\n }).join('\\n');\n }\n}", + "writable": false + } + ], + "class": "InitialWorkDirRequirement" + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8, + "ramMin": 16000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "MergeBam.sh" + ], + "stderr": "merge_bam.log", + "inputs": [ + { + "type": "string", + "inputBinding": { + "position": 1 + }, + "id": "#MergeBAM.cwl/Assay" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#MergeBAM.cwl/BamFiles" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MergeBAM.cwl/Generate_Bam" + }, + { + "type": "File", + "loadContents": true, + "id": "#MergeBAM.cwl/Run_Metadata" + } + ], + "arguments": [ + { + "position": 2, + "valueFrom": "${\n var st_version = JSON.parse(inputs.Run_Metadata.contents).Sample_Tags_Version\n if (st_version)\n {\n return st_version\n } else\n {\n return \"None\"\n }\n}" + }, + { + "position": 3, + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + }, + { + "position": 4, + "valueFrom": "$(runtime.cores)" + }, + { + "position": 5, + "valueFrom": "bam_files.txt" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "${ return \"*\" + JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name + \"*.bam\" }" + }, + "id": "#MergeBAM.cwl/Bam" + }, + { + "type": "File", + "outputBinding": { + "glob": "${ return \"*\" + JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name + \"*.bam.bai\" }" + }, + "id": "#MergeBAM.cwl/BamIndex" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#MergeBAM.cwl/log" + } + ], + "id": "#MergeBAM.cwl" + }, + { + "class": "CommandLineTool", + "baseCommand": "echo", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/AbSeq_Reference" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "id": "#Metadata.cwl/Assay" + }, + { + "type": { + "type": "array", + "items": { + "type": "record", + "fields": [ + { + "name": "#Metadata.cwl/Bead_Version/Library", + "type": "string" + }, + { + "name": "#Metadata.cwl/Bead_Version/bead_version", + "type": "string" + } + ] + } + }, + "id": "#Metadata.cwl/Bead_Version" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#Metadata.cwl/Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#Metadata.cwl/Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Exact_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#Metadata.cwl/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Expected_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#Metadata.cwl/Generate_Bam" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Libraries" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Libraries_ATAC" + }, + { + "type": "string", + "id": "#Metadata.cwl/Long_Reads" + }, + { + "type": "string", + "id": "#Metadata.cwl/Pipeline_Name" + }, + { + "type": "string", + "id": "#Metadata.cwl/Pipeline_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#Metadata.cwl/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Reads" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Reads_ATAC" + }, + { + "type": [ + "null", + "File", + "Directory" + ], + "id": "#Metadata.cwl/Reference_Archive" + }, + { + "type": "string", + "id": "#Metadata.cwl/Run_Base_Name" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Run_Name" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Sample_Tag_Names" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Sample_Tags_Version" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Start_Time" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/Targeted_Reference" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "run_metadata.json" + }, + "id": "#Metadata.cwl/Run_Metadata" + } + ], + "stdout": "run_metadata.json", + "arguments": [ + { + "prefix": "" + }, + { + "shellQuote": true, + "valueFrom": "${\n var metadata = inputs;\n var all_bv = {};\n var customer_bv = \"Original (V1)\";\n var detected_bv = \"V1\";\n for (var i = 0; i < inputs.Bead_Version.length; i++) {\n var BeadVer = inputs.Bead_Version[i];\n var Library = BeadVer[\"Library\"];\n var bead_version = BeadVer[\"bead_version\"];\n all_bv[Library] = bead_version \n var short_bv = bead_version.substring(0, 5);\n if (short_bv == \"Enh\") {\n customer_bv = \"Enhanced\";\n detected_bv = \"Enh\";\n }\n else if (short_bv == \"EnhV2\") {\n customer_bv = \"Enhanced V2/V3\";\n detected_bv = \"EnhV2\";\n }\n }\n metadata[\"Bead_Version\"] = all_bv;\n metadata[\"Bead_Version_Detected\"] = detected_bv;\n\n var pipeline_name = inputs.Pipeline_Name;\n var version = inputs.Pipeline_Version;\n var time = inputs.Start_Time;\n var libraries = inputs.Libraries;\n if(libraries == null){\n libraries = [\"None\"];\n }\n var libraries_atac = inputs.Libraries_ATAC\n if(libraries_atac == null){\n libraries_atac = [\"None\"];\n }\n\n var i = 0;\n var refs_mrna_inputs = [];\n var mrna_name = \"mRNA Reference\";\n if (inputs.Targeted_Reference != null) {\n refs_mrna_inputs = refs_mrna_inputs.concat(inputs.Targeted_Reference);\n mrna_name = \"Targeted Reference\";\n }\n if(inputs.Reference_Archive != null){\n refs_mrna_inputs = refs_mrna_inputs.concat(inputs.Reference_Archive);\n mrna_name = \"Reference Archive\";\n }\n var refs_mrna = [];\n if (refs_mrna_inputs.length > 0) {\n for (i = 0; i < refs_mrna_inputs.length; i++) {\n if (refs_mrna_inputs[i] != null) {\n refs_mrna.push(refs_mrna_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_mrna = [\"None\"];\n }\n\n var refs_abseq_inputs = [];\n if (inputs.AbSeq_Reference != null) {\n refs_abseq_inputs = refs_abseq_inputs.concat(inputs.AbSeq_Reference);\n }\n var refs_abseq = [];\n if (refs_abseq_inputs.length > 0) {\n for (i = 0; i < refs_abseq_inputs.length; i++) {\n if (refs_abseq_inputs[i] != null) {\n refs_abseq.push(refs_abseq_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_abseq = [\"None\"];\n }\n\n var refs_supp_inputs = [];\n if (inputs.Supplemental_Reference != null) {\n refs_supp_inputs = refs_supp_inputs.concat(inputs.AbSeq_Reference);\n }\n var refs_supp = [];\n if (refs_supp_inputs.length > 0) {\n for (i = 0; i < refs_supp_inputs.length; i++) {\n if (refs_supp_inputs[i] != null) {\n refs_supp.push(refs_supp_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_supp = [\"None\"];\n }\n\n if (inputs.Predefined_ATAC_Peaks != null) {\n var predef_atac_peaks = inputs.Predefined_ATAC_Peaks[\"basename\"];\n } else {\n var predef_atac_peaks = \"None\";\n }\n\n var parameters = [];\n if(inputs.Sample_Tags_Version != null){\n var tags = \"Sample Tag Version: \" + inputs.Sample_Tags_Version;\n } else{ \n var tags = \"Sample Tag Version: None\";\n }\n parameters.push(tags);\n\n if(inputs.Sample_Tag_Names != null){\n var tag_names = inputs.Sample_Tag_Names.join(\"; \")\n var tag_list = \"Sample Tag Names: \" + tag_names;\n } else{\n var tag_list = \"Sample Tag Names: None\";\n }\n parameters.push(tag_list);\n \n if(inputs.VDJ_Version != null){\n var vdj = \"VDJ Version: \" + inputs.VDJ_Version;\n } else{ \n var vdj = \"VDJ Version: None\";\n }\n parameters.push(vdj)\n\n if (inputs.Cell_Calling_Data == 0) {\n var call = \"Putative Cell Calling Data: mRNA\";\n } else if (inputs.Cell_Calling_Data == 1) {\n var call = \"Putative Cell Calling Data: AbSeq\";\n } else if (inputs.Cell_Calling_Data == 2) {\n var call = \"Putative Cell Calling Data: mRNA_and_AbSeq\";\n } else if (inputs.Cell_Calling_Data == 3) {\n var call = \"Putative Cell Calling Data: mRNA_and_ATAC\";\n } else if (inputs.Cell_Calling_Data == 4) {\n var call = \"Putative Cell Calling Data: AbSeq_and_ATAC\";\n } else if (inputs.Cell_Calling_Data == 5) {\n var call = \"Putative Cell Calling Data: ATAC\";\n } else {\n var call = \"Putative Cell Calling Data: None\";\n }\n parameters.push(call)\n\n if (inputs.Cell_Calling_Bioproduct_Algorithm != null) {\n var bioproduct_alg = \"Bioproduct Cell Calling Algorithm: \" + inputs.Cell_Calling_Bioproduct_Algorithm;\n } else {\n var bioproduct_alg = \"Bioproduct Cell Calling Algorithm: None\";\n }\n parameters.push(bioproduct_alg)\n\n if (inputs.Cell_Calling_ATAC_Algorithm != null) {\n var atac_alg = \"ATAC Cell Calling Algorithm: \" + inputs.Cell_Calling_ATAC_Algorithm;\n } else {\n var atac_alg = \"ATAC Cell Calling Algorithm: None\";\n }\n parameters.push(atac_alg)\n\n if(inputs.Exclude_Intronic_Reads){\n var introns = \"Exclude Intronic Reads: On\";\n } else{\n var introns = \"Exclude Intronic Reads: Off\";\n }\n parameters.push(introns)\n\n if(inputs.Generate_Bam){\n var generateBam = \"Generate Bam: On\";\n } else{\n var generateBam = \"Generate Bam: Off\";\n }\n parameters.push(generateBam)\n\n if(inputs.Exact_Cell_Count != null){\n var exactCells = \"Exact Cell Count: \" + inputs.Exact_Cell_Count;\n } else{\n var exactCells = \"Exact Cell Count: None\";\n }\n parameters.push(exactCells)\n\n if(inputs.Expected_Cell_Count != null){\n var expectedCells = \"Expected Cell Count: \" + inputs.Expected_Cell_Count;\n } else{\n var expectedCells = \"Expected Cell Count: None\";\n }\n parameters.push(expectedCells);\n\n var longReads = \"Long Reads: \" + inputs.Long_Reads;\n parameters.push(longReads);\n\n if (inputs.Custom_STAR_Params != null)\n {\n var starParams = \"Custom STAR Params: \" + inputs.Custom_STAR_Params;\n } else {\n var starParams = \"Custom STAR Params: None\"; \n }\n parameters.push(starParams);\n\n if (inputs.Custom_bwa_mem2_Params != null)\n {\n var bwaParams = \"Custom bwa-mem2 Params: \" + inputs.Custom_bwa_mem2_Params;\n } else {\n var bwaParams = \"Custom bwa-mem2 Params: None\";\n }\n parameters.push(bwaParams);\n\n var run_name = inputs.Run_Name;\n var run_base_name = inputs.Run_Base_Name;\n\n var header = [\"####################\"];\n header.push(\"## \" + pipeline_name + \" Version \" + version);\n header.push(\"## Analysis Date - \" + time);\n header.push(\"## Libraries - Bioproduct Libraries: \" + libraries.join('; ') + \" | ATAC Libraries: \" + libraries_atac.join('; ') + \" | Bead version detected: \" + customer_bv);\n header.push(\"## References - \" + mrna_name + \": \" + refs_mrna.join('; ') + \" | AbSeq Reference: \" + refs_abseq.join('; ') + \" | Supplemental Reference: \" + refs_supp.join('; ') + \" | ATAC Predefined Peak Regions: \" + predef_atac_peaks);\n header.push(\"## Parameters - \" + parameters.join(' | '));\n header.push(\"####################\");\n metadata[\"Output_Header\"] = header;\n metadata[\"Run_Name\"] = run_name \n metadata[\"Run_Base_Name\"] = run_base_name;\n\n var metadata_json = JSON.stringify(metadata, null, 2);\n\n return metadata_json;\n}\n" + } + ], + "id": "#Metadata.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_metrics.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-file", + "itemSeparator": "," + }, + "id": "#Metrics.cwl/Cell_Type_Predictions" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--metrics-tar" + }, + "id": "#Metrics.cwl/Metrics_tar" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#Metrics.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell-file" + }, + "id": "#Metrics.cwl/vdjCellsDatatable" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-metrics-fp" + }, + "id": "#Metrics.cwl/vdjMetricsJson" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": [ + "*_ATAC_Metrics.json", + "*_ATAC_Metrics.csv" + ] + }, + "id": "#Metrics.cwl/Metrics_ATAC" + }, + { + "type": "File", + "outputBinding": { + "glob": "internal-metrics-archive.tar.gz" + }, + "id": "#Metrics.cwl/Metrics_Archive" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Metrics_Summary.csv" + }, + "id": "#Metrics.cwl/Metrics_Summary" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Pipeline_Report.html" + }, + "id": "#Metrics.cwl/Pipeline_Report_HTML" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Pipeline_Report.json" + }, + "id": "#Metrics.cwl/Pipeline_Report_JSON" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#Metrics.cwl/output" + } + ], + "id": "#Metrics.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/_Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/_Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/_Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/_Long_Reads" + } + ], + "outputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Long_Reads" + } + ], + "expression": "${\n // the exclude intronic reads flag defaults to false\n var excludeIntronicReads = false;\n // the user can set the flag to exclude intronic reads\n if (inputs._Exclude_Intronic_Reads) {\n excludeIntronicReads = inputs._Exclude_Intronic_Reads;\n }\n\n // Use_Long_Reads default is autodetect, which happens in CheckFastqs\n // User can set this explicitly true or false\n // Convert boolean results to string. null -> \"auto\", true -> \"true\", false -> \"false\"\n var longReads = \"auto\";\n if (inputs._Long_Reads !== null) {\n if (inputs._Long_Reads) {\n longReads = \"true\";\n }\n else {\n longReads = \"false\";\n }\n }\n\n return ({\n Exclude_Intronic_Reads: excludeIntronicReads,\n Long_Reads: longReads,\n Custom_STAR_Params: inputs._Custom_STAR_Params,\n Custom_bwa_mem2_Params: inputs._Custom_bwa_mem2_Params\n });\n\n\n}", + "id": "#MiscSettings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": "string", + "default": "Targeted", + "id": "#MultiplexingSettings.cwl/Assay" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#MultiplexingSettings.cwl/_Sample_Tag_Names" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#MultiplexingSettings.cwl/_Sample_Tags_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#MultiplexingSettings.cwl/Sample_Tag_Names" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MultiplexingSettings.cwl/Sample_Tags_Version" + } + ], + "expression": "${\n var enumifiedSampleTagsVersion = null;\n if (inputs._Sample_Tags_Version) {\n var _Sample_Tags_Version = inputs._Sample_Tags_Version.toLowerCase();\n\n if (_Sample_Tags_Version.indexOf('human') >= 0 || _Sample_Tags_Version === 'hs')\n {\n enumifiedSampleTagsVersion = 'hs';\n }\n else if (_Sample_Tags_Version.indexOf('mouse') >= 0 || _Sample_Tags_Version === 'mm')\n {\n enumifiedSampleTagsVersion = 'mm';\n }\n else if (_Sample_Tags_Version.indexOf('flex') >= 0)\n {\n enumifiedSampleTagsVersion = 'flex';\n }\n else if (_Sample_Tags_Version.indexOf('nuclei') >= 0)\n {\n if (_Sample_Tags_Version.indexOf('atac') >= 0)\n {\n enumifiedSampleTagsVersion = 'nuclei_atac_only';\n }\n else\n {\n enumifiedSampleTagsVersion = 'nuclei_includes_mrna';\n }\n }\n else if (_Sample_Tags_Version === 'no multiplexing')\n {\n enumifiedSampleTagsVersion = null;\n }\n else\n {\n throw new Error(\"Cannot parse Sample Tag Version: \" + inputs._Sample_Tags_Version);\n }\n }\n var newTagNames = null;\n if (inputs._Sample_Tag_Names) {\n var listTagNames = inputs._Sample_Tag_Names\n var newTagNames = []\n for (var num in listTagNames) {\n var tag = listTagNames[num].replace(/[^A-Za-z0-9-+]/g,\"_\");\n newTagNames.push(tag);\n }\n } \n return ({\n Sample_Tag_Names: newTagNames,\n Sample_Tags_Version: enumifiedSampleTagsVersion\n });\n}", + "id": "#MultiplexingSettings.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "NameSettings sets the Run_Name variable that is used as a common prefix to name output files. If the user has specified a Run_Name, it is cleaned up or one is set based on the Bioproduct/ATAC fastq filenames.\n", + "hints": [], + "baseCommand": [ + "python", + "-c" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "default": [], + "id": "#NameSettings.cwl/ATAC_Fastqs" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "default": [], + "id": "#NameSettings.cwl/Bioproduct_Fastqs" + }, + { + "type": [ + "null", + "string" + ], + "default": "", + "id": "#NameSettings.cwl/_Run_Name" + } + ], + "arguments": [ + { + "position": 2, + "valueFrom": "import sys\nfrom mist.apps import CheckFastqs\nCheckFastqs.write_run_name_json(sys.argv[1], sys.argv[2], sys.argv[3])\n", + "shellQuote": true + }, + { + "position": 3, + "valueFrom": "$(inputs._Run_Name)", + "shellQuote": true + }, + { + "position": 4, + "valueFrom": "$(inputs.Bioproduct_Fastqs.join(\",\"))", + "shellQuote": true + }, + { + "position": 5, + "valueFrom": "$(inputs.ATAC_Fastqs.join(\",\"))", + "shellQuote": true + } + ], + "outputs": [ + { + "type": "string", + "outputBinding": { + "glob": "run_base_name.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents)['Run_Base_Name'])" + }, + "id": "#NameSettings.cwl/Run_Base_Name" + }, + { + "type": [ + "null", + "string" + ], + "outputBinding": { + "outputEval": "$(inputs._Run_Name)" + }, + "id": "#NameSettings.cwl/Run_Name" + } + ], + "id": "#NameSettings.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 16000 + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_peak_annotation.py" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "prefix": "--gtf" + }, + "id": "#PeakAnnotation.cwl/Gtf" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peaks_bed_file" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#PeakAnnotation.cwl/Peaks_bed" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#PeakAnnotation.cwl/Run_Metadata" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.tsv.gz" + }, + "id": "#PeakAnnotation.cwl/Peak_Annotation_TSV" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#PeakAnnotation.cwl/output" + } + ], + "id": "#PeakAnnotation.cwl" + }, + { + "class": "CommandLineTool", + "baseCommand": "echo", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_Data" + }, + { + "type": [ + "null", + "int" + ], + "id": "#PutativeCellSettings.cwl/_Exact_Cell_Count" + }, + { + "type": [ + "null", + "int" + ], + "id": "#PutativeCellSettings.cwl/_Expected_Cell_Count" + }, + { + "type": "boolean", + "id": "#PutativeCellSettings.cwl/_Reads_ATAC_Present" + }, + { + "type": "boolean", + "id": "#PutativeCellSettings.cwl/_Reads_RNA_Present" + } + ], + "outputs": [ + { + "type": [ + "null", + "Any" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_ATAC_Algorithm; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_Bioproduct_Algorithm; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": "int", + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_Data; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "int" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Exact_Cell_Count; }" + }, + "id": "#PutativeCellSettings.cwl/Exact_Cell_Count" + }, + { + "type": [ + "null", + "int" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Expected_Cell_Count; }" + }, + "id": "#PutativeCellSettings.cwl/Expected_Cell_Count" + }, + { + "type": "File", + "outputBinding": { + "glob": "putative_cell_settings.json" + }, + "id": "#PutativeCellSettings.cwl/PutativeCellSettings" + } + ], + "stdout": "putative_cell_settings.json", + "arguments": [ + { + "prefix": "" + }, + { + "shellQuote": true, + "valueFrom": "${\n var settings = inputs;\n var errorMessage = \"No error\";\n\n var cellCallingATACAlg = null;\n // the default cell calling algorithm for ATAC is basic\n if (inputs._Reads_ATAC_Present) {\n cellCallingATACAlg = \"Basic\";\n }\n // the user can choose the ATAC cell calling algorithm\n if (inputs._Cell_Calling_ATAC_Algorithm) {\n cellCallingATACAlg = inputs._Cell_Calling_ATAC_Algorithm;\n }\n\n var cellCallingBioproductAlg = null;\n // the default cell calling algorithm for bioproducts is basic\n if (inputs._Reads_RNA_Present) {\n cellCallingBioproductAlg = \"Basic\";\n }\n // the user can choose the bioproducts cell calling algorithm\n if (inputs._Cell_Calling_Bioproduct_Algorithm) {\n cellCallingBioproductAlg = inputs._Cell_Calling_Bioproduct_Algorithm;\n }\n\n // the default cell calling data depends on the data that is provided\n // the overall default is mRNA data\n var cellCallingDataInt = 0;\n // if mRNA and ATAC reads are present, then default to joint cell calling\n if (inputs._Reads_RNA_Present && inputs._Reads_ATAC_Present) {\n cellCallingDataInt = 3;\n }\n // if no WTA data is present, but ATAC data is, then default to ATAC\n else if (!inputs._Reads_RNA_Present && inputs._Reads_ATAC_Present) {\n cellCallingDataInt = 5;\n }\n\n // convert the Cell_Calling_Data from a string to an integer\n if (inputs._Cell_Calling_Data) {\n if (inputs._Cell_Calling_Data === \"mRNA\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA\" was selected but no mRNA Reads were provided.';\n } else {\n cellCallingDataInt = 0;\n }\n }\n else if (inputs._Cell_Calling_Data === \"AbSeq\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq\" was selected but no AbSeq Reads were provided.';\n }\n cellCallingDataInt = 1;\n }\n else if (inputs._Cell_Calling_Data === \"mRNA_and_AbSeq\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_AbSeq\" was selected but no mRNA/AbSeq Reads were provided.';\n }\n cellCallingDataInt = 2;\n }\n else if (inputs._Cell_Calling_Data === \"mRNA_and_ATAC\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_ATAC\" was selected but no mRNA Reads were provided.';\n } else if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 3;\n }\n else if (inputs._Cell_Calling_Data === \"AbSeq_and_ATAC\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq_and_ATAC\" was selected but no AbSeq Reads were provided.';\n } else if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq_and_ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 4;\n }\n else if (inputs._Cell_Calling_Data === \"ATAC\") {\n if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 5;\n }\n }\n // check the exact cell count\n if (inputs._Exact_Cell_Count) {\n if (inputs._Exact_Cell_Count < 1) {\n errorMessage = \"Exact cell count must be an integer greater than 0, value received: \" + inputs._Exact_Cell_Count;\n }\n }\n // check if there is an error\n if (errorMessage != \"No error\") {\n // If there is an error, force CWL to show it:\n // - \"Cell_Calling_Data\" is a required output\n // - setting it to null will cause a CWL error\n // - the error message will be shown in the json\n cellCallingDataInt = null;\n }\n\n settings[\"Cell_Calling_ATAC_Algorithm\"] = cellCallingATACAlg;\n settings[\"Cell_Calling_Bioproduct_Algorithm\"] = cellCallingBioproductAlg;\n settings[\"Cell_Calling_Data\"] = cellCallingDataInt;\n settings[\"Expected_Cell_Count\"] = inputs._Expected_Cell_Count;\n settings[\"Exact_Cell_Count\"] = inputs._Exact_Cell_Count;\n settings[\"Error\"] = errorMessage;\n\n var settings_json = JSON.stringify(settings, null, 2);\n\n return settings_json;\n }\n" + } + ], + "id": "#PutativeCellSettings.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "CheckFastqs does several quality control routines including: ensuring that read pair file names are formatted correctly and contain a read pair mate; QualCLAlign stage of the Rain pipeline overlaps read pairs and then performs a series of filters and mappings to reduce valid reads into a single FastQ file to be fed into the aligner. The R2 reads are annotated with cell index and UMI information derived from the R1 read.\n", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": "${ if(inputs.Threads){ return inputs.Threads; } else{ return 8; } }", + "ramMin": 48000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "mist_run_qualclalign.py" + ], + "inputs": [ + { + "inputBinding": { + "prefix": "--alignment-compression-threads" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Alignment_Compression_threads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--assay" + }, + "id": "#QualCLAlign.cwl/Assay" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--bgzf-threads" + }, + "id": "#QualCLAlign.cwl/BGZF_Threads" + }, + { + "inputBinding": { + "prefix": "--extra-seqs" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Extra_Seqs" + }, + { + "type": [ + "null", + "boolean" + ], + "inputBinding": { + "prefix": "--split-atac-bam" + }, + "id": "#QualCLAlign.cwl/Generate_Bam" + }, + { + "inputBinding": { + "prefix": "--index" + }, + "type": "Directory", + "id": "#QualCLAlign.cwl/Index" + }, + { + "inputBinding": { + "prefix": "--use-star-long" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Long_Reads" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--predefined-peaks" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#QualCLAlign.cwl/Predefined_ATAC_Peaks" + }, + { + "inputBinding": { + "prefix": "--reader-annotation-threads" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Reader_Annotation_Threads" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "inputBinding": { + "prefix": "--reads", + "itemSeparator": "," + }, + "id": "#QualCLAlign.cwl/Reads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--run-name" + }, + "id": "#QualCLAlign.cwl/Run_Base_Name" + }, + { + "inputBinding": { + "prefix": "--star-params", + "shellQuote": true + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/STAR_Params" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--threads" + }, + "id": "#QualCLAlign.cwl/Threads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version" + }, + "id": "#QualCLAlign.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "boolean" + ], + "inputBinding": { + "prefix": "--write-filtered-read-pairs" + }, + "id": "#QualCLAlign.cwl/Write_Filtered_Reads" + }, + { + "inputBinding": { + "prefix": "--bwa-mem2-params", + "shellQuote": true + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/bwa_mem2_Params" + } + ], + "outputs": [ + { + "outputBinding": { + "glob": "*.bam" + }, + "type": { + "type": "array", + "items": "File" + }, + "id": "#QualCLAlign.cwl/BAMFiles" + }, + { + "type": { + "type": "array", + "items": { + "type": "record", + "fields": [ + { + "name": "#QualCLAlign.cwl/Bead_Version/Library", + "type": "string" + }, + { + "name": "#QualCLAlign.cwl/Bead_Version/bead_version", + "type": "string" + } + ] + } + }, + "outputBinding": { + "glob": "bead_version.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).BeadVersion)\n" + }, + "id": "#QualCLAlign.cwl/Bead_Version" + }, + { + "outputBinding": { + "glob": "*.failedReads.csv.gz" + }, + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#QualCLAlign.cwl/Failed_Reads_CSVs" + }, + { + "outputBinding": { + "glob": "fastq_read_pairs.json" + }, + "type": "File", + "id": "#QualCLAlign.cwl/Fastq_read_pairs" + }, + { + "outputBinding": { + "glob": "*_ATAC_Fragments.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Fragments" + }, + { + "outputBinding": { + "glob": "*_ATAC_Fragments.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Fragments_Index" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "outputBinding": { + "glob": "bead_version.json", + "loadContents": true, + "outputEval": "${\n var obj = JSON.parse(self[0].contents);\n var libraries = [];\n var beadLibs = obj.BeadVersion\n for (var i in beadLibs){\n if (libraries.indexOf(beadLibs[i][\"Library\"]) == -1){ \n libraries.push(beadLibs[i][\"Library\"]);\n }\n }\n libraries.sort();\n return libraries\n}\n" + }, + "id": "#QualCLAlign.cwl/Libraries" + }, + { + "outputBinding": { + "glob": "*logs.tar.gz" + }, + "type": { + "type": "array", + "items": "File" + }, + "id": "#QualCLAlign.cwl/Logs" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Peaks" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Peaks_Index" + }, + { + "outputBinding": { + "glob": "*ReadQualityMetrics.json" + }, + "type": "File", + "id": "#QualCLAlign.cwl/QualCLAlignMetrics" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "outputBinding": { + "outputEval": "${ \n var reads = []; \n var files = inputs.Reads\n for (var i in files){\n reads.push(files[i][\"basename\"]);\n }\n reads.sort();\n return(reads)\n}\n" + }, + "id": "#QualCLAlign.cwl/ReadsList" + }, + { + "type": "string", + "outputBinding": { + "glob": "genome_size.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return \"0\"; } return JSON.parse(self[0].contents); }" + }, + "id": "#QualCLAlign.cwl/Reference_Genome_Size" + }, + { + "outputBinding": { + "glob": "*_ATAC_Transposase_Sites.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Transposase_Sites" + }, + { + "outputBinding": { + "glob": "*_ATAC_Transposase_Sites.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Transposase_Sites_Index" + }, + { + "outputBinding": { + "glob": "*_ATAC_UnifiedMetrics.json" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/UnifiedMetrics" + } + ], + "id": "#QualCLAlign.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/Maximum_Threads" + }, + { + "type": "int", + "id": "#VDJ_Analyze_Reads_IG.cwl/Num_Valid_Reads_IG" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/Valid_Reads_Fastq_IG" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/gatheredCalls", + "id": "#VDJ_Analyze_Reads_IG.cwl/gatheredCalls" + } + ], + "requirements": [ + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "steps": [ + { + "run": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/RSEC_Reads_Fastq", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_cores", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/num_cores" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/igCalls" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG" + }, + { + "run": "#VDJ_GatherCalls.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/igCalls", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/theCalls" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/gatheredCalls" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls" + }, + { + "run": "#VDJ_Preprocess_Reads.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Maximum_Threads", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/Maximum_Threads" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Valid_Reads_Fastq_IG", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/Valid_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Num_Valid_Reads_IG", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_valid_reads" + }, + { + "valueFrom": "BCR", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/vdj_type" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/RSEC_Reads_Fastq", + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_splits", + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_cores" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG" + } + ], + "id": "#VDJ_Analyze_Reads_IG.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/Maximum_Threads" + }, + { + "type": "int", + "id": "#VDJ_Analyze_Reads_TCR.cwl/Num_Valid_Reads_TCR" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/Valid_Reads_Fastq_TCR" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/gatheredCalls", + "id": "#VDJ_Analyze_Reads_TCR.cwl/gatheredCalls" + } + ], + "requirements": [ + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "steps": [ + { + "run": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/RSEC_Reads_Fastq", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_cores", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/num_cores" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/tcrCalls" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR" + }, + { + "run": "#VDJ_GatherCalls.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/tcrCalls", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/theCalls" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/gatheredCalls" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls" + }, + { + "run": "#VDJ_Preprocess_Reads.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Maximum_Threads", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/Maximum_Threads" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Valid_Reads_Fastq_TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/Valid_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Num_Valid_Reads_TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_valid_reads" + }, + { + "valueFrom": "TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/vdj_type" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/RSEC_Reads_Fastq", + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_splits", + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_cores" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR" + } + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 1, + "ramMin": 1024 + } + ], + "baseCommand": [ + "AssembleAndAnnotate.sh" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "position": 1 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/RSEC_Reads_Fastq" + }, + { + "type": "string", + "inputBinding": { + "position": 2 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/Read_Limit" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "position": 3 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_pruned.csv.gz" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/PyirCall" + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl" + }, + { + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/RSEC_Reads_Fastq" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/num_cores" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/PyirCall", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/igCalls" + } + ], + "steps": [ + { + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG", + "run": "#VDJ_Assemble_and_Annotate_Contigs.cwl", + "in": [ + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/RSEC_Reads_Fastq", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + }, + { + "valueFrom": "75000", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/Read_Limit" + }, + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Version", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/VDJ_Version" + } + ], + "out": [ + "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/PyirCall" + ], + "scatter": [ + "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + ] + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl" + }, + { + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/RSEC_Reads_Fastq" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/num_cores" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/PyirCall", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/tcrCalls" + } + ], + "steps": [ + { + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR", + "run": "#VDJ_Assemble_and_Annotate_Contigs.cwl", + "in": [ + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/RSEC_Reads_Fastq", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + }, + { + "valueFrom": "75000", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/Read_Limit" + }, + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Version", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/VDJ_Version" + } + ], + "out": [ + "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/PyirCall" + ], + "scatter": [ + "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + ] + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl" + }, + { + "class": "CommandLineTool", + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + } + ], + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "baseCommand": [ + "mist_vdj_compile_results.py" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "prefix": "--seq-metrics", + "position": 10 + }, + "id": "#VDJ_Compile_Results.cwl/Seq_Metrics" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version", + "position": 2 + }, + "id": "#VDJ_Compile_Results.cwl/VDJ_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "position": 0, + "itemSeparator": ",", + "prefix": "--cell-type-mapping-fp" + }, + "id": "#VDJ_Compile_Results.cwl/cellTypeMapping" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "inputBinding": { + "position": 4, + "prefix": "--ignore", + "itemSeparator": "," + }, + "id": "#VDJ_Compile_Results.cwl/chainsToIgnore" + }, + { + "type": [ + "null", + "float" + ], + "inputBinding": { + "position": 8, + "prefix": "--e-value-for-j" + }, + "id": "#VDJ_Compile_Results.cwl/evalueJgene" + }, + { + "type": [ + "null", + "float" + ], + "inputBinding": { + "position": 7, + "prefix": "--e-value-for-v" + }, + "id": "#VDJ_Compile_Results.cwl/evalueVgene" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 5 + }, + "id": "#VDJ_Compile_Results.cwl/igCalls" + }, + { + "type": "File", + "inputBinding": { + "position": 9, + "prefix": "--metadata-fp" + }, + "id": "#VDJ_Compile_Results.cwl/metadata" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--putative-cells-json-fp", + "position": 3 + }, + "id": "#VDJ_Compile_Results.cwl/putativeCells" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 6 + }, + "id": "#VDJ_Compile_Results.cwl/tcrCalls" + } + ], + "outputs": [ + { + "doc": "VDJ data per cell, with distribution based error correction", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_perCell.csv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjCellsDatatable" + }, + { + "doc": "VDJ data per cell, including non-putative cells, no error correction applied", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_perCell_uncorrected.csv.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjCellsDatatableUncorrected" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_DBEC_images.tar.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjDbecFilterImages" + }, + { + "doc": "AIRR compatible output that only reports the Dominant contigs, counts are DBEC corrected", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_Dominant_Contigs_AIRR.tsv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjDominantContigsAIRR" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_metrics.csv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjMetricsCsv" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_metrics.json" + }, + "id": "#VDJ_Compile_Results.cwl/vdjMetricsJson" + }, + { + "type": [ + "null", + "File" + ], + "doc": "AIRR compatible output that reports all the congits, counts are not DBEC corrected", + "outputBinding": { + "glob": "*_VDJ_Unfiltered_Contigs_AIRR.tsv.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjUnfilteredContigsAIRR" + } + ], + "id": "#VDJ_Compile_Results.cwl" + }, + { + "doc": "VDJ_GatherCalls collect the outputs from the multi-processed VDJ step into one file.\n", + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_GatherCalls.cwl/VDJ_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#VDJ_GatherCalls.cwl/theCalls" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/gatheredCalls", + "id": "#VDJ_GatherCalls.cwl/gatheredCalls" + } + ], + "steps": [ + { + "in": [ + { + "source": "#VDJ_GatherCalls.cwl/theCalls", + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/theCalls" + } + ], + "out": [ + "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/gatheredCalls" + ], + "run": { + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR", + "cwlVersion": "v1.2", + "class": "CommandLineTool", + "hints": [], + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR/theCalls" + } + ], + "arguments": [ + { + "shellQuote": false, + "valueFrom": "${\n if (!inputs.theCalls[0] ) {\n return (\"echo \\\"No outputs from PyIR detected in VDJ_GatherCalls\\\"\")\n }\n var inputFiles = \"\"\n if (!inputs.theCalls[0].path.split(\"_PrunePyIR\")[1]){\n inputFiles = \"zcat\"\n for (var i = 0; i < inputs.theCalls.length; i++) {\n inputFiles += \" \" + inputs.theCalls[i].path\n }\n inputFiles += \" | \"\n } else {\n inputFiles = \"zcat \" + inputs.theCalls[0].path.split(\"VDJ\")[0] + \"*\" + inputs.theCalls[0].path.split(\"_PrunePyIR\")[1].split(\"_Number_\")[0] + \"_Number_*.csv.gz | \"\n }\n var outputFileName = \"\\\"gzip > \" + inputs.theCalls[0].nameroot.split(\"_Number_\")[0] + \"_constant_region_called_pruned.csv.gz\" + \"\\\"\"\n var awkCommand = \"awk \\'NR==1{F=$1;print | \" + outputFileName + \" } $1!=F { print | \" + outputFileName + \" }\\' \"\n var outputCommand = inputFiles + awkCommand\n return (outputCommand)\n}" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_constant_region_called_pruned.csv.gz", + "outputEval": "${\n if (self.size == 0) {\n throw(\"No outputs from PyIR detected in VDJ_GatherCalls!\");\n } else {\n return(self);\n }\n}" + }, + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR/gatheredCalls" + } + ] + }, + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls" + } + ], + "id": "#VDJ_GatherCalls.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/Maximum_Threads" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Preprocess_Reads.cwl/Valid_Reads_Fastq" + }, + { + "type": "int", + "id": "#VDJ_Preprocess_Reads.cwl/num_valid_reads" + }, + { + "type": "string", + "id": "#VDJ_Preprocess_Reads.cwl/vdj_type" + } + ], + "outputs": [ + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/RSEC_Reads_Fastq", + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Preprocess_Reads.cwl/RSEC_Reads_Fastq" + }, + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_cores", + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/num_cores" + }, + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/num_splits" + } + ], + "requirements": [ + { + "envDef": [ + { + "envValue": "8", + "envName": "CORES_ALLOCATED_PER_CWL_PROCESS" + } + ], + "class": "EnvVarRequirement" + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "steps": [ + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads", + "requirements": [ + { + "class": "ResourceRequirement", + "coresMin": 8 + } + ], + "run": "#VDJ_RSEC_Reads.cwl", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/Valid_Reads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/num_splits" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/num_valid_reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/num_valid_reads" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/RSEC_Reads_Fastq" + ] + }, + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8 + } + ], + "run": "#VDJ_Trim_Reads.cwl", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/Valid_Reads_Fastq", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads_Fastq" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads", + "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Trim_Report" + ] + }, + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/Maximum_Threads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/Maximum_Threads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/num_valid_reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_valid_reads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/vdj_type", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/vdj_type" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_cores" + ], + "run": { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits", + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/Maximum_Threads" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_valid_reads" + }, + { + "type": "string", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/vdj_type" + } + ], + "outputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_cores" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_splits" + } + ], + "expression": "${\n var num_splits = 64;\n var max_threads = parseInt(inputs.Maximum_Threads);\n if (!isNaN(max_threads)) {\n num_splits = parseInt(Math.max(max_threads, 8) * 0.7);\n }\n return ({\"num_splits\": num_splits, \"num_cores\": num_splits});\n}" + } + } + ], + "id": "#VDJ_Preprocess_Reads.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": "mist_vdj_rsec_reads.py", + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_RSEC_Reads.cwl/VDJ_Version" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "inputBinding": { + "prefix": "--vdj-valid-reads", + "itemSeparator": "," + }, + "id": "#VDJ_RSEC_Reads.cwl/Valid_Reads" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--num-splits" + }, + "id": "#VDJ_RSEC_Reads.cwl/num_splits" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_RSEC_Reads.cwl/num_valid_reads" + } + ], + "outputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*RSEC_Reads_Fastq_*.tar.gz" + }, + "id": "#VDJ_RSEC_Reads.cwl/RSEC_Reads_Fastq" + } + ], + "id": "#VDJ_RSEC_Reads.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "Any" + ], + "id": "#VDJ_Settings.cwl/_VDJ_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + "float" + ], + "id": "#VDJ_Settings.cwl/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#VDJ_Settings.cwl/VDJ_VGene_Evalue" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Settings.cwl/VDJ_Version" + } + ], + "expression": "${\n var vdjVersion = null;\n if (!inputs._VDJ_Version) {\n vdjVersion = null;}\n else {\n var _VDJ_Version = inputs._VDJ_Version.toLowerCase();\n if (_VDJ_Version === \"human\" || _VDJ_Version === \"hs\" || _VDJ_Version === \"human vdj - bcr and tcr\") {\n vdjVersion = \"human\";\n } else if (_VDJ_Version === \"humanbcr\" || _VDJ_Version === \"human vdj - bcr only\") {\n vdjVersion = \"humanBCR\";\n } else if (_VDJ_Version === \"humantcr\" || _VDJ_Version === \"human vdj - tcr only\") {\n vdjVersion = \"humanTCR\";\n } else if (_VDJ_Version === \"mouse\" || _VDJ_Version === \"mm\" || _VDJ_Version === \"mouse vdj - bcr and tcr\") {\n vdjVersion = \"mouse\";\n } else if (_VDJ_Version === \"mousebcr\" || _VDJ_Version === \"mouse vdj - bcr only\") {\n vdjVersion = \"mouseBCR\";\n } else if (_VDJ_Version === \"mousetcr\" || _VDJ_Version === \"mouse vdj - tcr only\") {\n vdjVersion = \"mouseTCR\";\n } else {\n vdjVersion = inputs._VDJ_Version;\n }\n }\n\n return ({\n VDJ_Version: vdjVersion,\n })\n}", + "id": "#VDJ_Settings.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": "VDJ_Trim_Reads.sh", + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Trim_Reads.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 1 + }, + "id": "#VDJ_Trim_Reads.cwl/Valid_Reads_Fastq" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "cutadapt.log" + }, + "id": "#VDJ_Trim_Reads.cwl/Trim_Report" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*vdjtxt.gz" + }, + "id": "#VDJ_Trim_Reads.cwl/Valid_Reads" + } + ], + "id": "#VDJ_Trim_Reads.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_print_version.py" + ], + "stdout": "output.txt", + "inputs": [], + "outputs": [ + { + "type": "string", + "outputBinding": { + "glob": "output.txt", + "loadContents": true, + "outputEval": "$(self[0].contents)" + }, + "id": "#Version.cwl/version" + } + ], + "id": "#Version.cwl" + } + ], + "cwlVersion": "v1.2" +} diff --git a/target/executable/mapping/cellranger_atac_count/.config.vsh.yaml b/target/executable/mapping/cellranger_atac_count/.config.vsh.yaml new file mode 100644 index 00000000..5819f7c0 --- /dev/null +++ b/target/executable/mapping/cellranger_atac_count/.config.vsh.yaml @@ -0,0 +1,302 @@ +name: "cellranger_atac_count" +namespace: "mapping" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The fastq.gz files to align. Can also be a single directory containing\ + \ fastq.gz files." + info: null + example: + - "sample_S1_L001_R1_001.fastq.gz" + - "sample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The path to Cell Ranger reference tar.gz file. Can also be a directory." + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The folder to store the alignment results." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--description" + description: "Sample description to embed in output files" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_cells" + description: "Define the top N barcodes with the most fragments overlapping peaks\ + \ as cells and override the cell calling algorithm. N must be a positive integer\ + \ <= 20,000. Use this option if the number of cells estimated by Cell Ranger\ + \ ATAC is not consistent with the barcode rank plot" + info: null + required: false + max: 20000 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--peaks" + description: "Override peak caller: specify peaks to use in downstream analyses\ + \ from supplied 3-column BED file. The supplied peaks file must be sorted by\ + \ position and not contain overlapping peaks; comment lines beginning with #\ + \ are allowed" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--dim_reduce" + description: "Dimensionality reduction mode for clustering" + info: null + default: + - "lsa" + required: false + choices: + - "lsa" + - "pca" + - "plsa" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--subsample_rate" + description: "Downsample to preserve this fraction of reads" + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--lanes" + description: "bcl2fastq option. Semicolon-delimited series of lanes to demultiplex.\ + \ Use this if you have a sample sheet for an entire flow cell but only want\ + \ to generate a few lanes for further 10x Genomics analysis." + info: null + example: + - "1,3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using Cell Ranger ATAC count." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_atac_tiny_bcl" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger_atac:2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update \\\n&& apt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_atac_count/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/cellranger_atac_count" + executable: "target/executable/mapping/cellranger_atac_count/cellranger_atac_count" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/cellranger_atac_count/cellranger_atac_count b/target/executable/mapping/cellranger_atac_count/cellranger_atac_count new file mode 100755 index 00000000..df46b90e --- /dev/null +++ b/target/executable/mapping/cellranger_atac_count/cellranger_atac_count @@ -0,0 +1,1408 @@ +#!/usr/bin/env bash + +# cellranger_atac_count main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_atac_count" +VIASH_META_FUNCTIONALITY_NAME="cellranger_atac_count" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger_atac:2.1 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update \ +&& apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component mapping cellranger_atac_count" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_atac_count main" + echo "" + echo "Align fastq files using Cell Ranger ATAC count." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: sample_S1_L001_R1_001.fastq.gz;sample_S1_L001_R2_001.fastq.gz" + echo " The fastq.gz files to align. Can also be a single directory containing" + echo " fastq.gz files." + echo "" + echo " --reference" + echo " type: file, required parameter, file must exist" + echo " example: reference.tar.gz" + echo " The path to Cell Ranger reference tar.gz file. Can also be a directory." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/output" + echo " The folder to store the alignment results." + echo "" + echo "Arguments:" + echo " --description" + echo " type: string" + echo " default:" + echo " Sample description to embed in output files" + echo "" + echo " --force_cells" + echo " type: integer" + echo " max: 20000" + echo " Define the top N barcodes with the most fragments overlapping peaks as" + echo " cells and override the cell calling algorithm. N must be a positive" + echo " integer <= 20,000. Use this option if the number of cells estimated by" + echo " Cell Ranger ATAC is not consistent with the barcode rank plot" + echo "" + echo " --peaks" + echo " type: file, file must exist" + echo " Override peak caller: specify peaks to use in downstream analyses from" + echo " supplied 3-column BED file. The supplied peaks file must be sorted by" + echo " position and not contain overlapping peaks; comment lines beginning with" + echo " # are allowed" + echo "" + echo " --dim_reduce" + echo " type: string" + echo " default: lsa" + echo " choices: [ lsa, pca, plsa ]" + echo " Dimensionality reduction mode for clustering" + echo "" + echo " --subsample_rate" + echo " type: double" + echo " example: 0.1" + echo " Downsample to preserve this fraction of reads" + echo "" + echo " --lanes" + echo " type: string, multiple values allowed" + echo " example: 1,3" + echo " bcl2fastq option. Semicolon-delimited series of lanes to demultiplex." + echo " Use this if you have a sample sheet for an entire flow cell but only" + echo " want to generate a few lanes for further 10x Genomics analysis." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_atac_count main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --description) + [ -n "$VIASH_PAR_DESCRIPTION" ] && ViashError Bad arguments for option \'--description\': \'$VIASH_PAR_DESCRIPTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DESCRIPTION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --description. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --description=*) + [ -n "$VIASH_PAR_DESCRIPTION" ] && ViashError Bad arguments for option \'--description=*\': \'$VIASH_PAR_DESCRIPTION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DESCRIPTION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --force_cells) + [ -n "$VIASH_PAR_FORCE_CELLS" ] && ViashError Bad arguments for option \'--force_cells\': \'$VIASH_PAR_FORCE_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --force_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --force_cells=*) + [ -n "$VIASH_PAR_FORCE_CELLS" ] && ViashError Bad arguments for option \'--force_cells=*\': \'$VIASH_PAR_FORCE_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peaks) + [ -n "$VIASH_PAR_PEAKS" ] && ViashError Bad arguments for option \'--peaks\': \'$VIASH_PAR_PEAKS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEAKS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peaks. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peaks=*) + [ -n "$VIASH_PAR_PEAKS" ] && ViashError Bad arguments for option \'--peaks=*\': \'$VIASH_PAR_PEAKS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEAKS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dim_reduce) + [ -n "$VIASH_PAR_DIM_REDUCE" ] && ViashError Bad arguments for option \'--dim_reduce\': \'$VIASH_PAR_DIM_REDUCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DIM_REDUCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dim_reduce. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dim_reduce=*) + [ -n "$VIASH_PAR_DIM_REDUCE" ] && ViashError Bad arguments for option \'--dim_reduce=*\': \'$VIASH_PAR_DIM_REDUCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DIM_REDUCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --subsample_rate) + [ -n "$VIASH_PAR_SUBSAMPLE_RATE" ] && ViashError Bad arguments for option \'--subsample_rate\': \'$VIASH_PAR_SUBSAMPLE_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSAMPLE_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --subsample_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --subsample_rate=*) + [ -n "$VIASH_PAR_SUBSAMPLE_RATE" ] && ViashError Bad arguments for option \'--subsample_rate=*\': \'$VIASH_PAR_SUBSAMPLE_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSAMPLE_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lanes) + if [ -z "$VIASH_PAR_LANES" ]; then + VIASH_PAR_LANES="$2" + else + VIASH_PAR_LANES="$VIASH_PAR_LANES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lanes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lanes=*) + if [ -z "$VIASH_PAR_LANES" ]; then + VIASH_PAR_LANES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LANES="$VIASH_PAR_LANES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/cellranger_atac_count:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_DESCRIPTION+x} ]; then + VIASH_PAR_DESCRIPTION="" +fi +if [ -z ${VIASH_PAR_DIM_REDUCE+x} ]; then + VIASH_PAR_DIM_REDUCE="lsa" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_PEAKS" ] && [ ! -e "$VIASH_PAR_PEAKS" ]; then + ViashError "Input file '$VIASH_PAR_PEAKS' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_FORCE_CELLS" ]]; then + if ! [[ "$VIASH_PAR_FORCE_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--force_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_FORCE_CELLS -gt 20000 ]]; then + ViashError '--force_cells' has be less than or equal to 20000. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SUBSAMPLE_RATE" ]]; then + if ! [[ "$VIASH_PAR_SUBSAMPLE_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--subsample_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_DIM_REDUCE" ]; then + VIASH_PAR_DIM_REDUCE_CHOICES=("lsa;pca;plsa") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_DIM_REDUCE_CHOICES[*]};" =~ ";$VIASH_PAR_DIM_REDUCE;" ]]; then + ViashError '--dim_reduce' specified value of \'$VIASH_PAR_DIM_REDUCE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_PEAKS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PEAKS")" ) + VIASH_PAR_PEAKS=$(ViashDockerAutodetectMount "$VIASH_PAR_PEAKS") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_atac_count-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "${VIASH_PAR_REFERENCE}" | sed "s#'#'\"'\"'#g;s#.*#par_reference='&'#" ; else echo "# par_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_DESCRIPTION+x} ]; then echo "${VIASH_PAR_DESCRIPTION}" | sed "s#'#'\"'\"'#g;s#.*#par_description='&'#" ; else echo "# par_description="; fi ) +$( if [ ! -z ${VIASH_PAR_FORCE_CELLS+x} ]; then echo "${VIASH_PAR_FORCE_CELLS}" | sed "s#'#'\"'\"'#g;s#.*#par_force_cells='&'#" ; else echo "# par_force_cells="; fi ) +$( if [ ! -z ${VIASH_PAR_PEAKS+x} ]; then echo "${VIASH_PAR_PEAKS}" | sed "s#'#'\"'\"'#g;s#.*#par_peaks='&'#" ; else echo "# par_peaks="; fi ) +$( if [ ! -z ${VIASH_PAR_DIM_REDUCE+x} ]; then echo "${VIASH_PAR_DIM_REDUCE}" | sed "s#'#'\"'\"'#g;s#.*#par_dim_reduce='&'#" ; else echo "# par_dim_reduce="; fi ) +$( if [ ! -z ${VIASH_PAR_SUBSAMPLE_RATE+x} ]; then echo "${VIASH_PAR_SUBSAMPLE_RATE}" | sed "s#'#'\"'\"'#g;s#.*#par_subsample_rate='&'#" ; else echo "# par_subsample_rate="; fi ) +$( if [ ! -z ${VIASH_PAR_LANES+x} ]; then echo "${VIASH_PAR_LANES}" | sed "s#'#'\"'\"'#g;s#.*#par_lanes='&'#" ; else echo "# par_lanes="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# just to make sure paths are absolute +par_reference=\`realpath \$par_reference\` +par_output=\`realpath \$par_output\` + +echo "Creating temporary directory" +tmpdir=\$(mktemp -d "\$meta_temp_dir/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# process inputs +# for every fastq file found, make a symlink into the tempdir +echo "Locating fastqs" +fastq_dir="\$tmpdir/fastqs" +mkdir -p "\$fastq_dir" +IFS=";" +for var in \$par_input; do + unset IFS + abs_path=\`realpath \$var\` + if [ -d "\$abs_path" ]; then + find "\$abs_path" -name *.fastq.gz -exec ln -s {} "\$fastq_dir" \\; + else + ln -s "\$abs_path" "\$fastq_dir" + fi +done + +echo "fastq_dir content: \$(ls \$fastq_dir)" + +echo "Processing reference" +# process reference +if file \$par_reference | grep -q 'gzip compressed data'; then + echo "Untarring genome" + reference_dir="\$tmpdir/fastqs" + mkdir -p "\$reference_dir" + tar -xvf "\$par_reference" -C "\$reference_dir" --strip-components=1 + par_reference="\$reference_dir" +fi + +# cd into tempdir +cd "\$tmpdir" + +if [ ! -z "\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\`python -c "print(int('\$meta_memory_gb') - 2)"\` +fi + +echo "Running cellranger-atac count" + +id=myoutput +cellranger-atac count \\ + --id "\$id" \\ + --fastqs "\$fastq_dir" \\ + --reference "\$par_reference" \\ + --dim-reduce "\$par_dim_reduce" \\ + --description "\$par_description" \\ + \${par_lanes:+--lanes=\${par_lanes[*]}} \\ + \${par_force_cells:+--force-cells=\$par_force_cells} \\ + \${par_subsample_rate:+--subsample-rate=\$par_subsample_rate} \\ + \${memory_gb:+--localmem=\$memory_gb} \\ + \${meta_cpus:+--localcores=\$meta_cpus} \\ + \${par_lanes:+--lanes=\${par_lanes[*]}} + +echo "Copying output" +if [ -d "\$id/outs/" ]; then + if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" + fi + mv "\$id/outs/"* "\$par_output" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_PEAKS" ]; then + VIASH_PAR_PEAKS=$(ViashDockerStripAutomount "$VIASH_PAR_PEAKS") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/cellranger_atac_count/nextflow_labels.config b/target/executable/mapping/cellranger_atac_count/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/cellranger_atac_count/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/cellranger_count/.config.vsh.yaml b/target/executable/mapping/cellranger_count/.config.vsh.yaml new file mode 100644 index 00000000..b17cfad2 --- /dev/null +++ b/target/executable/mapping/cellranger_count/.config.vsh.yaml @@ -0,0 +1,446 @@ +name: "cellranger_count" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The fastq.gz files to align. Can also be a single directory containing\ + \ fastq.gz files." + info: null + example: + - "sample_S1_L001_R1_001.fastq.gz" + - "sample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The path to Cell Ranger reference tar.gz file. Can also be a directory." + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--feature_reference" + description: "Feature reference CSV file, declaring Feature Barcode constructs\ + \ and associated barcodes\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The folder to store the alignment results." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm." + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_cells" + description: "Force pipeline to use this number of cells, bypassing cell calling\ + \ algorithm." + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chemistry" + description: "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single\ + \ Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1 \n NOTE:\ + \ this mode cannot be auto-detected. It must be set explicitly with this option.\n\ + - SC3Pv2: Single Cell 3' v2\n- SC3Pv3: Single Cell 3' v3\n- SC3Pv4: Single Cell\ + \ 3' v4\n- SC3Pv3LT: Single Cell 3' v3 LT\n- SC3Pv3HT: Single Cell 3' v3 HT\n\ + - SC5P-R2-v3: Single Cell 5', paired-end/R2-only\n- SC5P-PE-v3: Single Cell\ + \ 5' paired-end v3 (GEM-X)\n- SC5P-PE: Single Cell 5' paired-end\n- SC5P-R2:\ + \ Single Cell 5' R2-only\n- SC-FB: Single Cell Antibody-only 3' v2 or 5'\n-\ + \ ARC-v1: for analyzing the Gene Expression portion of Multiome data. \n NOTE:\ + \ when the pipeline auto-detects ARC-v1 chemistry, an error is triggered.\n\ + See https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-\ + \ for more information.\n" + info: null + default: + - "auto" + required: false + choices: + - "auto" + - "threeprime" + - "fiveprime" + - "SC3Pv1" + - "SC3Pv2" + - "SC3Pv3" + - "SC3Pv4" + - "SC3Pv3LT" + - "SC3Pv3HT" + - "SC5P-PE-v3" + - "SC5P-PE" + - "SC5P-R2" + - "SC5P-R2-v3" + - "SC-FB" + - "ARC-v1" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--tenx_cloud_token_path" + description: "The 10x Cloud Analysis user token used to enable cell annotation." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_annotation_model" + description: "\"Cell annotation model to use. If auto, uses the default model\ + \ for the species.\nIf not given, does not run cell annotation.\"\n" + info: null + required: false + choices: + - "auto" + - "human_pca_v1_beta" + - "mouse_pca_v1_beta" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--secondary_analysis" + description: "Whether or not to run the secondary analysis e.g. clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--generate_bam" + description: "Whether to generate a BAM file." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--include_introns" + description: "Include intronic reads in count." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--r1_length" + description: "Hard trim the input Read 1 to this length before analysis" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--r2_length" + description: "Hard trim the input Read 2 to this length before analysis" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--lanes" + description: "Only use FASTQs from selected lanes." + info: null + example: + - 1 + - 2 + - 3 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean" + name: "--library_compatibility_check" + description: "Whether to check for barcode compatibility between libraries.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_crispr_umi" + description: "Set the minimum number of CRISPR guide RNA UMIs required for protospacer\ + \ detection.\nIf a lower or higher sensitivity is desired for detection, this\ + \ value can be customized\naccording to specific experimental needs. Applicable\ + \ only to datasets that include a\nCRISPR Guide Capture library.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using Cell Ranger count." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +- type: "file" + path: "setup_logger.py" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_count/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/cellranger_count" + executable: "target/executable/mapping/cellranger_count/cellranger_count" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/cellranger_count/cellranger_count b/target/executable/mapping/cellranger_count/cellranger_count new file mode 100755 index 00000000..8b755a71 --- /dev/null +++ b/target/executable/mapping/cellranger_count/cellranger_count @@ -0,0 +1,1661 @@ +#!/usr/bin/env bash + +# cellranger_count main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Samuel D'Souza (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_count" +VIASH_META_FUNCTIONALITY_NAME="cellranger_count" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger:9.0 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update && \ +apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Samuel D'Souza, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component mapping cellranger_count" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_count main" + echo "" + echo "Align fastq files using Cell Ranger count." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: sample_S1_L001_R1_001.fastq.gz;sample_S1_L001_R2_001.fastq.gz" + echo " The fastq.gz files to align. Can also be a single directory containing" + echo " fastq.gz files." + echo "" + echo " --reference" + echo " type: file, required parameter, file must exist" + echo " example: reference.tar.gz" + echo " The path to Cell Ranger reference tar.gz file. Can also be a directory." + echo "" + echo " --feature_reference" + echo " type: file, file must exist" + echo " Feature reference CSV file, declaring Feature Barcode constructs and" + echo " associated barcodes" + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/output" + echo " The folder to store the alignment results." + echo "" + echo "Arguments:" + echo " --expect_cells" + echo " type: integer" + echo " example: 3000" + echo " Expected number of recovered cells, used as input to cell calling" + echo " algorithm." + echo "" + echo " --force_cells" + echo " type: integer" + echo " example: 3000" + echo " Force pipeline to use this number of cells, bypassing cell calling" + echo " algorithm." + echo "" + echo " --chemistry" + echo " type: string" + echo " default: auto" + echo " choices: [ auto, threeprime, fiveprime, SC3Pv1, SC3Pv2, SC3Pv3, SC3Pv4," + echo "SC3Pv3LT, SC3Pv3HT, SC5P-PE-v3, SC5P-PE, SC5P-R2, SC5P-R2-v3, SC-FB, ARC-v1 ]" + echo " Assay configuration." + echo " - auto: autodetect mode" + echo " - threeprime: Single Cell 3'" + echo " - fiveprime: Single Cell 5'" + echo " - SC3Pv1: Single Cell 3' v1" + echo " NOTE: this mode cannot be auto-detected. It must be set explicitly" + echo " with this option." + echo " - SC3Pv2: Single Cell 3' v2" + echo " - SC3Pv3: Single Cell 3' v3" + echo " - SC3Pv4: Single Cell 3' v4" + echo " - SC3Pv3LT: Single Cell 3' v3 LT" + echo " - SC3Pv3HT: Single Cell 3' v3 HT" + echo " - SC5P-R2-v3: Single Cell 5', paired-end/R2-only" + echo " - SC5P-PE-v3: Single Cell 5' paired-end v3 (GEM-X)" + echo " - SC5P-PE: Single Cell 5' paired-end" + echo " - SC5P-R2: Single Cell 5' R2-only" + echo " - SC-FB: Single Cell Antibody-only 3' v2 or 5'" + echo " - ARC-v1: for analyzing the Gene Expression portion of Multiome data." + echo " NOTE: when the pipeline auto-detects ARC-v1 chemistry, an error is" + echo " triggered." + echo " See" + echo " " + echo "https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-" + echo " for more information." + echo "" + echo " --tenx_cloud_token_path" + echo " type: file, file must exist" + echo " The 10x Cloud Analysis user token used to enable cell annotation." + echo "" + echo " --cell_annotation_model" + echo " type: string" + echo " choices: [ auto, human_pca_v1_beta, mouse_pca_v1_beta ]" + echo " \"Cell annotation model to use. If auto, uses the default model for the" + echo " species." + echo " If not given, does not run cell annotation.\"" + echo "" + echo " --secondary_analysis" + echo " type: boolean" + echo " default: false" + echo " Whether or not to run the secondary analysis e.g. clustering." + echo "" + echo " --generate_bam" + echo " type: boolean" + echo " default: true" + echo " Whether to generate a BAM file." + echo "" + echo " --include_introns" + echo " type: boolean" + echo " default: true" + echo " Include intronic reads in count." + echo "" + echo " --r1_length" + echo " type: integer" + echo " Hard trim the input Read 1 to this length before analysis" + echo "" + echo " --r2_length" + echo " type: integer" + echo " Hard trim the input Read 2 to this length before analysis" + echo "" + echo " --lanes" + echo " type: integer, multiple values allowed" + echo " example: 1;2;3" + echo " Only use FASTQs from selected lanes." + echo "" + echo " --library_compatibility_check" + echo " type: boolean" + echo " default: true" + echo " Whether to check for barcode compatibility between libraries." + echo "" + echo " --min_crispr_umi" + echo " type: integer" + echo " min: 1" + echo " Set the minimum number of CRISPR guide RNA UMIs required for protospacer" + echo " detection." + echo " If a lower or higher sensitivity is desired for detection, this value" + echo " can be customized" + echo " according to specific experimental needs. Applicable only to datasets" + echo " that include a" + echo " CRISPR Guide Capture library." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_count main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --feature_reference) + [ -n "$VIASH_PAR_FEATURE_REFERENCE" ] && ViashError Bad arguments for option \'--feature_reference\': \'$VIASH_PAR_FEATURE_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --feature_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --feature_reference=*) + [ -n "$VIASH_PAR_FEATURE_REFERENCE" ] && ViashError Bad arguments for option \'--feature_reference=*\': \'$VIASH_PAR_FEATURE_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --expect_cells) + [ -n "$VIASH_PAR_EXPECT_CELLS" ] && ViashError Bad arguments for option \'--expect_cells\': \'$VIASH_PAR_EXPECT_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECT_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --expect_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --expect_cells=*) + [ -n "$VIASH_PAR_EXPECT_CELLS" ] && ViashError Bad arguments for option \'--expect_cells=*\': \'$VIASH_PAR_EXPECT_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXPECT_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --force_cells) + [ -n "$VIASH_PAR_FORCE_CELLS" ] && ViashError Bad arguments for option \'--force_cells\': \'$VIASH_PAR_FORCE_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --force_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --force_cells=*) + [ -n "$VIASH_PAR_FORCE_CELLS" ] && ViashError Bad arguments for option \'--force_cells=*\': \'$VIASH_PAR_FORCE_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FORCE_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chemistry) + [ -n "$VIASH_PAR_CHEMISTRY" ] && ViashError Bad arguments for option \'--chemistry\': \'$VIASH_PAR_CHEMISTRY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHEMISTRY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chemistry. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chemistry=*) + [ -n "$VIASH_PAR_CHEMISTRY" ] && ViashError Bad arguments for option \'--chemistry=*\': \'$VIASH_PAR_CHEMISTRY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHEMISTRY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tenx_cloud_token_path) + [ -n "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ] && ViashError Bad arguments for option \'--tenx_cloud_token_path\': \'$VIASH_PAR_TENX_CLOUD_TOKEN_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TENX_CLOUD_TOKEN_PATH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tenx_cloud_token_path. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tenx_cloud_token_path=*) + [ -n "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ] && ViashError Bad arguments for option \'--tenx_cloud_token_path=*\': \'$VIASH_PAR_TENX_CLOUD_TOKEN_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TENX_CLOUD_TOKEN_PATH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_annotation_model) + [ -n "$VIASH_PAR_CELL_ANNOTATION_MODEL" ] && ViashError Bad arguments for option \'--cell_annotation_model\': \'$VIASH_PAR_CELL_ANNOTATION_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_ANNOTATION_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_annotation_model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_annotation_model=*) + [ -n "$VIASH_PAR_CELL_ANNOTATION_MODEL" ] && ViashError Bad arguments for option \'--cell_annotation_model=*\': \'$VIASH_PAR_CELL_ANNOTATION_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_ANNOTATION_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --secondary_analysis) + [ -n "$VIASH_PAR_SECONDARY_ANALYSIS" ] && ViashError Bad arguments for option \'--secondary_analysis\': \'$VIASH_PAR_SECONDARY_ANALYSIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SECONDARY_ANALYSIS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --secondary_analysis. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --secondary_analysis=*) + [ -n "$VIASH_PAR_SECONDARY_ANALYSIS" ] && ViashError Bad arguments for option \'--secondary_analysis=*\': \'$VIASH_PAR_SECONDARY_ANALYSIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SECONDARY_ANALYSIS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --generate_bam) + [ -n "$VIASH_PAR_GENERATE_BAM" ] && ViashError Bad arguments for option \'--generate_bam\': \'$VIASH_PAR_GENERATE_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENERATE_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --generate_bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --generate_bam=*) + [ -n "$VIASH_PAR_GENERATE_BAM" ] && ViashError Bad arguments for option \'--generate_bam=*\': \'$VIASH_PAR_GENERATE_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENERATE_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --include_introns) + [ -n "$VIASH_PAR_INCLUDE_INTRONS" ] && ViashError Bad arguments for option \'--include_introns\': \'$VIASH_PAR_INCLUDE_INTRONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INCLUDE_INTRONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --include_introns. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --include_introns=*) + [ -n "$VIASH_PAR_INCLUDE_INTRONS" ] && ViashError Bad arguments for option \'--include_introns=*\': \'$VIASH_PAR_INCLUDE_INTRONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INCLUDE_INTRONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --r1_length) + [ -n "$VIASH_PAR_R1_LENGTH" ] && ViashError Bad arguments for option \'--r1_length\': \'$VIASH_PAR_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_R1_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --r1_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --r1_length=*) + [ -n "$VIASH_PAR_R1_LENGTH" ] && ViashError Bad arguments for option \'--r1_length=*\': \'$VIASH_PAR_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_R1_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --r2_length) + [ -n "$VIASH_PAR_R2_LENGTH" ] && ViashError Bad arguments for option \'--r2_length\': \'$VIASH_PAR_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_R2_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --r2_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --r2_length=*) + [ -n "$VIASH_PAR_R2_LENGTH" ] && ViashError Bad arguments for option \'--r2_length=*\': \'$VIASH_PAR_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_R2_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --lanes) + if [ -z "$VIASH_PAR_LANES" ]; then + VIASH_PAR_LANES="$2" + else + VIASH_PAR_LANES="$VIASH_PAR_LANES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --lanes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --lanes=*) + if [ -z "$VIASH_PAR_LANES" ]; then + VIASH_PAR_LANES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LANES="$VIASH_PAR_LANES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --library_compatibility_check) + [ -n "$VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK" ] && ViashError Bad arguments for option \'--library_compatibility_check\': \'$VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --library_compatibility_check. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --library_compatibility_check=*) + [ -n "$VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK" ] && ViashError Bad arguments for option \'--library_compatibility_check=*\': \'$VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_crispr_umi) + [ -n "$VIASH_PAR_MIN_CRISPR_UMI" ] && ViashError Bad arguments for option \'--min_crispr_umi\': \'$VIASH_PAR_MIN_CRISPR_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CRISPR_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_crispr_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_crispr_umi=*) + [ -n "$VIASH_PAR_MIN_CRISPR_UMI" ] && ViashError Bad arguments for option \'--min_crispr_umi=*\': \'$VIASH_PAR_MIN_CRISPR_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CRISPR_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/cellranger_count:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_CHEMISTRY+x} ]; then + VIASH_PAR_CHEMISTRY="auto" +fi +if [ -z ${VIASH_PAR_SECONDARY_ANALYSIS+x} ]; then + VIASH_PAR_SECONDARY_ANALYSIS="false" +fi +if [ -z ${VIASH_PAR_GENERATE_BAM+x} ]; then + VIASH_PAR_GENERATE_BAM="true" +fi +if [ -z ${VIASH_PAR_INCLUDE_INTRONS+x} ]; then + VIASH_PAR_INCLUDE_INTRONS="true" +fi +if [ -z ${VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK+x} ]; then + VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK="true" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_FEATURE_REFERENCE" ] && [ ! -e "$VIASH_PAR_FEATURE_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_FEATURE_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ] && [ ! -e "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ]; then + ViashError "Input file '$VIASH_PAR_TENX_CLOUD_TOKEN_PATH' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_EXPECT_CELLS" ]]; then + if ! [[ "$VIASH_PAR_EXPECT_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--expect_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FORCE_CELLS" ]]; then + if ! [[ "$VIASH_PAR_FORCE_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--force_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SECONDARY_ANALYSIS" ]]; then + if ! [[ "$VIASH_PAR_SECONDARY_ANALYSIS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--secondary_analysis' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENERATE_BAM" ]]; then + if ! [[ "$VIASH_PAR_GENERATE_BAM" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--generate_bam' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_INCLUDE_INTRONS" ]]; then + if ! [[ "$VIASH_PAR_INCLUDE_INTRONS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--include_introns' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_R1_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_R1_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--r1_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_R2_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_R2_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--r2_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_LANES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_LANES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--lanes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK" ]]; then + if ! [[ "$VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--library_compatibility_check' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CRISPR_UMI" ]]; then + if ! [[ "$VIASH_PAR_MIN_CRISPR_UMI" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_crispr_umi' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_MIN_CRISPR_UMI -lt 1 ]]; then + ViashError '--min_crispr_umi' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_CHEMISTRY" ]; then + VIASH_PAR_CHEMISTRY_CHOICES=("auto;threeprime;fiveprime;SC3Pv1;SC3Pv2;SC3Pv3;SC3Pv4;SC3Pv3LT;SC3Pv3HT;SC5P-PE-v3;SC5P-PE;SC5P-R2;SC5P-R2-v3;SC-FB;ARC-v1") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CHEMISTRY_CHOICES[*]};" =~ ";$VIASH_PAR_CHEMISTRY;" ]]; then + ViashError '--chemistry' specified value of \'$VIASH_PAR_CHEMISTRY\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CELL_ANNOTATION_MODEL" ]; then + VIASH_PAR_CELL_ANNOTATION_MODEL_CHOICES=("auto;human_pca_v1_beta;mouse_pca_v1_beta") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CELL_ANNOTATION_MODEL_CHOICES[*]};" =~ ";$VIASH_PAR_CELL_ANNOTATION_MODEL;" ]]; then + ViashError '--cell_annotation_model' specified value of \'$VIASH_PAR_CELL_ANNOTATION_MODEL\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_FEATURE_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FEATURE_REFERENCE")" ) + VIASH_PAR_FEATURE_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_FEATURE_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH")" ) + VIASH_PAR_TENX_CLOUD_TOKEN_PATH=$(ViashDockerAutodetectMount "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_count-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "${VIASH_PAR_REFERENCE}" | sed "s#'#'\"'\"'#g;s#.*#par_reference='&'#" ; else echo "# par_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_FEATURE_REFERENCE+x} ]; then echo "${VIASH_PAR_FEATURE_REFERENCE}" | sed "s#'#'\"'\"'#g;s#.*#par_feature_reference='&'#" ; else echo "# par_feature_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_EXPECT_CELLS+x} ]; then echo "${VIASH_PAR_EXPECT_CELLS}" | sed "s#'#'\"'\"'#g;s#.*#par_expect_cells='&'#" ; else echo "# par_expect_cells="; fi ) +$( if [ ! -z ${VIASH_PAR_FORCE_CELLS+x} ]; then echo "${VIASH_PAR_FORCE_CELLS}" | sed "s#'#'\"'\"'#g;s#.*#par_force_cells='&'#" ; else echo "# par_force_cells="; fi ) +$( if [ ! -z ${VIASH_PAR_CHEMISTRY+x} ]; then echo "${VIASH_PAR_CHEMISTRY}" | sed "s#'#'\"'\"'#g;s#.*#par_chemistry='&'#" ; else echo "# par_chemistry="; fi ) +$( if [ ! -z ${VIASH_PAR_TENX_CLOUD_TOKEN_PATH+x} ]; then echo "${VIASH_PAR_TENX_CLOUD_TOKEN_PATH}" | sed "s#'#'\"'\"'#g;s#.*#par_tenx_cloud_token_path='&'#" ; else echo "# par_tenx_cloud_token_path="; fi ) +$( if [ ! -z ${VIASH_PAR_CELL_ANNOTATION_MODEL+x} ]; then echo "${VIASH_PAR_CELL_ANNOTATION_MODEL}" | sed "s#'#'\"'\"'#g;s#.*#par_cell_annotation_model='&'#" ; else echo "# par_cell_annotation_model="; fi ) +$( if [ ! -z ${VIASH_PAR_SECONDARY_ANALYSIS+x} ]; then echo "${VIASH_PAR_SECONDARY_ANALYSIS}" | sed "s#'#'\"'\"'#g;s#.*#par_secondary_analysis='&'#" ; else echo "# par_secondary_analysis="; fi ) +$( if [ ! -z ${VIASH_PAR_GENERATE_BAM+x} ]; then echo "${VIASH_PAR_GENERATE_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_generate_bam='&'#" ; else echo "# par_generate_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_INCLUDE_INTRONS+x} ]; then echo "${VIASH_PAR_INCLUDE_INTRONS}" | sed "s#'#'\"'\"'#g;s#.*#par_include_introns='&'#" ; else echo "# par_include_introns="; fi ) +$( if [ ! -z ${VIASH_PAR_R1_LENGTH+x} ]; then echo "${VIASH_PAR_R1_LENGTH}" | sed "s#'#'\"'\"'#g;s#.*#par_r1_length='&'#" ; else echo "# par_r1_length="; fi ) +$( if [ ! -z ${VIASH_PAR_R2_LENGTH+x} ]; then echo "${VIASH_PAR_R2_LENGTH}" | sed "s#'#'\"'\"'#g;s#.*#par_r2_length='&'#" ; else echo "# par_r2_length="; fi ) +$( if [ ! -z ${VIASH_PAR_LANES+x} ]; then echo "${VIASH_PAR_LANES}" | sed "s#'#'\"'\"'#g;s#.*#par_lanes='&'#" ; else echo "# par_lanes="; fi ) +$( if [ ! -z ${VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK+x} ]; then echo "${VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK}" | sed "s#'#'\"'\"'#g;s#.*#par_library_compatibility_check='&'#" ; else echo "# par_library_compatibility_check="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_CRISPR_UMI+x} ]; then echo "${VIASH_PAR_MIN_CRISPR_UMI}" | sed "s#'#'\"'\"'#g;s#.*#par_min_crispr_umi='&'#" ; else echo "# par_min_crispr_umi="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# just to make sure paths are absolute +par_reference=\`realpath \$par_reference\` +par_output=\`realpath \$par_output\` + +# create temporary directory +tmpdir=\$(mktemp -d "\$meta_temp_dir/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# process inputs +# for every fastq file found, make a symlink into the tempdir +fastq_dir="\$tmpdir/fastqs" +mkdir -p "\$fastq_dir" +IFS=";" +for var in \$par_input; do + unset IFS + abs_path=\`realpath \$var\` + if [ -d "\$abs_path" ]; then + find "\$abs_path" -name *.fastq.gz -exec ln -s {} "\$fastq_dir" \\; + else + ln -s "\$abs_path" "\$fastq_dir" + fi +done + +# process reference +if file \$par_reference | grep -q 'gzip compressed data'; then + echo "Untarring genome" + reference_dir="\$tmpdir/fastqs" + mkdir -p "\$reference_dir" + tar -xvf "\$par_reference" -C "\$reference_dir" --strip-components=1 + par_reference="\$reference_dir" +fi + +# cd into tempdir +cd "\$tmpdir" + +no_secondary_analysis="" +if [ "\$par_secondary_analysis" == "false" ]; then + no_secondary_analysis="true" +fi + +IFS="," +id=myoutput +cellranger count \\ + --id="\$id" \\ + --fastqs="\$fastq_dir" \\ + --transcriptome="\$par_reference" \\ + --include-introns="\$par_include_introns" \\ + \${par_feature_reference:+--feature-ref=\$par_feature_reference} \\ + \${par_cell_annotation_model:+--cell-annotation-model=\$par_cell_annotation_model} \\ + \${par_tenx_cloud_token_path:+--tenx-cloud-token-path=\$par_tenx_cloud_token_path} \\ + \${meta_cpus:+--localcores=\$meta_cpus} \\ + \${meta_memory_gb:+--localmem=\$((meta_memory_gb-2))} \\ + \${par_expect_cells:+--expect-cells=\$par_expect_cells} \\ + \${par_force_cells:+--force-cells=\$par_force_cells} \\ + \${par_chemistry:+--chemistry="\$par_chemistry"} \\ + \${par_generate_bam:+--create-bam=\$par_generate_bam} \\ + \${no_secondary_analysis:+--nosecondary} \\ + \${par_r1_length:+--r1-length=\$par_r1_length} \\ + \${par_r2_length:+--r2-length=\$par_r2_length} \\ + \${par_lanes:+--lanes=\${par_lanes[*]}} \\ + \${par_library_compatibility_check:+--check-library-compatibility=\$par_library_compatibility_check}\\ + --disable-ui +unset IFS + +echo "Copying output" +if [ -d "\$id/outs/" ]; then + if [ ! -d "\$par_output" ]; then + mkdir -p "\$par_output" + fi + mv "\$id/outs/"* "\$par_output" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_FEATURE_REFERENCE" ]; then + VIASH_PAR_FEATURE_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_FEATURE_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ]; then + VIASH_PAR_TENX_CLOUD_TOKEN_PATH=$(ViashDockerStripAutomount "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/cellranger_count/nextflow_labels.config b/target/executable/mapping/cellranger_count/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/cellranger_count/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/cellranger_count_split/.config.vsh.yaml b/target/executable/mapping/cellranger_count_split/.config.vsh.yaml new file mode 100644 index 00000000..022b75ce --- /dev/null +++ b/target/executable/mapping/cellranger_count_split/.config.vsh.yaml @@ -0,0 +1,290 @@ +name: "cellranger_count_split" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Output directory from a Cell Ranger count run." + info: null + example: + - "input_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--filtered_h5" + description: "Output path for the h5 file storing the filtered counts.\n" + info: null + example: + - "filtered_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--metrics_summary" + description: "Where to store the 'metrics_summary' CSV file.\n" + info: null + example: + - "metrics_summary.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--molecule_info" + description: "Where to store Cell Ranger's 'molecule_info.h5' file.\n" + info: null + example: + - "molecule_info.h5" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "Location of output BAM files.\n" + info: null + example: + - "possorted_genome_bam.bam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bai" + description: "Where to store the BAM index files.\n" + info: null + example: + - "possorted_genome_bam.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--raw_h5" + description: "Output path for the h5 file storing the raw counts.\n" + info: null + example: + - "raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split 10x Cell Ranger output directory into separate output fields." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:jammy" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "apt update && apt upgrade -y" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_count_split/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/cellranger_count_split" + executable: "target/executable/mapping/cellranger_count_split/cellranger_count_split" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/cellranger_count_split/cellranger_count_split b/target/executable/mapping/cellranger_count_split/cellranger_count_split new file mode 100755 index 00000000..7d0dd442 --- /dev/null +++ b/target/executable/mapping/cellranger_count_split/cellranger_count_split @@ -0,0 +1,1299 @@ +#!/usr/bin/env bash + +# cellranger_count_split main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Samuel D'Souza (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_count_split" +VIASH_META_FUNCTIONALITY_NAME="cellranger_count_split" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:jammy +ENTRYPOINT [] +RUN apt update && apt upgrade -y +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Samuel D'Souza, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component mapping cellranger_count_split" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_count_split main" + echo "" + echo "Split 10x Cell Ranger output directory into separate output fields." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input_dir" + echo " Output directory from a Cell Ranger count run." + echo "" + echo " --filtered_h5" + echo " type: file, output, file must exist" + echo " example: filtered_feature_bc_matrix.h5" + echo " Output path for the h5 file storing the filtered counts." + echo "" + echo " --metrics_summary" + echo " type: file, output, file must exist" + echo " example: metrics_summary.csv" + echo " Where to store the 'metrics_summary' CSV file." + echo "" + echo " --molecule_info" + echo " type: file, output, file must exist" + echo " example: molecule_info.h5" + echo " Where to store Cell Ranger's 'molecule_info.h5' file." + echo "" + echo " --bam" + echo " type: file, output, file must exist" + echo " example: possorted_genome_bam.bam" + echo " Location of output BAM files." + echo "" + echo " --bai" + echo " type: file, output, file must exist" + echo " example: possorted_genome_bam.bam.bai" + echo " Where to store the BAM index files." + echo "" + echo " --raw_h5" + echo " type: file, output, file must exist" + echo " example: raw_feature_bc_matrix.h5" + echo " Output path for the h5 file storing the raw counts." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_count_split main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --filtered_h5) + [ -n "$VIASH_PAR_FILTERED_H5" ] && ViashError Bad arguments for option \'--filtered_h5\': \'$VIASH_PAR_FILTERED_H5\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTERED_H5="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --filtered_h5. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --filtered_h5=*) + [ -n "$VIASH_PAR_FILTERED_H5" ] && ViashError Bad arguments for option \'--filtered_h5=*\': \'$VIASH_PAR_FILTERED_H5\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTERED_H5=$(ViashRemoveFlags "$1") + shift 1 + ;; + --metrics_summary) + [ -n "$VIASH_PAR_METRICS_SUMMARY" ] && ViashError Bad arguments for option \'--metrics_summary\': \'$VIASH_PAR_METRICS_SUMMARY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRICS_SUMMARY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --metrics_summary. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --metrics_summary=*) + [ -n "$VIASH_PAR_METRICS_SUMMARY" ] && ViashError Bad arguments for option \'--metrics_summary=*\': \'$VIASH_PAR_METRICS_SUMMARY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRICS_SUMMARY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --molecule_info) + [ -n "$VIASH_PAR_MOLECULE_INFO" ] && ViashError Bad arguments for option \'--molecule_info\': \'$VIASH_PAR_MOLECULE_INFO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MOLECULE_INFO="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --molecule_info. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --molecule_info=*) + [ -n "$VIASH_PAR_MOLECULE_INFO" ] && ViashError Bad arguments for option \'--molecule_info=*\': \'$VIASH_PAR_MOLECULE_INFO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MOLECULE_INFO=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bam) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bam=*) + [ -n "$VIASH_PAR_BAM" ] && ViashError Bad arguments for option \'--bam=*\': \'$VIASH_PAR_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bai) + [ -n "$VIASH_PAR_BAI" ] && ViashError Bad arguments for option \'--bai\': \'$VIASH_PAR_BAI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bai. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bai=*) + [ -n "$VIASH_PAR_BAI" ] && ViashError Bad arguments for option \'--bai=*\': \'$VIASH_PAR_BAI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --raw_h5) + [ -n "$VIASH_PAR_RAW_H5" ] && ViashError Bad arguments for option \'--raw_h5\': \'$VIASH_PAR_RAW_H5\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RAW_H5="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --raw_h5. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --raw_h5=*) + [ -n "$VIASH_PAR_RAW_H5" ] && ViashError Bad arguments for option \'--raw_h5=*\': \'$VIASH_PAR_RAW_H5\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RAW_H5=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/cellranger_count_split:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_FILTERED_H5" ] && [ ! -d "$(dirname "$VIASH_PAR_FILTERED_H5")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_FILTERED_H5")" +fi +if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ] && [ ! -d "$(dirname "$VIASH_PAR_METRICS_SUMMARY")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_METRICS_SUMMARY")" +fi +if [ ! -z "$VIASH_PAR_MOLECULE_INFO" ] && [ ! -d "$(dirname "$VIASH_PAR_MOLECULE_INFO")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_MOLECULE_INFO")" +fi +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -d "$(dirname "$VIASH_PAR_BAM")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_BAM")" +fi +if [ ! -z "$VIASH_PAR_BAI" ] && [ ! -d "$(dirname "$VIASH_PAR_BAI")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_BAI")" +fi +if [ ! -z "$VIASH_PAR_RAW_H5" ] && [ ! -d "$(dirname "$VIASH_PAR_RAW_H5")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_RAW_H5")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_FILTERED_H5" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FILTERED_H5")" ) + VIASH_PAR_FILTERED_H5=$(ViashDockerAutodetectMount "$VIASH_PAR_FILTERED_H5") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_FILTERED_H5" ) +fi +if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_METRICS_SUMMARY")" ) + VIASH_PAR_METRICS_SUMMARY=$(ViashDockerAutodetectMount "$VIASH_PAR_METRICS_SUMMARY") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_METRICS_SUMMARY" ) +fi +if [ ! -z "$VIASH_PAR_MOLECULE_INFO" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MOLECULE_INFO")" ) + VIASH_PAR_MOLECULE_INFO=$(ViashDockerAutodetectMount "$VIASH_PAR_MOLECULE_INFO") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_MOLECULE_INFO" ) +fi +if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAM")" ) + VIASH_PAR_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_BAM") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_BAM" ) +fi +if [ ! -z "$VIASH_PAR_BAI" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BAI")" ) + VIASH_PAR_BAI=$(ViashDockerAutodetectMount "$VIASH_PAR_BAI") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_BAI" ) +fi +if [ ! -z "$VIASH_PAR_RAW_H5" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_RAW_H5")" ) + VIASH_PAR_RAW_H5=$(ViashDockerAutodetectMount "$VIASH_PAR_RAW_H5") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_RAW_H5" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_count_split-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_FILTERED_H5+x} ]; then echo "${VIASH_PAR_FILTERED_H5}" | sed "s#'#'\"'\"'#g;s#.*#par_filtered_h5='&'#" ; else echo "# par_filtered_h5="; fi ) +$( if [ ! -z ${VIASH_PAR_METRICS_SUMMARY+x} ]; then echo "${VIASH_PAR_METRICS_SUMMARY}" | sed "s#'#'\"'\"'#g;s#.*#par_metrics_summary='&'#" ; else echo "# par_metrics_summary="; fi ) +$( if [ ! -z ${VIASH_PAR_MOLECULE_INFO+x} ]; then echo "${VIASH_PAR_MOLECULE_INFO}" | sed "s#'#'\"'\"'#g;s#.*#par_molecule_info='&'#" ; else echo "# par_molecule_info="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\"'\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAI+x} ]; then echo "${VIASH_PAR_BAI}" | sed "s#'#'\"'\"'#g;s#.*#par_bai='&'#" ; else echo "# par_bai="; fi ) +$( if [ ! -z ${VIASH_PAR_RAW_H5+x} ]; then echo "${VIASH_PAR_RAW_H5}" | sed "s#'#'\"'\"'#g;s#.*#par_raw_h5='&'#" ; else echo "# par_raw_h5="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +filtered_h5="\$par_input/filtered_feature_bc_matrix.h5" +if [ -f "\$filtered_h5" ] && [ ! -z "\$par_filtered_h5" ]; then + echo "+ cp \$filtered_h5 \$par_filtered_h5" + cp "\$filtered_h5" "\$par_filtered_h5" +fi + +metrics_summary="\$par_input/metrics_summary.csv" +if [ -f "\$metrics_summary" ] && [ ! -z "\$par_metrics_summary" ]; then + echo "+ cp \$metrics_summary \$par_metrics_summary" + cp "\$metrics_summary" "\$par_metrics_summary" +fi + +molecule_info="\$par_input/molecule_info.h5" +if [ -f "\$molecule_info" ] && [ ! -z "\$par_molecule_info" ]; then + echo "+ cp \$molecule_info \$par_molecule_info" + cp "\$molecule_info" "\$par_molecule_info" +fi + +bam="\$par_input/possorted_genome_bam.bam" +if [ -f "\$bam" ] && [ ! -z "\$par_bam" ]; then + echo "cp \$bam \$par_bam" + cp "\$bam" "\$par_bam" +fi + +raw_h5="\$par_input/raw_feature_bc_matrix.h5" +if [ -f "\$raw_h5" ] && [ ! -z "\$par_raw_h5" ]; then + echo "+ cp \$raw_h5 \$par_raw_h5" + cp "\$raw_h5" "\$par_raw_h5" +fi + +bai="\$par_input/possorted_genome_bam.bam.bai" +if [ -f "\$bai" ] && [ ! -z "\$par_bai" ]; then + echo "+ cp \$bai \$par_bai" + cp "\$bai" "\$par_bai" +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_FILTERED_H5" ]; then + VIASH_PAR_FILTERED_H5=$(ViashDockerStripAutomount "$VIASH_PAR_FILTERED_H5") + fi + if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ]; then + VIASH_PAR_METRICS_SUMMARY=$(ViashDockerStripAutomount "$VIASH_PAR_METRICS_SUMMARY") + fi + if [ ! -z "$VIASH_PAR_MOLECULE_INFO" ]; then + VIASH_PAR_MOLECULE_INFO=$(ViashDockerStripAutomount "$VIASH_PAR_MOLECULE_INFO") + fi + if [ ! -z "$VIASH_PAR_BAM" ]; then + VIASH_PAR_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_BAM") + fi + if [ ! -z "$VIASH_PAR_BAI" ]; then + VIASH_PAR_BAI=$(ViashDockerStripAutomount "$VIASH_PAR_BAI") + fi + if [ ! -z "$VIASH_PAR_RAW_H5" ]; then + VIASH_PAR_RAW_H5=$(ViashDockerStripAutomount "$VIASH_PAR_RAW_H5") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_FILTERED_H5" ] && [ ! -e "$VIASH_PAR_FILTERED_H5" ]; then + ViashError "Output file '$VIASH_PAR_FILTERED_H5' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_METRICS_SUMMARY" ] && [ ! -e "$VIASH_PAR_METRICS_SUMMARY" ]; then + ViashError "Output file '$VIASH_PAR_METRICS_SUMMARY' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MOLECULE_INFO" ] && [ ! -e "$VIASH_PAR_MOLECULE_INFO" ]; then + ViashError "Output file '$VIASH_PAR_MOLECULE_INFO' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAM" ] && [ ! -e "$VIASH_PAR_BAM" ]; then + ViashError "Output file '$VIASH_PAR_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BAI" ] && [ ! -e "$VIASH_PAR_BAI" ]; then + ViashError "Output file '$VIASH_PAR_BAI' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_RAW_H5" ] && [ ! -e "$VIASH_PAR_RAW_H5" ]; then + ViashError "Output file '$VIASH_PAR_RAW_H5' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/cellranger_count_split/nextflow_labels.config b/target/executable/mapping/cellranger_count_split/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/cellranger_count_split/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/cellranger_multi/.config.vsh.yaml b/target/executable/mapping/cellranger_multi/.config.vsh.yaml new file mode 100644 index 00000000..0739ff45 --- /dev/null +++ b/target/executable/mapping/cellranger_multi/.config.vsh.yaml @@ -0,0 +1,1028 @@ +name: "cellranger_multi" +namespace: "mapping" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input files" + arguments: + - type: "file" + name: "--input" + description: "The FASTQ files to be analyzed. FASTQ files should conform to the\ + \ naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane\ + \ Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Feature type-specific input files" + description: "Helper functionality to allow feature type-specific input files, without\ + \ the need to specify\nlibrary_type or library_id. The library_id will be inferred\ + \ from the input paths.\n" + arguments: + - type: "file" + name: "--gex_input" + description: "The FASTQ files to be analyzed for Gene Expression. FASTQ files\ + \ should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--abc_input" + description: "The FASTQ files to be analyzed for Antibody Capture. FASTQ files\ + \ should conform to \nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--cgc_input" + description: "The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--mux_input" + description: "The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_input" + description: "The FASTQ files to be analyzed for VDJ. FASTQ files should conform\ + \ to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_t_input" + description: "The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform\ + \ to the naming\nconventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_t_gd_input" + description: "The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should\ + \ conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_b_input" + description: "The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform\ + \ to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--agc_input" + description: "The FASTQ files to be analyzed for Antigen Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Library arguments" + arguments: + - type: "string" + name: "--library_id" + description: "The Illumina sample name to analyze. This must exactly match the\ + \ 'Sample Name'part\nof the FASTQ files specified in the `--input` argument.\n" + info: null + example: + - "mysample1" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_type" + description: "The underlying feature type of the library.\n" + info: null + example: + - "Gene Expression" + required: false + choices: + - "Gene Expression" + - "VDJ" + - "VDJ-T" + - "VDJ-B" + - "VDJ-T-GD" + - "Antibody Capture" + - "CRISPR Guide Capture" + - "Multiplexing Capture" + - "Antigen Capture" + - "Custom" + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_subsample" + description: "The rate at which reads from the provided FASTQ files are sampled.\n\ + Must be strictly greater than 0 and less than or equal to 1.\n" + info: null + example: + - "0.5" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_lanes" + description: "Lanes associated with this sample. Defaults to using all lanes." + info: null + example: + - "1-4" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_chemistry" + description: "Only applicable to FRP. Library-specific assay configuration. By\ + \ default,\nthe assay configuration is detected automatically. Typically, users\ + \ will\nnot need to specify a chemistry.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Sample parameters" + arguments: + - type: "string" + name: "--sample_ids" + alternatives: + - "--cell_multiplex_sample_id" + description: "A name to identify a multiplexed sample. Must be alphanumeric with\ + \ hyphens and/or underscores,\nand less than 64 characters. Required for Cell\ + \ Multiplexing libraries.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sample_description" + alternatives: + - "--cell_multiplex_description" + description: "A description for the sample." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sample_expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sample_force_cells" + description: "Force pipeline to use this number of cells, bypassing cell detection.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Feature Barcode library specific arguments" + arguments: + - type: "file" + name: "--feature_reference" + description: "Path to the Feature reference CSV file, declaring Feature Barcode\ + \ constructs and associated barcodes.\nRequired only for Antibody Capture or\ + \ CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref\ + \ for more information.\"\n" + info: null + example: + - "feature_reference.csv" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--feature_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is the user-supplied value. Note that the length\ + \ includes the Barcode and UMI\nsequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--feature_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before\ + \ sequencing metrics are computed\nand therefore, limiting the length of Read\ + \ 2 may affect Q30 scores.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_crispr_umi" + description: "Set the minimum number of CRISPR guide RNA UMIs required for protospacer\ + \ detection.\nIf a lower or higher sensitivity is desired for detection, this\ + \ value can be customized\naccording to specific experimental needs. Applicable\ + \ only to datasets that include a\nCRISPR Guide Capture library.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Gene expression arguments" + description: "Arguments relevant to the analysis of gene expression data." + arguments: + - type: "file" + name: "--gex_reference" + description: "Genome refence index built by Cell Ranger mkref." + info: null + example: + - "reference_genome.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_secondary_analysis" + description: "Whether or not to run the secondary analysis e.g. clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_generate_bam" + description: "Whether to generate a BAM file." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--tenx_cloud_token_path" + description: "The 10x Cloud Analysis user token used to enable cell annotation." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_annotation_model" + description: "\"Cell annotation model to use. If auto, uses the default model\ + \ for the species.\nIf not given, does not run cell annotation.\"\n" + info: null + required: false + choices: + - "auto" + - "human_pca_v1_beta" + - "mouse_pca_v1_beta" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_force_cells" + description: "Force pipeline to use this number of cells, bypassing cell detection.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_include_introns" + description: "Whether or not to include intronic reads in counts.\nThis option\ + \ does not apply to Fixed RNA Profiling analysis.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is the user-supplied value. Note that the length\ + \ includes the Barcode and UMI\nsequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before\ + \ sequencing metrics are computed\nand therefore, limiting the length of Read\ + \ 2 may affect Q30 scores.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gex_chemistry" + description: "Assay configuration. Either specify a single value which will be\ + \ applied to all libraries,\nor a number of values that is equal to the number\ + \ of libararies. The latter is only applicable\nto only applicable to Fixed\ + \ RNA Profiling.\n - auto: Chemistry autodetection (default)\n - threeprime:\ + \ Single Cell 3'\n - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single\ + \ Cell 3' v1, v2, v3, or v4\n - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT\n\ + \ - SC-FB: Single Cell Antibody-only 3' v2 or 5'\n - fiveprime: Single Cell\ + \ 5'\n - SC5P-PE: Paired-end Single Cell 5'\n - SC5P-PE-v3: Paired-end Single\ + \ Cell 5' v3\n - SC5P-R2: R2-only Single Cell 5'\n - SC5P-R2-v3: R2-only Single\ + \ Cell 5' v3\n - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X)\n - SC5PHT\ + \ : Single Cell 5' v2 HT\n - SFRP: Fixed RNA Profiling (Singleplex)\n - MFRP:\ + \ Fixed RNA Profiling (Multiplex, Probe Barcode on R2)\n - MFRP-R1: Fixed RNA\ + \ Profiling (Multiplex, Probe Barcode on R1)\n - MFRP-RNA: Fixed RNA Profiling\ + \ (Multiplex, RNA, Probe Barcode on R2)\n - MFRP-Ab: Fixed RNA Profiling (Multiplex,\ + \ Antibody, Probe Barcode at R2:69)\n - MFRP-Ab-R2pos50: Fixed RNA Profiling\ + \ (Multiplex, Antibody, Probe Barcode at R2:50)\n - MFRP-RNA-R1: Fixed RNA\ + \ Profiling (Multiplex, RNA, Probe Barcode on R1)\n - MFRP-Ab-R1: Fixed RNA\ + \ Profiling (Multiplex, Antibody, Probe Barcode on R1)\n - ARC-v1 for analyzing\ + \ the Gene Expression portion of Multiome data. If Cell Ranger auto-detects\ + \ ARC-v1 chemistry, an error is triggered.\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-\ + \ for more information.\n" + info: null + default: + - "auto" + required: false + choices: + - "auto" + - "threeprime" + - "fiveprime" + - "SC3Pv1" + - "SC3Pv2" + - "SC3Pv3" + - "SC3Pv3-polyA" + - "SC3Pv4" + - "SC3Pv4-polyA" + - "SC3Pv3LT" + - "SC3Pv3HT" + - "SC3Pv3HT-polyA" + - "SC5P-PE" + - "SC5P-PE-v3" + - "SC5P-R2" + - "SC-FB" + - "SC5P-R2-v3" + - "SCP5-PE-v3" + - "SC5PHT" + - "MFRP" + - "MFRP-R1" + - "MFRP-RNA" + - "MFRP-Ab" + - "SFRP" + - "MFRP-Ab-R2pos50" + - "MFRP-RNA-R1" + - "MFRP-Ab-R1" + - "ARC-v1" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "VDJ related parameters" + arguments: + - type: "file" + name: "--vdj_reference" + description: "VDJ refence index built by Cell Ranger mkref." + info: null + example: + - "reference_vdj.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_inner_enrichment_primers" + description: "V(D)J Immune Profiling libraries: if inner enrichment primers other\ + \ than those provided \nin the 10x Genomics kits are used, they need to be specified\ + \ here as a\ntext file with one primer per line.\n" + info: null + example: + - "enrichment_primers.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vdj_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases, where N is the user-supplied value.\nNote that the length\ + \ includes the Barcode and UMI sequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vdj_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases, where N is a user-supplied value. \nTrimming occurs\ + \ before sequencing metrics are computed and therefore, limiting the length\ + \ of Read 2 may affect Q30 scores\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "3' Cell multiplexing parameters (CellPlex Multiplexing)" + arguments: + - type: "string" + name: "--cell_multiplex_oligo_ids" + alternatives: + - "--cmo_ids" + description: "The Cell Multiplexing oligo IDs used to multiplex this sample. If\ + \ multiple CMOs were used for a sample,\nseparate IDs with a pipe (e.g., CMO301|CMO302).\ + \ Required for Cell Multiplexing libraries.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--min_assignment_confidence" + description: "The minimum estimated likelihood to call a sample as tagged with\ + \ a Cell Multiplexing Oligo (CMO) instead of \"Unassigned\".\nUsers may wish\ + \ to tolerate a higher rate of mis-assignment in order to obtain more singlets\ + \ to include in their analysis,\nor a lower rate of mis-assignment at the cost\ + \ of obtaining fewer singlets.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cmo_set" + description: "Path to a custom CMO set CSV file, declaring CMO constructs and\ + \ associated barcodes. If the default CMO reference IDs that are built into\n\ + the Cell Ranger software are required, this option does not need to be used.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode_sample_assignment" + description: "Path to a barcode-sample assignment CSV file that specifies the\ + \ barcodes that belong to each sample.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Hashtag multiplexing parameters" + arguments: + - type: "string" + name: "--hashtag_ids" + description: "The hashtag IDs used to multiplex this sample. If multiple antibody\ + \ hashtags were used for the same sample,\nyou can separate IDs with a pipe.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "On-chip multiplexing parameters" + arguments: + - type: "string" + name: "--ocm_barcode_ids" + description: "The OCM barcode IDs used to multiplex this sample. Must be one of\ + \ OB1, OB2, OB3, OB4.\nIf multiple OCM Barcodes were used for the same sample,\ + \ you can separate IDs\nwith a pipe (e.g., OB1|OB2).\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Flex multiplexing paramaters" + arguments: + - type: "file" + name: "--probe_set" + description: "A probe set reference CSV file. It specifies the sequences used\ + \ as a reference for probe alignment and the gene ID associated with each probe.\n\ + It must include 4 columns (probe file format 1.0.0): gene_id,probe_seq,probe_id,included,region\ + \ and an optional 5th column (probe file format 1.0.1).\n- gene_id: The Ensembl\ + \ gene identifier targeted by the probe.\n- probe_seq: The nucleotide sequence\ + \ of the probe, which is complementary to the transcript sequence.\n- probe_id:\ + \ The probe identifier, whose format is described in Probe identifiers.\n- included:\ + \ A TRUE or FALSE flag specifying whether the probe is included in the filtered\ + \ counts matrix output or excluded by the probe filter. \n See filter-probes\ + \ option of cellranger multi. All probes of a gene must be marked TRUE in the\ + \ included column for that gene to be included.\n- region: Present only in v1.0.1\ + \ probe set reference CSV. The gene boundary targeted by the probe. Accepted\ + \ values are spliced or unspliced.\n\nThe file also contains a number of required\ + \ metadata fields in the header in the format #key=value:\n- panel_name: The\ + \ name of the probe set.\n- panel_type: Always predesigned for predesigned probe\ + \ sets.\n- reference_genome: The reference genome build used for probe design.\n\ + - reference_version: The version of the Cell Ranger reference transcriptome\ + \ used for probe design.\n- probe_set_file_format: The version of the probe\ + \ set file format specification that this file conforms to.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--filter_probes" + description: "If 'false', include all non-deprecated probes listed in the probe\ + \ set reference CSV file.\nIf 'true' or not set, probes that are predicted to\ + \ have off-target activity to homologous genes are excluded from analysis.\n\ + Not filtering will result in UMI counts from all non-deprecated probes,\nincluding\ + \ those with predicted off-target activity, to be used in the analysis.\nProbes\ + \ whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--probe_barcode_ids" + description: "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex\ + \ GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing\ + \ Barcode IDs. 10x recommends specifying both barcodes (e.g., BC001+AB001)\n\ + when an Antibody Capture library is present. The barcode pair order is BC+AB\ + \ and they\nare separated with a \"+\" (no spaces). Alternatively, you can specify\ + \ the Probe Barcode ID alone and\nCell Ranger's barcode pairing auto-detection\ + \ algorithm will automatically match to the corresponding Antibody\nMultiplexing\ + \ Barcode.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--emptydrops_minimum_umis" + description: "For singleplex Flex experiments, use this option to adjust the UMI\ + \ cutoff during the second step of cell calling.\nCell Ranger will still perform\ + \ the full cell calling process but will only evaluate barcodes with UMIs above\n\ + the threshold you specify.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Antigen Capture (BEAM) libary arguments" + description: "These arguments are recommended if an Antigen Capture (BEAM) library\ + \ is present. \nIt is needed to calculate the antigen specificity score.\n" + arguments: + - type: "string" + name: "--control_id" + description: "A user-defined ID for any negative controls used in the T/BCR Antigen\ + \ Capture assay. Must match id specified in the feature reference CSV.\nMay\ + \ only include ASCII characters and must not use whitespace, slash, quote, or\ + \ comma characters. \nEach ID must be unique and must not collide with a gene\ + \ identifier from the transcriptome.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--mhc_allele" + description: "The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele\ + \ name specified in the Feature Reference CSV.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "General arguments" + description: "These arguments are applicable to all library types.\n" + arguments: + - type: "boolean" + name: "--check_library_compatibility" + description: "Optional. This option allows users to disable the check that evaluates\ + \ 10x Barcode overlap between\nibraries when multiple libraries are specified\ + \ (e.g., Gene Expression + Antibody Capture). Setting\nthis option to false\ + \ will disable the check across all library combinations. We recommend running\n\ + this check (default), however if the pipeline errors out, users can bypass the\ + \ check to generate\noutputs for troubleshooting.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The folder to store the alignment results." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Executor arguments" + arguments: + - type: "boolean_true" + name: "--dryrun" + description: "If true, the output directory will only contain the CWL input files,\ + \ but the pipeline itself will not be executed." + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using Cell Ranger multi." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "10x_5k_anticmv/raw/" + dest: "10x_5k_anticmv/raw/" +- type: "file" + path: "10x_5k_lung_crispr/raw/" + dest: "10x_5k_lung_crispr/raw/" +- type: "file" + path: "10x_5k_beam/raw/" + dest: "10x_5k_beam/raw/" +- type: "file" + path: "10x_5k_fixed/raw" + dest: "10x_5k_fixed/raw" +- type: "file" + path: "raw" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "veryhighmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + - type: "python" + user: false + packages: + - "pandas" + - "pyyaml" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_multi/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/cellranger_multi" + executable: "target/executable/mapping/cellranger_multi/cellranger_multi" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/cellranger_multi/cellranger_multi b/target/executable/mapping/cellranger_multi/cellranger_multi new file mode 100755 index 00000000..60b409f3 --- /dev/null +++ b/target/executable/mapping/cellranger_multi/cellranger_multi @@ -0,0 +1,3489 @@ +#!/usr/bin/env bash + +# cellranger_multi main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (author, maintainer) +# * Dries De Maeyer (author) +# * Weiwei Schultz (contributor) +# * Dorien Roosen (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_multi" +VIASH_META_FUNCTIONALITY_NAME="cellranger_multi" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger:9.0 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update && \ +apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pandas" "pyyaml" + +LABEL org.opencontainers.image.authors="Dries Schaumont, Angela Oliveira Pisco, Robrecht Cannoodt, Dries De Maeyer, Weiwei Schultz, Dorien Roosen" +LABEL org.opencontainers.image.description="Companion container for running component mapping cellranger_multi" +LABEL org.opencontainers.image.created="2025-08-22T07:23:23Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_multi main" + echo "" + echo "Align fastq files using Cell Ranger multi." + echo "" + echo "Input files:" + echo " --input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed. FASTQ files should conform to the naming" + echo " conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo "Feature type-specific input files:" + echo " Helper functionality to allow feature type-specific input files, without the" + echo " need to specify" + echo " library_type or library_id. The library_id will be inferred from the input" + echo " paths." + echo "" + echo " --gex_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for Gene Expression. FASTQ files should" + echo " conform to the" + echo " naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --abc_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for Antibody Capture. FASTQ files should" + echo " conform to" + echo " the naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --cgc_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files" + echo " should conform to" + echo " the naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --mux_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files" + echo " should conform to" + echo " the naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --vdj_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for VDJ. FASTQ files should conform to" + echo " the" + echo " naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --vdj_t_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform to" + echo " the naming" + echo " conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --vdj_t_gd_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should conform" + echo " to" + echo " the naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --vdj_b_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform to" + echo " the naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo " --agc_input" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed for Antigen Capture. FASTQ files should" + echo " conform to" + echo " the naming conventions of bcl2fastq and mkfastq:" + echo " \`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read" + echo " Type]_001.fastq.gz\`" + echo "" + echo "Library arguments:" + echo " --library_id" + echo " type: string, multiple values allowed" + echo " example: mysample1" + echo " The Illumina sample name to analyze. This must exactly match the 'Sample" + echo " Name'part" + echo " of the FASTQ files specified in the \`--input\` argument." + echo "" + echo " --library_type" + echo " type: string, multiple values allowed" + echo " example: Gene Expression" + echo " choices: [ Gene Expression, VDJ, VDJ-T, VDJ-B, VDJ-T-GD, Antibody" + echo "Capture, CRISPR Guide Capture, Multiplexing Capture, Antigen Capture, Custom ]" + echo " The underlying feature type of the library." + echo "" + echo " --library_subsample" + echo " type: string, multiple values allowed" + echo " example: 0.5" + echo " The rate at which reads from the provided FASTQ files are sampled." + echo " Must be strictly greater than 0 and less than or equal to 1." + echo "" + echo " --library_lanes" + echo " type: string, multiple values allowed" + echo " example: 1-4" + echo " Lanes associated with this sample. Defaults to using all lanes." + echo "" + echo " --library_chemistry" + echo " type: string" + echo " Only applicable to FRP. Library-specific assay configuration. By" + echo " default," + echo " the assay configuration is detected automatically. Typically, users will" + echo " not need to specify a chemistry." + echo "" + echo "Sample parameters:" + echo " --cell_multiplex_sample_id, --sample_ids" + echo " type: string, multiple values allowed" + echo " A name to identify a multiplexed sample. Must be alphanumeric with" + echo " hyphens and/or underscores," + echo " and less than 64 characters. Required for Cell Multiplexing libraries." + echo "" + echo " --cell_multiplex_description, --sample_description" + echo " type: string, multiple values allowed" + echo " A description for the sample." + echo "" + echo " --sample_expect_cells" + echo " type: integer, multiple values allowed" + echo " example: 3000" + echo " Expected number of recovered cells, used as input to cell calling" + echo " algorithm." + echo "" + echo " --sample_force_cells" + echo " type: integer, multiple values allowed" + echo " example: 3000" + echo " Force pipeline to use this number of cells, bypassing cell detection." + echo "" + echo "Feature Barcode library specific arguments:" + echo " --feature_reference" + echo " type: file, file must exist" + echo " example: feature_reference.csv" + echo " Path to the Feature reference CSV file, declaring Feature Barcode" + echo " constructs and associated barcodes." + echo " Required only for Antibody Capture or CRISPR Guide Capture libraries." + echo " See" + echo " " + echo "https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref" + echo " for more information.\"" + echo "" + echo " --feature_r1_length" + echo " type: integer" + echo " Limit the length of the input Read 1 sequence of V(D)J libraries to the" + echo " first N bases," + echo " where N is the user-supplied value. Note that the length includes the" + echo " Barcode and UMI" + echo " sequences so do not set this below 26." + echo "" + echo " --feature_r2_length" + echo " type: integer" + echo " Limit the length of the input Read 2 sequence of V(D)J libraries to the" + echo " first N bases," + echo " where N is a user-supplied value. Trimming occurs before sequencing" + echo " metrics are computed" + echo " and therefore, limiting the length of Read 2 may affect Q30 scores." + echo "" + echo " --min_crispr_umi" + echo " type: integer" + echo " min: 1" + echo " Set the minimum number of CRISPR guide RNA UMIs required for protospacer" + echo " detection." + echo " If a lower or higher sensitivity is desired for detection, this value" + echo " can be customized" + echo " according to specific experimental needs. Applicable only to datasets" + echo " that include a" + echo " CRISPR Guide Capture library." + echo "" + echo "Gene expression arguments:" + echo " Arguments relevant to the analysis of gene expression data." + echo "" + echo " --gex_reference" + echo " type: file, required parameter, file must exist" + echo " example: reference_genome.tar.gz" + echo " Genome refence index built by Cell Ranger mkref." + echo "" + echo " --gex_secondary_analysis" + echo " type: boolean" + echo " default: false" + echo " Whether or not to run the secondary analysis e.g. clustering." + echo "" + echo " --gex_generate_bam" + echo " type: boolean" + echo " default: false" + echo " Whether to generate a BAM file." + echo "" + echo " --tenx_cloud_token_path" + echo " type: file, file must exist" + echo " The 10x Cloud Analysis user token used to enable cell annotation." + echo "" + echo " --cell_annotation_model" + echo " type: string" + echo " choices: [ auto, human_pca_v1_beta, mouse_pca_v1_beta ]" + echo " \"Cell annotation model to use. If auto, uses the default model for the" + echo " species." + echo " If not given, does not run cell annotation.\"" + echo "" + echo " --gex_expect_cells" + echo " type: integer" + echo " example: 3000" + echo " Expected number of recovered cells, used as input to cell calling" + echo " algorithm." + echo "" + echo " --gex_force_cells" + echo " type: integer" + echo " example: 3000" + echo " Force pipeline to use this number of cells, bypassing cell detection." + echo "" + echo " --gex_include_introns" + echo " type: boolean" + echo " default: true" + echo " Whether or not to include intronic reads in counts." + echo " This option does not apply to Fixed RNA Profiling analysis." + echo "" + echo " --gex_r1_length" + echo " type: integer" + echo " Limit the length of the input Read 1 sequence of V(D)J libraries to the" + echo " first N bases," + echo " where N is the user-supplied value. Note that the length includes the" + echo " Barcode and UMI" + echo " sequences so do not set this below 26." + echo "" + echo " --gex_r2_length" + echo " type: integer" + echo " Limit the length of the input Read 2 sequence of V(D)J libraries to the" + echo " first N bases," + echo " where N is a user-supplied value. Trimming occurs before sequencing" + echo " metrics are computed" + echo " and therefore, limiting the length of Read 2 may affect Q30 scores." + echo "" + echo " --gex_chemistry" + echo " type: string" + echo " default: auto" + echo " choices: [ auto, threeprime, fiveprime, SC3Pv1, SC3Pv2, SC3Pv3," + echo "SC3Pv3-polyA, SC3Pv4, SC3Pv4-polyA, SC3Pv3LT, SC3Pv3HT, SC3Pv3HT-polyA, SC5P-PE," + echo "SC5P-PE-v3, SC5P-R2, SC-FB, SC5P-R2-v3, SCP5-PE-v3, SC5PHT, MFRP, MFRP-R1," + echo "MFRP-RNA, MFRP-Ab, SFRP, MFRP-Ab-R2pos50, MFRP-RNA-R1, MFRP-Ab-R1, ARC-v1 ]" + echo " Assay configuration. Either specify a single value which will be applied" + echo " to all libraries," + echo " or a number of values that is equal to the number of libararies. The" + echo " latter is only applicable" + echo " to only applicable to Fixed RNA Profiling." + echo " - auto: Chemistry autodetection (default)" + echo " - threeprime: Single Cell 3'" + echo " - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single Cell 3' v1," + echo " v2, v3, or v4" + echo " - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT" + echo " - SC-FB: Single Cell Antibody-only 3' v2 or 5'" + echo " - fiveprime: Single Cell 5'" + echo " - SC5P-PE: Paired-end Single Cell 5'" + echo " - SC5P-PE-v3: Paired-end Single Cell 5' v3" + echo " - SC5P-R2: R2-only Single Cell 5'" + echo " - SC5P-R2-v3: R2-only Single Cell 5' v3" + echo " - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X)" + echo " - SC5PHT : Single Cell 5' v2 HT" + echo " - SFRP: Fixed RNA Profiling (Singleplex)" + echo " - MFRP: Fixed RNA Profiling (Multiplex, Probe Barcode on R2)" + echo " - MFRP-R1: Fixed RNA Profiling (Multiplex, Probe Barcode on R1)" + echo " - MFRP-RNA: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R2)" + echo " - MFRP-Ab: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at" + echo " R2:69)" + echo " - MFRP-Ab-R2pos50: Fixed RNA Profiling (Multiplex, Antibody, Probe" + echo " Barcode at R2:50)" + echo " - MFRP-RNA-R1: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on" + echo " R1)" + echo " - MFRP-Ab-R1: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode" + echo " on R1)" + echo " - ARC-v1 for analyzing the Gene Expression portion of Multiome data." + echo " If Cell Ranger auto-detects ARC-v1 chemistry, an error is triggered." + echo " See" + echo " " + echo "https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-" + echo " for more information." + echo "" + echo "VDJ related parameters:" + echo " --vdj_reference" + echo " type: file, file must exist" + echo " example: reference_vdj.tar.gz" + echo " VDJ refence index built by Cell Ranger mkref." + echo "" + echo " --vdj_inner_enrichment_primers" + echo " type: file, file must exist" + echo " example: enrichment_primers.txt" + echo " V(D)J Immune Profiling libraries: if inner enrichment primers other than" + echo " those provided" + echo " in the 10x Genomics kits are used, they need to be specified here as a" + echo " text file with one primer per line." + echo "" + echo " --vdj_r1_length" + echo " type: integer" + echo " Limit the length of the input Read 1 sequence of V(D)J libraries to the" + echo " first N bases, where N is the user-supplied value." + echo " Note that the length includes the Barcode and UMI sequences so do not" + echo " set this below 26." + echo "" + echo " --vdj_r2_length" + echo " type: integer" + echo " Limit the length of the input Read 2 sequence of V(D)J libraries to the" + echo " first N bases, where N is a user-supplied value." + echo " Trimming occurs before sequencing metrics are computed and therefore," + echo " limiting the length of Read 2 may affect Q30 scores" + echo "" + echo "3' Cell multiplexing parameters (CellPlex Multiplexing):" + echo " --cmo_ids, --cell_multiplex_oligo_ids" + echo " type: string, multiple values allowed" + echo " The Cell Multiplexing oligo IDs used to multiplex this sample. If" + echo " multiple CMOs were used for a sample," + echo " separate IDs with a pipe (e.g., CMO301|CMO302). Required for Cell" + echo " Multiplexing libraries." + echo "" + echo " --min_assignment_confidence" + echo " type: double" + echo " The minimum estimated likelihood to call a sample as tagged with a Cell" + echo " Multiplexing Oligo (CMO) instead of \"Unassigned\"." + echo " Users may wish to tolerate a higher rate of mis-assignment in order to" + echo " obtain more singlets to include in their analysis," + echo " or a lower rate of mis-assignment at the cost of obtaining fewer" + echo " singlets." + echo "" + echo " --cmo_set" + echo " type: file, file must exist" + echo " Path to a custom CMO set CSV file, declaring CMO constructs and" + echo " associated barcodes. If the default CMO reference IDs that are built" + echo " into" + echo " the Cell Ranger software are required, this option does not need to be" + echo " used." + echo "" + echo " --barcode_sample_assignment" + echo " type: file, file must exist" + echo " Path to a barcode-sample assignment CSV file that specifies the barcodes" + echo " that belong to each sample." + echo "" + echo "Hashtag multiplexing parameters:" + echo " --hashtag_ids" + echo " type: string, multiple values allowed" + echo " The hashtag IDs used to multiplex this sample. If multiple antibody" + echo " hashtags were used for the same sample," + echo " you can separate IDs with a pipe." + echo "" + echo "On-chip multiplexing parameters:" + echo " --ocm_barcode_ids" + echo " type: string, multiple values allowed" + echo " The OCM barcode IDs used to multiplex this sample. Must be one of OB1," + echo " OB2, OB3, OB4." + echo " If multiple OCM Barcodes were used for the same sample, you can separate" + echo " IDs" + echo " with a pipe (e.g., OB1|OB2)." + echo "" + echo "Flex multiplexing paramaters:" + echo " --probe_set" + echo " type: file, file must exist" + echo " A probe set reference CSV file. It specifies the sequences used as a" + echo " reference for probe alignment and the gene ID associated with each" + echo " probe." + echo " It must include 4 columns (probe file format 1.0.0):" + echo " gene_id,probe_seq,probe_id,included,region and an optional 5th column" + echo " (probe file format 1.0.1)." + echo " - gene_id: The Ensembl gene identifier targeted by the probe." + echo " - probe_seq: The nucleotide sequence of the probe, which is" + echo " complementary to the transcript sequence." + echo " - probe_id: The probe identifier, whose format is described in Probe" + echo " identifiers." + echo " - included: A TRUE or FALSE flag specifying whether the probe is" + echo " included in the filtered counts matrix output or excluded by the probe" + echo " filter." + echo " See filter-probes option of cellranger multi. All probes of" + echo " a gene must be marked TRUE in the included column for that gene to be" + echo " included." + echo " - region: Present only in v1.0.1 probe set reference CSV. The gene" + echo " boundary targeted by the probe. Accepted values are spliced or" + echo " unspliced." + echo " The file also contains a number of required metadata fields in the" + echo " header in the format #key=value:" + echo " - panel_name: The name of the probe set." + echo " - panel_type: Always predesigned for predesigned probe sets." + echo " - reference_genome: The reference genome build used for probe design." + echo " - reference_version: The version of the Cell Ranger reference" + echo " transcriptome used for probe design." + echo " - probe_set_file_format: The version of the probe set file format" + echo " specification that this file conforms to." + echo "" + echo " --filter_probes" + echo " type: boolean" + echo " If 'false', include all non-deprecated probes listed in the probe set" + echo " reference CSV file." + echo " If 'true' or not set, probes that are predicted to have off-target" + echo " activity to homologous genes are excluded from analysis." + echo " Not filtering will result in UMI counts from all non-deprecated probes," + echo " including those with predicted off-target activity, to be used in the" + echo " analysis." + echo " Probes whose ID is prefixed with DEPRECATED are always excluded from the" + echo " analysis." + echo "" + echo " --probe_barcode_ids" + echo " type: string, multiple values allowed" + echo " The Fixed RNA Probe Barcode ID used for this sample, and for multiplex" + echo " GEX + Antibody Capture libraries," + echo " the corresponding Antibody Multiplexing Barcode IDs. 10x recommends" + echo " specifying both barcodes (e.g., BC001+AB001)" + echo " when an Antibody Capture library is present. The barcode pair order is" + echo " BC+AB and they" + echo " are separated with a \"+\" (no spaces). Alternatively, you can specify the" + echo " Probe Barcode ID alone and" + echo " Cell Ranger's barcode pairing auto-detection algorithm will" + echo " automatically match to the corresponding Antibody" + echo " Multiplexing Barcode." + echo "" + echo " --emptydrops_minimum_umis" + echo " type: integer" + echo " min: 1" + echo " For singleplex Flex experiments, use this option to adjust the UMI" + echo " cutoff during the second step of cell calling." + echo " Cell Ranger will still perform the full cell calling process but will" + echo " only evaluate barcodes with UMIs above" + echo " the threshold you specify." + echo "" + echo "Antigen Capture (BEAM) libary arguments:" + echo " These arguments are recommended if an Antigen Capture (BEAM) library is" + echo " present." + echo " It is needed to calculate the antigen specificity score." + echo "" + echo " --control_id" + echo " type: string, multiple values allowed" + echo " A user-defined ID for any negative controls used in the T/BCR Antigen" + echo " Capture assay. Must match id specified in the feature reference CSV." + echo " May only include ASCII characters and must not use whitespace, slash," + echo " quote, or comma characters." + echo " Each ID must be unique and must not collide with a gene identifier from" + echo " the transcriptome." + echo "" + echo " --mhc_allele" + echo " type: string, multiple values allowed" + echo " The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele" + echo " name specified in the Feature Reference CSV." + echo "" + echo "General arguments:" + echo " These arguments are applicable to all library types." + echo "" + echo " --check_library_compatibility" + echo " type: boolean" + echo " default: true" + echo " Optional. This option allows users to disable the check that evaluates" + echo " 10x Barcode overlap between" + echo " ibraries when multiple libraries are specified (e.g., Gene Expression +" + echo " Antibody Capture). Setting" + echo " this option to false will disable the check across all library" + echo " combinations. We recommend running" + echo " this check (default), however if the pipeline errors out, users can" + echo " bypass the check to generate" + echo " outputs for troubleshooting." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/output" + echo " The folder to store the alignment results." + echo "" + echo "Executor arguments:" + echo " --dryrun" + echo " type: boolean_true" + echo " If true, the output directory will only contain the CWL input files, but" + echo " the pipeline itself will not be executed." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_multi main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --gex_input) + if [ -z "$VIASH_PAR_GEX_INPUT" ]; then + VIASH_PAR_GEX_INPUT="$2" + else + VIASH_PAR_GEX_INPUT="$VIASH_PAR_GEX_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_input=*) + if [ -z "$VIASH_PAR_GEX_INPUT" ]; then + VIASH_PAR_GEX_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GEX_INPUT="$VIASH_PAR_GEX_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --abc_input) + if [ -z "$VIASH_PAR_ABC_INPUT" ]; then + VIASH_PAR_ABC_INPUT="$2" + else + VIASH_PAR_ABC_INPUT="$VIASH_PAR_ABC_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --abc_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --abc_input=*) + if [ -z "$VIASH_PAR_ABC_INPUT" ]; then + VIASH_PAR_ABC_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ABC_INPUT="$VIASH_PAR_ABC_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --cgc_input) + if [ -z "$VIASH_PAR_CGC_INPUT" ]; then + VIASH_PAR_CGC_INPUT="$2" + else + VIASH_PAR_CGC_INPUT="$VIASH_PAR_CGC_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cgc_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cgc_input=*) + if [ -z "$VIASH_PAR_CGC_INPUT" ]; then + VIASH_PAR_CGC_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CGC_INPUT="$VIASH_PAR_CGC_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --mux_input) + if [ -z "$VIASH_PAR_MUX_INPUT" ]; then + VIASH_PAR_MUX_INPUT="$2" + else + VIASH_PAR_MUX_INPUT="$VIASH_PAR_MUX_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mux_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mux_input=*) + if [ -z "$VIASH_PAR_MUX_INPUT" ]; then + VIASH_PAR_MUX_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MUX_INPUT="$VIASH_PAR_MUX_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --vdj_input) + if [ -z "$VIASH_PAR_VDJ_INPUT" ]; then + VIASH_PAR_VDJ_INPUT="$2" + else + VIASH_PAR_VDJ_INPUT="$VIASH_PAR_VDJ_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_input=*) + if [ -z "$VIASH_PAR_VDJ_INPUT" ]; then + VIASH_PAR_VDJ_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VDJ_INPUT="$VIASH_PAR_VDJ_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --vdj_t_input) + if [ -z "$VIASH_PAR_VDJ_T_INPUT" ]; then + VIASH_PAR_VDJ_T_INPUT="$2" + else + VIASH_PAR_VDJ_T_INPUT="$VIASH_PAR_VDJ_T_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_t_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_t_input=*) + if [ -z "$VIASH_PAR_VDJ_T_INPUT" ]; then + VIASH_PAR_VDJ_T_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VDJ_T_INPUT="$VIASH_PAR_VDJ_T_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --vdj_t_gd_input) + if [ -z "$VIASH_PAR_VDJ_T_GD_INPUT" ]; then + VIASH_PAR_VDJ_T_GD_INPUT="$2" + else + VIASH_PAR_VDJ_T_GD_INPUT="$VIASH_PAR_VDJ_T_GD_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_t_gd_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_t_gd_input=*) + if [ -z "$VIASH_PAR_VDJ_T_GD_INPUT" ]; then + VIASH_PAR_VDJ_T_GD_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VDJ_T_GD_INPUT="$VIASH_PAR_VDJ_T_GD_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --vdj_b_input) + if [ -z "$VIASH_PAR_VDJ_B_INPUT" ]; then + VIASH_PAR_VDJ_B_INPUT="$2" + else + VIASH_PAR_VDJ_B_INPUT="$VIASH_PAR_VDJ_B_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_b_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_b_input=*) + if [ -z "$VIASH_PAR_VDJ_B_INPUT" ]; then + VIASH_PAR_VDJ_B_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VDJ_B_INPUT="$VIASH_PAR_VDJ_B_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --agc_input) + if [ -z "$VIASH_PAR_AGC_INPUT" ]; then + VIASH_PAR_AGC_INPUT="$2" + else + VIASH_PAR_AGC_INPUT="$VIASH_PAR_AGC_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --agc_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --agc_input=*) + if [ -z "$VIASH_PAR_AGC_INPUT" ]; then + VIASH_PAR_AGC_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_AGC_INPUT="$VIASH_PAR_AGC_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --library_id) + if [ -z "$VIASH_PAR_LIBRARY_ID" ]; then + VIASH_PAR_LIBRARY_ID="$2" + else + VIASH_PAR_LIBRARY_ID="$VIASH_PAR_LIBRARY_ID;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --library_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --library_id=*) + if [ -z "$VIASH_PAR_LIBRARY_ID" ]; then + VIASH_PAR_LIBRARY_ID=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIBRARY_ID="$VIASH_PAR_LIBRARY_ID;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --library_type) + if [ -z "$VIASH_PAR_LIBRARY_TYPE" ]; then + VIASH_PAR_LIBRARY_TYPE="$2" + else + VIASH_PAR_LIBRARY_TYPE="$VIASH_PAR_LIBRARY_TYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --library_type. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --library_type=*) + if [ -z "$VIASH_PAR_LIBRARY_TYPE" ]; then + VIASH_PAR_LIBRARY_TYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIBRARY_TYPE="$VIASH_PAR_LIBRARY_TYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --library_subsample) + if [ -z "$VIASH_PAR_LIBRARY_SUBSAMPLE" ]; then + VIASH_PAR_LIBRARY_SUBSAMPLE="$2" + else + VIASH_PAR_LIBRARY_SUBSAMPLE="$VIASH_PAR_LIBRARY_SUBSAMPLE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --library_subsample. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --library_subsample=*) + if [ -z "$VIASH_PAR_LIBRARY_SUBSAMPLE" ]; then + VIASH_PAR_LIBRARY_SUBSAMPLE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIBRARY_SUBSAMPLE="$VIASH_PAR_LIBRARY_SUBSAMPLE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --library_lanes) + if [ -z "$VIASH_PAR_LIBRARY_LANES" ]; then + VIASH_PAR_LIBRARY_LANES="$2" + else + VIASH_PAR_LIBRARY_LANES="$VIASH_PAR_LIBRARY_LANES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --library_lanes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --library_lanes=*) + if [ -z "$VIASH_PAR_LIBRARY_LANES" ]; then + VIASH_PAR_LIBRARY_LANES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIBRARY_LANES="$VIASH_PAR_LIBRARY_LANES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --library_chemistry) + [ -n "$VIASH_PAR_LIBRARY_CHEMISTRY" ] && ViashError Bad arguments for option \'--library_chemistry\': \'$VIASH_PAR_LIBRARY_CHEMISTRY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIBRARY_CHEMISTRY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --library_chemistry. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --library_chemistry=*) + [ -n "$VIASH_PAR_LIBRARY_CHEMISTRY" ] && ViashError Bad arguments for option \'--library_chemistry=*\': \'$VIASH_PAR_LIBRARY_CHEMISTRY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIBRARY_CHEMISTRY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sample_ids) + if [ -z "$VIASH_PAR_SAMPLE_IDS" ]; then + VIASH_PAR_SAMPLE_IDS="$2" + else + VIASH_PAR_SAMPLE_IDS="$VIASH_PAR_SAMPLE_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_ids=*) + if [ -z "$VIASH_PAR_SAMPLE_IDS" ]; then + VIASH_PAR_SAMPLE_IDS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SAMPLE_IDS="$VIASH_PAR_SAMPLE_IDS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --cell_multiplex_sample_id) + if [ -z "$VIASH_PAR_SAMPLE_IDS" ]; then + VIASH_PAR_SAMPLE_IDS="$2" + else + VIASH_PAR_SAMPLE_IDS="$VIASH_PAR_SAMPLE_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_multiplex_sample_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_description) + if [ -z "$VIASH_PAR_SAMPLE_DESCRIPTION" ]; then + VIASH_PAR_SAMPLE_DESCRIPTION="$2" + else + VIASH_PAR_SAMPLE_DESCRIPTION="$VIASH_PAR_SAMPLE_DESCRIPTION;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_description. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_description=*) + if [ -z "$VIASH_PAR_SAMPLE_DESCRIPTION" ]; then + VIASH_PAR_SAMPLE_DESCRIPTION=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SAMPLE_DESCRIPTION="$VIASH_PAR_SAMPLE_DESCRIPTION;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --cell_multiplex_description) + if [ -z "$VIASH_PAR_SAMPLE_DESCRIPTION" ]; then + VIASH_PAR_SAMPLE_DESCRIPTION="$2" + else + VIASH_PAR_SAMPLE_DESCRIPTION="$VIASH_PAR_SAMPLE_DESCRIPTION;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_multiplex_description. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_expect_cells) + if [ -z "$VIASH_PAR_SAMPLE_EXPECT_CELLS" ]; then + VIASH_PAR_SAMPLE_EXPECT_CELLS="$2" + else + VIASH_PAR_SAMPLE_EXPECT_CELLS="$VIASH_PAR_SAMPLE_EXPECT_CELLS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_expect_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_expect_cells=*) + if [ -z "$VIASH_PAR_SAMPLE_EXPECT_CELLS" ]; then + VIASH_PAR_SAMPLE_EXPECT_CELLS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SAMPLE_EXPECT_CELLS="$VIASH_PAR_SAMPLE_EXPECT_CELLS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sample_force_cells) + if [ -z "$VIASH_PAR_SAMPLE_FORCE_CELLS" ]; then + VIASH_PAR_SAMPLE_FORCE_CELLS="$2" + else + VIASH_PAR_SAMPLE_FORCE_CELLS="$VIASH_PAR_SAMPLE_FORCE_CELLS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sample_force_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sample_force_cells=*) + if [ -z "$VIASH_PAR_SAMPLE_FORCE_CELLS" ]; then + VIASH_PAR_SAMPLE_FORCE_CELLS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SAMPLE_FORCE_CELLS="$VIASH_PAR_SAMPLE_FORCE_CELLS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --feature_reference) + [ -n "$VIASH_PAR_FEATURE_REFERENCE" ] && ViashError Bad arguments for option \'--feature_reference\': \'$VIASH_PAR_FEATURE_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --feature_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --feature_reference=*) + [ -n "$VIASH_PAR_FEATURE_REFERENCE" ] && ViashError Bad arguments for option \'--feature_reference=*\': \'$VIASH_PAR_FEATURE_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --feature_r1_length) + [ -n "$VIASH_PAR_FEATURE_R1_LENGTH" ] && ViashError Bad arguments for option \'--feature_r1_length\': \'$VIASH_PAR_FEATURE_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_R1_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --feature_r1_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --feature_r1_length=*) + [ -n "$VIASH_PAR_FEATURE_R1_LENGTH" ] && ViashError Bad arguments for option \'--feature_r1_length=*\': \'$VIASH_PAR_FEATURE_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_R1_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --feature_r2_length) + [ -n "$VIASH_PAR_FEATURE_R2_LENGTH" ] && ViashError Bad arguments for option \'--feature_r2_length\': \'$VIASH_PAR_FEATURE_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_R2_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --feature_r2_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --feature_r2_length=*) + [ -n "$VIASH_PAR_FEATURE_R2_LENGTH" ] && ViashError Bad arguments for option \'--feature_r2_length=*\': \'$VIASH_PAR_FEATURE_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FEATURE_R2_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_crispr_umi) + [ -n "$VIASH_PAR_MIN_CRISPR_UMI" ] && ViashError Bad arguments for option \'--min_crispr_umi\': \'$VIASH_PAR_MIN_CRISPR_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CRISPR_UMI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_crispr_umi. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_crispr_umi=*) + [ -n "$VIASH_PAR_MIN_CRISPR_UMI" ] && ViashError Bad arguments for option \'--min_crispr_umi=*\': \'$VIASH_PAR_MIN_CRISPR_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CRISPR_UMI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_reference) + [ -n "$VIASH_PAR_GEX_REFERENCE" ] && ViashError Bad arguments for option \'--gex_reference\': \'$VIASH_PAR_GEX_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_reference=*) + [ -n "$VIASH_PAR_GEX_REFERENCE" ] && ViashError Bad arguments for option \'--gex_reference=*\': \'$VIASH_PAR_GEX_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_secondary_analysis) + [ -n "$VIASH_PAR_GEX_SECONDARY_ANALYSIS" ] && ViashError Bad arguments for option \'--gex_secondary_analysis\': \'$VIASH_PAR_GEX_SECONDARY_ANALYSIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_SECONDARY_ANALYSIS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_secondary_analysis. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_secondary_analysis=*) + [ -n "$VIASH_PAR_GEX_SECONDARY_ANALYSIS" ] && ViashError Bad arguments for option \'--gex_secondary_analysis=*\': \'$VIASH_PAR_GEX_SECONDARY_ANALYSIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_SECONDARY_ANALYSIS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_generate_bam) + [ -n "$VIASH_PAR_GEX_GENERATE_BAM" ] && ViashError Bad arguments for option \'--gex_generate_bam\': \'$VIASH_PAR_GEX_GENERATE_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_GENERATE_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_generate_bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_generate_bam=*) + [ -n "$VIASH_PAR_GEX_GENERATE_BAM" ] && ViashError Bad arguments for option \'--gex_generate_bam=*\': \'$VIASH_PAR_GEX_GENERATE_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_GENERATE_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tenx_cloud_token_path) + [ -n "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ] && ViashError Bad arguments for option \'--tenx_cloud_token_path\': \'$VIASH_PAR_TENX_CLOUD_TOKEN_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TENX_CLOUD_TOKEN_PATH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tenx_cloud_token_path. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tenx_cloud_token_path=*) + [ -n "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ] && ViashError Bad arguments for option \'--tenx_cloud_token_path=*\': \'$VIASH_PAR_TENX_CLOUD_TOKEN_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TENX_CLOUD_TOKEN_PATH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_annotation_model) + [ -n "$VIASH_PAR_CELL_ANNOTATION_MODEL" ] && ViashError Bad arguments for option \'--cell_annotation_model\': \'$VIASH_PAR_CELL_ANNOTATION_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_ANNOTATION_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_annotation_model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_annotation_model=*) + [ -n "$VIASH_PAR_CELL_ANNOTATION_MODEL" ] && ViashError Bad arguments for option \'--cell_annotation_model=*\': \'$VIASH_PAR_CELL_ANNOTATION_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_ANNOTATION_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_expect_cells) + [ -n "$VIASH_PAR_GEX_EXPECT_CELLS" ] && ViashError Bad arguments for option \'--gex_expect_cells\': \'$VIASH_PAR_GEX_EXPECT_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_EXPECT_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_expect_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_expect_cells=*) + [ -n "$VIASH_PAR_GEX_EXPECT_CELLS" ] && ViashError Bad arguments for option \'--gex_expect_cells=*\': \'$VIASH_PAR_GEX_EXPECT_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_EXPECT_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_force_cells) + [ -n "$VIASH_PAR_GEX_FORCE_CELLS" ] && ViashError Bad arguments for option \'--gex_force_cells\': \'$VIASH_PAR_GEX_FORCE_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_FORCE_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_force_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_force_cells=*) + [ -n "$VIASH_PAR_GEX_FORCE_CELLS" ] && ViashError Bad arguments for option \'--gex_force_cells=*\': \'$VIASH_PAR_GEX_FORCE_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_FORCE_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_include_introns) + [ -n "$VIASH_PAR_GEX_INCLUDE_INTRONS" ] && ViashError Bad arguments for option \'--gex_include_introns\': \'$VIASH_PAR_GEX_INCLUDE_INTRONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_INCLUDE_INTRONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_include_introns. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_include_introns=*) + [ -n "$VIASH_PAR_GEX_INCLUDE_INTRONS" ] && ViashError Bad arguments for option \'--gex_include_introns=*\': \'$VIASH_PAR_GEX_INCLUDE_INTRONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_INCLUDE_INTRONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_r1_length) + [ -n "$VIASH_PAR_GEX_R1_LENGTH" ] && ViashError Bad arguments for option \'--gex_r1_length\': \'$VIASH_PAR_GEX_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_R1_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_r1_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_r1_length=*) + [ -n "$VIASH_PAR_GEX_R1_LENGTH" ] && ViashError Bad arguments for option \'--gex_r1_length=*\': \'$VIASH_PAR_GEX_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_R1_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_r2_length) + [ -n "$VIASH_PAR_GEX_R2_LENGTH" ] && ViashError Bad arguments for option \'--gex_r2_length\': \'$VIASH_PAR_GEX_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_R2_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_r2_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_r2_length=*) + [ -n "$VIASH_PAR_GEX_R2_LENGTH" ] && ViashError Bad arguments for option \'--gex_r2_length=*\': \'$VIASH_PAR_GEX_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_R2_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gex_chemistry) + [ -n "$VIASH_PAR_GEX_CHEMISTRY" ] && ViashError Bad arguments for option \'--gex_chemistry\': \'$VIASH_PAR_GEX_CHEMISTRY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_CHEMISTRY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gex_chemistry. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gex_chemistry=*) + [ -n "$VIASH_PAR_GEX_CHEMISTRY" ] && ViashError Bad arguments for option \'--gex_chemistry=*\': \'$VIASH_PAR_GEX_CHEMISTRY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GEX_CHEMISTRY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_reference) + [ -n "$VIASH_PAR_VDJ_REFERENCE" ] && ViashError Bad arguments for option \'--vdj_reference\': \'$VIASH_PAR_VDJ_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_reference=*) + [ -n "$VIASH_PAR_VDJ_REFERENCE" ] && ViashError Bad arguments for option \'--vdj_reference=*\': \'$VIASH_PAR_VDJ_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_inner_enrichment_primers) + [ -n "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS" ] && ViashError Bad arguments for option \'--vdj_inner_enrichment_primers\': \'$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_inner_enrichment_primers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_inner_enrichment_primers=*) + [ -n "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS" ] && ViashError Bad arguments for option \'--vdj_inner_enrichment_primers=*\': \'$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_r1_length) + [ -n "$VIASH_PAR_VDJ_R1_LENGTH" ] && ViashError Bad arguments for option \'--vdj_r1_length\': \'$VIASH_PAR_VDJ_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_R1_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_r1_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_r1_length=*) + [ -n "$VIASH_PAR_VDJ_R1_LENGTH" ] && ViashError Bad arguments for option \'--vdj_r1_length=*\': \'$VIASH_PAR_VDJ_R1_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_R1_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vdj_r2_length) + [ -n "$VIASH_PAR_VDJ_R2_LENGTH" ] && ViashError Bad arguments for option \'--vdj_r2_length\': \'$VIASH_PAR_VDJ_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_R2_LENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vdj_r2_length. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vdj_r2_length=*) + [ -n "$VIASH_PAR_VDJ_R2_LENGTH" ] && ViashError Bad arguments for option \'--vdj_r2_length=*\': \'$VIASH_PAR_VDJ_R2_LENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VDJ_R2_LENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_multiplex_oligo_ids) + if [ -z "$VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS" ]; then + VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS="$2" + else + VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS="$VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_multiplex_oligo_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_multiplex_oligo_ids=*) + if [ -z "$VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS" ]; then + VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS="$VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --cmo_ids) + if [ -z "$VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS" ]; then + VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS="$2" + else + VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS="$VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cmo_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_assignment_confidence) + [ -n "$VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE" ] && ViashError Bad arguments for option \'--min_assignment_confidence\': \'$VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_assignment_confidence. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_assignment_confidence=*) + [ -n "$VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE" ] && ViashError Bad arguments for option \'--min_assignment_confidence=*\': \'$VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cmo_set) + [ -n "$VIASH_PAR_CMO_SET" ] && ViashError Bad arguments for option \'--cmo_set\': \'$VIASH_PAR_CMO_SET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CMO_SET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cmo_set. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cmo_set=*) + [ -n "$VIASH_PAR_CMO_SET" ] && ViashError Bad arguments for option \'--cmo_set=*\': \'$VIASH_PAR_CMO_SET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CMO_SET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --barcode_sample_assignment) + [ -n "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT" ] && ViashError Bad arguments for option \'--barcode_sample_assignment\': \'$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcode_sample_assignment. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcode_sample_assignment=*) + [ -n "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT" ] && ViashError Bad arguments for option \'--barcode_sample_assignment=*\': \'$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --hashtag_ids) + if [ -z "$VIASH_PAR_HASHTAG_IDS" ]; then + VIASH_PAR_HASHTAG_IDS="$2" + else + VIASH_PAR_HASHTAG_IDS="$VIASH_PAR_HASHTAG_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --hashtag_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --hashtag_ids=*) + if [ -z "$VIASH_PAR_HASHTAG_IDS" ]; then + VIASH_PAR_HASHTAG_IDS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_HASHTAG_IDS="$VIASH_PAR_HASHTAG_IDS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --ocm_barcode_ids) + if [ -z "$VIASH_PAR_OCM_BARCODE_IDS" ]; then + VIASH_PAR_OCM_BARCODE_IDS="$2" + else + VIASH_PAR_OCM_BARCODE_IDS="$VIASH_PAR_OCM_BARCODE_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ocm_barcode_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ocm_barcode_ids=*) + if [ -z "$VIASH_PAR_OCM_BARCODE_IDS" ]; then + VIASH_PAR_OCM_BARCODE_IDS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OCM_BARCODE_IDS="$VIASH_PAR_OCM_BARCODE_IDS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --probe_set) + [ -n "$VIASH_PAR_PROBE_SET" ] && ViashError Bad arguments for option \'--probe_set\': \'$VIASH_PAR_PROBE_SET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROBE_SET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --probe_set. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --probe_set=*) + [ -n "$VIASH_PAR_PROBE_SET" ] && ViashError Bad arguments for option \'--probe_set=*\': \'$VIASH_PAR_PROBE_SET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROBE_SET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --filter_probes) + [ -n "$VIASH_PAR_FILTER_PROBES" ] && ViashError Bad arguments for option \'--filter_probes\': \'$VIASH_PAR_FILTER_PROBES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTER_PROBES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --filter_probes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --filter_probes=*) + [ -n "$VIASH_PAR_FILTER_PROBES" ] && ViashError Bad arguments for option \'--filter_probes=*\': \'$VIASH_PAR_FILTER_PROBES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTER_PROBES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --probe_barcode_ids) + if [ -z "$VIASH_PAR_PROBE_BARCODE_IDS" ]; then + VIASH_PAR_PROBE_BARCODE_IDS="$2" + else + VIASH_PAR_PROBE_BARCODE_IDS="$VIASH_PAR_PROBE_BARCODE_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --probe_barcode_ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --probe_barcode_ids=*) + if [ -z "$VIASH_PAR_PROBE_BARCODE_IDS" ]; then + VIASH_PAR_PROBE_BARCODE_IDS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_PROBE_BARCODE_IDS="$VIASH_PAR_PROBE_BARCODE_IDS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --emptydrops_minimum_umis) + [ -n "$VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS" ] && ViashError Bad arguments for option \'--emptydrops_minimum_umis\': \'$VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --emptydrops_minimum_umis. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --emptydrops_minimum_umis=*) + [ -n "$VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS" ] && ViashError Bad arguments for option \'--emptydrops_minimum_umis=*\': \'$VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --control_id) + if [ -z "$VIASH_PAR_CONTROL_ID" ]; then + VIASH_PAR_CONTROL_ID="$2" + else + VIASH_PAR_CONTROL_ID="$VIASH_PAR_CONTROL_ID;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --control_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --control_id=*) + if [ -z "$VIASH_PAR_CONTROL_ID" ]; then + VIASH_PAR_CONTROL_ID=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CONTROL_ID="$VIASH_PAR_CONTROL_ID;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --mhc_allele) + if [ -z "$VIASH_PAR_MHC_ALLELE" ]; then + VIASH_PAR_MHC_ALLELE="$2" + else + VIASH_PAR_MHC_ALLELE="$VIASH_PAR_MHC_ALLELE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mhc_allele. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mhc_allele=*) + if [ -z "$VIASH_PAR_MHC_ALLELE" ]; then + VIASH_PAR_MHC_ALLELE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MHC_ALLELE="$VIASH_PAR_MHC_ALLELE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --check_library_compatibility) + [ -n "$VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY" ] && ViashError Bad arguments for option \'--check_library_compatibility\': \'$VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --check_library_compatibility. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --check_library_compatibility=*) + [ -n "$VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY" ] && ViashError Bad arguments for option \'--check_library_compatibility=*\': \'$VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dryrun) + [ -n "$VIASH_PAR_DRYRUN" ] && ViashError Bad arguments for option \'--dryrun\': \'$VIASH_PAR_DRYRUN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DRYRUN=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/cellranger_multi:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GEX_REFERENCE+x} ]; then + ViashError '--gex_reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_GEX_SECONDARY_ANALYSIS+x} ]; then + VIASH_PAR_GEX_SECONDARY_ANALYSIS="false" +fi +if [ -z ${VIASH_PAR_GEX_GENERATE_BAM+x} ]; then + VIASH_PAR_GEX_GENERATE_BAM="false" +fi +if [ -z ${VIASH_PAR_GEX_INCLUDE_INTRONS+x} ]; then + VIASH_PAR_GEX_INCLUDE_INTRONS="true" +fi +if [ -z ${VIASH_PAR_GEX_CHEMISTRY+x} ]; then + VIASH_PAR_GEX_CHEMISTRY="auto" +fi +if [ -z ${VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY+x} ]; then + VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY="true" +fi +if [ -z ${VIASH_PAR_DRYRUN+x} ]; then + VIASH_PAR_DRYRUN="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_GEX_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GEX_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_ABC_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_ABC_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_CGC_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_CGC_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_MUX_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_MUX_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_VDJ_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_VDJ_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_VDJ_T_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_VDJ_T_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_VDJ_T_GD_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_VDJ_T_GD_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_VDJ_B_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_VDJ_B_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_AGC_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_AGC_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_FEATURE_REFERENCE" ] && [ ! -e "$VIASH_PAR_FEATURE_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_FEATURE_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GEX_REFERENCE" ] && [ ! -e "$VIASH_PAR_GEX_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_GEX_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ] && [ ! -e "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ]; then + ViashError "Input file '$VIASH_PAR_TENX_CLOUD_TOKEN_PATH' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_REFERENCE" ] && [ ! -e "$VIASH_PAR_VDJ_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_VDJ_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS" ] && [ ! -e "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS" ]; then + ViashError "Input file '$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_CMO_SET" ] && [ ! -e "$VIASH_PAR_CMO_SET" ]; then + ViashError "Input file '$VIASH_PAR_CMO_SET' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT" ] && [ ! -e "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT" ]; then + ViashError "Input file '$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_PROBE_SET" ] && [ ! -e "$VIASH_PAR_PROBE_SET" ]; then + ViashError "Input file '$VIASH_PAR_PROBE_SET' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [ -n "$VIASH_PAR_SAMPLE_EXPECT_CELLS" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_SAMPLE_EXPECT_CELLS; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sample_expect_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_SAMPLE_FORCE_CELLS" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_SAMPLE_FORCE_CELLS; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sample_force_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_FEATURE_R1_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_FEATURE_R1_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--feature_r1_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FEATURE_R2_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_FEATURE_R2_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--feature_r2_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CRISPR_UMI" ]]; then + if ! [[ "$VIASH_PAR_MIN_CRISPR_UMI" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_crispr_umi' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_MIN_CRISPR_UMI -lt 1 ]]; then + ViashError '--min_crispr_umi' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_SECONDARY_ANALYSIS" ]]; then + if ! [[ "$VIASH_PAR_GEX_SECONDARY_ANALYSIS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--gex_secondary_analysis' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_GENERATE_BAM" ]]; then + if ! [[ "$VIASH_PAR_GEX_GENERATE_BAM" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--gex_generate_bam' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_EXPECT_CELLS" ]]; then + if ! [[ "$VIASH_PAR_GEX_EXPECT_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gex_expect_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_FORCE_CELLS" ]]; then + if ! [[ "$VIASH_PAR_GEX_FORCE_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gex_force_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_INCLUDE_INTRONS" ]]; then + if ! [[ "$VIASH_PAR_GEX_INCLUDE_INTRONS" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--gex_include_introns' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_R1_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_GEX_R1_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gex_r1_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GEX_R2_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_GEX_R2_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gex_r2_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VDJ_R1_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_VDJ_R1_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--vdj_r1_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VDJ_R2_LENGTH" ]]; then + if ! [[ "$VIASH_PAR_VDJ_R2_LENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--vdj_r2_length' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE" ]]; then + if ! [[ "$VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_assignment_confidence' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_FILTER_PROBES" ]]; then + if ! [[ "$VIASH_PAR_FILTER_PROBES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--filter_probes' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS" ]]; then + if ! [[ "$VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--emptydrops_minimum_umis' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS -lt 1 ]]; then + ViashError '--emptydrops_minimum_umis' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY" ]]; then + if ! [[ "$VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--check_library_compatibility' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DRYRUN" ]]; then + if ! [[ "$VIASH_PAR_DRYRUN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--dryrun' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_LIBRARY_TYPE" ]; then + VIASH_PAR_LIBRARY_TYPE_CHOICES=("Gene Expression;VDJ;VDJ-T;VDJ-B;VDJ-T-GD;Antibody Capture;CRISPR Guide Capture;Multiplexing Capture;Antigen Capture;Custom") + IFS=';' + set -f + for val in $VIASH_PAR_LIBRARY_TYPE; do + if ! [[ ";${VIASH_PAR_LIBRARY_TYPE_CHOICES[*]};" =~ ";${val};" ]]; then + ViashError '--library_type' specified value of \'${val}\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_CELL_ANNOTATION_MODEL" ]; then + VIASH_PAR_CELL_ANNOTATION_MODEL_CHOICES=("auto;human_pca_v1_beta;mouse_pca_v1_beta") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_CELL_ANNOTATION_MODEL_CHOICES[*]};" =~ ";$VIASH_PAR_CELL_ANNOTATION_MODEL;" ]]; then + ViashError '--cell_annotation_model' specified value of \'$VIASH_PAR_CELL_ANNOTATION_MODEL\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_GEX_CHEMISTRY" ]; then + VIASH_PAR_GEX_CHEMISTRY_CHOICES=("auto;threeprime;fiveprime;SC3Pv1;SC3Pv2;SC3Pv3;SC3Pv3-polyA;SC3Pv4;SC3Pv4-polyA;SC3Pv3LT;SC3Pv3HT;SC3Pv3HT-polyA;SC5P-PE;SC5P-PE-v3;SC5P-R2;SC-FB;SC5P-R2-v3;SCP5-PE-v3;SC5PHT;MFRP;MFRP-R1;MFRP-RNA;MFRP-Ab;SFRP;MFRP-Ab-R2pos50;MFRP-RNA-R1;MFRP-Ab-R1;ARC-v1") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_GEX_CHEMISTRY_CHOICES[*]};" =~ ";$VIASH_PAR_GEX_CHEMISTRY;" ]]; then + ViashError '--gex_chemistry' specified value of \'$VIASH_PAR_GEX_CHEMISTRY\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_GEX_INPUT" ]; then + VIASH_TEST_GEX_INPUT=() + IFS=';' + for var in $VIASH_PAR_GEX_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GEX_INPUT+=( "$var" ) + done + VIASH_PAR_GEX_INPUT=$(IFS=';' ; echo "${VIASH_TEST_GEX_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_ABC_INPUT" ]; then + VIASH_TEST_ABC_INPUT=() + IFS=';' + for var in $VIASH_PAR_ABC_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_ABC_INPUT+=( "$var" ) + done + VIASH_PAR_ABC_INPUT=$(IFS=';' ; echo "${VIASH_TEST_ABC_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_CGC_INPUT" ]; then + VIASH_TEST_CGC_INPUT=() + IFS=';' + for var in $VIASH_PAR_CGC_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_CGC_INPUT+=( "$var" ) + done + VIASH_PAR_CGC_INPUT=$(IFS=';' ; echo "${VIASH_TEST_CGC_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_MUX_INPUT" ]; then + VIASH_TEST_MUX_INPUT=() + IFS=';' + for var in $VIASH_PAR_MUX_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_MUX_INPUT+=( "$var" ) + done + VIASH_PAR_MUX_INPUT=$(IFS=';' ; echo "${VIASH_TEST_MUX_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_VDJ_INPUT" ]; then + VIASH_TEST_VDJ_INPUT=() + IFS=';' + for var in $VIASH_PAR_VDJ_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_VDJ_INPUT+=( "$var" ) + done + VIASH_PAR_VDJ_INPUT=$(IFS=';' ; echo "${VIASH_TEST_VDJ_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_VDJ_T_INPUT" ]; then + VIASH_TEST_VDJ_T_INPUT=() + IFS=';' + for var in $VIASH_PAR_VDJ_T_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_VDJ_T_INPUT+=( "$var" ) + done + VIASH_PAR_VDJ_T_INPUT=$(IFS=';' ; echo "${VIASH_TEST_VDJ_T_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_VDJ_T_GD_INPUT" ]; then + VIASH_TEST_VDJ_T_GD_INPUT=() + IFS=';' + for var in $VIASH_PAR_VDJ_T_GD_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_VDJ_T_GD_INPUT+=( "$var" ) + done + VIASH_PAR_VDJ_T_GD_INPUT=$(IFS=';' ; echo "${VIASH_TEST_VDJ_T_GD_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_VDJ_B_INPUT" ]; then + VIASH_TEST_VDJ_B_INPUT=() + IFS=';' + for var in $VIASH_PAR_VDJ_B_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_VDJ_B_INPUT+=( "$var" ) + done + VIASH_PAR_VDJ_B_INPUT=$(IFS=';' ; echo "${VIASH_TEST_VDJ_B_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_AGC_INPUT" ]; then + VIASH_TEST_AGC_INPUT=() + IFS=';' + for var in $VIASH_PAR_AGC_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_AGC_INPUT+=( "$var" ) + done + VIASH_PAR_AGC_INPUT=$(IFS=';' ; echo "${VIASH_TEST_AGC_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_FEATURE_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FEATURE_REFERENCE")" ) + VIASH_PAR_FEATURE_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_FEATURE_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_GEX_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GEX_REFERENCE")" ) + VIASH_PAR_GEX_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_GEX_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH")" ) + VIASH_PAR_TENX_CLOUD_TOKEN_PATH=$(ViashDockerAutodetectMount "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH") +fi +if [ ! -z "$VIASH_PAR_VDJ_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_REFERENCE")" ) + VIASH_PAR_VDJ_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS")" ) + VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS=$(ViashDockerAutodetectMount "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS") +fi +if [ ! -z "$VIASH_PAR_CMO_SET" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_CMO_SET")" ) + VIASH_PAR_CMO_SET=$(ViashDockerAutodetectMount "$VIASH_PAR_CMO_SET") +fi +if [ ! -z "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT")" ) + VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT=$(ViashDockerAutodetectMount "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT") +fi +if [ ! -z "$VIASH_PAR_PROBE_SET" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PROBE_SET")" ) + VIASH_PAR_PROBE_SET=$(ViashDockerAutodetectMount "$VIASH_PAR_PROBE_SET") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_multi-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from __future__ import annotations + +import sys +import re +import subprocess +from multiprocessing import cpu_count +import tempfile +import pandas as pd +import yaml +from typing import Optional, Any, Union +import tarfile +from pathlib import Path +import shutil +from itertools import chain + +CELL_RANGER_EXEC = shutil.which("cellranger") + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'gex_input': $( if [ ! -z ${VIASH_PAR_GEX_INPUT+x} ]; then echo "r'${VIASH_PAR_GEX_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'abc_input': $( if [ ! -z ${VIASH_PAR_ABC_INPUT+x} ]; then echo "r'${VIASH_PAR_ABC_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'cgc_input': $( if [ ! -z ${VIASH_PAR_CGC_INPUT+x} ]; then echo "r'${VIASH_PAR_CGC_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'mux_input': $( if [ ! -z ${VIASH_PAR_MUX_INPUT+x} ]; then echo "r'${VIASH_PAR_MUX_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'vdj_input': $( if [ ! -z ${VIASH_PAR_VDJ_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'vdj_t_input': $( if [ ! -z ${VIASH_PAR_VDJ_T_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_T_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'vdj_t_gd_input': $( if [ ! -z ${VIASH_PAR_VDJ_T_GD_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_T_GD_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'vdj_b_input': $( if [ ! -z ${VIASH_PAR_VDJ_B_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_B_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'agc_input': $( if [ ! -z ${VIASH_PAR_AGC_INPUT+x} ]; then echo "r'${VIASH_PAR_AGC_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'library_id': $( if [ ! -z ${VIASH_PAR_LIBRARY_ID+x} ]; then echo "r'${VIASH_PAR_LIBRARY_ID//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'library_type': $( if [ ! -z ${VIASH_PAR_LIBRARY_TYPE+x} ]; then echo "r'${VIASH_PAR_LIBRARY_TYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'library_subsample': $( if [ ! -z ${VIASH_PAR_LIBRARY_SUBSAMPLE+x} ]; then echo "r'${VIASH_PAR_LIBRARY_SUBSAMPLE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'library_lanes': $( if [ ! -z ${VIASH_PAR_LIBRARY_LANES+x} ]; then echo "r'${VIASH_PAR_LIBRARY_LANES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'library_chemistry': $( if [ ! -z ${VIASH_PAR_LIBRARY_CHEMISTRY+x} ]; then echo "r'${VIASH_PAR_LIBRARY_CHEMISTRY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sample_ids': $( if [ ! -z ${VIASH_PAR_SAMPLE_IDS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_IDS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sample_description': $( if [ ! -z ${VIASH_PAR_SAMPLE_DESCRIPTION+x} ]; then echo "r'${VIASH_PAR_SAMPLE_DESCRIPTION//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sample_expect_cells': $( if [ ! -z ${VIASH_PAR_SAMPLE_EXPECT_CELLS+x} ]; then echo "list(map(int, r'${VIASH_PAR_SAMPLE_EXPECT_CELLS//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'sample_force_cells': $( if [ ! -z ${VIASH_PAR_SAMPLE_FORCE_CELLS+x} ]; then echo "list(map(int, r'${VIASH_PAR_SAMPLE_FORCE_CELLS//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'feature_reference': $( if [ ! -z ${VIASH_PAR_FEATURE_REFERENCE+x} ]; then echo "r'${VIASH_PAR_FEATURE_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'feature_r1_length': $( if [ ! -z ${VIASH_PAR_FEATURE_R1_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_FEATURE_R1_LENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'feature_r2_length': $( if [ ! -z ${VIASH_PAR_FEATURE_R2_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_FEATURE_R2_LENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_crispr_umi': $( if [ ! -z ${VIASH_PAR_MIN_CRISPR_UMI+x} ]; then echo "int(r'${VIASH_PAR_MIN_CRISPR_UMI//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gex_reference': $( if [ ! -z ${VIASH_PAR_GEX_REFERENCE+x} ]; then echo "r'${VIASH_PAR_GEX_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gex_secondary_analysis': $( if [ ! -z ${VIASH_PAR_GEX_SECONDARY_ANALYSIS+x} ]; then echo "r'${VIASH_PAR_GEX_SECONDARY_ANALYSIS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'gex_generate_bam': $( if [ ! -z ${VIASH_PAR_GEX_GENERATE_BAM+x} ]; then echo "r'${VIASH_PAR_GEX_GENERATE_BAM//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'tenx_cloud_token_path': $( if [ ! -z ${VIASH_PAR_TENX_CLOUD_TOKEN_PATH+x} ]; then echo "r'${VIASH_PAR_TENX_CLOUD_TOKEN_PATH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cell_annotation_model': $( if [ ! -z ${VIASH_PAR_CELL_ANNOTATION_MODEL+x} ]; then echo "r'${VIASH_PAR_CELL_ANNOTATION_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'gex_expect_cells': $( if [ ! -z ${VIASH_PAR_GEX_EXPECT_CELLS+x} ]; then echo "int(r'${VIASH_PAR_GEX_EXPECT_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gex_force_cells': $( if [ ! -z ${VIASH_PAR_GEX_FORCE_CELLS+x} ]; then echo "int(r'${VIASH_PAR_GEX_FORCE_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gex_include_introns': $( if [ ! -z ${VIASH_PAR_GEX_INCLUDE_INTRONS+x} ]; then echo "r'${VIASH_PAR_GEX_INCLUDE_INTRONS//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'gex_r1_length': $( if [ ! -z ${VIASH_PAR_GEX_R1_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_GEX_R1_LENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gex_r2_length': $( if [ ! -z ${VIASH_PAR_GEX_R2_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_GEX_R2_LENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gex_chemistry': $( if [ ! -z ${VIASH_PAR_GEX_CHEMISTRY+x} ]; then echo "r'${VIASH_PAR_GEX_CHEMISTRY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_reference': $( if [ ! -z ${VIASH_PAR_VDJ_REFERENCE+x} ]; then echo "r'${VIASH_PAR_VDJ_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_inner_enrichment_primers': $( if [ ! -z ${VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS+x} ]; then echo "r'${VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vdj_r1_length': $( if [ ! -z ${VIASH_PAR_VDJ_R1_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_VDJ_R1_LENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'vdj_r2_length': $( if [ ! -z ${VIASH_PAR_VDJ_R2_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_VDJ_R2_LENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'cell_multiplex_oligo_ids': $( if [ ! -z ${VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS+x} ]; then echo "r'${VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'min_assignment_confidence': $( if [ ! -z ${VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE+x} ]; then echo "float(r'${VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'cmo_set': $( if [ ! -z ${VIASH_PAR_CMO_SET+x} ]; then echo "r'${VIASH_PAR_CMO_SET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'barcode_sample_assignment': $( if [ ! -z ${VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT+x} ]; then echo "r'${VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'hashtag_ids': $( if [ ! -z ${VIASH_PAR_HASHTAG_IDS+x} ]; then echo "r'${VIASH_PAR_HASHTAG_IDS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'ocm_barcode_ids': $( if [ ! -z ${VIASH_PAR_OCM_BARCODE_IDS+x} ]; then echo "r'${VIASH_PAR_OCM_BARCODE_IDS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'probe_set': $( if [ ! -z ${VIASH_PAR_PROBE_SET+x} ]; then echo "r'${VIASH_PAR_PROBE_SET//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'filter_probes': $( if [ ! -z ${VIASH_PAR_FILTER_PROBES+x} ]; then echo "r'${VIASH_PAR_FILTER_PROBES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'probe_barcode_ids': $( if [ ! -z ${VIASH_PAR_PROBE_BARCODE_IDS+x} ]; then echo "r'${VIASH_PAR_PROBE_BARCODE_IDS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'emptydrops_minimum_umis': $( if [ ! -z ${VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS+x} ]; then echo "int(r'${VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'control_id': $( if [ ! -z ${VIASH_PAR_CONTROL_ID+x} ]; then echo "r'${VIASH_PAR_CONTROL_ID//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'mhc_allele': $( if [ ! -z ${VIASH_PAR_MHC_ALLELE+x} ]; then echo "r'${VIASH_PAR_MHC_ALLELE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'check_library_compatibility': $( if [ ! -z ${VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY+x} ]; then echo "r'${VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'dryrun': $( if [ ! -z ${VIASH_PAR_DRYRUN+x} ]; then echo "r'${VIASH_PAR_DRYRUN//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +if not CELL_RANGER_EXEC: + raise RuntimeError( + "Could not find Cell Ranger executable. Please make sure it is installed and available in your PATH." + ) + +# Tested with cellranger 7.0: +# - omitting the lane number is allowed (e.g. \`_L001\`) +# - lane number should be omitted across all files if omitted in one +# - replacing \`.fastq.\` for \`.fq.\` is NOT allowed +# - omitting \`.gz\` is allowed + +fastq_regex = r"^([A-Za-z0-9\\-_\\.]+)_S(\\d+)_(L(\\d+)_)?[RI](\\d+)_(\\d+)\\.fastq(\\.gz)?\$" +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_L001_R1_001.fastq.gz") is not None +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq") is not None +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq.gz.txt") is None + +# Invert some parameters. Keep the original ones in the config for compatibility +inverted_params = { + "gex_no_secondary_analysis": "gex_secondary_analysis", +} +for inverted_param, param in inverted_params.items(): + par[inverted_param] = not par[param] if par[param] is not None else None + del par[param] + +GEX_CONFIG_KEYS = { + "gex_reference": "reference", + "gex_expect_cells": "expect-cells", + "gex_force_cells": "force-cells", + "gex_chemistry": "chemistry", + "gex_no_secondary_analysis": "no-secondary", + "gex_generate_bam": "create-bam", + "gex_include_introns": "include-introns", + "min_assignment_confidence": "min-assignment-confidence", + "check_library_compatibility": "check-library-compatibility", + "barcode_sample_assignment": "barcode-sample-assignment", + "cmo_set": "cmo-set", + "probe_set": "probe-set", + "filter_probes": "filter-probes", + "gex_r1_length": "r1-length", + "gex_r2_length": "r2-length", + "tenx_cloud_token_path": "tenx-cloud-token-path", + "cell_annotation_model": "cell-annotation-model", + "emptydrops_minimum_umis": "emptydrops_minimum_umis", +} + +FEATURE_CONFIG_KEYS = { + "feature_reference": "reference", + "feature_r1_length": "r1-length", + "feature_r2_length": "r2-length", + "min_crispr_umi": "min-crispr-umi", +} + +VDJ_CONFIG_KEYS = { + "vdj_reference": "reference", + "vdj_inner_enrichment_primers": "inner-enrichment-primers", + "vdj_r1_length": "r1-length", + "vdj_r2_length": "r2-length", +} + + +ANTIGEN_SPECIFICITY_CONFIG_KEYS = { + "control_id": "control_id", + "mhc_allele": "mhc_allele", +} + + +REFERENCE_SECTIONS = { + "gene-expression": (GEX_CONFIG_KEYS, "index"), + "feature": (FEATURE_CONFIG_KEYS, "index"), + "vdj": (VDJ_CONFIG_KEYS, "index"), + "antigen-specificity": (ANTIGEN_SPECIFICITY_CONFIG_KEYS, "columns"), +} + +LIBRARY_CONFIG_KEYS = { + "library_id": "fastq_id", + "library_type": "feature_types", + "library_subsample": "subsample_rate", + "library_lanes": "lanes", + "library_chemistry": "chemistry", +} + + +SAMPLE_PARAMS_CONFIG_KEYS = { + "sample_ids": "sample_id", + "cell_multiplex_oligo_ids": "cmo_ids", + "sample_description": "description", + "probe_barcode_ids": "probe_barcode_ids", + "sample_expect_cells": "expect_cells", + "sample_force_cells": "force_cells", + "hashtag_ids": "hashtag_ids", + "ocm_barcode_ids": "ocm_barcode_ids", +} + + +# These are derived from the dictionaries above +REFERENCES = tuple( + reference_param + for reference_param, cellranger_param in chain( + GEX_CONFIG_KEYS.items(), FEATURE_CONFIG_KEYS.items(), VDJ_CONFIG_KEYS.items() + ) + if cellranger_param == "reference" +) +LIBRARY_PARAMS = tuple(LIBRARY_CONFIG_KEYS.keys()) +SAMPLE_PARAMS = tuple(SAMPLE_PARAMS_CONFIG_KEYS.keys()) +HELPER_INPUT = { + "gex_input": "Gene Expression", + "abc_input": "Antibody Capture", + "cgc_input": "CRISPR Guide Capture", + "mux_input": "Multiplexing Capture", + "vdj_input": "VDJ", + "vdj_t_input": "VDJ-T", + "vdj_t_gd_input": "VDJ-T-GD", + "vdj_b_input": "VDJ-B", + "agc_input": "Antigen Capture", +} + + +def infer_library_id_from_path(input_path: str) -> str: + match = re.match(fastq_regex, input_path) + assert match is not None, ( + f"File name of '{input_path}' should match regex {fastq_regex}." + ) + return match.group(1) + + +def transform_helper_inputs(par: dict[str, Any]) -> dict[str, Any]: + helper_input = {"input": [], "library_id": [], "library_type": []} + for input_type, library_type in HELPER_INPUT.items(): + if par[input_type]: + par[input_type] = resolve_input_directories_to_fastq_paths(par[input_type]) + + library_ids = [ + infer_library_id_from_path(path.name) for path in par[input_type] + ] + + library_id_dict = {} + for fastq, library_id in zip(par[input_type], library_ids): + library_id_dict.setdefault(library_id, []).append(fastq) + + for library_id, input in library_id_dict.items(): + helper_input["input"] += input + helper_input["library_id"].append(library_id) + helper_input["library_type"].append(library_type) + + assert len(helper_input["library_id"]) == len(set(helper_input["library_id"])), ( + "File names passed to feature type-specific inputs must be unique" + ) + + return helper_input + + +def lengths_gt1(dic: dict[str, Optional[list[Any]]]) -> dict[str, int]: + return { + key: len(li) + for key, li in dic.items() + if li is not None and isinstance(li, (list, tuple, set)) + } + + +def strip_margin(text: str) -> str: + return re.sub("(\\n?)[ \\t]*\\|", "\\\\1", text) + + +def subset_dict( + dictionary: dict[str, str], keys: Union[dict[str, str], list[str]] +) -> dict[str, str]: + if isinstance(keys, (list, tuple)): + keys = {key: key for key in keys} + return { + dest_key: dictionary[orig_key] + for orig_key, dest_key in keys.items() + if dictionary[orig_key] is not None + } + + +def check_subset_dict_equal_length( + group_name: str, dictionary: dict[str, list[str]] +) -> None: + lens = lengths_gt1(dictionary) + assert len(set(lens.values())) <= 1, ( + f"The number of values passed to {group_name} " + f"arguments must be 0, 1 or all the same. Offenders: {lens}" + ) + + +def resolve_input_directories_to_fastq_paths(input_paths: list[str]) -> list[Path]: + input_paths = [Path(fastq) for fastq in input_paths] + if len(input_paths) == 1 and input_paths[0].is_dir(): + logger.info( + "Detected a directory in input paths, " + "traversing to see if we can detect any FASTQ files." + ) + input_paths = [ + input_path + for input_path in input_paths[0].rglob("*") + if re.match(fastq_regex, input_path.name) + ] + + # check input fastq files + for input_path in input_paths: + assert re.match(fastq_regex, input_path.name) is not None, ( + f"File name of --input '{input_path}' should match regex {fastq_regex}." + ) + + return input_paths + + +def make_paths_absolute(par: dict[str, Any], config: Path | str): + with open(config, "r", encoding="utf-8") as open_viash_config: + config = yaml.safe_load(open_viash_config) + + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + for arg_name, arg in arguments.items(): + if not par.get(arg_name) or arg["type"] != "file": + continue + par_value, is_multiple = par[arg_name], arg["multiple"] + assert is_multiple in (True, False) + + def make_path_absolute(file: str | Path) -> Path: + logger.info("Making path %s absolute", file) + return Path(file).resolve() + + new_arg = ( + [make_path_absolute(file) for file in par_value] + if is_multiple + else make_path_absolute(par_value) + ) + par[arg_name] = new_arg + return par + + +def handle_integers_not_set(par: dict[str, Any], viash_config: Path | str) -> str: + """ + Allow to use \`-1\` to define a 'not set' value for arguments of \`type: integer\` with \`multiple: true\`. + """ + with open(viash_config, "r", encoding="utf-8") as open_viash_config: + config = yaml.safe_load(open_viash_config) + + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + for arg_name, arg in arguments.items(): + if not par.get(arg_name) or arg["type"] != "integer": + continue + par_value, is_multiple = par[arg_name], arg["multiple"] + assert is_multiple in (True, False) + + if not is_multiple: + continue + + def replace_notset_values(integer_value: int) -> int | None: + return None if integer_value == -1 else integer_value + + # Use an extension array to handle "None" values, otherwise int + NA + # values would be converted to a "float" dtype + new_arg = pd.array( + [replace_notset_values(value) for value in par_value], dtype="Int64" + ) + par[arg_name] = new_arg + return par + + +def process_params(par: dict[str, Any], viash_config: Path | str) -> str: + if par["input"]: + assert len(par["library_type"]) > 0, ( + "--library_type must be defined when passing input to --input" + ) + assert len(par["library_id"]) > 0, ( + "--library_id must be defined when passing input to --input" + ) + + # if par["input"] is a directory, look for fastq files + par["input"] = resolve_input_directories_to_fastq_paths(par["input"]) + + # add helper input + helper_input = transform_helper_inputs(par) + for key in ["input", "library_id", "library_type"]: + par[key] = (par[key] if par[key] else []) + helper_input[key] + + assert len(par[key]) > 0, ( + f"Either --{key} or feature type-specific input (e.g. --gex_input, --abc_input, ...) must be defined" + ) + + # check lengths of libraries metadata + library_dict = subset_dict(par, LIBRARY_PARAMS) + check_subset_dict_equal_length("Library", library_dict) + + samples_dict = subset_dict(par, SAMPLE_PARAMS) + check_subset_dict_equal_length("Samples", samples_dict) + + # Allow using -1 to indicate unset integers for arguments + # that accept multiple integers. + par = handle_integers_not_set(par, viash_config) + + # use absolute paths + return make_paths_absolute(par, viash_config) + + +def generate_csv_category(name: str, args: dict[str, str], orient: str) -> list[str]: + assert orient in ("index", "columns") + if not args: + return [] + title = [f"[{name}]"] + # Which index to include in csv section is based on orientation + to_csv_args = {"index": (orient == "index"), "header": (orient == "columns")} + values = [pd.DataFrame.from_dict(args, orient=orient).to_csv(**to_csv_args).strip()] + return title + values + [""] + + +def generate_config(par: dict[str, Any], fastq_dir: str) -> str: + content_list = [] + par["fastqs"] = fastq_dir + libraries = dict(LIBRARY_CONFIG_KEYS, **{"fastqs": "fastqs"}) + # TODO: use the union (|) operator when python is updated to 3.9 + all_sections = REFERENCE_SECTIONS | { + "libraries": (libraries, "columns"), + "samples": (SAMPLE_PARAMS_CONFIG_KEYS, "columns"), + } + for section_name, (section_params, orientation) in all_sections.items(): + reference_pars = subset_dict(par, section_params) + content_list += generate_csv_category( + section_name, reference_pars, orient=orientation + ) + + return "\\n".join(content_list) + + +def untar(tar_file, output_directory): + logger.info("Looking at %s to check if it needs decompressing", tar_file) + if tar_file and Path(tar_file).is_file() and tarfile.is_tarfile(tar_file): + extaction_dir_name = Path( + tar_file.stem + ).stem # Remove two extensions (if they exist) + unpacked_directory = output_directory / extaction_dir_name + logger.info("Extracting %s to %s", tar_file, unpacked_directory) + + with tarfile.open(tar_file, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_directory, members=members_to_move) + return unpacked_directory + return tar_file + + +def main(par: dict[str, Any], meta: dict[str, Any]): + logger.info("Processing params") + par = process_params(par, meta["config"]) + logger.info(par) + + with tempfile.TemporaryDirectory( + prefix="cellranger_multi-", dir=meta["temp_dir"] + ) as temp_dir: + logger.info("Using temporary directory %s", temp_dir) + temp_dir_path = Path(temp_dir) + for reference_par_name in REFERENCES: + par[reference_par_name] = untar(par[reference_par_name], temp_dir_path) + + logger.info("Creating symbolic links to temporary directory") + # Creating symlinks of fastq files to tempdir + input_symlinks_dir = temp_dir_path / "input_symlinks" + input_symlinks_dir.mkdir() + for fastq in par["input"]: + destination = input_symlinks_dir / fastq.name + destination.symlink_to(fastq) + + logger.info("Creating config file") + config_content = generate_config(par, input_symlinks_dir) + logger.info("Using config: \\n\\t%s", config_content.replace("\\n", "\\n\\t")) + par["output"].mkdir(parents=True, exist_ok=True) + config_file = par["output"] / "config.csv" + with open(config_file, "w") as f: + f.write(config_content) + + logger.info("Creating Cell Ranger argument list") + temp_id = "run" + cmd = [CELL_RANGER_EXEC, "multi", "--disable-ui", "--id", temp_id] + + command_line_parameters = { + "--localcores": int(meta["cpus"]) if meta["cpus"] else cpu_count() - 1, + "--csv": config_file, + } + if meta["memory_gb"]: + command_line_parameters["--localmem"] = ( + int(meta["memory_gb"]) - 2 + if meta["memory_gb"] > 2 + else meta["memory_gb"] + ) + cmd += [ + f"{param}={param_value}" + for param, param_value in command_line_parameters.items() + ] + + logger.info("> " + " ".join(cmd)) + + if par["dryrun"]: + logger.info("Not executing command. Dry run enabled.") + return + + logger.info("Starting Cell Ranger") + with ( + subprocess.Popen( + cmd, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=1, + cwd=temp_dir, + ) as process, + (par["output"] / "cellranger_multi.log").open("w") as open_log, + ): + for line in process.stdout: + print(line.strip(), flush=True) + open_log.write(line) + logger.info("Checking exit code.") + # Popen objects do not have a check_exitcode method. + if process.returncode != 0: + raise RuntimeError("Cell Ranger returned a nonzero exitcode!") + + logger.info("Checking if expected output files are present.") + # look for output dir file + tmp_output_dir = temp_dir_path / temp_id / "outs" + expected_files = { + Path("multi"): Path.is_dir, + Path("per_sample_outs"): Path.is_dir, + Path("config.csv"): Path.is_file, + } + for file_path, type_func in expected_files.items(): + output_path = tmp_output_dir / file_path + if not type_func(output_path): + raise ValueError(f"Could not find expected '{output_path}'") + logger.info("Copying output files to %s", par["output"]) + for output_path in tmp_output_dir.rglob("*"): + if output_path.name != "config.csv": # Already created + shutil.move(str(output_path), par["output"]) + + +if __name__ == "__main__": + logger.info("Component '%s' started.", meta["name"]) + main(par, meta) + logger.info("Finished!") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_GEX_INPUT" ]; then + unset VIASH_TEST_GEX_INPUT + IFS=';' + for var in $VIASH_PAR_GEX_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_GEX_INPUT" ]; then + VIASH_TEST_GEX_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GEX_INPUT="$VIASH_TEST_GEX_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GEX_INPUT="$VIASH_TEST_GEX_INPUT" + fi + if [ ! -z "$VIASH_PAR_ABC_INPUT" ]; then + unset VIASH_TEST_ABC_INPUT + IFS=';' + for var in $VIASH_PAR_ABC_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_ABC_INPUT" ]; then + VIASH_TEST_ABC_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_ABC_INPUT="$VIASH_TEST_ABC_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_ABC_INPUT="$VIASH_TEST_ABC_INPUT" + fi + if [ ! -z "$VIASH_PAR_CGC_INPUT" ]; then + unset VIASH_TEST_CGC_INPUT + IFS=';' + for var in $VIASH_PAR_CGC_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_CGC_INPUT" ]; then + VIASH_TEST_CGC_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_CGC_INPUT="$VIASH_TEST_CGC_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_CGC_INPUT="$VIASH_TEST_CGC_INPUT" + fi + if [ ! -z "$VIASH_PAR_MUX_INPUT" ]; then + unset VIASH_TEST_MUX_INPUT + IFS=';' + for var in $VIASH_PAR_MUX_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_MUX_INPUT" ]; then + VIASH_TEST_MUX_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_MUX_INPUT="$VIASH_TEST_MUX_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_MUX_INPUT="$VIASH_TEST_MUX_INPUT" + fi + if [ ! -z "$VIASH_PAR_VDJ_INPUT" ]; then + unset VIASH_TEST_VDJ_INPUT + IFS=';' + for var in $VIASH_PAR_VDJ_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_VDJ_INPUT" ]; then + VIASH_TEST_VDJ_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_VDJ_INPUT="$VIASH_TEST_VDJ_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_VDJ_INPUT="$VIASH_TEST_VDJ_INPUT" + fi + if [ ! -z "$VIASH_PAR_VDJ_T_INPUT" ]; then + unset VIASH_TEST_VDJ_T_INPUT + IFS=';' + for var in $VIASH_PAR_VDJ_T_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_VDJ_T_INPUT" ]; then + VIASH_TEST_VDJ_T_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_VDJ_T_INPUT="$VIASH_TEST_VDJ_T_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_VDJ_T_INPUT="$VIASH_TEST_VDJ_T_INPUT" + fi + if [ ! -z "$VIASH_PAR_VDJ_T_GD_INPUT" ]; then + unset VIASH_TEST_VDJ_T_GD_INPUT + IFS=';' + for var in $VIASH_PAR_VDJ_T_GD_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_VDJ_T_GD_INPUT" ]; then + VIASH_TEST_VDJ_T_GD_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_VDJ_T_GD_INPUT="$VIASH_TEST_VDJ_T_GD_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_VDJ_T_GD_INPUT="$VIASH_TEST_VDJ_T_GD_INPUT" + fi + if [ ! -z "$VIASH_PAR_VDJ_B_INPUT" ]; then + unset VIASH_TEST_VDJ_B_INPUT + IFS=';' + for var in $VIASH_PAR_VDJ_B_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_VDJ_B_INPUT" ]; then + VIASH_TEST_VDJ_B_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_VDJ_B_INPUT="$VIASH_TEST_VDJ_B_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_VDJ_B_INPUT="$VIASH_TEST_VDJ_B_INPUT" + fi + if [ ! -z "$VIASH_PAR_AGC_INPUT" ]; then + unset VIASH_TEST_AGC_INPUT + IFS=';' + for var in $VIASH_PAR_AGC_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_AGC_INPUT" ]; then + VIASH_TEST_AGC_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_AGC_INPUT="$VIASH_TEST_AGC_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_AGC_INPUT="$VIASH_TEST_AGC_INPUT" + fi + if [ ! -z "$VIASH_PAR_FEATURE_REFERENCE" ]; then + VIASH_PAR_FEATURE_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_FEATURE_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_GEX_REFERENCE" ]; then + VIASH_PAR_GEX_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_GEX_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH" ]; then + VIASH_PAR_TENX_CLOUD_TOKEN_PATH=$(ViashDockerStripAutomount "$VIASH_PAR_TENX_CLOUD_TOKEN_PATH") + fi + if [ ! -z "$VIASH_PAR_VDJ_REFERENCE" ]; then + VIASH_PAR_VDJ_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS" ]; then + VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS=$(ViashDockerStripAutomount "$VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS") + fi + if [ ! -z "$VIASH_PAR_CMO_SET" ]; then + VIASH_PAR_CMO_SET=$(ViashDockerStripAutomount "$VIASH_PAR_CMO_SET") + fi + if [ ! -z "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT" ]; then + VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT=$(ViashDockerStripAutomount "$VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT") + fi + if [ ! -z "$VIASH_PAR_PROBE_SET" ]; then + VIASH_PAR_PROBE_SET=$(ViashDockerStripAutomount "$VIASH_PAR_PROBE_SET") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/cellranger_multi/nextflow_labels.config b/target/executable/mapping/cellranger_multi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/cellranger_multi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/cellranger_multi/setup_logger.py b/target/executable/mapping/cellranger_multi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/mapping/cellranger_multi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/mapping/htseq_count/.config.vsh.yaml b/target/executable/mapping/htseq_count/.config.vsh.yaml new file mode 100644 index 00000000..17a8fd21 --- /dev/null +++ b/target/executable/mapping/htseq_count/.config.vsh.yaml @@ -0,0 +1,472 @@ +name: "htseq_count" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Path to the SAM/BAM files containing the mapped reads." + info: + orig_arg: "samfilenames" + example: + - "mysample1.BAM" + - "mysample2.BAM" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "Path to the GTF file containing the features." + info: + orig_arg: "featurefilename" + example: + - "reference.gtf" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Filename to output the counts to." + info: + orig_arg: "--counts_output" + example: + - "htseq-count.tsv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_delimiter" + description: "Column delimiter in output." + info: + orig_arg: "--delimiter" + example: + - "\t" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_sam" + description: "Write out all SAM alignment records into SAM/BAM files (one per\ + \ input file needed), \nannotating each line with its feature assignment (as\ + \ an optional field with tag 'XF'). \nSee the -p option to use BAM instead of\ + \ SAM.\n" + info: + orig_arg: "--samout" + example: + - "mysample1_out.BAM" + - "mysample2_out.BAM" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_sam_format" + description: "Format to use with the --output_sam argument." + info: + orig_arg: "--samout-format" + required: false + choices: + - "sam" + - "bam" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--order" + alternatives: + - "-r" + description: "Sorting order of . Paired-end sequencing data must\ + \ be sorted either by position or\nby read name, and the sorting order must\ + \ be specified. Ignored for single-end data.\n" + info: + orig_arg: "--order" + default: + - "name" + required: false + choices: + - "pos" + - "name" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--stranded" + alternatives: + - "-s" + description: "Whether the data is from a strand-specific assay. 'reverse' means\ + \ 'yes' with reversed strand interpretation." + info: + orig_arg: "--stranded" + default: + - "yes" + required: false + choices: + - "yes" + - "no" + - "reverse" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--minimum_alignment_quality" + alternatives: + - "-a" + - "--minaqual" + description: "Skip all reads with MAPQ alignment quality lower than the given\ + \ minimum value. \nMAPQ is the 5th column of a SAM/BAM file and its usage depends\ + \ on the software \nused to map the reads.\n" + info: + orig_arg: "--minaqual" + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--type" + alternatives: + - "-t" + description: "Feature type (3rd column in GTF file) to be used, all features of\ + \ other type are ignored (default, suitable for Ensembl GTF files: exon)" + info: + orig_arg: "--type" + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--id_attribute" + alternatives: + - "-i" + description: "GTF attribute to be used as feature ID (default, suitable for Ensembl\ + \ GTF files: gene_id).\nAll feature of the right type (see -t option) within\ + \ the same GTF attribute will be added\ntogether. The typical way of using this\ + \ option is to count all exonic reads from each gene\nand add the exons but\ + \ other uses are possible as well. You can call this option multiple\ntimes:\ + \ in that case, the combination of all attributes separated by colons (:) will\ + \ be used\nas a unique identifier, e.g. for exons you might use -i gene_id -i\ + \ exon_number.\n" + info: + orig_arg: "--idattr" + example: + - "gene_id" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--additional_attributes" + description: "Additional feature attributes (suitable for Ensembl GTF files: gene_name).\ + \ Use multiple times\nfor more than one additional attribute. These attributes\ + \ are only used as annotations in the\noutput, while the determination of how\ + \ the counts are added together is done based on option -i.\n" + info: + orig_arg: "--additional-attr" + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--add_chromosome_info" + description: "Store information about the chromosome of each feature as an additional\ + \ attribute\n(e.g. colunm in the TSV output file).\n" + info: + orig_arg: "--add-chromosome-info" + direction: "input" + - type: "string" + name: "--mode" + alternatives: + - "-m" + description: "Mode to handle reads overlapping more than one feature." + info: + orig_arg: "--mode" + default: + - "union" + required: false + choices: + - "union" + - "intersection-strict" + - "intersection-nonempty" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_unique" + description: "Whether and how to score reads that are not uniquely aligned or\ + \ ambiguously assigned to features." + info: + orig_arg: "--nonunique" + default: + - "none" + required: false + choices: + - "none" + - "all" + - "fraction" + - "random" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--secondary_alignments" + description: "Whether to score secondary alignments (0x100 flag)." + info: + orig_arg: "--secondary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--supplementary_alignments" + description: "Whether to score supplementary alignments (0x800 flag)." + info: + orig_arg: "--supplementary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--counts_output_sparse" + description: "Store the counts as a sparse matrix (mtx, h5ad, loom)." + info: + orig_arg: "--counts-output-sparse" + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Quantify gene expression for subsequent testing for differential expression.\n\ + \nThis script takes one or more alignment files in SAM/BAM format and a feature\ + \ file in GFF format and calculates for each feature the number of reads mapping\ + \ to it. \n\nSee http://htseq.readthedocs.io/en/master/count.html for details.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "HTSeq" + - "pyyaml" + - "scipy" + - "pandas" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/htseq_count/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/htseq_count" + executable: "target/executable/mapping/htseq_count/htseq_count" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/htseq_count/htseq_count b/target/executable/mapping/htseq_count/htseq_count new file mode 100755 index 00000000..1c3def21 --- /dev/null +++ b/target/executable/mapping/htseq_count/htseq_count @@ -0,0 +1,1792 @@ +#!/usr/bin/env bash + +# htseq_count main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) +# * Angela Oliveira Pisco (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="htseq_count" +VIASH_META_FUNCTIONALITY_NAME="htseq_count" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "HTSeq" "pyyaml" "scipy" "pandas" "numpy<2" + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Angela Oliveira Pisco" +LABEL org.opencontainers.image.description="Companion container for running component mapping htseq_count" +LABEL org.opencontainers.image.created="2025-08-22T07:23:23Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "htseq_count main" + echo "" + echo "Quantify gene expression for subsequent testing for differential expression." + echo "" + echo "This script takes one or more alignment files in SAM/BAM format and a feature" + echo "file in GFF format and calculates for each feature the number of reads mapping" + echo "to it." + echo "" + echo "See http://htseq.readthedocs.io/en/master/count.html for details." + echo "" + echo "Input:" + echo " --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: mysample1.BAM;mysample2.BAM" + echo " Path to the SAM/BAM files containing the mapped reads." + echo "" + echo " --reference" + echo " type: file, required parameter, file must exist" + echo " example: reference.gtf" + echo " Path to the GTF file containing the features." + echo "" + echo "Output:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: htseq-count.tsv" + echo " Filename to output the counts to." + echo "" + echo " --output_delimiter" + echo " type: string" + echo " example:" + echo " Column delimiter in output." + echo "" + echo " --output_sam" + echo " type: file, multiple values allowed, output, file must exist" + echo " example: mysample1_out.BAM;mysample2_out.BAM" + echo " Write out all SAM alignment records into SAM/BAM files (one per input" + echo " file needed)," + echo " annotating each line with its feature assignment (as an optional field" + echo " with tag 'XF')." + echo " See the -p option to use BAM instead of SAM." + echo "" + echo " --output_sam_format" + echo " type: string" + echo " choices: [ sam, bam ]" + echo " Format to use with the --output_sam argument." + echo "" + echo "Arguments:" + echo " -r, --order" + echo " type: string" + echo " default: name" + echo " choices: [ pos, name ]" + echo " Sorting order of . Paired-end sequencing data must be" + echo " sorted either by position or" + echo " by read name, and the sorting order must be specified. Ignored for" + echo " single-end data." + echo "" + echo " -s, --stranded" + echo " type: string" + echo " default: yes" + echo " choices: [ yes, no, reverse ]" + echo " Whether the data is from a strand-specific assay. 'reverse' means 'yes'" + echo " with reversed strand interpretation." + echo "" + echo " -a, --minaqual, --minimum_alignment_quality" + echo " type: integer" + echo " default: 10" + echo " Skip all reads with MAPQ alignment quality lower than the given minimum" + echo " value." + echo " MAPQ is the 5th column of a SAM/BAM file and its usage depends on the" + echo " software" + echo " used to map the reads." + echo "" + echo " -t, --type" + echo " type: string" + echo " example: exon" + echo " Feature type (3rd column in GTF file) to be used, all features of other" + echo " type are ignored (default, suitable for Ensembl GTF files: exon)" + echo "" + echo " -i, --id_attribute" + echo " type: string, multiple values allowed" + echo " example: gene_id" + echo " GTF attribute to be used as feature ID (default, suitable for Ensembl" + echo " GTF files: gene_id)." + echo " All feature of the right type (see -t option) within the same GTF" + echo " attribute will be added" + echo " together. The typical way of using this option is to count all exonic" + echo " reads from each gene" + echo " and add the exons but other uses are possible as well. You can call this" + echo " option multiple" + echo " times: in that case, the combination of all attributes separated by" + echo " colons (:) will be used" + echo " as a unique identifier, e.g. for exons you might use -i gene_id -i" + echo " exon_number." + echo "" + echo " --additional_attributes" + echo " type: string, multiple values allowed" + echo " example: gene_name" + echo " Additional feature attributes (suitable for Ensembl GTF files:" + echo " gene_name). Use multiple times" + echo " for more than one additional attribute. These attributes are only used" + echo " as annotations in the" + echo " output, while the determination of how the counts are added together is" + echo " done based on option -i." + echo "" + echo " --add_chromosome_info" + echo " type: boolean_true" + echo " Store information about the chromosome of each feature as an additional" + echo " attribute" + echo " (e.g. colunm in the TSV output file)." + echo "" + echo " -m, --mode" + echo " type: string" + echo " default: union" + echo " choices: [ union, intersection-strict, intersection-nonempty ]" + echo " Mode to handle reads overlapping more than one feature." + echo "" + echo " --non_unique" + echo " type: string" + echo " default: none" + echo " choices: [ none, all, fraction, random ]" + echo " Whether and how to score reads that are not uniquely aligned or" + echo " ambiguously assigned to features." + echo "" + echo " --secondary_alignments" + echo " type: string" + echo " choices: [ score, ignore ]" + echo " Whether to score secondary alignments (0x100 flag)." + echo "" + echo " --supplementary_alignments" + echo " type: string" + echo " choices: [ score, ignore ]" + echo " Whether to score supplementary alignments (0x800 flag)." + echo "" + echo " --counts_output_sparse" + echo " type: boolean_true" + echo " Store the counts as a sparse matrix (mtx, h5ad, loom)." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "htseq_count main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_delimiter) + [ -n "$VIASH_PAR_OUTPUT_DELIMITER" ] && ViashError Bad arguments for option \'--output_delimiter\': \'$VIASH_PAR_OUTPUT_DELIMITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_DELIMITER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_delimiter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_delimiter=*) + [ -n "$VIASH_PAR_OUTPUT_DELIMITER" ] && ViashError Bad arguments for option \'--output_delimiter=*\': \'$VIASH_PAR_OUTPUT_DELIMITER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_DELIMITER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_sam) + [ -n "$VIASH_PAR_OUTPUT_SAM" ] && ViashError Bad arguments for option \'--output_sam\': \'$VIASH_PAR_OUTPUT_SAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_SAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_sam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_sam=*) + [ -n "$VIASH_PAR_OUTPUT_SAM" ] && ViashError Bad arguments for option \'--output_sam=*\': \'$VIASH_PAR_OUTPUT_SAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_SAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_sam_format) + [ -n "$VIASH_PAR_OUTPUT_SAM_FORMAT" ] && ViashError Bad arguments for option \'--output_sam_format\': \'$VIASH_PAR_OUTPUT_SAM_FORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_SAM_FORMAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_sam_format. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_sam_format=*) + [ -n "$VIASH_PAR_OUTPUT_SAM_FORMAT" ] && ViashError Bad arguments for option \'--output_sam_format=*\': \'$VIASH_PAR_OUTPUT_SAM_FORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_SAM_FORMAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --order) + [ -n "$VIASH_PAR_ORDER" ] && ViashError Bad arguments for option \'--order\': \'$VIASH_PAR_ORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --order. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --order=*) + [ -n "$VIASH_PAR_ORDER" ] && ViashError Bad arguments for option \'--order=*\': \'$VIASH_PAR_ORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + -r) + [ -n "$VIASH_PAR_ORDER" ] && ViashError Bad arguments for option \'-r\': \'$VIASH_PAR_ORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -r. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --stranded) + [ -n "$VIASH_PAR_STRANDED" ] && ViashError Bad arguments for option \'--stranded\': \'$VIASH_PAR_STRANDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRANDED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --stranded. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --stranded=*) + [ -n "$VIASH_PAR_STRANDED" ] && ViashError Bad arguments for option \'--stranded=*\': \'$VIASH_PAR_STRANDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRANDED=$(ViashRemoveFlags "$1") + shift 1 + ;; + -s) + [ -n "$VIASH_PAR_STRANDED" ] && ViashError Bad arguments for option \'-s\': \'$VIASH_PAR_STRANDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRANDED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -s. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minimum_alignment_quality) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'--minimum_alignment_quality\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --minimum_alignment_quality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minimum_alignment_quality=*) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'--minimum_alignment_quality=*\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + -a) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'-a\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -a. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minaqual) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'--minaqual\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --minaqual. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --type) + [ -n "$VIASH_PAR_TYPE" ] && ViashError Bad arguments for option \'--type\': \'$VIASH_PAR_TYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --type. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --type=*) + [ -n "$VIASH_PAR_TYPE" ] && ViashError Bad arguments for option \'--type=*\': \'$VIASH_PAR_TYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -t) + [ -n "$VIASH_PAR_TYPE" ] && ViashError Bad arguments for option \'-t\': \'$VIASH_PAR_TYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -t. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id_attribute) + if [ -z "$VIASH_PAR_ID_ATTRIBUTE" ]; then + VIASH_PAR_ID_ATTRIBUTE="$2" + else + VIASH_PAR_ID_ATTRIBUTE="$VIASH_PAR_ID_ATTRIBUTE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --id_attribute. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id_attribute=*) + if [ -z "$VIASH_PAR_ID_ATTRIBUTE" ]; then + VIASH_PAR_ID_ATTRIBUTE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ID_ATTRIBUTE="$VIASH_PAR_ID_ATTRIBUTE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -i) + if [ -z "$VIASH_PAR_ID_ATTRIBUTE" ]; then + VIASH_PAR_ID_ATTRIBUTE="$2" + else + VIASH_PAR_ID_ATTRIBUTE="$VIASH_PAR_ID_ATTRIBUTE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --additional_attributes) + if [ -z "$VIASH_PAR_ADDITIONAL_ATTRIBUTES" ]; then + VIASH_PAR_ADDITIONAL_ATTRIBUTES="$2" + else + VIASH_PAR_ADDITIONAL_ATTRIBUTES="$VIASH_PAR_ADDITIONAL_ATTRIBUTES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --additional_attributes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --additional_attributes=*) + if [ -z "$VIASH_PAR_ADDITIONAL_ATTRIBUTES" ]; then + VIASH_PAR_ADDITIONAL_ATTRIBUTES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ADDITIONAL_ATTRIBUTES="$VIASH_PAR_ADDITIONAL_ATTRIBUTES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --add_chromosome_info) + [ -n "$VIASH_PAR_ADD_CHROMOSOME_INFO" ] && ViashError Bad arguments for option \'--add_chromosome_info\': \'$VIASH_PAR_ADD_CHROMOSOME_INFO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ADD_CHROMOSOME_INFO=true + shift 1 + ;; + --mode) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'--mode\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mode=*) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'--mode=*\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -m) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'-m\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -m. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --non_unique) + [ -n "$VIASH_PAR_NON_UNIQUE" ] && ViashError Bad arguments for option \'--non_unique\': \'$VIASH_PAR_NON_UNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NON_UNIQUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --non_unique. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --non_unique=*) + [ -n "$VIASH_PAR_NON_UNIQUE" ] && ViashError Bad arguments for option \'--non_unique=*\': \'$VIASH_PAR_NON_UNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NON_UNIQUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --secondary_alignments) + [ -n "$VIASH_PAR_SECONDARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--secondary_alignments\': \'$VIASH_PAR_SECONDARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SECONDARY_ALIGNMENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --secondary_alignments. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --secondary_alignments=*) + [ -n "$VIASH_PAR_SECONDARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--secondary_alignments=*\': \'$VIASH_PAR_SECONDARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SECONDARY_ALIGNMENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --supplementary_alignments) + [ -n "$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--supplementary_alignments\': \'$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --supplementary_alignments. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --supplementary_alignments=*) + [ -n "$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--supplementary_alignments=*\': \'$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --counts_output_sparse) + [ -n "$VIASH_PAR_COUNTS_OUTPUT_SPARSE" ] && ViashError Bad arguments for option \'--counts_output_sparse\': \'$VIASH_PAR_COUNTS_OUTPUT_SPARSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COUNTS_OUTPUT_SPARSE=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/htseq_count:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_ORDER+x} ]; then + VIASH_PAR_ORDER="name" +fi +if [ -z ${VIASH_PAR_STRANDED+x} ]; then + VIASH_PAR_STRANDED="yes" +fi +if [ -z ${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY+x} ]; then + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="10" +fi +if [ -z ${VIASH_PAR_ADD_CHROMOSOME_INFO+x} ]; then + VIASH_PAR_ADD_CHROMOSOME_INFO="false" +fi +if [ -z ${VIASH_PAR_MODE+x} ]; then + VIASH_PAR_MODE="union" +fi +if [ -z ${VIASH_PAR_NON_UNIQUE+x} ]; then + VIASH_PAR_NON_UNIQUE="none" +fi +if [ -z ${VIASH_PAR_COUNTS_OUTPUT_SPARSE+x} ]; then + VIASH_PAR_COUNTS_OUTPUT_SPARSE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_OUTPUT_SAM" ]]; then + if ! [[ "$VIASH_PAR_OUTPUT_SAM" =~ \* ]]; then + ViashError '--output_sam' has to be a path containing a wildcard, e.g. 'output_*.txt'. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ]]; then + if ! [[ "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--minimum_alignment_quality' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ADD_CHROMOSOME_INFO" ]]; then + if ! [[ "$VIASH_PAR_ADD_CHROMOSOME_INFO" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--add_chromosome_info' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_COUNTS_OUTPUT_SPARSE" ]]; then + if ! [[ "$VIASH_PAR_COUNTS_OUTPUT_SPARSE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--counts_output_sparse' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_SAM_FORMAT" ]; then + VIASH_PAR_OUTPUT_SAM_FORMAT_CHOICES=("sam;bam") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_SAM_FORMAT_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_SAM_FORMAT;" ]]; then + ViashError '--output_sam_format' specified value of \'$VIASH_PAR_OUTPUT_SAM_FORMAT\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_ORDER" ]; then + VIASH_PAR_ORDER_CHOICES=("pos;name") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_ORDER_CHOICES[*]};" =~ ";$VIASH_PAR_ORDER;" ]]; then + ViashError '--order' specified value of \'$VIASH_PAR_ORDER\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_STRANDED" ]; then + VIASH_PAR_STRANDED_CHOICES=("yes;no;reverse") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_STRANDED_CHOICES[*]};" =~ ";$VIASH_PAR_STRANDED;" ]]; then + ViashError '--stranded' specified value of \'$VIASH_PAR_STRANDED\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_MODE" ]; then + VIASH_PAR_MODE_CHOICES=("union;intersection-strict;intersection-nonempty") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MODE_CHOICES[*]};" =~ ";$VIASH_PAR_MODE;" ]]; then + ViashError '--mode' specified value of \'$VIASH_PAR_MODE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_NON_UNIQUE" ]; then + VIASH_PAR_NON_UNIQUE_CHOICES=("none;all;fraction;random") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_NON_UNIQUE_CHOICES[*]};" =~ ";$VIASH_PAR_NON_UNIQUE;" ]]; then + ViashError '--non_unique' specified value of \'$VIASH_PAR_NON_UNIQUE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_SECONDARY_ALIGNMENTS" ]; then + VIASH_PAR_SECONDARY_ALIGNMENTS_CHOICES=("score;ignore") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SECONDARY_ALIGNMENTS_CHOICES[*]};" =~ ";$VIASH_PAR_SECONDARY_ALIGNMENTS;" ]]; then + ViashError '--secondary_alignments' specified value of \'$VIASH_PAR_SECONDARY_ALIGNMENTS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS" ]; then + VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS_CHOICES=("score;ignore") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS_CHOICES[*]};" =~ ";$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS;" ]]; then + ViashError '--supplementary_alignments' specified value of \'$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_SAM" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_SAM")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_SAM")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_SAM" ]; then + VIASH_TEST_OUTPUT_SAM=() + IFS=';' + for var in $VIASH_PAR_OUTPUT_SAM; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_OUTPUT_SAM+=( "$var" ) + VIASH_CHOWN_VARS+=( "$var" ) + done + VIASH_PAR_OUTPUT_SAM=$(IFS=';' ; echo "${VIASH_TEST_OUTPUT_SAM[*]}") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-htseq_count-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil +import yaml + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_delimiter': $( if [ ! -z ${VIASH_PAR_OUTPUT_DELIMITER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_DELIMITER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_sam': $( if [ ! -z ${VIASH_PAR_OUTPUT_SAM+x} ]; then echo "r'${VIASH_PAR_OUTPUT_SAM//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output_sam_format': $( if [ ! -z ${VIASH_PAR_OUTPUT_SAM_FORMAT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_SAM_FORMAT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'order': $( if [ ! -z ${VIASH_PAR_ORDER+x} ]; then echo "r'${VIASH_PAR_ORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'stranded': $( if [ ! -z ${VIASH_PAR_STRANDED+x} ]; then echo "r'${VIASH_PAR_STRANDED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'minimum_alignment_quality': $( if [ ! -z ${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY+x} ]; then echo "int(r'${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'type': $( if [ ! -z ${VIASH_PAR_TYPE+x} ]; then echo "r'${VIASH_PAR_TYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'id_attribute': $( if [ ! -z ${VIASH_PAR_ID_ATTRIBUTE+x} ]; then echo "r'${VIASH_PAR_ID_ATTRIBUTE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'additional_attributes': $( if [ ! -z ${VIASH_PAR_ADDITIONAL_ATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_ADDITIONAL_ATTRIBUTES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'add_chromosome_info': $( if [ ! -z ${VIASH_PAR_ADD_CHROMOSOME_INFO+x} ]; then echo "r'${VIASH_PAR_ADD_CHROMOSOME_INFO//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'mode': $( if [ ! -z ${VIASH_PAR_MODE+x} ]; then echo "r'${VIASH_PAR_MODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'non_unique': $( if [ ! -z ${VIASH_PAR_NON_UNIQUE+x} ]; then echo "r'${VIASH_PAR_NON_UNIQUE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'secondary_alignments': $( if [ ! -z ${VIASH_PAR_SECONDARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SECONDARY_ALIGNMENTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'supplementary_alignments': $( if [ ! -z ${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'counts_output_sparse': $( if [ ! -z ${VIASH_PAR_COUNTS_OUTPUT_SPARSE+x} ]; then echo "r'${VIASH_PAR_COUNTS_OUTPUT_SPARSE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\x1f\\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print( + f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True + ) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print( + f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True + ) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +def generate_args(par, config): + # fetch arguments from config + arguments = [ + arg for group in config["argument_groups"] for arg in group["arguments"] + ] + + cmd_args = [] + + for arg in arguments: + arg_val = par.get(arg["name"].removeprefix("--")) + orig_arg = arg.get("info", {}).get("orig_arg") + if arg_val and orig_arg: + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +######################## +### Main code ### +######################## + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + + +with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + reference = Path(par["reference"]) + + print(f">> Check compression of --reference with value: {reference}", flush=True) + par["reference"] = extract_if_need_be(reference, temp_dir_path) + + print(">> Constructing command", flush=True) + cmd_args = ["htseq-count"] + generate_args(par, config) + + # manually process cpus parameter + if "cpus" in meta and meta["cpus"]: + cmd_args.extend(["--nprocesses", str(meta["cpus"])]) + + print(">> Running htseq-count with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_SAM" ]; then + VIASH_PAR_OUTPUT_SAM=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_SAM") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_SAM" ] && ! compgen -G "$VIASH_PAR_OUTPUT_SAM" > /dev/null; then + ViashError "Output file '$VIASH_PAR_OUTPUT_SAM' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/htseq_count/nextflow_labels.config b/target/executable/mapping/htseq_count/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/htseq_count/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/htseq_count_to_h5mu/.config.vsh.yaml b/target/executable/mapping/htseq_count_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..164b8ac4 --- /dev/null +++ b/target/executable/mapping/htseq_count_to_h5mu/.config.vsh.yaml @@ -0,0 +1,283 @@ +name: "htseq_count_to_h5mu" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--input_id" + description: "The obs index for the counts" + info: null + example: + - "foo" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_counts" + description: "The counts as a TSV file as output by HTSeq." + info: null + example: + - "counts.tsv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The GTF file." + info: null + example: + - "gencode_v41_star" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the htseq table to a h5mu.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "gtfparse" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "pyarrow~=18.0.0" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/htseq_count_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/htseq_count_to_h5mu" + executable: "target/executable/mapping/htseq_count_to_h5mu/htseq_count_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/htseq_count_to_h5mu/htseq_count_to_h5mu b/target/executable/mapping/htseq_count_to_h5mu/htseq_count_to_h5mu new file mode 100755 index 00000000..54eae487 --- /dev/null +++ b/target/executable/mapping/htseq_count_to_h5mu/htseq_count_to_h5mu @@ -0,0 +1,1366 @@ +#!/usr/bin/env bash + +# htseq_count_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) +# * Angela Oliveira Pisco (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="htseq_count_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="htseq_count_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "gtfparse" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "pyarrow~=18.0.0" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Angela Oliveira Pisco" +LABEL org.opencontainers.image.description="Companion container for running component mapping htseq_count_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:23Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "htseq_count_to_h5mu main" + echo "" + echo "Convert the htseq table to a h5mu." + echo "" + echo "Input:" + echo " --input_id" + echo " type: string, required parameter, multiple values allowed" + echo " example: foo" + echo " The obs index for the counts" + echo "" + echo " --input_counts" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: counts.tsv" + echo " The counts as a TSV file as output by HTSeq." + echo "" + echo " --reference" + echo " type: file, required parameter, file must exist" + echo " example: gencode_v41_star" + echo " The GTF file." + echo "" + echo "Outputs:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "htseq_count_to_h5mu main" + exit + ;; + --input_id) + if [ -z "$VIASH_PAR_INPUT_ID" ]; then + VIASH_PAR_INPUT_ID="$2" + else + VIASH_PAR_INPUT_ID="$VIASH_PAR_INPUT_ID;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_id=*) + if [ -z "$VIASH_PAR_INPUT_ID" ]; then + VIASH_PAR_INPUT_ID=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_ID="$VIASH_PAR_INPUT_ID;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_counts) + if [ -z "$VIASH_PAR_INPUT_COUNTS" ]; then + VIASH_PAR_INPUT_COUNTS="$2" + else + VIASH_PAR_INPUT_COUNTS="$VIASH_PAR_INPUT_COUNTS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_counts=*) + if [ -z "$VIASH_PAR_INPUT_COUNTS" ]; then + VIASH_PAR_INPUT_COUNTS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_COUNTS="$VIASH_PAR_INPUT_COUNTS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/htseq_count_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_ID+x} ]; then + ViashError '--input_id' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_COUNTS+x} ]; then + ViashError '--input_counts' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_COUNTS" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT_COUNTS; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_COUNTS" ]; then + VIASH_TEST_INPUT_COUNTS=() + IFS=';' + for var in $VIASH_PAR_INPUT_COUNTS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT_COUNTS+=( "$var" ) + done + VIASH_PAR_INPUT_COUNTS=$(IFS=';' ; echo "${VIASH_TEST_INPUT_COUNTS[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-htseq_count_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import tempfile +from pathlib import Path +import tarfile +import gzip +import shutil +import pandas as pd +import mudata as md +import anndata as ad +import polars as pl +import numpy as np +import gtfparse + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'input_counts': $( if [ ! -z ${VIASH_PAR_INPUT_COUNTS+x} ]; then echo "r'${VIASH_PAR_INPUT_COUNTS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\x1f\\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +print("> combine counts data", flush=True) +counts_data = [] + +for input_id, input_counts in zip(par["input_id"], par["input_counts"]): + data = pd.read_table( + input_counts, + index_col=0, + names=["gene_ids", input_id], + dtype={"gene_ids": "U", input_id: "i"}, + ).transpose() + counts_data.append(data) + +# combine all counts +counts_and_qc = pd.concat(counts_data, axis=0) + +print("> split qc", flush=True) +idx = counts_and_qc.columns.str.startswith("_") +qc = counts_and_qc.loc[:, idx] +qc.columns = qc.columns.str.replace("^__", "", regex=True) +counts = counts_and_qc.loc[:, ~idx] + +print("> construct var", flush=True) +with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + reference = Path(par["reference"]) + + print(f">> Check compression of --reference with value: {reference}", flush=True) + par["reference"] = extract_if_need_be(reference, temp_dir_path) + + # read_gtf only works on str object, not pathlib.Path + reference = gtfparse.read_gtf(str(par["reference"])) + + +# This is a polars dataframe, not pandas +reference_genes = reference.filter( + (pl.col("feature") == "gene") & (pl.col("gene_id").is_in(list(counts.columns))) +).sort("gene_id") + +var = pd.DataFrame( + data={ + "gene_ids": pd.Index(reference_genes.get_column("gene_id")), + "feature_types": "Gene Expression", + "gene_symbol": reference_genes.get_column("gene_name").to_pandas(), + } +).set_index("gene_ids") + +print("> construct anndata", flush=True) +adata = ad.AnnData(X=counts, obsm={"qc_htseq": qc}, var=var, dtype=np.int32) + +print("> convert to mudata", flush=True) +mdata = md.MuData(adata) + +print("> write to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_COUNTS" ]; then + unset VIASH_TEST_INPUT_COUNTS + IFS=';' + for var in $VIASH_PAR_INPUT_COUNTS; do + unset IFS + if [ -z "$VIASH_TEST_INPUT_COUNTS" ]; then + VIASH_TEST_INPUT_COUNTS="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT_COUNTS="$VIASH_TEST_INPUT_COUNTS;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT_COUNTS="$VIASH_TEST_INPUT_COUNTS" + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/htseq_count_to_h5mu/nextflow_labels.config b/target/executable/mapping/htseq_count_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/htseq_count_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/multi_star/.config.vsh.yaml b/target/executable/mapping/multi_star/.config.vsh.yaml new file mode 100644 index 00000000..817f0555 --- /dev/null +++ b/target/executable/mapping/multi_star/.config.vsh.yaml @@ -0,0 +1,2953 @@ +name: "multi_star" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input/Output" + arguments: + - type: "string" + name: "--input_id" + description: "The ID of the sample being processed. This vector should have the\ + \ same length as the `--input_r1` argument." + info: null + example: + - "mysample" + - "mysample" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r1" + description: "Paths to the sequences to be mapped. If using Illumina paired-end\ + \ reads, only the R1 files should be passed." + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L002_R1_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Paths to the sequences to be mapped. If using Illumina paired-end\ + \ reads, only the R2 files should be passed." + info: null + example: + - "mysample_S1_L001_R2_001.fastq.gz" + - "mysample_S1_L002_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference_index" + alternatives: + - "--genomeDir" + description: "Path to the reference built by star_build_reference. Corresponds\ + \ to the --genomeDir argument in the STAR command." + info: null + example: + - "/path/to/reference" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference_gtf" + description: "Path to the gtf reference file." + info: null + example: + - "genes.gtf" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--outFileNamePrefix" + description: "Path to output directory. Corresponds to the --outFileNamePrefix\ + \ argument in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Processing arguments" + arguments: + - type: "boolean" + name: "--run_htseq_count" + description: "Whether or not to also run htseq-count after STAR." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--run_multiqc" + description: "Whether or not to also run MultiQC at the end." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_success_rate" + description: "Fail when the success rate is below this threshold." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Run Parameters" + arguments: + - type: "integer" + name: "--runRNGseed" + description: "random number generator seed." + info: + step: "star" + orig_arg: "--runRNGseed" + example: + - 777 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Genome Parameters" + arguments: + - type: "file" + name: "--genomeFastaFiles" + description: "path(s) to the fasta files with the genome sequences, separated\ + \ by spaces. These files should be plain text FASTA files, they *cannot* be\ + \ zipped.\n\nRequired for the genome generation (--runMode genomeGenerate).\ + \ Can also be used in the mapping (--runMode alignReads) to add extra (new)\ + \ sequences to the genome (e.g. spike-ins)." + info: + step: "star" + orig_arg: "--genomeFastaFiles" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Splice Junctions Database" + arguments: + - type: "string" + name: "--sjdbFileChrStartEnd" + description: "path to the files with genomic coordinates (chr start \ + \ end strand) for the splice junction introns. Multiple files can be supplied\ + \ and will be concatenated." + info: + step: "star" + orig_arg: "--sjdbFileChrStartEnd" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sjdbGTFfile" + description: "path to the GTF file with annotations" + info: + step: "star" + orig_arg: "--sjdbGTFfile" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFchrPrefix" + description: "prefix for chromosome names in a GTF file (e.g. 'chr' for using\ + \ ENSMEBL annotations with UCSC genomes)" + info: + step: "star" + orig_arg: "--sjdbGTFchrPrefix" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFfeatureExon" + description: "feature type in GTF file to be used as exons for building transcripts" + info: + step: "star" + orig_arg: "--sjdbGTFfeatureExon" + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentTranscript" + description: "GTF attribute name for parent transcript ID (default \"transcript_id\"\ + \ works for GTF files)" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentTranscript" + example: + - "transcript_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGene" + description: "GTF attribute name for parent gene ID (default \"gene_id\" works\ + \ for GTF files)" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentGene" + example: + - "gene_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneName" + description: "GTF attribute name for parent gene name" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentGeneName" + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneType" + description: "GTF attribute name for parent gene type" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentGeneType" + example: + - "gene_type" + - "gene_biotype" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sjdbOverhang" + description: "length of the donor/acceptor sequence on each side of the junctions,\ + \ ideally = (mate_length - 1)" + info: + step: "star" + orig_arg: "--sjdbOverhang" + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sjdbScore" + description: "extra alignment score for alignments that cross database junctions" + info: + step: "star" + orig_arg: "--sjdbScore" + example: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbInsertSave" + description: "which files to save when sjdb junctions are inserted on the fly\ + \ at the mapping step\n\n- Basic ... only small junction / transcript files\n\ + - All ... all files including big Genome, SA and SAindex - this will create\ + \ a complete genome directory" + info: + step: "star" + orig_arg: "--sjdbInsertSave" + example: + - "Basic" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variation parameters" + arguments: + - type: "string" + name: "--varVCFfile" + description: "path to the VCF file that contains variation data. The 10th column\ + \ should contain the genotype information, e.g. 0/1" + info: + step: "star" + orig_arg: "--varVCFfile" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Parameters" + arguments: + - type: "string" + name: "--readFilesType" + description: "format of input read files\n\n- Fastx ... FASTA or FASTQ\n\ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand\ + \ samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use\ + \ --readFilesCommand samtools view" + info: + step: "star" + orig_arg: "--readFilesType" + example: + - "Fastx" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesSAMattrKeep" + description: "for --readFilesType SAM SE/PE, which SAM tags to keep in the output\ + \ BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n-\ + \ None ... do not keep any tags" + info: + step: "star" + orig_arg: "--readFilesSAMattrKeep" + example: + - "All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--readFilesManifest" + description: "path to the \"manifest\" file with the names of read files. The\ + \ manifest file should contain 3 tab-separated columns:\n\npaired-end reads:\ + \ read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads:\ + \ read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but\ + \ not tabs are allowed in file names.\nIf read_group_line does not start with\ + \ ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line\ + \ starts with ID:, it can contain several fields separated by $tab$, and all\ + \ fields will be be copied verbatim into SAM @RG header line." + info: + step: "star" + orig_arg: "--readFilesManifest" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesPrefix" + description: "prefix for the read files names, i.e. it will be added in front\ + \ of the strings in --readFilesIn" + info: + step: "star" + orig_arg: "--readFilesPrefix" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesCommand" + description: "command line to execute for each of the input file. This command\ + \ should generate FASTA or FASTQ text and send it to stdout\n\nFor example:\ + \ zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc." + info: + step: "star" + orig_arg: "--readFilesCommand" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readMapNumber" + description: "number of reads to map from the beginning of the file\n\n-1: map\ + \ all reads" + info: + step: "star" + orig_arg: "--readMapNumber" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readMatesLengthsIn" + description: "Equal/NotEqual - lengths of names,sequences,qualities for both mates\ + \ are the same / not the same. NotEqual is safe in all situations." + info: + step: "star" + orig_arg: "--readMatesLengthsIn" + example: + - "NotEqual" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readNameSeparator" + description: "character(s) separating the part of the read names that will be\ + \ trimmed in output (read name after space is always trimmed)" + info: + step: "star" + orig_arg: "--readNameSeparator" + example: + - "/" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readQualityScoreBase" + description: "number to be subtracted from the ASCII code to get Phred quality\ + \ score" + info: + step: "star" + orig_arg: "--readQualityScoreBase" + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Clipping" + arguments: + - type: "string" + name: "--clipAdapterType" + description: "adapter clipping type\n\n- Hamming ... adapter clipping based on\ + \ Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n\ + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes\ + \ Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ...\ + \ no adapter clipping, all other clip* parameters are disregarded" + info: + step: "star" + orig_arg: "--clipAdapterType" + example: + - "Hamming" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clip3pNbases" + description: "number(s) of bases to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip3pNbases" + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--clip3pAdapterSeq" + description: "adapter sequences to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence\ + \ with the length equal to read length" + info: + step: "star" + orig_arg: "--clip3pAdapterSeq" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--clip3pAdapterMMp" + description: "max proportion of mismatches for 3p adapter clipping for each mate.\ + \ If one value is given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip3pAdapterMMp" + example: + - 0.1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip3pAfterAdapterNbases" + description: "number of bases to clip from 3p of each mate after the adapter clipping.\ + \ If one value is given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip3pAfterAdapterNbases" + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip5pNbases" + description: "number(s) of bases to clip from 5p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip5pNbases" + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Limits" + arguments: + - type: "long" + name: "--limitGenomeGenerateRAM" + description: "maximum available RAM (bytes) for genome generation" + info: + step: "star" + orig_arg: "--limitGenomeGenerateRAM" + example: + - 31000000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitIObufferSize" + description: "max available buffers size (bytes) for input/output, per thread" + info: + step: "star" + orig_arg: "--limitIObufferSize" + example: + - 30000000 + - 50000000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "long" + name: "--limitOutSAMoneReadBytes" + description: "max size of the SAM record (bytes) for one read. Recommended value:\ + \ >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + info: + step: "star" + orig_arg: "--limitOutSAMoneReadBytes" + example: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJoneRead" + description: "max number of junctions for one read (including all multi-mappers)" + info: + step: "star" + orig_arg: "--limitOutSJoneRead" + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJcollapsed" + description: "max number of collapsed junctions" + info: + step: "star" + orig_arg: "--limitOutSJcollapsed" + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitBAMsortRAM" + description: "maximum available RAM (bytes) for sorting BAM. If =0, it will be\ + \ set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory\ + \ option." + info: + step: "star" + orig_arg: "--limitBAMsortRAM" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitSjdbInsertNsj" + description: "maximum number of junctions to be inserted to the genome on the\ + \ fly at the mapping stage, including those from annotations and those detected\ + \ in the 1st step of the 2-pass run" + info: + step: "star" + orig_arg: "--limitSjdbInsertNsj" + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitNreadsSoft" + description: "soft limit on the number of reads" + info: + step: "star" + orig_arg: "--limitNreadsSoft" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: general" + arguments: + - type: "string" + name: "--outTmpKeep" + description: "whether to keep the temporary files after STAR runs is finished\n\ + \n- None ... remove all temporary files\n- All ... keep all files" + info: + step: "star" + orig_arg: "--outTmpKeep" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outStd" + description: "which output will be directed to stdout (standard out)\n\n- Log\ + \ ... log messages\n- SAM ... alignments\ + \ in SAM format (which normally are output to Aligned.out.sam file), normal\ + \ standard output will go into Log.std.out\n- BAM_Unsorted ... alignments\ + \ in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate\ + \ ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype\ + \ BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome\ + \ in BAM format, unsorted. Requires --quantMode TranscriptomeSAM" + info: + step: "star" + orig_arg: "--outStd" + example: + - "Log" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outReadsUnmapped" + description: "output of unmapped and partially mapped (i.e. mapped only one mate\ + \ of a paired end read) reads in separate file(s).\n\n- None ... no output\n\ + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + info: + step: "star" + orig_arg: "--outReadsUnmapped" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outQSconversionAdd" + description: "add this number to the quality score (e.g. to convert from Illumina\ + \ to Sanger, use -31)" + info: + step: "star" + orig_arg: "--outQSconversionAdd" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outMultimapperOrder" + description: "order of multimapping alignments in the output files\n\n- Old_2.4\ + \ ... quasi-random order used before 2.5.0\n- Random \ + \ ... random order of alignments for each multi-mapper. Read mates (pairs)\ + \ are always adjacent, all alignment for each read stay together. This option\ + \ will become default in the future releases." + info: + step: "star" + orig_arg: "--outMultimapperOrder" + example: + - "Old_2.4" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: SAM and BAM" + arguments: + - type: "string" + name: "--outSAMmode" + description: "mode of SAM output\n\n- None ... no SAM output\n- Full ... full\ + \ SAM output\n- NoQS ... full SAM but without quality scores" + info: + step: "star" + orig_arg: "--outSAMmode" + example: + - "Full" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMstrandField" + description: "Cufflinks-like strand field flag\n\n- None ... not used\n\ + - intronMotif ... strand derived from the intron motif. This option changes\ + \ the output alignments: reads with inconsistent and/or non-canonical introns\ + \ are filtered out." + info: + step: "star" + orig_arg: "--outSAMstrandField" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattributes" + description: "a string of desired SAM attributes, in the order desired for the\ + \ output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n\ + - None ... no attributes\n- Standard ... NH HI AS nM\n- All \ + \ ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number\ + \ of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard\ + \ SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart\ + \ (=1 by default). Standard SAM tag.\n- AS ... local alignment score,\ + \ +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE\ + \ reads, total score for two mates. Stadnard SAM tag.\n- nM ... number\ + \ of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance\ + \ to the reference (number of mismatched + inserted + deleted bases) for each\ + \ mate. Standard SAM tag.\n- MD ... string encoding mismatched and\ + \ deleted reference bases (see standard SAM specifications). Standard SAM tag.\n\ + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical;\ + \ 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions\ + \ database is used, and a junction is annotated, 20 is added to its motif value.\n\ + - jI ... start and end of introns for all junctions (1-based).\n- XS\ + \ ... alignment strand according to --outSAMstrandField.\n- MC \ + \ ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all\ + \ segment of all chimeric alingments for --chimOutType WithinBAM output.\n-\ + \ cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n\ + - vA ... variant allele\n- vG ... genomic coordinate of the\ + \ variant overlapped by the read.\n- vW ... 1 - alignment passes WASP\ + \ filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires\ + \ --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality\ + \ scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN \ + \ ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene\ + \ IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected\ + \ cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM\ + \ SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS \ + \ ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ \ + \ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha \ + \ ... haplotype (1/2) when mapping to the diploid genome. Requires genome\ + \ generated with --genomeTransformType Diploid .\n- rB ... alignment\ + \ block read/genomic coordinates.\n- vR ... read coordinate of the\ + \ variant." + info: + step: "star" + orig_arg: "--outSAMattributes" + example: + - "Standard" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMattrIHstart" + description: "start value for the IH attribute. 0 may be required by some downstream\ + \ software, such as Cufflinks or StringTie." + info: + step: "star" + orig_arg: "--outSAMattrIHstart" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMunmapped" + description: "output of unmapped reads in the SAM format\n\n1st word:\n- None\ + \ ... no output\n- Within ... output unmapped reads within the main SAM file\ + \ (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for\ + \ each alignment, and, in case of unsorted output, keep it adjacent to its mapped\ + \ mate. Only affects multi-mapping reads." + info: + step: "star" + orig_arg: "--outSAMunmapped" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMorder" + description: "type of sorting for the SAM output\n\nPaired: one mate after the\ + \ other for all paired alignments\nPairedKeepInputOrder: one mate after the\ + \ other for all paired alignments, the order is kept the same as in the input\ + \ FASTQ files" + info: + step: "star" + orig_arg: "--outSAMorder" + example: + - "Paired" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMprimaryFlag" + description: "which alignments are considered primary - all others will be marked\ + \ with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the\ + \ best score is primary\n- AllBestScore ... all alignments with the best score\ + \ are primary" + info: + step: "star" + orig_arg: "--outSAMprimaryFlag" + example: + - "OneBestScore" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMreadID" + description: "read ID record type\n\n- Standard ... first word (until space) from\ + \ the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number\ + \ (index) in the FASTx file" + info: + step: "star" + orig_arg: "--outSAMreadID" + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMmapqUnique" + description: "0 to 255: the MAPQ value for unique mappers" + info: + step: "star" + orig_arg: "--outSAMmapqUnique" + example: + - 255 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagOR" + description: "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e.\ + \ FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, and after outSAMflagAND. Can be used to set specific bits that are not\ + \ set otherwise." + info: + step: "star" + orig_arg: "--outSAMflagOR" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagAND" + description: "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e.\ + \ FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, but before outSAMflagOR. Can be used to unset specific bits that are\ + \ not set otherwise." + info: + step: "star" + orig_arg: "--outSAMflagAND" + example: + - 65535 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattrRGline" + description: "SAM/BAM read group line. The first word contains the read group\ + \ identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx CN:yy\ + \ \"DS:z z z\".\n\nxxx will be added as RG tag to each output alignment. Any\ + \ spaces in the tag values have to be double quoted.\nComma separated RG lines\ + \ correspons to different (comma separated) input files in --readFilesIn. Commas\ + \ have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz\ + \ \"DS:z z\" , ID:yyy DS:yyyy" + info: + step: "star" + orig_arg: "--outSAMattrRGline" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderHD" + description: "@HD (header) line of the SAM header" + info: + step: "star" + orig_arg: "--outSAMheaderHD" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderPG" + description: "extra @PG (software) line of the SAM header (in addition to STAR)" + info: + step: "star" + orig_arg: "--outSAMheaderPG" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderCommentFile" + description: "path to the file with @CO (comment) lines of the SAM header" + info: + step: "star" + orig_arg: "--outSAMheaderCommentFile" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMfilter" + description: "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences\ + \ ... only keep the reads for which all alignments are to the extra reference\ + \ sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences\ + \ ... keep all alignments to the extra reference sequences added with --genomeFastaFiles\ + \ at the mapping stage." + info: + step: "star" + orig_arg: "--outSAMfilter" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMmultNmax" + description: "max number of multiple alignments for a read that will be output\ + \ to the SAM/BAM files. Note that if this value is not equal to -1, the top\ + \ scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax)\ + \ will be output" + info: + step: "star" + orig_arg: "--outSAMmultNmax" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMtlen" + description: "calculation method for the TLEN field in the SAM/BAM files\n\n-\ + \ 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate.\ + \ (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost\ + \ base of any mate. (+)sign for the mate with the leftmost base. This is different\ + \ from 1 for overlapping mates with protruding ends" + info: + step: "star" + orig_arg: "--outSAMtlen" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMcompression" + description: "-1 to 10 BAM compression level, -1=default compression (6?), 0=no\ + \ compression, 10=maximum compression" + info: + step: "star" + orig_arg: "--outBAMcompression" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingThreadN" + description: ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN)." + info: + step: "star" + orig_arg: "--outBAMsortingThreadN" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingBinsN" + description: ">0: number of genome bins for coordinate-sorting" + info: + step: "star" + orig_arg: "--outBAMsortingBinsN" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BAM processing" + arguments: + - type: "string" + name: "--bamRemoveDuplicatesType" + description: "mark duplicates in the BAM file, for now only works with (i) sorted\ + \ BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- -\ + \ ... no duplicate removal/marking\n- UniqueIdentical\ + \ ... mark all multimappers, and duplicate unique mappers. The coordinates,\ + \ FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate\ + \ unique mappers but not multimappers." + info: + step: "star" + orig_arg: "--bamRemoveDuplicatesType" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--bamRemoveDuplicatesMate2basesN" + description: "number of bases from the 5' of mate 2 to use in collapsing (e.g.\ + \ for RAMPAGE)" + info: + step: "star" + orig_arg: "--bamRemoveDuplicatesMate2basesN" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Wiggle" + arguments: + - type: "string" + name: "--outWigType" + description: "type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\"\ + . Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n\ + - None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle\ + \ ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of\ + \ the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only\ + \ 2nd read" + info: + step: "star" + orig_arg: "--outWigType" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outWigStrand" + description: "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate\ + \ strands, str1 and str2\n- Unstranded ... collapsed strands" + info: + step: "star" + orig_arg: "--outWigStrand" + example: + - "Stranded" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigReferencesPrefix" + description: "prefix matching reference names to include in the output wiggle\ + \ file, e.g. \"chr\", default \"-\" - include all references" + info: + step: "star" + orig_arg: "--outWigReferencesPrefix" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigNorm" + description: "type of normalization for the signal\n\n- RPM ... reads per million\ + \ of mapped reads\n- None ... no normalization, \"raw\" counts" + info: + step: "star" + orig_arg: "--outWigNorm" + example: + - "RPM" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering" + arguments: + - type: "string" + name: "--outFilterType" + description: "type of filtering\n\n- Normal ... standard filtering using only\ + \ current alignment\n- BySJout ... keep only those reads that contain junctions\ + \ that passed filtering into SJ.out.tab" + info: + step: "star" + orig_arg: "--outFilterType" + example: + - "Normal" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapScoreRange" + description: "the score range below the maximum score for multimapping alignments" + info: + step: "star" + orig_arg: "--outFilterMultimapScoreRange" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapNmax" + description: "maximum number of loci the read is allowed to map to. Alignments\ + \ (all of them) will be output only if the read maps to no more loci than this\ + \ value.\n\nOtherwise no alignments will be output, and the read will be counted\ + \ as \"mapped to too many loci\" in the Log.final.out ." + info: + step: "star" + orig_arg: "--outFilterMultimapNmax" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMismatchNmax" + description: "alignment will be output only if it has no more mismatches than\ + \ this value." + info: + step: "star" + orig_arg: "--outFilterMismatchNmax" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverLmax" + description: "alignment will be output only if its ratio of mismatches to *mapped*\ + \ length is less than or equal to this value." + info: + step: "star" + orig_arg: "--outFilterMismatchNoverLmax" + example: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverReadLmax" + description: "alignment will be output only if its ratio of mismatches to *read*\ + \ length is less than or equal to this value." + info: + step: "star" + orig_arg: "--outFilterMismatchNoverReadLmax" + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterScoreMin" + description: "alignment will be output only if its score is higher than or equal\ + \ to this value." + info: + step: "star" + orig_arg: "--outFilterScoreMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterScoreMinOverLread" + description: "same as outFilterScoreMin, but normalized to read length (sum of\ + \ mates' lengths for paired-end reads)" + info: + step: "star" + orig_arg: "--outFilterScoreMinOverLread" + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMatchNmin" + description: "alignment will be output only if the number of matched bases is\ + \ higher than or equal to this value." + info: + step: "star" + orig_arg: "--outFilterMatchNmin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMatchNminOverLread" + description: "sam as outFilterMatchNmin, but normalized to the read length (sum\ + \ of mates' lengths for paired-end reads)." + info: + step: "star" + orig_arg: "--outFilterMatchNminOverLread" + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronMotifs" + description: "filter alignment using their motifs\n\n- None \ + \ ... no filtering\n- RemoveNoncanonical ... filter out\ + \ alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated\ + \ ... filter out alignments that contain non-canonical unannotated junctions\ + \ when using annotated splice junctions database. The annotated non-canonical\ + \ junctions will be kept." + info: + step: "star" + orig_arg: "--outFilterIntronMotifs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronStrands" + description: "filter alignments\n\n- RemoveInconsistentStrands ... remove\ + \ alignments that have junctions with inconsistent strands\n- None \ + \ ... no filtering" + info: + step: "star" + orig_arg: "--outFilterIntronStrands" + example: + - "RemoveInconsistentStrands" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output splice junctions (SJ.out.tab)" + arguments: + - type: "string" + name: "--outSJtype" + description: "type of splice junction output\n\n- Standard ... standard SJ.out.tab\ + \ output\n- None ... no splice junction output" + info: + step: "star" + orig_arg: "--outSJtype" + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering: Splice Junctions" + arguments: + - type: "string" + name: "--outSJfilterReads" + description: "which reads to consider for collapsed splice junctions output\n\n\ + - All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping\ + \ reads only" + info: + step: "star" + orig_arg: "--outSJfilterReads" + example: + - "All" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterOverhangMin" + description: "minimum overhang length for splice junctions on both sides for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply\ + \ to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterOverhangMin" + example: + - 30 + - 12 + - 12 + - 12 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountUniqueMin" + description: "minimum uniquely mapping read count per junction for: (1) non-canonical\ + \ motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and\ + \ GT/AT motif. -1 means no output for that motif\n\nJunctions are output if\ + \ one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are\ + \ satisfied\ndoes not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterCountUniqueMin" + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountTotalMin" + description: "minimum total (multi-mapping+unique) read count per junction for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions\ + \ are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin\ + \ conditions are satisfied\ndoes not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterCountTotalMin" + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterDistToOtherSJmin" + description: "minimum allowed distance to other junctions' donor/acceptor\n\n\ + does not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterDistToOtherSJmin" + example: + - 10 + - 0 + - 5 + - 10 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterIntronMaxVsReadN" + description: "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ + \ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2\ + \ reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\n\ + does not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterIntronMaxVsReadN" + example: + - 50000 + - 100000 + - 200000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Scoring" + arguments: + - type: "integer" + name: "--scoreGap" + description: "splice junction penalty (independent on intron motif)" + info: + step: "star" + orig_arg: "--scoreGap" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapNoncan" + description: "non-canonical junction penalty (in addition to scoreGap)" + info: + step: "star" + orig_arg: "--scoreGapNoncan" + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapGCAG" + description: "GC/AG and CT/GC junction penalty (in addition to scoreGap)" + info: + step: "star" + orig_arg: "--scoreGapGCAG" + example: + - -4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapATAC" + description: "AT/AC and GT/AT junction penalty (in addition to scoreGap)" + info: + step: "star" + orig_arg: "--scoreGapATAC" + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGenomicLengthLog2scale" + description: "extra score logarithmically scaled with genomic length of the alignment:\ + \ scoreGenomicLengthLog2scale*log2(genomicLength)" + info: + step: "star" + orig_arg: "--scoreGenomicLengthLog2scale" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelOpen" + description: "deletion open penalty" + info: + step: "star" + orig_arg: "--scoreDelOpen" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelBase" + description: "deletion extension penalty per base (in addition to scoreDelOpen)" + info: + step: "star" + orig_arg: "--scoreDelBase" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsOpen" + description: "insertion open penalty" + info: + step: "star" + orig_arg: "--scoreInsOpen" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsBase" + description: "insertion extension penalty per base (in addition to scoreInsOpen)" + info: + step: "star" + orig_arg: "--scoreInsBase" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreStitchSJshift" + description: "maximum score reduction while searching for SJ boundaries in the\ + \ stitching step" + info: + step: "star" + orig_arg: "--scoreStitchSJshift" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Alignments and Seeding" + arguments: + - type: "integer" + name: "--seedSearchStartLmax" + description: "defines the search start point through the read - the read is split\ + \ into pieces no longer than this value" + info: + step: "star" + orig_arg: "--seedSearchStartLmax" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--seedSearchStartLmaxOverLread" + description: "seedSearchStartLmax normalized to read length (sum of mates' lengths\ + \ for paired-end reads)" + info: + step: "star" + orig_arg: "--seedSearchStartLmaxOverLread" + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSearchLmax" + description: "defines the maximum length of the seeds, if =0 seed length is not\ + \ limited" + info: + step: "star" + orig_arg: "--seedSearchLmax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMultimapNmax" + description: "only pieces that map fewer than this value are utilized in the stitching\ + \ procedure" + info: + step: "star" + orig_arg: "--seedMultimapNmax" + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerReadNmax" + description: "max number of seeds per read" + info: + step: "star" + orig_arg: "--seedPerReadNmax" + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerWindowNmax" + description: "max number of seeds per window" + info: + step: "star" + orig_arg: "--seedPerWindowNmax" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedNoneLociPerWindow" + description: "max number of one seed loci per window" + info: + step: "star" + orig_arg: "--seedNoneLociPerWindow" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSplitMin" + description: "min length of the seed sequences split by Ns or mate gap" + info: + step: "star" + orig_arg: "--seedSplitMin" + example: + - 12 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMapMin" + description: "min length of seeds to be mapped" + info: + step: "star" + orig_arg: "--seedMapMin" + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMin" + description: "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,\ + \ otherwise it is considered Deletion" + info: + step: "star" + orig_arg: "--alignIntronMin" + example: + - 21 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMax" + description: "maximum intron size, if 0, max intron size will be determined by\ + \ (2^winBinNbits)*winAnchorDistNbins" + info: + step: "star" + orig_arg: "--alignIntronMax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignMatesGapMax" + description: "maximum gap between two mates, if 0, max intron gap will be determined\ + \ by (2^winBinNbits)*winAnchorDistNbins" + info: + step: "star" + orig_arg: "--alignMatesGapMax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJoverhangMin" + description: "minimum overhang (i.e. block size) for spliced alignments" + info: + step: "star" + orig_arg: "--alignSJoverhangMin" + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJstitchMismatchNmax" + description: "maximum number of mismatches for stitching of the splice junctions\ + \ (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3)\ + \ GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif." + info: + step: "star" + orig_arg: "--alignSJstitchMismatchNmax" + example: + - 0 + - -1 + - 0 + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--alignSJDBoverhangMin" + description: "minimum overhang (i.e. block size) for annotated (sjdb) spliced\ + \ alignments" + info: + step: "star" + orig_arg: "--alignSJDBoverhangMin" + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSplicedMateMapLmin" + description: "minimum mapped length for a read mate that is spliced" + info: + step: "star" + orig_arg: "--alignSplicedMateMapLmin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alignSplicedMateMapLminOverLmate" + description: "alignSplicedMateMapLmin normalized to mate length" + info: + step: "star" + orig_arg: "--alignSplicedMateMapLminOverLmate" + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignWindowsPerReadNmax" + description: "max number of windows per read" + info: + step: "star" + orig_arg: "--alignWindowsPerReadNmax" + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerWindowNmax" + description: "max number of transcripts per window" + info: + step: "star" + orig_arg: "--alignTranscriptsPerWindowNmax" + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerReadNmax" + description: "max number of different alignments per read to consider" + info: + step: "star" + orig_arg: "--alignTranscriptsPerReadNmax" + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsType" + description: "type of read ends alignment\n\n- Local ... standard\ + \ local alignment with soft-clipping allowed\n- EndToEnd ... force\ + \ end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully\ + \ extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12\ + \ ... fully extend only the 5p of the both read1 and read2, all other ends:\ + \ local alignment" + info: + step: "star" + orig_arg: "--alignEndsType" + example: + - "Local" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsProtrude" + description: "allow protrusion of alignment ends, i.e. start (end) of the +strand\ + \ mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum\ + \ number of protrusion bases allowed\n2nd word: string:\n- \ + \ ConcordantPair ... report alignments with non-zero protrusion as concordant\ + \ pairs\n- DiscordantPair ... report alignments with non-zero\ + \ protrusion as discordant pairs" + info: + step: "star" + orig_arg: "--alignEndsProtrude" + example: + - "0 ConcordantPair" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignSoftClipAtReferenceEnds" + description: "allow the soft-clipping of the alignments past the end of the chromosomes\n\ + \n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks" + info: + step: "star" + orig_arg: "--alignSoftClipAtReferenceEnds" + example: + - "Yes" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignInsertionFlush" + description: "how to flush ambiguous insertion positions\n\n- None ... insertions\ + \ are not flushed\n- Right ... insertions are flushed to the right" + info: + step: "star" + orig_arg: "--alignInsertionFlush" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-End reads" + arguments: + - type: "integer" + name: "--peOverlapNbasesMin" + description: "minimum number of overlapping bases to trigger mates merging and\ + \ realignment. Specify >0 value to switch on the \"merginf of overlapping mates\"\ + \ algorithm." + info: + step: "star" + orig_arg: "--peOverlapNbasesMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--peOverlapMMp" + description: "maximum proportion of mismatched bases in the overlap area" + info: + step: "star" + orig_arg: "--peOverlapMMp" + example: + - 0.01 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Windows, Anchors, Binning" + arguments: + - type: "integer" + name: "--winAnchorMultimapNmax" + description: "max number of loci anchors are allowed to map to" + info: + step: "star" + orig_arg: "--winAnchorMultimapNmax" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winBinNbits" + description: "=log2(winBin), where winBin is the size of the bin for the windows/clustering,\ + \ each window will occupy an integer number of bins." + info: + step: "star" + orig_arg: "--winBinNbits" + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winAnchorDistNbins" + description: "max number of bins between two anchors that allows aggregation of\ + \ anchors into one window" + info: + step: "star" + orig_arg: "--winAnchorDistNbins" + example: + - 9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winFlankNbins" + description: "log2(winFlank), where win Flank is the size of the left and right\ + \ flanking regions for each window" + info: + step: "star" + orig_arg: "--winFlankNbins" + example: + - 4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--winReadCoverageRelativeMin" + description: "minimum relative coverage of the read sequence by the seeds in a\ + \ window, for STARlong algorithm only." + info: + step: "star" + orig_arg: "--winReadCoverageRelativeMin" + example: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winReadCoverageBasesMin" + description: "minimum number of bases covered by the seeds in a window , for STARlong\ + \ algorithm only." + info: + step: "star" + orig_arg: "--winReadCoverageBasesMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Chimeric Alignments" + arguments: + - type: "string" + name: "--chimOutType" + description: "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n\ + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n-\ + \ WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n-\ + \ WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental\ + \ chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip\ + \ ... soft-clipping in the CIGAR for supplemental chimeric alignments" + info: + step: "star" + orig_arg: "--chimOutType" + example: + - "Junctions" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentMin" + description: "minimum length of chimeric segment length, if ==0, no chimeric output" + info: + step: "star" + orig_arg: "--chimSegmentMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreMin" + description: "minimum total (summed) score of the chimeric segments" + info: + step: "star" + orig_arg: "--chimScoreMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreDropMax" + description: "max drop (difference) of chimeric score (the sum of scores of all\ + \ chimeric segments) from the read length" + info: + step: "star" + orig_arg: "--chimScoreDropMax" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreSeparation" + description: "minimum difference (separation) between the best chimeric score\ + \ and the next one" + info: + step: "star" + orig_arg: "--chimScoreSeparation" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreJunctionNonGTAG" + description: "penalty for a non-GT/AG chimeric junction" + info: + step: "star" + orig_arg: "--chimScoreJunctionNonGTAG" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimJunctionOverhangMin" + description: "minimum overhang for a chimeric junction" + info: + step: "star" + orig_arg: "--chimJunctionOverhangMin" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentReadGapMax" + description: "maximum gap in the read sequence between chimeric segments" + info: + step: "star" + orig_arg: "--chimSegmentReadGapMax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chimFilter" + description: "different filters for chimeric alignments\n\n- None ... no filtering\n\ + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric\ + \ junction" + info: + step: "star" + orig_arg: "--chimFilter" + example: + - "banGenomicN" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimMainSegmentMultNmax" + description: "maximum number of multi-alignments for the main chimeric segment.\ + \ =1 will prohibit multimapping main segments." + info: + step: "star" + orig_arg: "--chimMainSegmentMultNmax" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapNmax" + description: "maximum number of chimeric multi-alignments\n\n- 0 ... use the old\ + \ scheme for chimeric detection which only considered unique alignments" + info: + step: "star" + orig_arg: "--chimMultimapNmax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapScoreRange" + description: "the score range for multi-mapping chimeras below the best chimeric\ + \ score. Only works with --chimMultimapNmax > 1" + info: + step: "star" + orig_arg: "--chimMultimapScoreRange" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimNonchimScoreDropMin" + description: "to trigger chimeric detection, the drop in the best non-chimeric\ + \ alignment score with respect to the read length has to be greater than this\ + \ value" + info: + step: "star" + orig_arg: "--chimNonchimScoreDropMin" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimOutJunctionFormat" + description: "formatting type for the Chimeric.out.junction file\n\n- 0 ... no\ + \ comment lines/headers\n- 1 ... comment lines at the end of the file: command\ + \ line and Nreads: total, unique/multi-mapping" + info: + step: "star" + orig_arg: "--chimOutJunctionFormat" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Quantification of Annotations" + arguments: + - type: "string" + name: "--quantMode" + description: "types of quantification requested\n\n- - ... none\n\ + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate\ + \ file\n- GeneCounts ... count reads per gene" + info: + step: "star" + orig_arg: "--quantMode" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--quantTranscriptomeBAMcompression" + description: "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM\ + \ output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10\ + \ ... maximum compression" + info: + step: "star" + orig_arg: "--quantTranscriptomeBAMcompression" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quantTranscriptomeBan" + description: "prohibit various alignment type\n\n- IndelSoftclipSingleend ...\ + \ prohibit indels, soft clipping and single-end alignments - compatible with\ + \ RSEM\n- Singleend ... prohibit single-end alignments" + info: + step: "star" + orig_arg: "--quantTranscriptomeBan" + example: + - "IndelSoftclipSingleend" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "2-pass Mapping" + arguments: + - type: "string" + name: "--twopassMode" + description: "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic\ + \ ... basic 2-pass mapping, with all 1st pass junctions inserted into\ + \ the genome indices on the fly" + info: + step: "star" + orig_arg: "--twopassMode" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--twopass1readsN" + description: "number of reads to process for the 1st step. Use very large number\ + \ (or default -1) to map all reads in the first step." + info: + step: "star" + orig_arg: "--twopass1readsN" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "WASP parameters" + arguments: + - type: "string" + name: "--waspOutputMode" + description: "WASP allele-specific output type. This is re-implementation of the\ + \ original WASP mappability filtering by Bryce van de Geijn, Graham McVicker,\ + \ Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature\ + \ Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\ + \n- SAMtag ... add WASP tags to the alignments that pass WASP filtering" + info: + step: "star" + orig_arg: "--waspOutputMode" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STARsolo (single cell RNA-seq) parameters" + arguments: + - type: "string" + name: "--soloType" + description: "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet)\ + \ one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X\ + \ Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length,\ + \ one UMI of fixed length and one adapter sequence of fixed length are allowed\ + \ in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode\ + \ as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2\ + \ if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or\ + \ SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate\ + \ FASTQ (paired- or single-end), barcodes are corresponding read-groups, no\ + \ UMI sequences, alignments deduplicated according to alignment start and end\ + \ (after extending soft-clipped bases)" + info: + step: "star" + orig_arg: "--soloType" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCBwhitelist" + description: "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex\ + \ allows more than one whitelist file.\n\n- None ... no whitelist:\ + \ all cell barcodes are allowed" + info: + step: "star" + orig_arg: "--soloCBwhitelist" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--soloCBstart" + description: "cell barcode start base" + info: + step: "star" + orig_arg: "--soloCBstart" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloCBlen" + description: "cell barcode length" + info: + step: "star" + orig_arg: "--soloCBlen" + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIstart" + description: "UMI start base" + info: + step: "star" + orig_arg: "--soloUMIstart" + example: + - 17 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIlen" + description: "UMI length" + info: + step: "star" + orig_arg: "--soloUMIlen" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeReadLength" + description: "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n\ + - 0 ... not defined, do not check" + info: + step: "star" + orig_arg: "--soloBarcodeReadLength" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeMate" + description: "identifies which read mate contains the barcode (CB+UMI) sequence\n\ + \n- 0 ... barcode sequence is on separate read, which should always be the\ + \ last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part\ + \ of mate 1\n- 2 ... barcode sequence is a part of mate 2" + info: + step: "star" + orig_arg: "--soloBarcodeMate" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBposition" + description: "position of Cell Barcode(s) on the barcode read.\n\nPresently only\ + \ works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\n\ + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor\ + \ defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter\ + \ start; 3: adapter end\nstart(end)Position is the 0-based position with of\ + \ the CB start(end) with respect to the Anchor Base\nString for different barcodes\ + \ are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols,\ + \ 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8" + info: + step: "star" + orig_arg: "--soloCBposition" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIposition" + description: "position of the UMI on the barcode read, same as soloCBposition\n\ + \nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition\ + \ 3_9_3_14" + info: + step: "star" + orig_arg: "--soloUMIposition" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloAdapterSequence" + description: "adapter sequence to anchor barcodes. Only one adapter sequence is\ + \ allowed." + info: + step: "star" + orig_arg: "--soloAdapterSequence" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloAdapterMismatchesNmax" + description: "maximum number of mismatches allowed in adapter sequence." + info: + step: "star" + orig_arg: "--soloAdapterMismatchesNmax" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBmatchWLtype" + description: "matching the Cell Barcodes to the WhiteList\n\n- Exact \ + \ ... only exact matches allowed\n- 1MM \ + \ ... only one match in whitelist with 1 mismatched base allowed. Allowed\ + \ CBs have to have at least one read with exact match.\n- 1MM_multi \ + \ ... multiple matches in whitelist with 1 mismatched base allowed,\ + \ posterior probability calculation is used choose one of the matches.\nAllowed\ + \ CBs have to have at least one read with exact match. This option matches best\ + \ with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi,\ + \ but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts\ + \ ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for\ + \ CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2\ + \ ... allow up to edit distance of 3 fpr each of the barcodes.\ + \ May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex.\ + \ Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio\ + \ Split-seq pipeline." + info: + step: "star" + orig_arg: "--soloCBmatchWLtype" + example: + - "1MM_multi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeSeq" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance,\ + \ for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR\ + \ .\nThis parameter is required when running STARsolo with input from SAM." + info: + step: "star" + orig_arg: "--soloInputSAMattrBarcodeSeq" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeQual" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode qualities (in proper order).\n\nFor\ + \ instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual\ + \ CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned\ + \ to all bases." + info: + step: "star" + orig_arg: "--soloInputSAMattrBarcodeQual" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloStrand" + description: "strandedness of the solo libraries:\n\n- Unstranded ... no strand\ + \ information\n- Forward ... read strand same as the original RNA molecule\n\ + - Reverse ... read strand opposite to the original RNA molecule" + info: + step: "star" + orig_arg: "--soloStrand" + example: + - "Forward" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloFeatures" + description: "genomic features for which the UMI counts per Cell Barcode are collected\n\ + \n- Gene ... genes: reads match the gene transcript\n- SJ \ + \ ... splice junctions: reported in SJ.out.tab\n- GeneFull ...\ + \ full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n\ + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping\ + \ genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS\ + \ ... full gene (pre-RNA): count all reads overlapping genes' exons and\ + \ introns: prioritize >50% overlap with exons. Do not count reads with 100%\ + \ exonic overlap in the antisense direction." + info: + step: "star" + orig_arg: "--soloFeatures" + example: + - "Gene" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloMultiMappers" + description: "counting method for reads mapping to multiple genes\n\n- Unique\ + \ ... count only reads that map to unique genes\n- Uniform ... uniformly\ + \ distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs\ + \ proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique\ + \ ... distribute UMIs proportionally to unique mappers, if present, and uniformly\ + \ if not.\n- EM ... multi-gene UMIs are distributed using Expectation\ + \ Maximization algorithm" + info: + step: "star" + orig_arg: "--soloMultiMappers" + example: + - "Unique" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIdedup" + description: "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All \ + \ ... all UMIs with 1 mismatch distance to each other are\ + \ collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows\ + \ the \"directional\" method from the UMI-tools by Smith, Heger and Sudbery\ + \ (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools,\ + \ but with more stringent criteria for duplicate UMIs\n- Exact \ + \ ... only exactly matching UMIs are collapsed.\n- NoDedup \ + \ ... no deduplication of UMIs, count all reads.\n- 1MM_CR \ + \ ... CellRanger2-4 algorithm for 1MM UMI collapsing." + info: + step: "star" + orig_arg: "--soloUMIdedup" + example: + - "1MM_All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIfiltering" + description: "type of UMI filtering (for reads uniquely mapping to genes)\n\n\ + - - ... basic filtering: remove UMIs with N and homopolymers\ + \ (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count\ + \ UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove\ + \ all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic +\ + \ remove lower-count UMIs that map to more than one gene, matching CellRanger\ + \ > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR" + info: + step: "star" + orig_arg: "--soloUMIfiltering" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFileNames" + description: "file names for STARsolo output:\n\nfile_name_prefix gene_names\ + \ barcode_sequences cell_feature_count_matrix" + info: + step: "star" + orig_arg: "--soloOutFileNames" + example: + - "Solo.out/" + - "features.tsv" + - "barcodes.tsv" + - "matrix.mtx" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellFilter" + description: "cell filtering type and parameters\n\n- None ... do not\ + \ output filtered cells\n- TopCells ... only report top cells by UMI\ + \ count, followed by the exact number of cells\n- CellRanger2.2 ... simple\ + \ filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected\ + \ cells, robust maximum percentile for UMI count, maximum to minimum ratio for\ + \ UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; \ + \ maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering\ + \ in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun\ + \ et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\n\ + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile \ + \ maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR\ + \ simN\nThe harcoded values are from CellRanger: 3000 \ + \ 0.99 10 45000 90000 500 0.01 20000\ + \ 0.01 10000" + info: + step: "star" + orig_arg: "--soloCellFilter" + example: + - "CellRanger2.2" + - "3000" + - "0.99" + - "10" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFormatFeaturesGeneField3" + description: "field 3 in the Gene features.tsv file. If \"-\", then no 3rd field\ + \ is output." + info: + step: "star" + orig_arg: "--soloOutFormatFeaturesGeneField3" + example: + - "Gene Expression" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellReadStats" + description: "Output reads statistics for each CB\n\n- Standard ... standard\ + \ output" + info: + step: "star" + orig_arg: "--soloCellReadStats" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "HTSeq arguments" + arguments: + - type: "string" + name: "--stranded" + alternatives: + - "-s" + description: "Whether the data is from a strand-specific assay. 'reverse' means\ + \ 'yes' with reversed strand interpretation." + info: + step: "htseq" + orig_arg: "--stranded" + default: + - "yes" + required: false + choices: + - "yes" + - "no" + - "reverse" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--minimum_alignment_quality" + alternatives: + - "-a" + - "--minaqual" + description: "Skip all reads with MAPQ alignment quality lower than the given\ + \ minimum value. \nMAPQ is the 5th column of a SAM/BAM file and its usage depends\ + \ on the software \nused to map the reads.\n" + info: + step: "htseq" + orig_arg: "--minaqual" + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--type" + alternatives: + - "-t" + description: "Feature type (3rd column in GTF file) to be used, all features of\ + \ other type are ignored (default, suitable for Ensembl GTF files: exon)" + info: + step: "htseq" + orig_arg: "--type" + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--id_attribute" + alternatives: + - "-i" + description: "GTF attribute to be used as feature ID (default, suitable for Ensembl\ + \ GTF files: gene_id).\nAll feature of the right type (see -t option) within\ + \ the same GTF attribute will be added\ntogether. The typical way of using this\ + \ option is to count all exonic reads from each gene\nand add the exons but\ + \ other uses are possible as well. You can call this option multiple\ntimes:\ + \ in that case, the combination of all attributes separated by colons (:) will\ + \ be used\nas a unique identifier, e.g. for exons you might use -i gene_id -i\ + \ exon_number.\n" + info: + step: "htseq" + orig_arg: "--idattr" + example: + - "gene_id" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--additional_attributes" + description: "Additional feature attributes (suitable for Ensembl GTF files: gene_name).\ + \ Use multiple times\nfor more than one additional attribute. These attributes\ + \ are only used as annotations in the\noutput, while the determination of how\ + \ the counts are added together is done based on option -i.\n" + info: + step: "htseq" + orig_arg: "--additional-attr" + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--add_chromosome_info" + description: "Store information about the chromosome of each feature as an additional\ + \ attribute\n(e.g. colunm in the TSV output file).\n" + info: + step: "htseq" + orig_arg: "--add-chromosome-info" + direction: "input" + - type: "string" + name: "--mode" + alternatives: + - "-m" + description: "Mode to handle reads overlapping more than one feature." + info: + step: "htseq" + orig_arg: "--mode" + default: + - "union" + required: false + choices: + - "union" + - "intersection-strict" + - "intersection-nonempty" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_unique" + description: "Whether and how to score reads that are not uniquely aligned or\ + \ ambiguously assigned to features." + info: + step: "htseq" + orig_arg: "--nonunique" + default: + - "none" + required: false + choices: + - "none" + - "all" + - "fraction" + - "random" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--secondary_alignments" + description: "Whether to score secondary alignments (0x100 flag)." + info: + step: "htseq" + orig_arg: "--secondary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--supplementary_alignments" + description: "Whether to score supplementary alignments (0x800 flag)." + info: + step: "htseq" + orig_arg: "--supplementary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--counts_output_sparse" + description: "Store the counts as a sparse matrix (mtx, h5ad, loom)." + info: + step: "htseq" + orig_arg: "--counts-output-sparse" + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using STAR." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "STAR_VERSION 2.7.10b" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + - type: "apt" + packages: + - "samtools" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pyyaml" + - "HTSeq" + - "multiprocess" + - "gtfparse" + - "pandas" + - "numpy<2" + - "multiqc~=1.15.0" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "pytest" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/multi_star/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/multi_star" + executable: "target/executable/mapping/multi_star/multi_star" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/multi_star/multi_star b/target/executable/mapping/multi_star/multi_star new file mode 100755 index 00000000..730bfbc4 --- /dev/null +++ b/target/executable/mapping/multi_star/multi_star @@ -0,0 +1,6595 @@ +#!/usr/bin/env bash + +# multi_star main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="multi_star" +VIASH_META_FUNCTIONALITY_NAME="multi_star" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +ENV STAR_VERSION 2.7.10b +ENV PACKAGES gcc g++ make wget zlib1g-dev unzip +RUN apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y samtools procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pyyaml" "HTSeq" "multiprocess" "gtfparse" "pandas" "numpy<2" "multiqc~=1.15.0" + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component mapping multi_star" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "multi_star main" + echo "" + echo "Align fastq files using STAR." + echo "" + echo "Input/Output:" + echo " --input_id" + echo " type: string, required parameter, multiple values allowed" + echo " example: mysample;mysample" + echo " The ID of the sample being processed. This vector should have the same" + echo " length as the \`--input_r1\` argument." + echo "" + echo " --input_r1" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L002_R1_001.fastq.gz" + echo " Paths to the sequences to be mapped. If using Illumina paired-end reads," + echo " only the R1 files should be passed." + echo "" + echo " --input_r2" + echo " type: file, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R2_001.fastq.gz;mysample_S1_L002_R2_001.fastq.gz" + echo " Paths to the sequences to be mapped. If using Illumina paired-end reads," + echo " only the R2 files should be passed." + echo "" + echo " --genomeDir, --reference_index" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/reference" + echo " Path to the reference built by star_build_reference. Corresponds to the" + echo " --genomeDir argument in the STAR command." + echo "" + echo " --reference_gtf" + echo " type: file, required parameter, file must exist" + echo " example: genes.gtf" + echo " Path to the gtf reference file." + echo "" + echo " --outFileNamePrefix, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/foo" + echo " Path to output directory. Corresponds to the --outFileNamePrefix" + echo " argument in the STAR command." + echo "" + echo "Processing arguments:" + echo " --run_htseq_count" + echo " type: boolean" + echo " default: true" + echo " Whether or not to also run htseq-count after STAR." + echo "" + echo " --run_multiqc" + echo " type: boolean" + echo " default: true" + echo " Whether or not to also run MultiQC at the end." + echo "" + echo " --min_success_rate" + echo " type: double" + echo " default: 0.5" + echo " Fail when the success rate is below this threshold." + echo "" + echo "Run Parameters:" + echo " --runRNGseed" + echo " type: integer" + echo " example: 777" + echo " random number generator seed." + echo "" + echo "Genome Parameters:" + echo " --genomeFastaFiles" + echo " type: file, multiple values allowed, file must exist" + echo " path(s) to the fasta files with the genome sequences, separated by" + echo " spaces. These files should be plain text FASTA files, they *cannot* be" + echo " zipped." + echo " Required for the genome generation (--runMode genomeGenerate). Can also" + echo " be used in the mapping (--runMode alignReads) to add extra (new)" + echo " sequences to the genome (e.g. spike-ins)." + echo "" + echo "Splice Junctions Database:" + echo " --sjdbFileChrStartEnd" + echo " type: string, multiple values allowed" + echo " path to the files with genomic coordinates (chr start end" + echo " strand) for the splice junction introns. Multiple files can be" + echo " supplied and will be concatenated." + echo "" + echo " --sjdbGTFfile" + echo " type: file, file must exist" + echo " path to the GTF file with annotations" + echo "" + echo " --sjdbGTFchrPrefix" + echo " type: string" + echo " prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL" + echo " annotations with UCSC genomes)" + echo "" + echo " --sjdbGTFfeatureExon" + echo " type: string" + echo " example: exon" + echo " feature type in GTF file to be used as exons for building transcripts" + echo "" + echo " --sjdbGTFtagExonParentTranscript" + echo " type: string" + echo " example: transcript_id" + echo " GTF attribute name for parent transcript ID (default \"transcript_id\"" + echo " works for GTF files)" + echo "" + echo " --sjdbGTFtagExonParentGene" + echo " type: string" + echo " example: gene_id" + echo " GTF attribute name for parent gene ID (default \"gene_id\" works for GTF" + echo " files)" + echo "" + echo " --sjdbGTFtagExonParentGeneName" + echo " type: string, multiple values allowed" + echo " example: gene_name" + echo " GTF attribute name for parent gene name" + echo "" + echo " --sjdbGTFtagExonParentGeneType" + echo " type: string, multiple values allowed" + echo " example: gene_type;gene_biotype" + echo " GTF attribute name for parent gene type" + echo "" + echo " --sjdbOverhang" + echo " type: integer" + echo " example: 100" + echo " length of the donor/acceptor sequence on each side of the junctions," + echo " ideally = (mate_length - 1)" + echo "" + echo " --sjdbScore" + echo " type: integer" + echo " example: 2" + echo " extra alignment score for alignments that cross database junctions" + echo "" + echo " --sjdbInsertSave" + echo " type: string" + echo " example: Basic" + echo " which files to save when sjdb junctions are inserted on the fly at the" + echo " mapping step" + echo " - Basic ... only small junction / transcript files" + echo " - All ... all files including big Genome, SA and SAindex - this will" + echo " create a complete genome directory" + echo "" + echo "Variation parameters:" + echo " --varVCFfile" + echo " type: string" + echo " path to the VCF file that contains variation data. The 10th column" + echo " should contain the genotype information, e.g. 0/1" + echo "" + echo "Read Parameters:" + echo " --readFilesType" + echo " type: string" + echo " example: Fastx" + echo " format of input read files" + echo " - Fastx ... FASTA or FASTQ" + echo " - SAM SE ... SAM or BAM single-end reads; for BAM use" + echo " --readFilesCommand samtools view" + echo " - SAM PE ... SAM or BAM paired-end reads; for BAM use" + echo " --readFilesCommand samtools view" + echo "" + echo " --readFilesSAMattrKeep" + echo " type: string, multiple values allowed" + echo " example: All" + echo " for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM," + echo " e.g.: --readFilesSAMtagsKeep RG PL" + echo " - All ... keep all tags" + echo " - None ... do not keep any tags" + echo "" + echo " --readFilesManifest" + echo " type: file, file must exist" + echo " path to the \"manifest\" file with the names of read files. The manifest" + echo " file should contain 3 tab-separated columns:" + echo " paired-end reads: read1_file_name \$tab\$ read2_file_name \$tab\$" + echo " read_group_line." + echo " single-end reads: read1_file_name \$tab\$ - \$tab\$" + echo " read_group_line." + echo " Spaces, but not tabs are allowed in file names." + echo " If read_group_line does not start with ID:, it can only contain one ID" + echo " field, and ID: will be added to it." + echo " If read_group_line starts with ID:, it can contain several fields" + echo " separated by \$tab\$, and all fields will be be copied verbatim into SAM" + echo " @RG header line." + echo "" + echo " --readFilesPrefix" + echo " type: string" + echo " prefix for the read files names, i.e. it will be added in front of the" + echo " strings in --readFilesIn" + echo "" + echo " --readFilesCommand" + echo " type: string, multiple values allowed" + echo " command line to execute for each of the input file. This command should" + echo " generate FASTA or FASTQ text and send it to stdout" + echo " For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2" + echo " files, etc." + echo "" + echo " --readMapNumber" + echo " type: integer" + echo " example: -1" + echo " number of reads to map from the beginning of the file" + echo " -1: map all reads" + echo "" + echo " --readMatesLengthsIn" + echo " type: string" + echo " example: NotEqual" + echo " Equal/NotEqual - lengths of names,sequences,qualities for both mates are" + echo " the same / not the same. NotEqual is safe in all situations." + echo "" + echo " --readNameSeparator" + echo " type: string, multiple values allowed" + echo " example: /" + echo " character(s) separating the part of the read names that will be trimmed" + echo " in output (read name after space is always trimmed)" + echo "" + echo " --readQualityScoreBase" + echo " type: integer" + echo " example: 33" + echo " number to be subtracted from the ASCII code to get Phred quality score" + echo "" + echo "Read Clipping:" + echo " --clipAdapterType" + echo " type: string" + echo " example: Hamming" + echo " adapter clipping type" + echo " - Hamming ... adapter clipping based on Hamming distance, with the" + echo " number of mismatches controlled by --clip5pAdapterMMp" + echo " - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4." + echo " Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal" + echo " - None ... no adapter clipping, all other clip* parameters are" + echo " disregarded" + echo "" + echo " --clip3pNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number(s) of bases to clip from 3p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo "" + echo " --clip3pAdapterSeq" + echo " type: string, multiple values allowed" + echo " adapter sequences to clip from 3p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo " - polyA ... polyA sequence with the length equal to read length" + echo "" + echo " --clip3pAdapterMMp" + echo " type: double, multiple values allowed" + echo " example: 0.1" + echo " max proportion of mismatches for 3p adapter clipping for each mate. If" + echo " one value is given, it will be assumed the same for both mates." + echo "" + echo " --clip3pAfterAdapterNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number of bases to clip from 3p of each mate after the adapter clipping." + echo " If one value is given, it will be assumed the same for both mates." + echo "" + echo " --clip5pNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number(s) of bases to clip from 5p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo "" + echo "Limits:" + echo " --limitGenomeGenerateRAM" + echo " type: long" + echo " example: 31000000000" + echo " maximum available RAM (bytes) for genome generation" + echo "" + echo " --limitIObufferSize" + echo " type: long, multiple values allowed" + echo " example: 30000000;50000000" + echo " max available buffers size (bytes) for input/output, per thread" + echo "" + echo " --limitOutSAMoneReadBytes" + echo " type: long" + echo " example: 100000" + echo " max size of the SAM record (bytes) for one read. Recommended value:" + echo " >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + echo "" + echo " --limitOutSJoneRead" + echo " type: integer" + echo " example: 1000" + echo " max number of junctions for one read (including all multi-mappers)" + echo "" + echo " --limitOutSJcollapsed" + echo " type: integer" + echo " example: 1000000" + echo " max number of collapsed junctions" + echo "" + echo " --limitBAMsortRAM" + echo " type: long" + echo " example: 0" + echo " maximum available RAM (bytes) for sorting BAM. If =0, it will be set to" + echo " the genome index size. 0 value can only be used with --genomeLoad" + echo " NoSharedMemory option." + echo "" + echo " --limitSjdbInsertNsj" + echo " type: integer" + echo " example: 1000000" + echo " maximum number of junctions to be inserted to the genome on the fly at" + echo " the mapping stage, including those from annotations and those detected" + echo " in the 1st step of the 2-pass run" + echo "" + echo " --limitNreadsSoft" + echo " type: integer" + echo " example: -1" + echo " soft limit on the number of reads" + echo "" + echo "Output: general:" + echo " --outTmpKeep" + echo " type: string" + echo " whether to keep the temporary files after STAR runs is finished" + echo " - None ... remove all temporary files" + echo " - All ... keep all files" + echo "" + echo " --outStd" + echo " type: string" + echo " example: Log" + echo " which output will be directed to stdout (standard out)" + echo " - Log ... log messages" + echo " - SAM ... alignments in SAM format (which normally" + echo " are output to Aligned.out.sam file), normal standard output will go into" + echo " Log.std.out" + echo " - BAM_Unsorted ... alignments in BAM format, unsorted." + echo " Requires --outSAMtype BAM Unsorted" + echo " - BAM_SortedByCoordinate ... alignments in BAM format, sorted by" + echo " coordinate. Requires --outSAMtype BAM SortedByCoordinate" + echo " - BAM_Quant ... alignments to transcriptome in BAM format," + echo " unsorted. Requires --quantMode TranscriptomeSAM" + echo "" + echo " --outReadsUnmapped" + echo " type: string" + echo " output of unmapped and partially mapped (i.e. mapped only one mate of a" + echo " paired end read) reads in separate file(s)." + echo " - None ... no output" + echo " - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + echo "" + echo " --outQSconversionAdd" + echo " type: integer" + echo " example: 0" + echo " add this number to the quality score (e.g. to convert from Illumina to" + echo " Sanger, use -31)" + echo "" + echo " --outMultimapperOrder" + echo " type: string" + echo " example: Old_2.4" + echo " order of multimapping alignments in the output files" + echo " - Old_2.4 ... quasi-random order used before 2.5.0" + echo " - Random ... random order of alignments for each" + echo " multi-mapper. Read mates (pairs) are always adjacent, all alignment for" + echo " each read stay together. This option will become default in the future" + echo " releases." + echo "" + echo "Output: SAM and BAM:" + echo " --outSAMmode" + echo " type: string" + echo " example: Full" + echo " mode of SAM output" + echo " - None ... no SAM output" + echo " - Full ... full SAM output" + echo " - NoQS ... full SAM but without quality scores" + echo "" + echo " --outSAMstrandField" + echo " type: string" + echo " Cufflinks-like strand field flag" + echo " - None ... not used" + echo " - intronMotif ... strand derived from the intron motif. This option" + echo " changes the output alignments: reads with inconsistent and/or" + echo " non-canonical introns are filtered out." + echo "" + echo " --outSAMattributes" + echo " type: string, multiple values allowed" + echo " example: Standard" + echo " a string of desired SAM attributes, in the order desired for the output" + echo " SAM. Tags can be listed in any combination/order." + echo " ***Presets:" + echo " - None ... no attributes" + echo " - Standard ... NH HI AS nM" + echo " - All ... NH HI AS nM NM MD jM jI MC ch" + echo " ***Alignment:" + echo " - NH ... number of loci the reads maps to: =1 for unique" + echo " mappers, >1 for multimappers. Standard SAM tag." + echo " - HI ... multiple alignment index, starts with" + echo " --outSAMattrIHstart (=1 by default). Standard SAM tag." + echo " - AS ... local alignment score, +1/-1 for matches/mismateches," + echo " score* penalties for indels and gaps. For PE reads, total score for two" + echo " mates. Stadnard SAM tag." + echo " - nM ... number of mismatches. For PE reads, sum over two" + echo " mates." + echo " - NM ... edit distance to the reference (number of mismatched +" + echo " inserted + deleted bases) for each mate. Standard SAM tag." + echo " - MD ... string encoding mismatched and deleted reference bases" + echo " (see standard SAM specifications). Standard SAM tag." + echo " - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0:" + echo " non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6:" + echo " GT/AT. If splice junctions database is used, and a junction is" + echo " annotated, 20 is added to its motif value." + echo " - jI ... start and end of introns for all junctions (1-based)." + echo " - XS ... alignment strand according to --outSAMstrandField." + echo " - MC ... mate's CIGAR string. Standard SAM tag." + echo " - ch ... marks all segment of all chimeric alingments for" + echo " --chimOutType WithinBAM output." + echo " - cN ... number of bases clipped from the read ends: 5' and 3'" + echo " ***Variation:" + echo " - vA ... variant allele" + echo " - vG ... genomic coordinate of the variant overlapped by the" + echo " read." + echo " - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 -" + echo " alignment does not pass WASP filtering. Requires --waspOutputMode" + echo " SAMtag." + echo " ***STARsolo:" + echo " - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs" + echo " for the solo* demultiplexing." + echo " - GX GN ... gene ID and gene name for unique-gene reads." + echo " - gx gn ... gene IDs and gene names for unique- and multi-gene" + echo " reads." + echo " - CB UB ... error-corrected cell barcodes and UMIs for solo*" + echo " demultiplexing. Requires --outSAMtype BAM SortedByCoordinate." + echo " - sM ... assessment of CB and UMI." + echo " - sS ... sequence of the entire barcode (CB,UMI,adapter)." + echo " - sQ ... quality of the entire barcode." + echo " ***Unsupported/undocumented:" + echo " - ha ... haplotype (1/2) when mapping to the diploid genome." + echo " Requires genome generated with --genomeTransformType Diploid ." + echo " - rB ... alignment block read/genomic coordinates." + echo " - vR ... read coordinate of the variant." + echo "" + echo " --outSAMattrIHstart" + echo " type: integer" + echo " example: 1" + echo " start value for the IH attribute. 0 may be required by some downstream" + echo " software, such as Cufflinks or StringTie." + echo "" + echo " --outSAMunmapped" + echo " type: string, multiple values allowed" + echo " output of unmapped reads in the SAM format" + echo " 1st word:" + echo " - None ... no output" + echo " - Within ... output unmapped reads within the main SAM file (i.e." + echo " Aligned.out.sam)" + echo " 2nd word:" + echo " - KeepPairs ... record unmapped mate for each alignment, and, in case of" + echo " unsorted output, keep it adjacent to its mapped mate. Only affects" + echo " multi-mapping reads." + echo "" + echo " --outSAMorder" + echo " type: string" + echo " example: Paired" + echo " type of sorting for the SAM output" + echo " Paired: one mate after the other for all paired alignments" + echo " PairedKeepInputOrder: one mate after the other for all paired" + echo " alignments, the order is kept the same as in the input FASTQ files" + echo "" + echo " --outSAMprimaryFlag" + echo " type: string" + echo " example: OneBestScore" + echo " which alignments are considered primary - all others will be marked with" + echo " 0x100 bit in the FLAG" + echo " - OneBestScore ... only one alignment with the best score is primary" + echo " - AllBestScore ... all alignments with the best score are primary" + echo "" + echo " --outSAMreadID" + echo " type: string" + echo " example: Standard" + echo " read ID record type" + echo " - Standard ... first word (until space) from the FASTx read ID line," + echo " removing /1,/2 from the end" + echo " - Number ... read number (index) in the FASTx file" + echo "" + echo " --outSAMmapqUnique" + echo " type: integer" + echo " example: 255" + echo " 0 to 255: the MAPQ value for unique mappers" + echo "" + echo " --outSAMflagOR" + echo " type: integer" + echo " example: 0" + echo " 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e." + echo " FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set" + echo " by STAR, and after outSAMflagAND. Can be used to set specific bits that" + echo " are not set otherwise." + echo "" + echo " --outSAMflagAND" + echo " type: integer" + echo " example: 65535" + echo " 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e." + echo " FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set" + echo " by STAR, but before outSAMflagOR. Can be used to unset specific bits" + echo " that are not set otherwise." + echo "" + echo " --outSAMattrRGline" + echo " type: string, multiple values allowed" + echo " SAM/BAM read group line. The first word contains the read group" + echo " identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx" + echo " CN:yy \"DS:z z z\"." + echo " xxx will be added as RG tag to each output alignment. Any spaces in the" + echo " tag values have to be double quoted." + echo " Comma separated RG lines correspons to different (comma separated) input" + echo " files in --readFilesIn. Commas have to be surrounded by spaces, e.g." + echo " --outSAMattrRGline ID:xxx , ID:zzz \"DS:z z\" , ID:yyy DS:yyyy" + echo "" + echo " --outSAMheaderHD" + echo " type: string, multiple values allowed" + echo " @HD (header) line of the SAM header" + echo "" + echo " --outSAMheaderPG" + echo " type: string, multiple values allowed" + echo " extra @PG (software) line of the SAM header (in addition to STAR)" + echo "" + echo " --outSAMheaderCommentFile" + echo " type: string" + echo " path to the file with @CO (comment) lines of the SAM header" + echo "" + echo " --outSAMfilter" + echo " type: string, multiple values allowed" + echo " filter the output into main SAM/BAM files" + echo " - KeepOnlyAddedReferences ... only keep the reads for which all" + echo " alignments are to the extra reference sequences added with" + echo " --genomeFastaFiles at the mapping stage." + echo " - KeepAllAddedReferences ... keep all alignments to the extra reference" + echo " sequences added with --genomeFastaFiles at the mapping stage." + echo "" + echo " --outSAMmultNmax" + echo " type: integer" + echo " example: -1" + echo " max number of multiple alignments for a read that will be output to the" + echo " SAM/BAM files. Note that if this value is not equal to -1, the top" + echo " scoring alignment will be output first" + echo " - -1 ... all alignments (up to --outFilterMultimapNmax) will be output" + echo "" + echo " --outSAMtlen" + echo " type: integer" + echo " example: 1" + echo " calculation method for the TLEN field in the SAM/BAM files" + echo " - 1 ... leftmost base of the (+)strand mate to rightmost base of the" + echo " (-)mate. (+)sign for the (+)strand mate" + echo " - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign" + echo " for the mate with the leftmost base. This is different from 1 for" + echo " overlapping mates with protruding ends" + echo "" + echo " --outBAMcompression" + echo " type: integer" + echo " example: 1" + echo " -1 to 10 BAM compression level, -1=default compression (6?), 0=no" + echo " compression, 10=maximum compression" + echo "" + echo " --outBAMsortingThreadN" + echo " type: integer" + echo " example: 0" + echo " >=0: number of threads for BAM sorting. 0 will default to" + echo " min(6,--runThreadN)." + echo "" + echo " --outBAMsortingBinsN" + echo " type: integer" + echo " example: 50" + echo " >0: number of genome bins for coordinate-sorting" + echo "" + echo "BAM processing:" + echo " --bamRemoveDuplicatesType" + echo " type: string" + echo " mark duplicates in the BAM file, for now only works with (i) sorted BAM" + echo " fed with inputBAMfile, and (ii) for paired-end alignments only" + echo " - - ... no duplicate removal/marking" + echo " - UniqueIdentical ... mark all multimappers, and duplicate" + echo " unique mappers. The coordinates, FLAG, CIGAR must be identical" + echo " - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not" + echo " multimappers." + echo "" + echo " --bamRemoveDuplicatesMate2basesN" + echo " type: integer" + echo " example: 0" + echo " number of bases from the 5' of mate 2 to use in collapsing (e.g. for" + echo " RAMPAGE)" + echo "" + echo "Output Wiggle:" + echo " --outWigType" + echo " type: string, multiple values allowed" + echo " type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\". Requires" + echo " sorted BAM: --outSAMtype BAM SortedByCoordinate ." + echo " 1st word:" + echo " - None ... no signal output" + echo " - bedGraph ... bedGraph format" + echo " - wiggle ... wiggle format" + echo " 2nd word:" + echo " - read1_5p ... signal from only 5' of the 1st read, useful for" + echo " CAGE/RAMPAGE etc" + echo " - read2 ... signal from only 2nd read" + echo "" + echo " --outWigStrand" + echo " type: string" + echo " example: Stranded" + echo " strandedness of wiggle/bedGraph output" + echo " - Stranded ... separate strands, str1 and str2" + echo " - Unstranded ... collapsed strands" + echo "" + echo " --outWigReferencesPrefix" + echo " type: string" + echo " prefix matching reference names to include in the output wiggle file," + echo " e.g. \"chr\", default \"-\" - include all references" + echo "" + echo " --outWigNorm" + echo " type: string" + echo " example: RPM" + echo " type of normalization for the signal" + echo " - RPM ... reads per million of mapped reads" + echo " - None ... no normalization, \"raw\" counts" + echo "" + echo "Output Filtering:" + echo " --outFilterType" + echo " type: string" + echo " example: Normal" + echo " type of filtering" + echo " - Normal ... standard filtering using only current alignment" + echo " - BySJout ... keep only those reads that contain junctions that passed" + echo " filtering into SJ.out.tab" + echo "" + echo " --outFilterMultimapScoreRange" + echo " type: integer" + echo " example: 1" + echo " the score range below the maximum score for multimapping alignments" + echo "" + echo " --outFilterMultimapNmax" + echo " type: integer" + echo " example: 10" + echo " maximum number of loci the read is allowed to map to. Alignments (all of" + echo " them) will be output only if the read maps to no more loci than this" + echo " value." + echo " Otherwise no alignments will be output, and the read will be counted as" + echo " \"mapped to too many loci\" in the Log.final.out ." + echo "" + echo " --outFilterMismatchNmax" + echo " type: integer" + echo " example: 10" + echo " alignment will be output only if it has no more mismatches than this" + echo " value." + echo "" + echo " --outFilterMismatchNoverLmax" + echo " type: double" + echo " example: 0.3" + echo " alignment will be output only if its ratio of mismatches to *mapped*" + echo " length is less than or equal to this value." + echo "" + echo " --outFilterMismatchNoverReadLmax" + echo " type: double" + echo " example: 1.0" + echo " alignment will be output only if its ratio of mismatches to *read*" + echo " length is less than or equal to this value." + echo "" + echo " --outFilterScoreMin" + echo " type: integer" + echo " example: 0" + echo " alignment will be output only if its score is higher than or equal to" + echo " this value." + echo "" + echo " --outFilterScoreMinOverLread" + echo " type: double" + echo " example: 0.66" + echo " same as outFilterScoreMin, but normalized to read length (sum of mates'" + echo " lengths for paired-end reads)" + echo "" + echo " --outFilterMatchNmin" + echo " type: integer" + echo " example: 0" + echo " alignment will be output only if the number of matched bases is higher" + echo " than or equal to this value." + echo "" + echo " --outFilterMatchNminOverLread" + echo " type: double" + echo " example: 0.66" + echo " sam as outFilterMatchNmin, but normalized to the read length (sum of" + echo " mates' lengths for paired-end reads)." + echo "" + echo " --outFilterIntronMotifs" + echo " type: string" + echo " filter alignment using their motifs" + echo " - None ... no filtering" + echo " - RemoveNoncanonical ... filter out alignments that contain" + echo " non-canonical junctions" + echo " - RemoveNoncanonicalUnannotated ... filter out alignments that contain" + echo " non-canonical unannotated junctions when using annotated splice" + echo " junctions database. The annotated non-canonical junctions will be kept." + echo "" + echo " --outFilterIntronStrands" + echo " type: string" + echo " example: RemoveInconsistentStrands" + echo " filter alignments" + echo " - RemoveInconsistentStrands ... remove alignments that have" + echo " junctions with inconsistent strands" + echo " - None ... no filtering" + echo "" + echo "Output splice junctions (SJ.out.tab):" + echo " --outSJtype" + echo " type: string" + echo " example: Standard" + echo " type of splice junction output" + echo " - Standard ... standard SJ.out.tab output" + echo " - None ... no splice junction output" + echo "" + echo "Output Filtering: Splice Junctions:" + echo " --outSJfilterReads" + echo " type: string" + echo " example: All" + echo " which reads to consider for collapsed splice junctions output" + echo " - All ... all reads, unique- and multi-mappers" + echo " - Unique ... uniquely mapping reads only" + echo "" + echo " --outSJfilterOverhangMin" + echo " type: integer, multiple values allowed" + echo " example: 30;12;12;12" + echo " minimum overhang length for splice junctions on both sides for: (1)" + echo " non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterCountUniqueMin" + echo " type: integer, multiple values allowed" + echo " example: 3;1;1;1" + echo " minimum uniquely mapping read count per junction for: (1) non-canonical" + echo " motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC" + echo " and GT/AT motif. -1 means no output for that motif" + echo " Junctions are output if one of outSJfilterCountUniqueMin OR" + echo " outSJfilterCountTotalMin conditions are satisfied" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterCountTotalMin" + echo " type: integer, multiple values allowed" + echo " example: 3;1;1;1" + echo " minimum total (multi-mapping+unique) read count per junction for: (1)" + echo " non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif" + echo " Junctions are output if one of outSJfilterCountUniqueMin OR" + echo " outSJfilterCountTotalMin conditions are satisfied" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterDistToOtherSJmin" + echo " type: integer, multiple values allowed" + echo " example: 10;0;5;10" + echo " minimum allowed distance to other junctions' donor/acceptor" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterIntronMaxVsReadN" + echo " type: integer, multiple values allowed" + echo " example: 50000;100000;200000" + echo " maximum gap allowed for junctions supported by 1,2,3,,,N reads" + echo " i.e. by default junctions supported by 1 read can have gaps <=50000b, by" + echo " 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap" + echo " <=alignIntronMax" + echo " does not apply to annotated junctions" + echo "" + echo "Scoring:" + echo " --scoreGap" + echo " type: integer" + echo " example: 0" + echo " splice junction penalty (independent on intron motif)" + echo "" + echo " --scoreGapNoncan" + echo " type: integer" + echo " example: -8" + echo " non-canonical junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGapGCAG" + echo " type: integer" + echo " example: -4" + echo " GC/AG and CT/GC junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGapATAC" + echo " type: integer" + echo " example: -8" + echo " AT/AC and GT/AT junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGenomicLengthLog2scale" + echo " type: integer" + echo " example: 0" + echo " extra score logarithmically scaled with genomic length of the alignment:" + echo " scoreGenomicLengthLog2scale*log2(genomicLength)" + echo "" + echo " --scoreDelOpen" + echo " type: integer" + echo " example: -2" + echo " deletion open penalty" + echo "" + echo " --scoreDelBase" + echo " type: integer" + echo " example: -2" + echo " deletion extension penalty per base (in addition to scoreDelOpen)" + echo "" + echo " --scoreInsOpen" + echo " type: integer" + echo " example: -2" + echo " insertion open penalty" + echo "" + echo " --scoreInsBase" + echo " type: integer" + echo " example: -2" + echo " insertion extension penalty per base (in addition to scoreInsOpen)" + echo "" + echo " --scoreStitchSJshift" + echo " type: integer" + echo " example: 1" + echo " maximum score reduction while searching for SJ boundaries in the" + echo " stitching step" + echo "" + echo "Alignments and Seeding:" + echo " --seedSearchStartLmax" + echo " type: integer" + echo " example: 50" + echo " defines the search start point through the read - the read is split into" + echo " pieces no longer than this value" + echo "" + echo " --seedSearchStartLmaxOverLread" + echo " type: double" + echo " example: 1.0" + echo " seedSearchStartLmax normalized to read length (sum of mates' lengths for" + echo " paired-end reads)" + echo "" + echo " --seedSearchLmax" + echo " type: integer" + echo " example: 0" + echo " defines the maximum length of the seeds, if =0 seed length is not" + echo " limited" + echo "" + echo " --seedMultimapNmax" + echo " type: integer" + echo " example: 10000" + echo " only pieces that map fewer than this value are utilized in the stitching" + echo " procedure" + echo "" + echo " --seedPerReadNmax" + echo " type: integer" + echo " example: 1000" + echo " max number of seeds per read" + echo "" + echo " --seedPerWindowNmax" + echo " type: integer" + echo " example: 50" + echo " max number of seeds per window" + echo "" + echo " --seedNoneLociPerWindow" + echo " type: integer" + echo " example: 10" + echo " max number of one seed loci per window" + echo "" + echo " --seedSplitMin" + echo " type: integer" + echo " example: 12" + echo " min length of the seed sequences split by Ns or mate gap" + echo "" + echo " --seedMapMin" + echo " type: integer" + echo " example: 5" + echo " min length of seeds to be mapped" + echo "" + echo " --alignIntronMin" + echo " type: integer" + echo " example: 21" + echo " minimum intron size, genomic gap is considered intron if its" + echo " length>=alignIntronMin, otherwise it is considered Deletion" + echo "" + echo " --alignIntronMax" + echo " type: integer" + echo " example: 0" + echo " maximum intron size, if 0, max intron size will be determined by" + echo " (2^winBinNbits)*winAnchorDistNbins" + echo "" + echo " --alignMatesGapMax" + echo " type: integer" + echo " example: 0" + echo " maximum gap between two mates, if 0, max intron gap will be determined" + echo " by (2^winBinNbits)*winAnchorDistNbins" + echo "" + echo " --alignSJoverhangMin" + echo " type: integer" + echo " example: 5" + echo " minimum overhang (i.e. block size) for spliced alignments" + echo "" + echo " --alignSJstitchMismatchNmax" + echo " type: integer, multiple values allowed" + echo " example: 0;-1;0;0" + echo " maximum number of mismatches for stitching of the splice junctions (-1:" + echo " no limit)." + echo " (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif." + echo "" + echo " --alignSJDBoverhangMin" + echo " type: integer" + echo " example: 3" + echo " minimum overhang (i.e. block size) for annotated (sjdb) spliced" + echo " alignments" + echo "" + echo " --alignSplicedMateMapLmin" + echo " type: integer" + echo " example: 0" + echo " minimum mapped length for a read mate that is spliced" + echo "" + echo " --alignSplicedMateMapLminOverLmate" + echo " type: double" + echo " example: 0.66" + echo " alignSplicedMateMapLmin normalized to mate length" + echo "" + echo " --alignWindowsPerReadNmax" + echo " type: integer" + echo " example: 10000" + echo " max number of windows per read" + echo "" + echo " --alignTranscriptsPerWindowNmax" + echo " type: integer" + echo " example: 100" + echo " max number of transcripts per window" + echo "" + echo " --alignTranscriptsPerReadNmax" + echo " type: integer" + echo " example: 10000" + echo " max number of different alignments per read to consider" + echo "" + echo " --alignEndsType" + echo " type: string" + echo " example: Local" + echo " type of read ends alignment" + echo " - Local ... standard local alignment with soft-clipping" + echo " allowed" + echo " - EndToEnd ... force end-to-end read alignment, do not" + echo " soft-clip" + echo " - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other" + echo " ends: local alignment" + echo " - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and" + echo " read2, all other ends: local alignment" + echo "" + echo " --alignEndsProtrude" + echo " type: string" + echo " example: 0 ConcordantPair" + echo " allow protrusion of alignment ends, i.e. start (end) of the +strand mate" + echo " downstream of the start (end) of the -strand mate" + echo " 1st word: int: maximum number of protrusion bases allowed" + echo " 2nd word: string:" + echo " - ConcordantPair ... report alignments with non-zero" + echo " protrusion as concordant pairs" + echo " - DiscordantPair ... report alignments with non-zero" + echo " protrusion as discordant pairs" + echo "" + echo " --alignSoftClipAtReferenceEnds" + echo " type: string" + echo " example: Yes" + echo " allow the soft-clipping of the alignments past the end of the" + echo " chromosomes" + echo " - Yes ... allow" + echo " - No ... prohibit, useful for compatibility with Cufflinks" + echo "" + echo " --alignInsertionFlush" + echo " type: string" + echo " how to flush ambiguous insertion positions" + echo " - None ... insertions are not flushed" + echo " - Right ... insertions are flushed to the right" + echo "" + echo "Paired-End reads:" + echo " --peOverlapNbasesMin" + echo " type: integer" + echo " example: 0" + echo " minimum number of overlapping bases to trigger mates merging and" + echo " realignment. Specify >0 value to switch on the \"merginf of overlapping" + echo " mates\" algorithm." + echo "" + echo " --peOverlapMMp" + echo " type: double" + echo " example: 0.01" + echo " maximum proportion of mismatched bases in the overlap area" + echo "" + echo "Windows, Anchors, Binning:" + echo " --winAnchorMultimapNmax" + echo " type: integer" + echo " example: 50" + echo " max number of loci anchors are allowed to map to" + echo "" + echo " --winBinNbits" + echo " type: integer" + echo " example: 16" + echo " =log2(winBin), where winBin is the size of the bin for the" + echo " windows/clustering, each window will occupy an integer number of bins." + echo "" + echo " --winAnchorDistNbins" + echo " type: integer" + echo " example: 9" + echo " max number of bins between two anchors that allows aggregation of" + echo " anchors into one window" + echo "" + echo " --winFlankNbins" + echo " type: integer" + echo " example: 4" + echo " log2(winFlank), where win Flank is the size of the left and right" + echo " flanking regions for each window" + echo "" + echo " --winReadCoverageRelativeMin" + echo " type: double" + echo " example: 0.5" + echo " minimum relative coverage of the read sequence by the seeds in a window," + echo " for STARlong algorithm only." + echo "" + echo " --winReadCoverageBasesMin" + echo " type: integer" + echo " example: 0" + echo " minimum number of bases covered by the seeds in a window , for STARlong" + echo " algorithm only." + echo "" + echo "Chimeric Alignments:" + echo " --chimOutType" + echo " type: string, multiple values allowed" + echo " example: Junctions" + echo " type of chimeric output" + echo " - Junctions ... Chimeric.out.junction" + echo " - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file" + echo " - WithinBAM ... output into main aligned BAM files (Aligned.*.bam)" + echo " - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for" + echo " supplemental chimeric alignments (default if no 2nd word is present)" + echo " - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental" + echo " chimeric alignments" + echo "" + echo " --chimSegmentMin" + echo " type: integer" + echo " example: 0" + echo " minimum length of chimeric segment length, if ==0, no chimeric output" + echo "" + echo " --chimScoreMin" + echo " type: integer" + echo " example: 0" + echo " minimum total (summed) score of the chimeric segments" + echo "" + echo " --chimScoreDropMax" + echo " type: integer" + echo " example: 20" + echo " max drop (difference) of chimeric score (the sum of scores of all" + echo " chimeric segments) from the read length" + echo "" + echo " --chimScoreSeparation" + echo " type: integer" + echo " example: 10" + echo " minimum difference (separation) between the best chimeric score and the" + echo " next one" + echo "" + echo " --chimScoreJunctionNonGTAG" + echo " type: integer" + echo " example: -1" + echo " penalty for a non-GT/AG chimeric junction" + echo "" + echo " --chimJunctionOverhangMin" + echo " type: integer" + echo " example: 20" + echo " minimum overhang for a chimeric junction" + echo "" + echo " --chimSegmentReadGapMax" + echo " type: integer" + echo " example: 0" + echo " maximum gap in the read sequence between chimeric segments" + echo "" + echo " --chimFilter" + echo " type: string, multiple values allowed" + echo " example: banGenomicN" + echo " different filters for chimeric alignments" + echo " - None ... no filtering" + echo " - banGenomicN ... Ns are not allowed in the genome sequence around the" + echo " chimeric junction" + echo "" + echo " --chimMainSegmentMultNmax" + echo " type: integer" + echo " example: 10" + echo " maximum number of multi-alignments for the main chimeric segment. =1" + echo " will prohibit multimapping main segments." + echo "" + echo " --chimMultimapNmax" + echo " type: integer" + echo " example: 0" + echo " maximum number of chimeric multi-alignments" + echo " - 0 ... use the old scheme for chimeric detection which only considered" + echo " unique alignments" + echo "" + echo " --chimMultimapScoreRange" + echo " type: integer" + echo " example: 1" + echo " the score range for multi-mapping chimeras below the best chimeric" + echo " score. Only works with --chimMultimapNmax > 1" + echo "" + echo " --chimNonchimScoreDropMin" + echo " type: integer" + echo " example: 20" + echo " to trigger chimeric detection, the drop in the best non-chimeric" + echo " alignment score with respect to the read length has to be greater than" + echo " this value" + echo "" + echo " --chimOutJunctionFormat" + echo " type: integer" + echo " example: 0" + echo " formatting type for the Chimeric.out.junction file" + echo " - 0 ... no comment lines/headers" + echo " - 1 ... comment lines at the end of the file: command line and Nreads:" + echo " total, unique/multi-mapping" + echo "" + echo "Quantification of Annotations:" + echo " --quantMode" + echo " type: string, multiple values allowed" + echo " types of quantification requested" + echo " - - ... none" + echo " - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a" + echo " separate file" + echo " - GeneCounts ... count reads per gene" + echo "" + echo " --quantTranscriptomeBAMcompression" + echo " type: integer" + echo " example: 1" + echo " -2 to 10 transcriptome BAM compression level" + echo " - -2 ... no BAM output" + echo " - -1 ... default compression (6?)" + echo " - 0 ... no compression" + echo " - 10 ... maximum compression" + echo "" + echo " --quantTranscriptomeBan" + echo " type: string" + echo " example: IndelSoftclipSingleend" + echo " prohibit various alignment type" + echo " - IndelSoftclipSingleend ... prohibit indels, soft clipping and" + echo " single-end alignments - compatible with RSEM" + echo " - Singleend ... prohibit single-end alignments" + echo "" + echo "2-pass Mapping:" + echo " --twopassMode" + echo " type: string" + echo " 2-pass mapping mode." + echo " - None ... 1-pass mapping" + echo " - Basic ... basic 2-pass mapping, with all 1st pass junctions" + echo " inserted into the genome indices on the fly" + echo "" + echo " --twopass1readsN" + echo " type: integer" + echo " example: -1" + echo " number of reads to process for the 1st step. Use very large number (or" + echo " default -1) to map all reads in the first step." + echo "" + echo "WASP parameters:" + echo " --waspOutputMode" + echo " type: string" + echo " WASP allele-specific output type. This is re-implementation of the" + echo " original WASP mappability filtering by Bryce van de Geijn, Graham" + echo " McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original" + echo " WASP paper: Nature Methods 12, 1061-1063 (2015)," + echo " https://www.nature.com/articles/nmeth.3582 ." + echo " - SAMtag ... add WASP tags to the alignments that pass WASP" + echo " filtering" + echo "" + echo "STARsolo (single cell RNA-seq) parameters:" + echo " --soloType" + echo " type: string, multiple values allowed" + echo " type of single-cell RNA-seq" + echo " - CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of" + echo " fixed length in read2, e.g. Drop-seq and 10X Chromium." + echo " - CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI" + echo " of fixed length and one adapter sequence of fixed length are allowed in" + echo " read2 only (e.g. inDrop, ddSeq)." + echo " - CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No" + echo " UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end]" + echo " CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or" + echo " SortedByCoordinate]" + echo " - SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired-" + echo " or single-end), barcodes are corresponding read-groups, no UMI" + echo " sequences, alignments deduplicated according to alignment start and end" + echo " (after extending soft-clipped bases)" + echo "" + echo " --soloCBwhitelist" + echo " type: string, multiple values allowed" + echo " file(s) with whitelist(s) of cell barcodes. Only --soloType" + echo " CB_UMI_Complex allows more than one whitelist file." + echo " - None ... no whitelist: all cell barcodes are allowed" + echo "" + echo " --soloCBstart" + echo " type: integer" + echo " example: 1" + echo " cell barcode start base" + echo "" + echo " --soloCBlen" + echo " type: integer" + echo " example: 16" + echo " cell barcode length" + echo "" + echo " --soloUMIstart" + echo " type: integer" + echo " example: 17" + echo " UMI start base" + echo "" + echo " --soloUMIlen" + echo " type: integer" + echo " example: 10" + echo " UMI length" + echo "" + echo " --soloBarcodeReadLength" + echo " type: integer" + echo " example: 1" + echo " length of the barcode read" + echo " - 1 ... equal to sum of soloCBlen+soloUMIlen" + echo " - 0 ... not defined, do not check" + echo "" + echo " --soloBarcodeMate" + echo " type: integer" + echo " example: 0" + echo " identifies which read mate contains the barcode (CB+UMI) sequence" + echo " - 0 ... barcode sequence is on separate read, which should always be" + echo " the last file in the --readFilesIn listed" + echo " - 1 ... barcode sequence is a part of mate 1" + echo " - 2 ... barcode sequence is a part of mate 2" + echo "" + echo " --soloCBposition" + echo " type: string, multiple values allowed" + echo " position of Cell Barcode(s) on the barcode read." + echo " Presently only works with --soloType CB_UMI_Complex, and barcodes are" + echo " assumed to be on Read2." + echo " Format for each barcode: startAnchor_startPosition_endAnchor_endPosition" + echo " start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1:" + echo " read end; 2: adapter start; 3: adapter end" + echo " start(end)Position is the 0-based position with of the CB start(end)" + echo " with respect to the Anchor Base" + echo " String for different barcodes are separated by space." + echo " Example: inDrop (Zilionis et al, Nat. Protocols, 2017):" + echo " --soloCBposition 0_0_2_-1 3_1_3_8" + echo "" + echo " --soloUMIposition" + echo " type: string" + echo " position of the UMI on the barcode read, same as soloCBposition" + echo " Example: inDrop (Zilionis et al, Nat. Protocols, 2017):" + echo " --soloCBposition 3_9_3_14" + echo "" + echo " --soloAdapterSequence" + echo " type: string" + echo " adapter sequence to anchor barcodes. Only one adapter sequence is" + echo " allowed." + echo "" + echo " --soloAdapterMismatchesNmax" + echo " type: integer" + echo " example: 1" + echo " maximum number of mismatches allowed in adapter sequence." + echo "" + echo " --soloCBmatchWLtype" + echo " type: string" + echo " example: 1MM_multi" + echo " matching the Cell Barcodes to the WhiteList" + echo " - Exact ... only exact matches allowed" + echo " - 1MM ... only one match in whitelist with 1" + echo " mismatched base allowed. Allowed CBs have to have at least one read with" + echo " exact match." + echo " - 1MM_multi ... multiple matches in whitelist with" + echo " 1 mismatched base allowed, posterior probability calculation is used" + echo " choose one of the matches." + echo " Allowed CBs have to have at least one read with exact match. This option" + echo " matches best with CellRanger 2.2.0" + echo " - 1MM_multi_pseudocounts ... same as 1MM_Multi, but" + echo " pseudocounts of 1 are added to all whitelist barcodes." + echo " - 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts," + echo " multimatching to WL is allowed for CBs with N-bases. This option matches" + echo " best with CellRanger >= 3.0.0" + echo " - EditDist_2 ... allow up to edit distance of 3 fpr" + echo " each of the barcodes. May include one deletion + one insertion. Only" + echo " works with --soloType CB_UMI_Complex. Matches to multiple passlist" + echo " barcdoes are not allowed. Similar to ParseBio Split-seq pipeline." + echo "" + echo " --soloInputSAMattrBarcodeSeq" + echo " type: string, multiple values allowed" + echo " when inputting reads from a SAM file (--readsFileType SAM SE/PE), these" + echo " SAM attributes mark the barcode sequence (in proper order)." + echo " For instance, for 10X CellRanger or STARsolo BAMs, use" + echo " --soloInputSAMattrBarcodeSeq CR UR ." + echo " This parameter is required when running STARsolo with input from SAM." + echo "" + echo " --soloInputSAMattrBarcodeQual" + echo " type: string, multiple values allowed" + echo " when inputting reads from a SAM file (--readsFileType SAM SE/PE), these" + echo " SAM attributes mark the barcode qualities (in proper order)." + echo " For instance, for 10X CellRanger or STARsolo BAMs, use" + echo " --soloInputSAMattrBarcodeQual CY UY ." + echo " If this parameter is '-' (default), the quality 'H' will be assigned to" + echo " all bases." + echo "" + echo " --soloStrand" + echo " type: string" + echo " example: Forward" + echo " strandedness of the solo libraries:" + echo " - Unstranded ... no strand information" + echo " - Forward ... read strand same as the original RNA molecule" + echo " - Reverse ... read strand opposite to the original RNA molecule" + echo "" + echo " --soloFeatures" + echo " type: string, multiple values allowed" + echo " example: Gene" + echo " genomic features for which the UMI counts per Cell Barcode are collected" + echo " - Gene ... genes: reads match the gene transcript" + echo " - SJ ... splice junctions: reported in SJ.out.tab" + echo " - GeneFull ... full gene (pre-mRNA): count all reads overlapping" + echo " genes' exons and introns" + echo " - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads" + echo " overlapping genes' exons and introns: prioritize 100% overlap with exons" + echo " - GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads" + echo " overlapping genes' exons and introns: prioritize >50% overlap with" + echo " exons. Do not count reads with 100% exonic overlap in the antisense" + echo " direction." + echo "" + echo " --soloMultiMappers" + echo " type: string, multiple values allowed" + echo " example: Unique" + echo " counting method for reads mapping to multiple genes" + echo " - Unique ... count only reads that map to unique genes" + echo " - Uniform ... uniformly distribute multi-genic UMIs to all genes" + echo " - Rescue ... distribute UMIs proportionally to unique+uniform counts" + echo " (~ first iteration of EM)" + echo " - PropUnique ... distribute UMIs proportionally to unique mappers, if" + echo " present, and uniformly if not." + echo " - EM ... multi-gene UMIs are distributed using Expectation" + echo " Maximization algorithm" + echo "" + echo " --soloUMIdedup" + echo " type: string, multiple values allowed" + echo " example: 1MM_All" + echo " type of UMI deduplication (collapsing) algorithm" + echo " - 1MM_All ... all UMIs with 1 mismatch distance to" + echo " each other are collapsed (i.e. counted once)." + echo " - 1MM_Directional_UMItools ... follows the \"directional\" method from" + echo " the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017)." + echo " - 1MM_Directional ... same as 1MM_Directional_UMItools, but" + echo " with more stringent criteria for duplicate UMIs" + echo " - Exact ... only exactly matching UMIs are" + echo " collapsed." + echo " - NoDedup ... no deduplication of UMIs, count all" + echo " reads." + echo " - 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI" + echo " collapsing." + echo "" + echo " --soloUMIfiltering" + echo " type: string, multiple values allowed" + echo " type of UMI filtering (for reads uniquely mapping to genes)" + echo " - - ... basic filtering: remove UMIs with N and" + echo " homopolymers (similar to CellRanger 2.2.0)." + echo " - MultiGeneUMI ... basic + remove lower-count UMIs that map to" + echo " more than one gene." + echo " - MultiGeneUMI_All ... basic + remove all UMIs that map to more than" + echo " one gene." + echo " - MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to" + echo " more than one gene, matching CellRanger > 3.0.0 ." + echo " Only works with --soloUMIdedup 1MM_CR" + echo "" + echo " --soloOutFileNames" + echo " type: string, multiple values allowed" + echo " example: Solo.out/;features.tsv;barcodes.tsv;matrix.mtx" + echo " file names for STARsolo output:" + echo " file_name_prefix gene_names barcode_sequences" + echo " cell_feature_count_matrix" + echo "" + echo " --soloCellFilter" + echo " type: string, multiple values allowed" + echo " example: CellRanger2.2;3000;0.99;10" + echo " cell filtering type and parameters" + echo " - None ... do not output filtered cells" + echo " - TopCells ... only report top cells by UMI count, followed by" + echo " the exact number of cells" + echo " - CellRanger2.2 ... simple filtering of CellRanger 2.2." + echo " Can be followed by numbers: number of expected cells, robust maximum" + echo " percentile for UMI count, maximum to minimum ratio for UMI count" + echo " The harcoded values are from CellRanger: nExpectedCells=3000;" + echo " maxPercentile=0.99; maxMinRatio=10" + echo " - EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please" + echo " cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20," + echo " 63 (2019):" + echo " " + echo "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y" + echo " Can be followed by 10 numeric parameters: nExpectedCells" + echo " maxPercentile maxMinRatio indMin indMax umiMin" + echo " umiMinFracMedian candMaxN FDR simN" + echo " The harcoded values are from CellRanger: 3000" + echo " 0.99 10 45000 90000 500 0.01" + echo " 20000 0.01 10000" + echo "" + echo " --soloOutFormatFeaturesGeneField3" + echo " type: string, multiple values allowed" + echo " example: Gene Expression" + echo " field 3 in the Gene features.tsv file. If \"-\", then no 3rd field is" + echo " output." + echo "" + echo " --soloCellReadStats" + echo " type: string" + echo " Output reads statistics for each CB" + echo " - Standard ... standard output" + echo "" + echo "HTSeq arguments:" + echo " -s, --stranded" + echo " type: string" + echo " default: yes" + echo " choices: [ yes, no, reverse ]" + echo " Whether the data is from a strand-specific assay. 'reverse' means 'yes'" + echo " with reversed strand interpretation." + echo "" + echo " -a, --minaqual, --minimum_alignment_quality" + echo " type: integer" + echo " default: 10" + echo " Skip all reads with MAPQ alignment quality lower than the given minimum" + echo " value." + echo " MAPQ is the 5th column of a SAM/BAM file and its usage depends on the" + echo " software" + echo " used to map the reads." + echo "" + echo " -t, --type" + echo " type: string" + echo " example: exon" + echo " Feature type (3rd column in GTF file) to be used, all features of other" + echo " type are ignored (default, suitable for Ensembl GTF files: exon)" + echo "" + echo " -i, --id_attribute" + echo " type: string, multiple values allowed" + echo " example: gene_id" + echo " GTF attribute to be used as feature ID (default, suitable for Ensembl" + echo " GTF files: gene_id)." + echo " All feature of the right type (see -t option) within the same GTF" + echo " attribute will be added" + echo " together. The typical way of using this option is to count all exonic" + echo " reads from each gene" + echo " and add the exons but other uses are possible as well. You can call this" + echo " option multiple" + echo " times: in that case, the combination of all attributes separated by" + echo " colons (:) will be used" + echo " as a unique identifier, e.g. for exons you might use -i gene_id -i" + echo " exon_number." + echo "" + echo " --additional_attributes" + echo " type: string, multiple values allowed" + echo " example: gene_name" + echo " Additional feature attributes (suitable for Ensembl GTF files:" + echo " gene_name). Use multiple times" + echo " for more than one additional attribute. These attributes are only used" + echo " as annotations in the" + echo " output, while the determination of how the counts are added together is" + echo " done based on option -i." + echo "" + echo " --add_chromosome_info" + echo " type: boolean_true" + echo " Store information about the chromosome of each feature as an additional" + echo " attribute" + echo " (e.g. colunm in the TSV output file)." + echo "" + echo " -m, --mode" + echo " type: string" + echo " default: union" + echo " choices: [ union, intersection-strict, intersection-nonempty ]" + echo " Mode to handle reads overlapping more than one feature." + echo "" + echo " --non_unique" + echo " type: string" + echo " default: none" + echo " choices: [ none, all, fraction, random ]" + echo " Whether and how to score reads that are not uniquely aligned or" + echo " ambiguously assigned to features." + echo "" + echo " --secondary_alignments" + echo " type: string" + echo " choices: [ score, ignore ]" + echo " Whether to score secondary alignments (0x100 flag)." + echo "" + echo " --supplementary_alignments" + echo " type: string" + echo " choices: [ score, ignore ]" + echo " Whether to score supplementary alignments (0x800 flag)." + echo "" + echo " --counts_output_sparse" + echo " type: boolean_true" + echo " Store the counts as a sparse matrix (mtx, h5ad, loom)." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "multi_star main" + exit + ;; + --input_id) + if [ -z "$VIASH_PAR_INPUT_ID" ]; then + VIASH_PAR_INPUT_ID="$2" + else + VIASH_PAR_INPUT_ID="$VIASH_PAR_INPUT_ID;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_id=*) + if [ -z "$VIASH_PAR_INPUT_ID" ]; then + VIASH_PAR_INPUT_ID=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_ID="$VIASH_PAR_INPUT_ID;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_r1) + if [ -z "$VIASH_PAR_INPUT_R1" ]; then + VIASH_PAR_INPUT_R1="$2" + else + VIASH_PAR_INPUT_R1="$VIASH_PAR_INPUT_R1;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_r1. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_r1=*) + if [ -z "$VIASH_PAR_INPUT_R1" ]; then + VIASH_PAR_INPUT_R1=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_R1="$VIASH_PAR_INPUT_R1;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_r2) + if [ -z "$VIASH_PAR_INPUT_R2" ]; then + VIASH_PAR_INPUT_R2="$2" + else + VIASH_PAR_INPUT_R2="$VIASH_PAR_INPUT_R2;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_r2. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_r2=*) + if [ -z "$VIASH_PAR_INPUT_R2" ]; then + VIASH_PAR_INPUT_R2=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_R2="$VIASH_PAR_INPUT_R2;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference_index) + [ -n "$VIASH_PAR_REFERENCE_INDEX" ] && ViashError Bad arguments for option \'--reference_index\': \'$VIASH_PAR_REFERENCE_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_index. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_index=*) + [ -n "$VIASH_PAR_REFERENCE_INDEX" ] && ViashError Bad arguments for option \'--reference_index=*\': \'$VIASH_PAR_REFERENCE_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_INDEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeDir) + [ -n "$VIASH_PAR_REFERENCE_INDEX" ] && ViashError Bad arguments for option \'--genomeDir\': \'$VIASH_PAR_REFERENCE_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_INDEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_gtf) + [ -n "$VIASH_PAR_REFERENCE_GTF" ] && ViashError Bad arguments for option \'--reference_gtf\': \'$VIASH_PAR_REFERENCE_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_gtf=*) + [ -n "$VIASH_PAR_REFERENCE_GTF" ] && ViashError Bad arguments for option \'--reference_gtf=*\': \'$VIASH_PAR_REFERENCE_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFileNamePrefix) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--outFileNamePrefix\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFileNamePrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --run_htseq_count) + [ -n "$VIASH_PAR_RUN_HTSEQ_COUNT" ] && ViashError Bad arguments for option \'--run_htseq_count\': \'$VIASH_PAR_RUN_HTSEQ_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_HTSEQ_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --run_htseq_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --run_htseq_count=*) + [ -n "$VIASH_PAR_RUN_HTSEQ_COUNT" ] && ViashError Bad arguments for option \'--run_htseq_count=*\': \'$VIASH_PAR_RUN_HTSEQ_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_HTSEQ_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --run_multiqc) + [ -n "$VIASH_PAR_RUN_MULTIQC" ] && ViashError Bad arguments for option \'--run_multiqc\': \'$VIASH_PAR_RUN_MULTIQC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_MULTIQC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --run_multiqc. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --run_multiqc=*) + [ -n "$VIASH_PAR_RUN_MULTIQC" ] && ViashError Bad arguments for option \'--run_multiqc=*\': \'$VIASH_PAR_RUN_MULTIQC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_MULTIQC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_success_rate) + [ -n "$VIASH_PAR_MIN_SUCCESS_RATE" ] && ViashError Bad arguments for option \'--min_success_rate\': \'$VIASH_PAR_MIN_SUCCESS_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SUCCESS_RATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_success_rate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_success_rate=*) + [ -n "$VIASH_PAR_MIN_SUCCESS_RATE" ] && ViashError Bad arguments for option \'--min_success_rate=*\': \'$VIASH_PAR_MIN_SUCCESS_RATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SUCCESS_RATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --runRNGseed) + [ -n "$VIASH_PAR_RUNRNGSEED" ] && ViashError Bad arguments for option \'--runRNGseed\': \'$VIASH_PAR_RUNRNGSEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNRNGSEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --runRNGseed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runRNGseed=*) + [ -n "$VIASH_PAR_RUNRNGSEED" ] && ViashError Bad arguments for option \'--runRNGseed=*\': \'$VIASH_PAR_RUNRNGSEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNRNGSEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeFastaFiles) + if [ -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_PAR_GENOMEFASTAFILES="$2" + else + VIASH_PAR_GENOMEFASTAFILES="$VIASH_PAR_GENOMEFASTAFILES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeFastaFiles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeFastaFiles=*) + if [ -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_PAR_GENOMEFASTAFILES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMEFASTAFILES="$VIASH_PAR_GENOMEFASTAFILES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbFileChrStartEnd) + if [ -z "$VIASH_PAR_SJDBFILECHRSTARTEND" ]; then + VIASH_PAR_SJDBFILECHRSTARTEND="$2" + else + VIASH_PAR_SJDBFILECHRSTARTEND="$VIASH_PAR_SJDBFILECHRSTARTEND;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbFileChrStartEnd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbFileChrStartEnd=*) + if [ -z "$VIASH_PAR_SJDBFILECHRSTARTEND" ]; then + VIASH_PAR_SJDBFILECHRSTARTEND=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBFILECHRSTARTEND="$VIASH_PAR_SJDBFILECHRSTARTEND;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbGTFfile) + [ -n "$VIASH_PAR_SJDBGTFFILE" ] && ViashError Bad arguments for option \'--sjdbGTFfile\': \'$VIASH_PAR_SJDBGTFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFfile=*) + [ -n "$VIASH_PAR_SJDBGTFFILE" ] && ViashError Bad arguments for option \'--sjdbGTFfile=*\': \'$VIASH_PAR_SJDBGTFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFchrPrefix) + [ -n "$VIASH_PAR_SJDBGTFCHRPREFIX" ] && ViashError Bad arguments for option \'--sjdbGTFchrPrefix\': \'$VIASH_PAR_SJDBGTFCHRPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFCHRPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFchrPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFchrPrefix=*) + [ -n "$VIASH_PAR_SJDBGTFCHRPREFIX" ] && ViashError Bad arguments for option \'--sjdbGTFchrPrefix=*\': \'$VIASH_PAR_SJDBGTFCHRPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFCHRPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFfeatureExon) + [ -n "$VIASH_PAR_SJDBGTFFEATUREEXON" ] && ViashError Bad arguments for option \'--sjdbGTFfeatureExon\': \'$VIASH_PAR_SJDBGTFFEATUREEXON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFEATUREEXON="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfeatureExon. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFfeatureExon=*) + [ -n "$VIASH_PAR_SJDBGTFFEATUREEXON" ] && ViashError Bad arguments for option \'--sjdbGTFfeatureExon=*\': \'$VIASH_PAR_SJDBGTFFEATUREEXON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFEATUREEXON=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentTranscript) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentTranscript\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentTranscript. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentTranscript=*) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentTranscript=*\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentGene) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentGene\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTGENE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGene. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGene=*) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentGene=*\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTGENE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentGeneName) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$2" + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGeneName. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGeneName=*) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbGTFtagExonParentGeneType) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$2" + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGeneType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGeneType=*) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbOverhang) + [ -n "$VIASH_PAR_SJDBOVERHANG" ] && ViashError Bad arguments for option \'--sjdbOverhang\': \'$VIASH_PAR_SJDBOVERHANG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBOVERHANG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbOverhang. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbOverhang=*) + [ -n "$VIASH_PAR_SJDBOVERHANG" ] && ViashError Bad arguments for option \'--sjdbOverhang=*\': \'$VIASH_PAR_SJDBOVERHANG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBOVERHANG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbScore) + [ -n "$VIASH_PAR_SJDBSCORE" ] && ViashError Bad arguments for option \'--sjdbScore\': \'$VIASH_PAR_SJDBSCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBSCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbScore. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbScore=*) + [ -n "$VIASH_PAR_SJDBSCORE" ] && ViashError Bad arguments for option \'--sjdbScore=*\': \'$VIASH_PAR_SJDBSCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBSCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbInsertSave) + [ -n "$VIASH_PAR_SJDBINSERTSAVE" ] && ViashError Bad arguments for option \'--sjdbInsertSave\': \'$VIASH_PAR_SJDBINSERTSAVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBINSERTSAVE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbInsertSave. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbInsertSave=*) + [ -n "$VIASH_PAR_SJDBINSERTSAVE" ] && ViashError Bad arguments for option \'--sjdbInsertSave=*\': \'$VIASH_PAR_SJDBINSERTSAVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBINSERTSAVE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --varVCFfile) + [ -n "$VIASH_PAR_VARVCFFILE" ] && ViashError Bad arguments for option \'--varVCFfile\': \'$VIASH_PAR_VARVCFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARVCFFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --varVCFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --varVCFfile=*) + [ -n "$VIASH_PAR_VARVCFFILE" ] && ViashError Bad arguments for option \'--varVCFfile=*\': \'$VIASH_PAR_VARVCFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARVCFFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesType) + [ -n "$VIASH_PAR_READFILESTYPE" ] && ViashError Bad arguments for option \'--readFilesType\': \'$VIASH_PAR_READFILESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesType=*) + [ -n "$VIASH_PAR_READFILESTYPE" ] && ViashError Bad arguments for option \'--readFilesType=*\': \'$VIASH_PAR_READFILESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesSAMattrKeep) + if [ -z "$VIASH_PAR_READFILESSAMATTRKEEP" ]; then + VIASH_PAR_READFILESSAMATTRKEEP="$2" + else + VIASH_PAR_READFILESSAMATTRKEEP="$VIASH_PAR_READFILESSAMATTRKEEP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesSAMattrKeep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesSAMattrKeep=*) + if [ -z "$VIASH_PAR_READFILESSAMATTRKEEP" ]; then + VIASH_PAR_READFILESSAMATTRKEEP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READFILESSAMATTRKEEP="$VIASH_PAR_READFILESSAMATTRKEEP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readFilesManifest) + [ -n "$VIASH_PAR_READFILESMANIFEST" ] && ViashError Bad arguments for option \'--readFilesManifest\': \'$VIASH_PAR_READFILESMANIFEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESMANIFEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesManifest. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesManifest=*) + [ -n "$VIASH_PAR_READFILESMANIFEST" ] && ViashError Bad arguments for option \'--readFilesManifest=*\': \'$VIASH_PAR_READFILESMANIFEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESMANIFEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesPrefix) + [ -n "$VIASH_PAR_READFILESPREFIX" ] && ViashError Bad arguments for option \'--readFilesPrefix\': \'$VIASH_PAR_READFILESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesPrefix=*) + [ -n "$VIASH_PAR_READFILESPREFIX" ] && ViashError Bad arguments for option \'--readFilesPrefix=*\': \'$VIASH_PAR_READFILESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesCommand) + if [ -z "$VIASH_PAR_READFILESCOMMAND" ]; then + VIASH_PAR_READFILESCOMMAND="$2" + else + VIASH_PAR_READFILESCOMMAND="$VIASH_PAR_READFILESCOMMAND;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesCommand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesCommand=*) + if [ -z "$VIASH_PAR_READFILESCOMMAND" ]; then + VIASH_PAR_READFILESCOMMAND=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READFILESCOMMAND="$VIASH_PAR_READFILESCOMMAND;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readMapNumber) + [ -n "$VIASH_PAR_READMAPNUMBER" ] && ViashError Bad arguments for option \'--readMapNumber\': \'$VIASH_PAR_READMAPNUMBER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMAPNUMBER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readMapNumber. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readMapNumber=*) + [ -n "$VIASH_PAR_READMAPNUMBER" ] && ViashError Bad arguments for option \'--readMapNumber=*\': \'$VIASH_PAR_READMAPNUMBER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMAPNUMBER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readMatesLengthsIn) + [ -n "$VIASH_PAR_READMATESLENGTHSIN" ] && ViashError Bad arguments for option \'--readMatesLengthsIn\': \'$VIASH_PAR_READMATESLENGTHSIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMATESLENGTHSIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readMatesLengthsIn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readMatesLengthsIn=*) + [ -n "$VIASH_PAR_READMATESLENGTHSIN" ] && ViashError Bad arguments for option \'--readMatesLengthsIn=*\': \'$VIASH_PAR_READMATESLENGTHSIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMATESLENGTHSIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readNameSeparator) + if [ -z "$VIASH_PAR_READNAMESEPARATOR" ]; then + VIASH_PAR_READNAMESEPARATOR="$2" + else + VIASH_PAR_READNAMESEPARATOR="$VIASH_PAR_READNAMESEPARATOR;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readNameSeparator. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readNameSeparator=*) + if [ -z "$VIASH_PAR_READNAMESEPARATOR" ]; then + VIASH_PAR_READNAMESEPARATOR=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READNAMESEPARATOR="$VIASH_PAR_READNAMESEPARATOR;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readQualityScoreBase) + [ -n "$VIASH_PAR_READQUALITYSCOREBASE" ] && ViashError Bad arguments for option \'--readQualityScoreBase\': \'$VIASH_PAR_READQUALITYSCOREBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READQUALITYSCOREBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readQualityScoreBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readQualityScoreBase=*) + [ -n "$VIASH_PAR_READQUALITYSCOREBASE" ] && ViashError Bad arguments for option \'--readQualityScoreBase=*\': \'$VIASH_PAR_READQUALITYSCOREBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READQUALITYSCOREBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clipAdapterType) + [ -n "$VIASH_PAR_CLIPADAPTERTYPE" ] && ViashError Bad arguments for option \'--clipAdapterType\': \'$VIASH_PAR_CLIPADAPTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLIPADAPTERTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clipAdapterType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clipAdapterType=*) + [ -n "$VIASH_PAR_CLIPADAPTERTYPE" ] && ViashError Bad arguments for option \'--clipAdapterType=*\': \'$VIASH_PAR_CLIPADAPTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLIPADAPTERTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clip3pNbases) + if [ -z "$VIASH_PAR_CLIP3PNBASES" ]; then + VIASH_PAR_CLIP3PNBASES="$2" + else + VIASH_PAR_CLIP3PNBASES="$VIASH_PAR_CLIP3PNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pNbases=*) + if [ -z "$VIASH_PAR_CLIP3PNBASES" ]; then + VIASH_PAR_CLIP3PNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PNBASES="$VIASH_PAR_CLIP3PNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAdapterSeq) + if [ -z "$VIASH_PAR_CLIP3PADAPTERSEQ" ]; then + VIASH_PAR_CLIP3PADAPTERSEQ="$2" + else + VIASH_PAR_CLIP3PADAPTERSEQ="$VIASH_PAR_CLIP3PADAPTERSEQ;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAdapterSeq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAdapterSeq=*) + if [ -z "$VIASH_PAR_CLIP3PADAPTERSEQ" ]; then + VIASH_PAR_CLIP3PADAPTERSEQ=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PADAPTERSEQ="$VIASH_PAR_CLIP3PADAPTERSEQ;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAdapterMMp) + if [ -z "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + VIASH_PAR_CLIP3PADAPTERMMP="$2" + else + VIASH_PAR_CLIP3PADAPTERMMP="$VIASH_PAR_CLIP3PADAPTERMMP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAdapterMMp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAdapterMMp=*) + if [ -z "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + VIASH_PAR_CLIP3PADAPTERMMP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PADAPTERMMP="$VIASH_PAR_CLIP3PADAPTERMMP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAfterAdapterNbases) + if [ -z "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$2" + else + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$VIASH_PAR_CLIP3PAFTERADAPTERNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAfterAdapterNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAfterAdapterNbases=*) + if [ -z "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + VIASH_PAR_CLIP3PAFTERADAPTERNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$VIASH_PAR_CLIP3PAFTERADAPTERNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip5pNbases) + if [ -z "$VIASH_PAR_CLIP5PNBASES" ]; then + VIASH_PAR_CLIP5PNBASES="$2" + else + VIASH_PAR_CLIP5PNBASES="$VIASH_PAR_CLIP5PNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip5pNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip5pNbases=*) + if [ -z "$VIASH_PAR_CLIP5PNBASES" ]; then + VIASH_PAR_CLIP5PNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP5PNBASES="$VIASH_PAR_CLIP5PNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --limitGenomeGenerateRAM) + [ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ] && ViashError Bad arguments for option \'--limitGenomeGenerateRAM\': \'$VIASH_PAR_LIMITGENOMEGENERATERAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITGENOMEGENERATERAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitGenomeGenerateRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitGenomeGenerateRAM=*) + [ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ] && ViashError Bad arguments for option \'--limitGenomeGenerateRAM=*\': \'$VIASH_PAR_LIMITGENOMEGENERATERAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITGENOMEGENERATERAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitIObufferSize) + if [ -z "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + VIASH_PAR_LIMITIOBUFFERSIZE="$2" + else + VIASH_PAR_LIMITIOBUFFERSIZE="$VIASH_PAR_LIMITIOBUFFERSIZE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitIObufferSize. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitIObufferSize=*) + if [ -z "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + VIASH_PAR_LIMITIOBUFFERSIZE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIMITIOBUFFERSIZE="$VIASH_PAR_LIMITIOBUFFERSIZE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --limitOutSAMoneReadBytes) + [ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ] && ViashError Bad arguments for option \'--limitOutSAMoneReadBytes\': \'$VIASH_PAR_LIMITOUTSAMONEREADBYTES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSAMONEREADBYTES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSAMoneReadBytes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSAMoneReadBytes=*) + [ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ] && ViashError Bad arguments for option \'--limitOutSAMoneReadBytes=*\': \'$VIASH_PAR_LIMITOUTSAMONEREADBYTES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSAMONEREADBYTES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitOutSJoneRead) + [ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ] && ViashError Bad arguments for option \'--limitOutSJoneRead\': \'$VIASH_PAR_LIMITOUTSJONEREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJONEREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSJoneRead. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSJoneRead=*) + [ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ] && ViashError Bad arguments for option \'--limitOutSJoneRead=*\': \'$VIASH_PAR_LIMITOUTSJONEREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJONEREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitOutSJcollapsed) + [ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ] && ViashError Bad arguments for option \'--limitOutSJcollapsed\': \'$VIASH_PAR_LIMITOUTSJCOLLAPSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJCOLLAPSED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSJcollapsed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSJcollapsed=*) + [ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ] && ViashError Bad arguments for option \'--limitOutSJcollapsed=*\': \'$VIASH_PAR_LIMITOUTSJCOLLAPSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJCOLLAPSED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitBAMsortRAM) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitBAMsortRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitBAMsortRAM=*) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM=*\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitSjdbInsertNsj) + [ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ] && ViashError Bad arguments for option \'--limitSjdbInsertNsj\': \'$VIASH_PAR_LIMITSJDBINSERTNSJ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITSJDBINSERTNSJ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitSjdbInsertNsj. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitSjdbInsertNsj=*) + [ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ] && ViashError Bad arguments for option \'--limitSjdbInsertNsj=*\': \'$VIASH_PAR_LIMITSJDBINSERTNSJ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITSJDBINSERTNSJ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitNreadsSoft) + [ -n "$VIASH_PAR_LIMITNREADSSOFT" ] && ViashError Bad arguments for option \'--limitNreadsSoft\': \'$VIASH_PAR_LIMITNREADSSOFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITNREADSSOFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitNreadsSoft. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitNreadsSoft=*) + [ -n "$VIASH_PAR_LIMITNREADSSOFT" ] && ViashError Bad arguments for option \'--limitNreadsSoft=*\': \'$VIASH_PAR_LIMITNREADSSOFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITNREADSSOFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outTmpKeep) + [ -n "$VIASH_PAR_OUTTMPKEEP" ] && ViashError Bad arguments for option \'--outTmpKeep\': \'$VIASH_PAR_OUTTMPKEEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTTMPKEEP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outTmpKeep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outTmpKeep=*) + [ -n "$VIASH_PAR_OUTTMPKEEP" ] && ViashError Bad arguments for option \'--outTmpKeep=*\': \'$VIASH_PAR_OUTTMPKEEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTTMPKEEP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outStd) + [ -n "$VIASH_PAR_OUTSTD" ] && ViashError Bad arguments for option \'--outStd\': \'$VIASH_PAR_OUTSTD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSTD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outStd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outStd=*) + [ -n "$VIASH_PAR_OUTSTD" ] && ViashError Bad arguments for option \'--outStd=*\': \'$VIASH_PAR_OUTSTD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSTD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outReadsUnmapped) + [ -n "$VIASH_PAR_OUTREADSUNMAPPED" ] && ViashError Bad arguments for option \'--outReadsUnmapped\': \'$VIASH_PAR_OUTREADSUNMAPPED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTREADSUNMAPPED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outReadsUnmapped. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outReadsUnmapped=*) + [ -n "$VIASH_PAR_OUTREADSUNMAPPED" ] && ViashError Bad arguments for option \'--outReadsUnmapped=*\': \'$VIASH_PAR_OUTREADSUNMAPPED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTREADSUNMAPPED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outQSconversionAdd) + [ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ] && ViashError Bad arguments for option \'--outQSconversionAdd\': \'$VIASH_PAR_OUTQSCONVERSIONADD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTQSCONVERSIONADD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outQSconversionAdd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outQSconversionAdd=*) + [ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ] && ViashError Bad arguments for option \'--outQSconversionAdd=*\': \'$VIASH_PAR_OUTQSCONVERSIONADD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTQSCONVERSIONADD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outMultimapperOrder) + [ -n "$VIASH_PAR_OUTMULTIMAPPERORDER" ] && ViashError Bad arguments for option \'--outMultimapperOrder\': \'$VIASH_PAR_OUTMULTIMAPPERORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTMULTIMAPPERORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outMultimapperOrder. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outMultimapperOrder=*) + [ -n "$VIASH_PAR_OUTMULTIMAPPERORDER" ] && ViashError Bad arguments for option \'--outMultimapperOrder=*\': \'$VIASH_PAR_OUTMULTIMAPPERORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTMULTIMAPPERORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMmode) + [ -n "$VIASH_PAR_OUTSAMMODE" ] && ViashError Bad arguments for option \'--outSAMmode\': \'$VIASH_PAR_OUTSAMMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmode=*) + [ -n "$VIASH_PAR_OUTSAMMODE" ] && ViashError Bad arguments for option \'--outSAMmode=*\': \'$VIASH_PAR_OUTSAMMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMstrandField) + [ -n "$VIASH_PAR_OUTSAMSTRANDFIELD" ] && ViashError Bad arguments for option \'--outSAMstrandField\': \'$VIASH_PAR_OUTSAMSTRANDFIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMSTRANDFIELD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMstrandField. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMstrandField=*) + [ -n "$VIASH_PAR_OUTSAMSTRANDFIELD" ] && ViashError Bad arguments for option \'--outSAMstrandField=*\': \'$VIASH_PAR_OUTSAMSTRANDFIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMSTRANDFIELD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMattributes) + if [ -z "$VIASH_PAR_OUTSAMATTRIBUTES" ]; then + VIASH_PAR_OUTSAMATTRIBUTES="$2" + else + VIASH_PAR_OUTSAMATTRIBUTES="$VIASH_PAR_OUTSAMATTRIBUTES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattributes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattributes=*) + if [ -z "$VIASH_PAR_OUTSAMATTRIBUTES" ]; then + VIASH_PAR_OUTSAMATTRIBUTES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMATTRIBUTES="$VIASH_PAR_OUTSAMATTRIBUTES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMattrIHstart) + [ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ] && ViashError Bad arguments for option \'--outSAMattrIHstart\': \'$VIASH_PAR_OUTSAMATTRIHSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMATTRIHSTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattrIHstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattrIHstart=*) + [ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ] && ViashError Bad arguments for option \'--outSAMattrIHstart=*\': \'$VIASH_PAR_OUTSAMATTRIHSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMATTRIHSTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMunmapped) + if [ -z "$VIASH_PAR_OUTSAMUNMAPPED" ]; then + VIASH_PAR_OUTSAMUNMAPPED="$2" + else + VIASH_PAR_OUTSAMUNMAPPED="$VIASH_PAR_OUTSAMUNMAPPED;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMunmapped. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMunmapped=*) + if [ -z "$VIASH_PAR_OUTSAMUNMAPPED" ]; then + VIASH_PAR_OUTSAMUNMAPPED=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMUNMAPPED="$VIASH_PAR_OUTSAMUNMAPPED;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMorder) + [ -n "$VIASH_PAR_OUTSAMORDER" ] && ViashError Bad arguments for option \'--outSAMorder\': \'$VIASH_PAR_OUTSAMORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMorder. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMorder=*) + [ -n "$VIASH_PAR_OUTSAMORDER" ] && ViashError Bad arguments for option \'--outSAMorder=*\': \'$VIASH_PAR_OUTSAMORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMprimaryFlag) + [ -n "$VIASH_PAR_OUTSAMPRIMARYFLAG" ] && ViashError Bad arguments for option \'--outSAMprimaryFlag\': \'$VIASH_PAR_OUTSAMPRIMARYFLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMPRIMARYFLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMprimaryFlag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMprimaryFlag=*) + [ -n "$VIASH_PAR_OUTSAMPRIMARYFLAG" ] && ViashError Bad arguments for option \'--outSAMprimaryFlag=*\': \'$VIASH_PAR_OUTSAMPRIMARYFLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMPRIMARYFLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMreadID) + [ -n "$VIASH_PAR_OUTSAMREADID" ] && ViashError Bad arguments for option \'--outSAMreadID\': \'$VIASH_PAR_OUTSAMREADID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMREADID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMreadID. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMreadID=*) + [ -n "$VIASH_PAR_OUTSAMREADID" ] && ViashError Bad arguments for option \'--outSAMreadID=*\': \'$VIASH_PAR_OUTSAMREADID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMREADID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMmapqUnique) + [ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ] && ViashError Bad arguments for option \'--outSAMmapqUnique\': \'$VIASH_PAR_OUTSAMMAPQUNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMAPQUNIQUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmapqUnique. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmapqUnique=*) + [ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ] && ViashError Bad arguments for option \'--outSAMmapqUnique=*\': \'$VIASH_PAR_OUTSAMMAPQUNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMAPQUNIQUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMflagOR) + [ -n "$VIASH_PAR_OUTSAMFLAGOR" ] && ViashError Bad arguments for option \'--outSAMflagOR\': \'$VIASH_PAR_OUTSAMFLAGOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMflagOR. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMflagOR=*) + [ -n "$VIASH_PAR_OUTSAMFLAGOR" ] && ViashError Bad arguments for option \'--outSAMflagOR=*\': \'$VIASH_PAR_OUTSAMFLAGOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMflagAND) + [ -n "$VIASH_PAR_OUTSAMFLAGAND" ] && ViashError Bad arguments for option \'--outSAMflagAND\': \'$VIASH_PAR_OUTSAMFLAGAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMflagAND. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMflagAND=*) + [ -n "$VIASH_PAR_OUTSAMFLAGAND" ] && ViashError Bad arguments for option \'--outSAMflagAND=*\': \'$VIASH_PAR_OUTSAMFLAGAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMattrRGline) + if [ -z "$VIASH_PAR_OUTSAMATTRRGLINE" ]; then + VIASH_PAR_OUTSAMATTRRGLINE="$2" + else + VIASH_PAR_OUTSAMATTRRGLINE="$VIASH_PAR_OUTSAMATTRRGLINE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattrRGline. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattrRGline=*) + if [ -z "$VIASH_PAR_OUTSAMATTRRGLINE" ]; then + VIASH_PAR_OUTSAMATTRRGLINE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMATTRRGLINE="$VIASH_PAR_OUTSAMATTRRGLINE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderHD) + if [ -z "$VIASH_PAR_OUTSAMHEADERHD" ]; then + VIASH_PAR_OUTSAMHEADERHD="$2" + else + VIASH_PAR_OUTSAMHEADERHD="$VIASH_PAR_OUTSAMHEADERHD;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderHD. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderHD=*) + if [ -z "$VIASH_PAR_OUTSAMHEADERHD" ]; then + VIASH_PAR_OUTSAMHEADERHD=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMHEADERHD="$VIASH_PAR_OUTSAMHEADERHD;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderPG) + if [ -z "$VIASH_PAR_OUTSAMHEADERPG" ]; then + VIASH_PAR_OUTSAMHEADERPG="$2" + else + VIASH_PAR_OUTSAMHEADERPG="$VIASH_PAR_OUTSAMHEADERPG;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderPG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderPG=*) + if [ -z "$VIASH_PAR_OUTSAMHEADERPG" ]; then + VIASH_PAR_OUTSAMHEADERPG=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMHEADERPG="$VIASH_PAR_OUTSAMHEADERPG;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderCommentFile) + [ -n "$VIASH_PAR_OUTSAMHEADERCOMMENTFILE" ] && ViashError Bad arguments for option \'--outSAMheaderCommentFile\': \'$VIASH_PAR_OUTSAMHEADERCOMMENTFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMHEADERCOMMENTFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderCommentFile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderCommentFile=*) + [ -n "$VIASH_PAR_OUTSAMHEADERCOMMENTFILE" ] && ViashError Bad arguments for option \'--outSAMheaderCommentFile=*\': \'$VIASH_PAR_OUTSAMHEADERCOMMENTFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMHEADERCOMMENTFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMfilter) + if [ -z "$VIASH_PAR_OUTSAMFILTER" ]; then + VIASH_PAR_OUTSAMFILTER="$2" + else + VIASH_PAR_OUTSAMFILTER="$VIASH_PAR_OUTSAMFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMfilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMfilter=*) + if [ -z "$VIASH_PAR_OUTSAMFILTER" ]; then + VIASH_PAR_OUTSAMFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMFILTER="$VIASH_PAR_OUTSAMFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMmultNmax) + [ -n "$VIASH_PAR_OUTSAMMULTNMAX" ] && ViashError Bad arguments for option \'--outSAMmultNmax\': \'$VIASH_PAR_OUTSAMMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMULTNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmultNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmultNmax=*) + [ -n "$VIASH_PAR_OUTSAMMULTNMAX" ] && ViashError Bad arguments for option \'--outSAMmultNmax=*\': \'$VIASH_PAR_OUTSAMMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMULTNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMtlen) + [ -n "$VIASH_PAR_OUTSAMTLEN" ] && ViashError Bad arguments for option \'--outSAMtlen\': \'$VIASH_PAR_OUTSAMTLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMTLEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMtlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMtlen=*) + [ -n "$VIASH_PAR_OUTSAMTLEN" ] && ViashError Bad arguments for option \'--outSAMtlen=*\': \'$VIASH_PAR_OUTSAMTLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMTLEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMcompression) + [ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--outBAMcompression\': \'$VIASH_PAR_OUTBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMCOMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMcompression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMcompression=*) + [ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--outBAMcompression=*\': \'$VIASH_PAR_OUTBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMCOMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMsortingThreadN) + [ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ] && ViashError Bad arguments for option \'--outBAMsortingThreadN\': \'$VIASH_PAR_OUTBAMSORTINGTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGTHREADN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMsortingThreadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMsortingThreadN=*) + [ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ] && ViashError Bad arguments for option \'--outBAMsortingThreadN=*\': \'$VIASH_PAR_OUTBAMSORTINGTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGTHREADN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMsortingBinsN) + [ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ] && ViashError Bad arguments for option \'--outBAMsortingBinsN\': \'$VIASH_PAR_OUTBAMSORTINGBINSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGBINSN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMsortingBinsN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMsortingBinsN=*) + [ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ] && ViashError Bad arguments for option \'--outBAMsortingBinsN=*\': \'$VIASH_PAR_OUTBAMSORTINGBINSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGBINSN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bamRemoveDuplicatesType) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESTYPE" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesType\': \'$VIASH_PAR_BAMREMOVEDUPLICATESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bamRemoveDuplicatesType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bamRemoveDuplicatesType=*) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESTYPE" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesType=*\': \'$VIASH_PAR_BAMREMOVEDUPLICATESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bamRemoveDuplicatesMate2basesN) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesMate2basesN\': \'$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bamRemoveDuplicatesMate2basesN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bamRemoveDuplicatesMate2basesN=*) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesMate2basesN=*\': \'$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigType) + if [ -z "$VIASH_PAR_OUTWIGTYPE" ]; then + VIASH_PAR_OUTWIGTYPE="$2" + else + VIASH_PAR_OUTWIGTYPE="$VIASH_PAR_OUTWIGTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigType=*) + if [ -z "$VIASH_PAR_OUTWIGTYPE" ]; then + VIASH_PAR_OUTWIGTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTWIGTYPE="$VIASH_PAR_OUTWIGTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outWigStrand) + [ -n "$VIASH_PAR_OUTWIGSTRAND" ] && ViashError Bad arguments for option \'--outWigStrand\': \'$VIASH_PAR_OUTWIGSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGSTRAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigStrand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigStrand=*) + [ -n "$VIASH_PAR_OUTWIGSTRAND" ] && ViashError Bad arguments for option \'--outWigStrand=*\': \'$VIASH_PAR_OUTWIGSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGSTRAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigReferencesPrefix) + [ -n "$VIASH_PAR_OUTWIGREFERENCESPREFIX" ] && ViashError Bad arguments for option \'--outWigReferencesPrefix\': \'$VIASH_PAR_OUTWIGREFERENCESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGREFERENCESPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigReferencesPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigReferencesPrefix=*) + [ -n "$VIASH_PAR_OUTWIGREFERENCESPREFIX" ] && ViashError Bad arguments for option \'--outWigReferencesPrefix=*\': \'$VIASH_PAR_OUTWIGREFERENCESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGREFERENCESPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigNorm) + [ -n "$VIASH_PAR_OUTWIGNORM" ] && ViashError Bad arguments for option \'--outWigNorm\': \'$VIASH_PAR_OUTWIGNORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGNORM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigNorm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigNorm=*) + [ -n "$VIASH_PAR_OUTWIGNORM" ] && ViashError Bad arguments for option \'--outWigNorm=*\': \'$VIASH_PAR_OUTWIGNORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGNORM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterType) + [ -n "$VIASH_PAR_OUTFILTERTYPE" ] && ViashError Bad arguments for option \'--outFilterType\': \'$VIASH_PAR_OUTFILTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterType=*) + [ -n "$VIASH_PAR_OUTFILTERTYPE" ] && ViashError Bad arguments for option \'--outFilterType=*\': \'$VIASH_PAR_OUTFILTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMultimapScoreRange) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--outFilterMultimapScoreRange\': \'$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMultimapScoreRange. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMultimapScoreRange=*) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--outFilterMultimapScoreRange=*\': \'$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMultimapNmax) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--outFilterMultimapNmax\': \'$VIASH_PAR_OUTFILTERMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMultimapNmax=*) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--outFilterMultimapNmax=*\': \'$VIASH_PAR_OUTFILTERMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNoverLmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverLmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNoverLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNoverLmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverLmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNoverReadLmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverReadLmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNoverReadLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNoverReadLmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverReadLmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterScoreMin) + [ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ] && ViashError Bad arguments for option \'--outFilterScoreMin\': \'$VIASH_PAR_OUTFILTERSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterScoreMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterScoreMin=*) + [ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ] && ViashError Bad arguments for option \'--outFilterScoreMin=*\': \'$VIASH_PAR_OUTFILTERSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterScoreMinOverLread) + [ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterScoreMinOverLread\': \'$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMINOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterScoreMinOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterScoreMinOverLread=*) + [ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterScoreMinOverLread=*\': \'$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMINOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMatchNmin) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ] && ViashError Bad arguments for option \'--outFilterMatchNmin\': \'$VIASH_PAR_OUTFILTERMATCHNMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMatchNmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMatchNmin=*) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ] && ViashError Bad arguments for option \'--outFilterMatchNmin=*\': \'$VIASH_PAR_OUTFILTERMATCHNMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMatchNminOverLread) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterMatchNminOverLread\': \'$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMatchNminOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMatchNminOverLread=*) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterMatchNminOverLread=*\': \'$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterIntronMotifs) + [ -n "$VIASH_PAR_OUTFILTERINTRONMOTIFS" ] && ViashError Bad arguments for option \'--outFilterIntronMotifs\': \'$VIASH_PAR_OUTFILTERINTRONMOTIFS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONMOTIFS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterIntronMotifs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterIntronMotifs=*) + [ -n "$VIASH_PAR_OUTFILTERINTRONMOTIFS" ] && ViashError Bad arguments for option \'--outFilterIntronMotifs=*\': \'$VIASH_PAR_OUTFILTERINTRONMOTIFS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONMOTIFS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterIntronStrands) + [ -n "$VIASH_PAR_OUTFILTERINTRONSTRANDS" ] && ViashError Bad arguments for option \'--outFilterIntronStrands\': \'$VIASH_PAR_OUTFILTERINTRONSTRANDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONSTRANDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterIntronStrands. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterIntronStrands=*) + [ -n "$VIASH_PAR_OUTFILTERINTRONSTRANDS" ] && ViashError Bad arguments for option \'--outFilterIntronStrands=*\': \'$VIASH_PAR_OUTFILTERINTRONSTRANDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONSTRANDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJtype) + [ -n "$VIASH_PAR_OUTSJTYPE" ] && ViashError Bad arguments for option \'--outSJtype\': \'$VIASH_PAR_OUTSJTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJtype=*) + [ -n "$VIASH_PAR_OUTSJTYPE" ] && ViashError Bad arguments for option \'--outSJtype=*\': \'$VIASH_PAR_OUTSJTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJfilterReads) + [ -n "$VIASH_PAR_OUTSJFILTERREADS" ] && ViashError Bad arguments for option \'--outSJfilterReads\': \'$VIASH_PAR_OUTSJFILTERREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJFILTERREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterReads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterReads=*) + [ -n "$VIASH_PAR_OUTSJFILTERREADS" ] && ViashError Bad arguments for option \'--outSJfilterReads=*\': \'$VIASH_PAR_OUTSJFILTERREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJFILTERREADS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJfilterOverhangMin) + if [ -z "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$2" + else + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$VIASH_PAR_OUTSJFILTEROVERHANGMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterOverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterOverhangMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + VIASH_PAR_OUTSJFILTEROVERHANGMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$VIASH_PAR_OUTSJFILTEROVERHANGMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterCountUniqueMin) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$2" + else + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterCountUniqueMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterCountUniqueMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterCountTotalMin) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$2" + else + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterCountTotalMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterCountTotalMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterDistToOtherSJmin) + if [ -z "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$2" + else + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterDistToOtherSJmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterDistToOtherSJmin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterIntronMaxVsReadN) + if [ -z "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$2" + else + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterIntronMaxVsReadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterIntronMaxVsReadN=*) + if [ -z "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --scoreGap) + [ -n "$VIASH_PAR_SCOREGAP" ] && ViashError Bad arguments for option \'--scoreGap\': \'$VIASH_PAR_SCOREGAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGap=*) + [ -n "$VIASH_PAR_SCOREGAP" ] && ViashError Bad arguments for option \'--scoreGap=*\': \'$VIASH_PAR_SCOREGAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapNoncan) + [ -n "$VIASH_PAR_SCOREGAPNONCAN" ] && ViashError Bad arguments for option \'--scoreGapNoncan\': \'$VIASH_PAR_SCOREGAPNONCAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPNONCAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapNoncan. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapNoncan=*) + [ -n "$VIASH_PAR_SCOREGAPNONCAN" ] && ViashError Bad arguments for option \'--scoreGapNoncan=*\': \'$VIASH_PAR_SCOREGAPNONCAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPNONCAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapGCAG) + [ -n "$VIASH_PAR_SCOREGAPGCAG" ] && ViashError Bad arguments for option \'--scoreGapGCAG\': \'$VIASH_PAR_SCOREGAPGCAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPGCAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapGCAG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapGCAG=*) + [ -n "$VIASH_PAR_SCOREGAPGCAG" ] && ViashError Bad arguments for option \'--scoreGapGCAG=*\': \'$VIASH_PAR_SCOREGAPGCAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPGCAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapATAC) + [ -n "$VIASH_PAR_SCOREGAPATAC" ] && ViashError Bad arguments for option \'--scoreGapATAC\': \'$VIASH_PAR_SCOREGAPATAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPATAC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapATAC. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapATAC=*) + [ -n "$VIASH_PAR_SCOREGAPATAC" ] && ViashError Bad arguments for option \'--scoreGapATAC=*\': \'$VIASH_PAR_SCOREGAPATAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPATAC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGenomicLengthLog2scale) + [ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ] && ViashError Bad arguments for option \'--scoreGenomicLengthLog2scale\': \'$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGenomicLengthLog2scale. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGenomicLengthLog2scale=*) + [ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ] && ViashError Bad arguments for option \'--scoreGenomicLengthLog2scale=*\': \'$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreDelOpen) + [ -n "$VIASH_PAR_SCOREDELOPEN" ] && ViashError Bad arguments for option \'--scoreDelOpen\': \'$VIASH_PAR_SCOREDELOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELOPEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreDelOpen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreDelOpen=*) + [ -n "$VIASH_PAR_SCOREDELOPEN" ] && ViashError Bad arguments for option \'--scoreDelOpen=*\': \'$VIASH_PAR_SCOREDELOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELOPEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreDelBase) + [ -n "$VIASH_PAR_SCOREDELBASE" ] && ViashError Bad arguments for option \'--scoreDelBase\': \'$VIASH_PAR_SCOREDELBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreDelBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreDelBase=*) + [ -n "$VIASH_PAR_SCOREDELBASE" ] && ViashError Bad arguments for option \'--scoreDelBase=*\': \'$VIASH_PAR_SCOREDELBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreInsOpen) + [ -n "$VIASH_PAR_SCOREINSOPEN" ] && ViashError Bad arguments for option \'--scoreInsOpen\': \'$VIASH_PAR_SCOREINSOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSOPEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreInsOpen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreInsOpen=*) + [ -n "$VIASH_PAR_SCOREINSOPEN" ] && ViashError Bad arguments for option \'--scoreInsOpen=*\': \'$VIASH_PAR_SCOREINSOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSOPEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreInsBase) + [ -n "$VIASH_PAR_SCOREINSBASE" ] && ViashError Bad arguments for option \'--scoreInsBase\': \'$VIASH_PAR_SCOREINSBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreInsBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreInsBase=*) + [ -n "$VIASH_PAR_SCOREINSBASE" ] && ViashError Bad arguments for option \'--scoreInsBase=*\': \'$VIASH_PAR_SCOREINSBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreStitchSJshift) + [ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ] && ViashError Bad arguments for option \'--scoreStitchSJshift\': \'$VIASH_PAR_SCORESTITCHSJSHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCORESTITCHSJSHIFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreStitchSJshift. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreStitchSJshift=*) + [ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ] && ViashError Bad arguments for option \'--scoreStitchSJshift=*\': \'$VIASH_PAR_SCORESTITCHSJSHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCORESTITCHSJSHIFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchStartLmax) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ] && ViashError Bad arguments for option \'--seedSearchStartLmax\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchStartLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchStartLmax=*) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ] && ViashError Bad arguments for option \'--seedSearchStartLmax=*\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchStartLmaxOverLread) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ] && ViashError Bad arguments for option \'--seedSearchStartLmaxOverLread\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchStartLmaxOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchStartLmaxOverLread=*) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ] && ViashError Bad arguments for option \'--seedSearchStartLmaxOverLread=*\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchLmax) + [ -n "$VIASH_PAR_SEEDSEARCHLMAX" ] && ViashError Bad arguments for option \'--seedSearchLmax\': \'$VIASH_PAR_SEEDSEARCHLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchLmax=*) + [ -n "$VIASH_PAR_SEEDSEARCHLMAX" ] && ViashError Bad arguments for option \'--seedSearchLmax=*\': \'$VIASH_PAR_SEEDSEARCHLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedMultimapNmax) + [ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--seedMultimapNmax\': \'$VIASH_PAR_SEEDMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedMultimapNmax=*) + [ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--seedMultimapNmax=*\': \'$VIASH_PAR_SEEDMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedPerReadNmax) + [ -n "$VIASH_PAR_SEEDPERREADNMAX" ] && ViashError Bad arguments for option \'--seedPerReadNmax\': \'$VIASH_PAR_SEEDPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedPerReadNmax=*) + [ -n "$VIASH_PAR_SEEDPERREADNMAX" ] && ViashError Bad arguments for option \'--seedPerReadNmax=*\': \'$VIASH_PAR_SEEDPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedPerWindowNmax) + [ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--seedPerWindowNmax\': \'$VIASH_PAR_SEEDPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERWINDOWNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedPerWindowNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedPerWindowNmax=*) + [ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--seedPerWindowNmax=*\': \'$VIASH_PAR_SEEDPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERWINDOWNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedNoneLociPerWindow) + [ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ] && ViashError Bad arguments for option \'--seedNoneLociPerWindow\': \'$VIASH_PAR_SEEDNONELOCIPERWINDOW\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDNONELOCIPERWINDOW="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedNoneLociPerWindow. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedNoneLociPerWindow=*) + [ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ] && ViashError Bad arguments for option \'--seedNoneLociPerWindow=*\': \'$VIASH_PAR_SEEDNONELOCIPERWINDOW\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDNONELOCIPERWINDOW=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSplitMin) + [ -n "$VIASH_PAR_SEEDSPLITMIN" ] && ViashError Bad arguments for option \'--seedSplitMin\': \'$VIASH_PAR_SEEDSPLITMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSPLITMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSplitMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSplitMin=*) + [ -n "$VIASH_PAR_SEEDSPLITMIN" ] && ViashError Bad arguments for option \'--seedSplitMin=*\': \'$VIASH_PAR_SEEDSPLITMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSPLITMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedMapMin) + [ -n "$VIASH_PAR_SEEDMAPMIN" ] && ViashError Bad arguments for option \'--seedMapMin\': \'$VIASH_PAR_SEEDMAPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMAPMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedMapMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedMapMin=*) + [ -n "$VIASH_PAR_SEEDMAPMIN" ] && ViashError Bad arguments for option \'--seedMapMin=*\': \'$VIASH_PAR_SEEDMAPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMAPMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignIntronMin) + [ -n "$VIASH_PAR_ALIGNINTRONMIN" ] && ViashError Bad arguments for option \'--alignIntronMin\': \'$VIASH_PAR_ALIGNINTRONMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignIntronMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignIntronMin=*) + [ -n "$VIASH_PAR_ALIGNINTRONMIN" ] && ViashError Bad arguments for option \'--alignIntronMin=*\': \'$VIASH_PAR_ALIGNINTRONMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignIntronMax) + [ -n "$VIASH_PAR_ALIGNINTRONMAX" ] && ViashError Bad arguments for option \'--alignIntronMax\': \'$VIASH_PAR_ALIGNINTRONMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignIntronMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignIntronMax=*) + [ -n "$VIASH_PAR_ALIGNINTRONMAX" ] && ViashError Bad arguments for option \'--alignIntronMax=*\': \'$VIASH_PAR_ALIGNINTRONMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignMatesGapMax) + [ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ] && ViashError Bad arguments for option \'--alignMatesGapMax\': \'$VIASH_PAR_ALIGNMATESGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNMATESGAPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignMatesGapMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignMatesGapMax=*) + [ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ] && ViashError Bad arguments for option \'--alignMatesGapMax=*\': \'$VIASH_PAR_ALIGNMATESGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNMATESGAPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSJoverhangMin) + [ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJoverhangMin\': \'$VIASH_PAR_ALIGNSJOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJoverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJoverhangMin=*) + [ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJoverhangMin=*\': \'$VIASH_PAR_ALIGNSJOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSJstitchMismatchNmax) + if [ -z "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$2" + else + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJstitchMismatchNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJstitchMismatchNmax=*) + if [ -z "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --alignSJDBoverhangMin) + [ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJDBoverhangMin\': \'$VIASH_PAR_ALIGNSJDBOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJDBOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJDBoverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJDBoverhangMin=*) + [ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJDBoverhangMin=*\': \'$VIASH_PAR_ALIGNSJDBOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJDBOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSplicedMateMapLmin) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLmin\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSplicedMateMapLmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSplicedMateMapLmin=*) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLmin=*\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSplicedMateMapLminOverLmate) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLminOverLmate\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSplicedMateMapLminOverLmate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSplicedMateMapLminOverLmate=*) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLminOverLmate=*\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignWindowsPerReadNmax) + [ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignWindowsPerReadNmax\': \'$VIASH_PAR_ALIGNWINDOWSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNWINDOWSPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignWindowsPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignWindowsPerReadNmax=*) + [ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignWindowsPerReadNmax=*\': \'$VIASH_PAR_ALIGNWINDOWSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNWINDOWSPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignTranscriptsPerWindowNmax) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerWindowNmax\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignTranscriptsPerWindowNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignTranscriptsPerWindowNmax=*) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerWindowNmax=*\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignTranscriptsPerReadNmax) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerReadNmax\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignTranscriptsPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignTranscriptsPerReadNmax=*) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerReadNmax=*\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignEndsType) + [ -n "$VIASH_PAR_ALIGNENDSTYPE" ] && ViashError Bad arguments for option \'--alignEndsType\': \'$VIASH_PAR_ALIGNENDSTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignEndsType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignEndsType=*) + [ -n "$VIASH_PAR_ALIGNENDSTYPE" ] && ViashError Bad arguments for option \'--alignEndsType=*\': \'$VIASH_PAR_ALIGNENDSTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignEndsProtrude) + [ -n "$VIASH_PAR_ALIGNENDSPROTRUDE" ] && ViashError Bad arguments for option \'--alignEndsProtrude\': \'$VIASH_PAR_ALIGNENDSPROTRUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSPROTRUDE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignEndsProtrude. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignEndsProtrude=*) + [ -n "$VIASH_PAR_ALIGNENDSPROTRUDE" ] && ViashError Bad arguments for option \'--alignEndsProtrude=*\': \'$VIASH_PAR_ALIGNENDSPROTRUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSPROTRUDE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSoftClipAtReferenceEnds) + [ -n "$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS" ] && ViashError Bad arguments for option \'--alignSoftClipAtReferenceEnds\': \'$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSoftClipAtReferenceEnds. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSoftClipAtReferenceEnds=*) + [ -n "$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS" ] && ViashError Bad arguments for option \'--alignSoftClipAtReferenceEnds=*\': \'$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignInsertionFlush) + [ -n "$VIASH_PAR_ALIGNINSERTIONFLUSH" ] && ViashError Bad arguments for option \'--alignInsertionFlush\': \'$VIASH_PAR_ALIGNINSERTIONFLUSH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINSERTIONFLUSH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignInsertionFlush. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignInsertionFlush=*) + [ -n "$VIASH_PAR_ALIGNINSERTIONFLUSH" ] && ViashError Bad arguments for option \'--alignInsertionFlush=*\': \'$VIASH_PAR_ALIGNINSERTIONFLUSH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINSERTIONFLUSH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peOverlapNbasesMin) + [ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ] && ViashError Bad arguments for option \'--peOverlapNbasesMin\': \'$VIASH_PAR_PEOVERLAPNBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPNBASESMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peOverlapNbasesMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peOverlapNbasesMin=*) + [ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ] && ViashError Bad arguments for option \'--peOverlapNbasesMin=*\': \'$VIASH_PAR_PEOVERLAPNBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPNBASESMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peOverlapMMp) + [ -n "$VIASH_PAR_PEOVERLAPMMP" ] && ViashError Bad arguments for option \'--peOverlapMMp\': \'$VIASH_PAR_PEOVERLAPMMP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPMMP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peOverlapMMp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peOverlapMMp=*) + [ -n "$VIASH_PAR_PEOVERLAPMMP" ] && ViashError Bad arguments for option \'--peOverlapMMp=*\': \'$VIASH_PAR_PEOVERLAPMMP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPMMP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winAnchorMultimapNmax) + [ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--winAnchorMultimapNmax\': \'$VIASH_PAR_WINANCHORMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winAnchorMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winAnchorMultimapNmax=*) + [ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--winAnchorMultimapNmax=*\': \'$VIASH_PAR_WINANCHORMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winBinNbits) + [ -n "$VIASH_PAR_WINBINNBITS" ] && ViashError Bad arguments for option \'--winBinNbits\': \'$VIASH_PAR_WINBINNBITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINBINNBITS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winBinNbits. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winBinNbits=*) + [ -n "$VIASH_PAR_WINBINNBITS" ] && ViashError Bad arguments for option \'--winBinNbits=*\': \'$VIASH_PAR_WINBINNBITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINBINNBITS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winAnchorDistNbins) + [ -n "$VIASH_PAR_WINANCHORDISTNBINS" ] && ViashError Bad arguments for option \'--winAnchorDistNbins\': \'$VIASH_PAR_WINANCHORDISTNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORDISTNBINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winAnchorDistNbins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winAnchorDistNbins=*) + [ -n "$VIASH_PAR_WINANCHORDISTNBINS" ] && ViashError Bad arguments for option \'--winAnchorDistNbins=*\': \'$VIASH_PAR_WINANCHORDISTNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORDISTNBINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winFlankNbins) + [ -n "$VIASH_PAR_WINFLANKNBINS" ] && ViashError Bad arguments for option \'--winFlankNbins\': \'$VIASH_PAR_WINFLANKNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINFLANKNBINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winFlankNbins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winFlankNbins=*) + [ -n "$VIASH_PAR_WINFLANKNBINS" ] && ViashError Bad arguments for option \'--winFlankNbins=*\': \'$VIASH_PAR_WINFLANKNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINFLANKNBINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winReadCoverageRelativeMin) + [ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ] && ViashError Bad arguments for option \'--winReadCoverageRelativeMin\': \'$VIASH_PAR_WINREADCOVERAGERELATIVEMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGERELATIVEMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winReadCoverageRelativeMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winReadCoverageRelativeMin=*) + [ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ] && ViashError Bad arguments for option \'--winReadCoverageRelativeMin=*\': \'$VIASH_PAR_WINREADCOVERAGERELATIVEMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGERELATIVEMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winReadCoverageBasesMin) + [ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ] && ViashError Bad arguments for option \'--winReadCoverageBasesMin\': \'$VIASH_PAR_WINREADCOVERAGEBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGEBASESMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winReadCoverageBasesMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winReadCoverageBasesMin=*) + [ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ] && ViashError Bad arguments for option \'--winReadCoverageBasesMin=*\': \'$VIASH_PAR_WINREADCOVERAGEBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGEBASESMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimOutType) + if [ -z "$VIASH_PAR_CHIMOUTTYPE" ]; then + VIASH_PAR_CHIMOUTTYPE="$2" + else + VIASH_PAR_CHIMOUTTYPE="$VIASH_PAR_CHIMOUTTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimOutType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimOutType=*) + if [ -z "$VIASH_PAR_CHIMOUTTYPE" ]; then + VIASH_PAR_CHIMOUTTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CHIMOUTTYPE="$VIASH_PAR_CHIMOUTTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --chimSegmentMin) + [ -n "$VIASH_PAR_CHIMSEGMENTMIN" ] && ViashError Bad arguments for option \'--chimSegmentMin\': \'$VIASH_PAR_CHIMSEGMENTMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimSegmentMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimSegmentMin=*) + [ -n "$VIASH_PAR_CHIMSEGMENTMIN" ] && ViashError Bad arguments for option \'--chimSegmentMin=*\': \'$VIASH_PAR_CHIMSEGMENTMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreMin) + [ -n "$VIASH_PAR_CHIMSCOREMIN" ] && ViashError Bad arguments for option \'--chimScoreMin\': \'$VIASH_PAR_CHIMSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreMin=*) + [ -n "$VIASH_PAR_CHIMSCOREMIN" ] && ViashError Bad arguments for option \'--chimScoreMin=*\': \'$VIASH_PAR_CHIMSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreDropMax) + [ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ] && ViashError Bad arguments for option \'--chimScoreDropMax\': \'$VIASH_PAR_CHIMSCOREDROPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREDROPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreDropMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreDropMax=*) + [ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ] && ViashError Bad arguments for option \'--chimScoreDropMax=*\': \'$VIASH_PAR_CHIMSCOREDROPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREDROPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreSeparation) + [ -n "$VIASH_PAR_CHIMSCORESEPARATION" ] && ViashError Bad arguments for option \'--chimScoreSeparation\': \'$VIASH_PAR_CHIMSCORESEPARATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCORESEPARATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreSeparation. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreSeparation=*) + [ -n "$VIASH_PAR_CHIMSCORESEPARATION" ] && ViashError Bad arguments for option \'--chimScoreSeparation=*\': \'$VIASH_PAR_CHIMSCORESEPARATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCORESEPARATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreJunctionNonGTAG) + [ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ] && ViashError Bad arguments for option \'--chimScoreJunctionNonGTAG\': \'$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreJunctionNonGTAG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreJunctionNonGTAG=*) + [ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ] && ViashError Bad arguments for option \'--chimScoreJunctionNonGTAG=*\': \'$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimJunctionOverhangMin) + [ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ] && ViashError Bad arguments for option \'--chimJunctionOverhangMin\': \'$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMJUNCTIONOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimJunctionOverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimJunctionOverhangMin=*) + [ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ] && ViashError Bad arguments for option \'--chimJunctionOverhangMin=*\': \'$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMJUNCTIONOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimSegmentReadGapMax) + [ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ] && ViashError Bad arguments for option \'--chimSegmentReadGapMax\': \'$VIASH_PAR_CHIMSEGMENTREADGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTREADGAPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimSegmentReadGapMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimSegmentReadGapMax=*) + [ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ] && ViashError Bad arguments for option \'--chimSegmentReadGapMax=*\': \'$VIASH_PAR_CHIMSEGMENTREADGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTREADGAPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimFilter) + if [ -z "$VIASH_PAR_CHIMFILTER" ]; then + VIASH_PAR_CHIMFILTER="$2" + else + VIASH_PAR_CHIMFILTER="$VIASH_PAR_CHIMFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimFilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimFilter=*) + if [ -z "$VIASH_PAR_CHIMFILTER" ]; then + VIASH_PAR_CHIMFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CHIMFILTER="$VIASH_PAR_CHIMFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --chimMainSegmentMultNmax) + [ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ] && ViashError Bad arguments for option \'--chimMainSegmentMultNmax\': \'$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMAINSEGMENTMULTNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMainSegmentMultNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMainSegmentMultNmax=*) + [ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ] && ViashError Bad arguments for option \'--chimMainSegmentMultNmax=*\': \'$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMAINSEGMENTMULTNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimMultimapNmax) + [ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--chimMultimapNmax\': \'$VIASH_PAR_CHIMMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMultimapNmax=*) + [ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--chimMultimapNmax=*\': \'$VIASH_PAR_CHIMMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimMultimapScoreRange) + [ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--chimMultimapScoreRange\': \'$VIASH_PAR_CHIMMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPSCORERANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMultimapScoreRange. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMultimapScoreRange=*) + [ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--chimMultimapScoreRange=*\': \'$VIASH_PAR_CHIMMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPSCORERANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimNonchimScoreDropMin) + [ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ] && ViashError Bad arguments for option \'--chimNonchimScoreDropMin\': \'$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMNONCHIMSCOREDROPMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimNonchimScoreDropMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimNonchimScoreDropMin=*) + [ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ] && ViashError Bad arguments for option \'--chimNonchimScoreDropMin=*\': \'$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMNONCHIMSCOREDROPMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimOutJunctionFormat) + [ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ] && ViashError Bad arguments for option \'--chimOutJunctionFormat\': \'$VIASH_PAR_CHIMOUTJUNCTIONFORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMOUTJUNCTIONFORMAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimOutJunctionFormat. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimOutJunctionFormat=*) + [ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ] && ViashError Bad arguments for option \'--chimOutJunctionFormat=*\': \'$VIASH_PAR_CHIMOUTJUNCTIONFORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMOUTJUNCTIONFORMAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantMode) + if [ -z "$VIASH_PAR_QUANTMODE" ]; then + VIASH_PAR_QUANTMODE="$2" + else + VIASH_PAR_QUANTMODE="$VIASH_PAR_QUANTMODE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantMode=*) + if [ -z "$VIASH_PAR_QUANTMODE" ]; then + VIASH_PAR_QUANTMODE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_QUANTMODE="$VIASH_PAR_QUANTMODE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --quantTranscriptomeBAMcompression) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--quantTranscriptomeBAMcompression\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantTranscriptomeBAMcompression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantTranscriptomeBAMcompression=*) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--quantTranscriptomeBAMcompression=*\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantTranscriptomeBan) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAN" ] && ViashError Bad arguments for option \'--quantTranscriptomeBan\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantTranscriptomeBan. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantTranscriptomeBan=*) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAN" ] && ViashError Bad arguments for option \'--quantTranscriptomeBan=*\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --twopassMode) + [ -n "$VIASH_PAR_TWOPASSMODE" ] && ViashError Bad arguments for option \'--twopassMode\': \'$VIASH_PAR_TWOPASSMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASSMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --twopassMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --twopassMode=*) + [ -n "$VIASH_PAR_TWOPASSMODE" ] && ViashError Bad arguments for option \'--twopassMode=*\': \'$VIASH_PAR_TWOPASSMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASSMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --twopass1readsN) + [ -n "$VIASH_PAR_TWOPASS1READSN" ] && ViashError Bad arguments for option \'--twopass1readsN\': \'$VIASH_PAR_TWOPASS1READSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASS1READSN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --twopass1readsN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --twopass1readsN=*) + [ -n "$VIASH_PAR_TWOPASS1READSN" ] && ViashError Bad arguments for option \'--twopass1readsN=*\': \'$VIASH_PAR_TWOPASS1READSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASS1READSN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --waspOutputMode) + [ -n "$VIASH_PAR_WASPOUTPUTMODE" ] && ViashError Bad arguments for option \'--waspOutputMode\': \'$VIASH_PAR_WASPOUTPUTMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WASPOUTPUTMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --waspOutputMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --waspOutputMode=*) + [ -n "$VIASH_PAR_WASPOUTPUTMODE" ] && ViashError Bad arguments for option \'--waspOutputMode=*\': \'$VIASH_PAR_WASPOUTPUTMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WASPOUTPUTMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloType) + if [ -z "$VIASH_PAR_SOLOTYPE" ]; then + VIASH_PAR_SOLOTYPE="$2" + else + VIASH_PAR_SOLOTYPE="$VIASH_PAR_SOLOTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloType=*) + if [ -z "$VIASH_PAR_SOLOTYPE" ]; then + VIASH_PAR_SOLOTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOTYPE="$VIASH_PAR_SOLOTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCBwhitelist) + if [ -z "$VIASH_PAR_SOLOCBWHITELIST" ]; then + VIASH_PAR_SOLOCBWHITELIST="$2" + else + VIASH_PAR_SOLOCBWHITELIST="$VIASH_PAR_SOLOCBWHITELIST;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBwhitelist. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBwhitelist=*) + if [ -z "$VIASH_PAR_SOLOCBWHITELIST" ]; then + VIASH_PAR_SOLOCBWHITELIST=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCBWHITELIST="$VIASH_PAR_SOLOCBWHITELIST;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCBstart) + [ -n "$VIASH_PAR_SOLOCBSTART" ] && ViashError Bad arguments for option \'--soloCBstart\': \'$VIASH_PAR_SOLOCBSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBSTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBstart=*) + [ -n "$VIASH_PAR_SOLOCBSTART" ] && ViashError Bad arguments for option \'--soloCBstart=*\': \'$VIASH_PAR_SOLOCBSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBSTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBlen) + [ -n "$VIASH_PAR_SOLOCBLEN" ] && ViashError Bad arguments for option \'--soloCBlen\': \'$VIASH_PAR_SOLOCBLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBLEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBlen=*) + [ -n "$VIASH_PAR_SOLOCBLEN" ] && ViashError Bad arguments for option \'--soloCBlen=*\': \'$VIASH_PAR_SOLOCBLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBLEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloUMIstart) + [ -n "$VIASH_PAR_SOLOUMISTART" ] && ViashError Bad arguments for option \'--soloUMIstart\': \'$VIASH_PAR_SOLOUMISTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMISTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIstart=*) + [ -n "$VIASH_PAR_SOLOUMISTART" ] && ViashError Bad arguments for option \'--soloUMIstart=*\': \'$VIASH_PAR_SOLOUMISTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMISTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloUMIlen) + [ -n "$VIASH_PAR_SOLOUMILEN" ] && ViashError Bad arguments for option \'--soloUMIlen\': \'$VIASH_PAR_SOLOUMILEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMILEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIlen=*) + [ -n "$VIASH_PAR_SOLOUMILEN" ] && ViashError Bad arguments for option \'--soloUMIlen=*\': \'$VIASH_PAR_SOLOUMILEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMILEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloBarcodeReadLength) + [ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ] && ViashError Bad arguments for option \'--soloBarcodeReadLength\': \'$VIASH_PAR_SOLOBARCODEREADLENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEREADLENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloBarcodeReadLength. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloBarcodeReadLength=*) + [ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ] && ViashError Bad arguments for option \'--soloBarcodeReadLength=*\': \'$VIASH_PAR_SOLOBARCODEREADLENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEREADLENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloBarcodeMate) + [ -n "$VIASH_PAR_SOLOBARCODEMATE" ] && ViashError Bad arguments for option \'--soloBarcodeMate\': \'$VIASH_PAR_SOLOBARCODEMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEMATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloBarcodeMate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloBarcodeMate=*) + [ -n "$VIASH_PAR_SOLOBARCODEMATE" ] && ViashError Bad arguments for option \'--soloBarcodeMate=*\': \'$VIASH_PAR_SOLOBARCODEMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEMATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBposition) + if [ -z "$VIASH_PAR_SOLOCBPOSITION" ]; then + VIASH_PAR_SOLOCBPOSITION="$2" + else + VIASH_PAR_SOLOCBPOSITION="$VIASH_PAR_SOLOCBPOSITION;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBposition. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBposition=*) + if [ -z "$VIASH_PAR_SOLOCBPOSITION" ]; then + VIASH_PAR_SOLOCBPOSITION=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCBPOSITION="$VIASH_PAR_SOLOCBPOSITION;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIposition) + [ -n "$VIASH_PAR_SOLOUMIPOSITION" ] && ViashError Bad arguments for option \'--soloUMIposition\': \'$VIASH_PAR_SOLOUMIPOSITION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMIPOSITION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIposition. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIposition=*) + [ -n "$VIASH_PAR_SOLOUMIPOSITION" ] && ViashError Bad arguments for option \'--soloUMIposition=*\': \'$VIASH_PAR_SOLOUMIPOSITION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMIPOSITION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloAdapterSequence) + [ -n "$VIASH_PAR_SOLOADAPTERSEQUENCE" ] && ViashError Bad arguments for option \'--soloAdapterSequence\': \'$VIASH_PAR_SOLOADAPTERSEQUENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERSEQUENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloAdapterSequence. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloAdapterSequence=*) + [ -n "$VIASH_PAR_SOLOADAPTERSEQUENCE" ] && ViashError Bad arguments for option \'--soloAdapterSequence=*\': \'$VIASH_PAR_SOLOADAPTERSEQUENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERSEQUENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloAdapterMismatchesNmax) + [ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ] && ViashError Bad arguments for option \'--soloAdapterMismatchesNmax\': \'$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERMISMATCHESNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloAdapterMismatchesNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloAdapterMismatchesNmax=*) + [ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ] && ViashError Bad arguments for option \'--soloAdapterMismatchesNmax=*\': \'$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERMISMATCHESNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBmatchWLtype) + [ -n "$VIASH_PAR_SOLOCBMATCHWLTYPE" ] && ViashError Bad arguments for option \'--soloCBmatchWLtype\': \'$VIASH_PAR_SOLOCBMATCHWLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBMATCHWLTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBmatchWLtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBmatchWLtype=*) + [ -n "$VIASH_PAR_SOLOCBMATCHWLTYPE" ] && ViashError Bad arguments for option \'--soloCBmatchWLtype=*\': \'$VIASH_PAR_SOLOCBMATCHWLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBMATCHWLTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloInputSAMattrBarcodeSeq) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$2" + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloInputSAMattrBarcodeSeq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloInputSAMattrBarcodeSeq=*) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloInputSAMattrBarcodeQual) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$2" + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloInputSAMattrBarcodeQual. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloInputSAMattrBarcodeQual=*) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloStrand) + [ -n "$VIASH_PAR_SOLOSTRAND" ] && ViashError Bad arguments for option \'--soloStrand\': \'$VIASH_PAR_SOLOSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOSTRAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloStrand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloStrand=*) + [ -n "$VIASH_PAR_SOLOSTRAND" ] && ViashError Bad arguments for option \'--soloStrand=*\': \'$VIASH_PAR_SOLOSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOSTRAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloFeatures) + if [ -z "$VIASH_PAR_SOLOFEATURES" ]; then + VIASH_PAR_SOLOFEATURES="$2" + else + VIASH_PAR_SOLOFEATURES="$VIASH_PAR_SOLOFEATURES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloFeatures. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloFeatures=*) + if [ -z "$VIASH_PAR_SOLOFEATURES" ]; then + VIASH_PAR_SOLOFEATURES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOFEATURES="$VIASH_PAR_SOLOFEATURES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloMultiMappers) + if [ -z "$VIASH_PAR_SOLOMULTIMAPPERS" ]; then + VIASH_PAR_SOLOMULTIMAPPERS="$2" + else + VIASH_PAR_SOLOMULTIMAPPERS="$VIASH_PAR_SOLOMULTIMAPPERS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloMultiMappers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloMultiMappers=*) + if [ -z "$VIASH_PAR_SOLOMULTIMAPPERS" ]; then + VIASH_PAR_SOLOMULTIMAPPERS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOMULTIMAPPERS="$VIASH_PAR_SOLOMULTIMAPPERS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIdedup) + if [ -z "$VIASH_PAR_SOLOUMIDEDUP" ]; then + VIASH_PAR_SOLOUMIDEDUP="$2" + else + VIASH_PAR_SOLOUMIDEDUP="$VIASH_PAR_SOLOUMIDEDUP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIdedup. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIdedup=*) + if [ -z "$VIASH_PAR_SOLOUMIDEDUP" ]; then + VIASH_PAR_SOLOUMIDEDUP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOUMIDEDUP="$VIASH_PAR_SOLOUMIDEDUP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIfiltering) + if [ -z "$VIASH_PAR_SOLOUMIFILTERING" ]; then + VIASH_PAR_SOLOUMIFILTERING="$2" + else + VIASH_PAR_SOLOUMIFILTERING="$VIASH_PAR_SOLOUMIFILTERING;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIfiltering. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIfiltering=*) + if [ -z "$VIASH_PAR_SOLOUMIFILTERING" ]; then + VIASH_PAR_SOLOUMIFILTERING=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOUMIFILTERING="$VIASH_PAR_SOLOUMIFILTERING;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloOutFileNames) + if [ -z "$VIASH_PAR_SOLOOUTFILENAMES" ]; then + VIASH_PAR_SOLOOUTFILENAMES="$2" + else + VIASH_PAR_SOLOOUTFILENAMES="$VIASH_PAR_SOLOOUTFILENAMES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloOutFileNames. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloOutFileNames=*) + if [ -z "$VIASH_PAR_SOLOOUTFILENAMES" ]; then + VIASH_PAR_SOLOOUTFILENAMES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOOUTFILENAMES="$VIASH_PAR_SOLOOUTFILENAMES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCellFilter) + if [ -z "$VIASH_PAR_SOLOCELLFILTER" ]; then + VIASH_PAR_SOLOCELLFILTER="$2" + else + VIASH_PAR_SOLOCELLFILTER="$VIASH_PAR_SOLOCELLFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCellFilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCellFilter=*) + if [ -z "$VIASH_PAR_SOLOCELLFILTER" ]; then + VIASH_PAR_SOLOCELLFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCELLFILTER="$VIASH_PAR_SOLOCELLFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloOutFormatFeaturesGeneField3) + if [ -z "$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3" ]; then + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$2" + else + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloOutFormatFeaturesGeneField3. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloOutFormatFeaturesGeneField3=*) + if [ -z "$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3" ]; then + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCellReadStats) + [ -n "$VIASH_PAR_SOLOCELLREADSTATS" ] && ViashError Bad arguments for option \'--soloCellReadStats\': \'$VIASH_PAR_SOLOCELLREADSTATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCELLREADSTATS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCellReadStats. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCellReadStats=*) + [ -n "$VIASH_PAR_SOLOCELLREADSTATS" ] && ViashError Bad arguments for option \'--soloCellReadStats=*\': \'$VIASH_PAR_SOLOCELLREADSTATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCELLREADSTATS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --stranded) + [ -n "$VIASH_PAR_STRANDED" ] && ViashError Bad arguments for option \'--stranded\': \'$VIASH_PAR_STRANDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRANDED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --stranded. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --stranded=*) + [ -n "$VIASH_PAR_STRANDED" ] && ViashError Bad arguments for option \'--stranded=*\': \'$VIASH_PAR_STRANDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRANDED=$(ViashRemoveFlags "$1") + shift 1 + ;; + -s) + [ -n "$VIASH_PAR_STRANDED" ] && ViashError Bad arguments for option \'-s\': \'$VIASH_PAR_STRANDED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STRANDED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -s. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minimum_alignment_quality) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'--minimum_alignment_quality\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --minimum_alignment_quality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minimum_alignment_quality=*) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'--minimum_alignment_quality=*\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + -a) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'-a\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -a. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minaqual) + [ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ] && ViashError Bad arguments for option \'--minaqual\': \'$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --minaqual. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --type) + [ -n "$VIASH_PAR_TYPE" ] && ViashError Bad arguments for option \'--type\': \'$VIASH_PAR_TYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --type. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --type=*) + [ -n "$VIASH_PAR_TYPE" ] && ViashError Bad arguments for option \'--type=*\': \'$VIASH_PAR_TYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -t) + [ -n "$VIASH_PAR_TYPE" ] && ViashError Bad arguments for option \'-t\': \'$VIASH_PAR_TYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -t. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id_attribute) + if [ -z "$VIASH_PAR_ID_ATTRIBUTE" ]; then + VIASH_PAR_ID_ATTRIBUTE="$2" + else + VIASH_PAR_ID_ATTRIBUTE="$VIASH_PAR_ID_ATTRIBUTE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --id_attribute. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id_attribute=*) + if [ -z "$VIASH_PAR_ID_ATTRIBUTE" ]; then + VIASH_PAR_ID_ATTRIBUTE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ID_ATTRIBUTE="$VIASH_PAR_ID_ATTRIBUTE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -i) + if [ -z "$VIASH_PAR_ID_ATTRIBUTE" ]; then + VIASH_PAR_ID_ATTRIBUTE="$2" + else + VIASH_PAR_ID_ATTRIBUTE="$VIASH_PAR_ID_ATTRIBUTE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --additional_attributes) + if [ -z "$VIASH_PAR_ADDITIONAL_ATTRIBUTES" ]; then + VIASH_PAR_ADDITIONAL_ATTRIBUTES="$2" + else + VIASH_PAR_ADDITIONAL_ATTRIBUTES="$VIASH_PAR_ADDITIONAL_ATTRIBUTES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --additional_attributes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --additional_attributes=*) + if [ -z "$VIASH_PAR_ADDITIONAL_ATTRIBUTES" ]; then + VIASH_PAR_ADDITIONAL_ATTRIBUTES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ADDITIONAL_ATTRIBUTES="$VIASH_PAR_ADDITIONAL_ATTRIBUTES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --add_chromosome_info) + [ -n "$VIASH_PAR_ADD_CHROMOSOME_INFO" ] && ViashError Bad arguments for option \'--add_chromosome_info\': \'$VIASH_PAR_ADD_CHROMOSOME_INFO\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ADD_CHROMOSOME_INFO=true + shift 1 + ;; + --mode) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'--mode\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mode=*) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'--mode=*\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -m) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'-m\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -m. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --non_unique) + [ -n "$VIASH_PAR_NON_UNIQUE" ] && ViashError Bad arguments for option \'--non_unique\': \'$VIASH_PAR_NON_UNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NON_UNIQUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --non_unique. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --non_unique=*) + [ -n "$VIASH_PAR_NON_UNIQUE" ] && ViashError Bad arguments for option \'--non_unique=*\': \'$VIASH_PAR_NON_UNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NON_UNIQUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --secondary_alignments) + [ -n "$VIASH_PAR_SECONDARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--secondary_alignments\': \'$VIASH_PAR_SECONDARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SECONDARY_ALIGNMENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --secondary_alignments. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --secondary_alignments=*) + [ -n "$VIASH_PAR_SECONDARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--secondary_alignments=*\': \'$VIASH_PAR_SECONDARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SECONDARY_ALIGNMENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --supplementary_alignments) + [ -n "$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--supplementary_alignments\': \'$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --supplementary_alignments. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --supplementary_alignments=*) + [ -n "$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS" ] && ViashError Bad arguments for option \'--supplementary_alignments=*\': \'$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --counts_output_sparse) + [ -n "$VIASH_PAR_COUNTS_OUTPUT_SPARSE" ] && ViashError Bad arguments for option \'--counts_output_sparse\': \'$VIASH_PAR_COUNTS_OUTPUT_SPARSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COUNTS_OUTPUT_SPARSE=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/multi_star:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_ID+x} ]; then + ViashError '--input_id' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_R1+x} ]; then + ViashError '--input_r1' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_INDEX+x} ]; then + ViashError '--reference_index' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_GTF+x} ]; then + ViashError '--reference_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_RUN_HTSEQ_COUNT+x} ]; then + VIASH_PAR_RUN_HTSEQ_COUNT="true" +fi +if [ -z ${VIASH_PAR_RUN_MULTIQC+x} ]; then + VIASH_PAR_RUN_MULTIQC="true" +fi +if [ -z ${VIASH_PAR_MIN_SUCCESS_RATE+x} ]; then + VIASH_PAR_MIN_SUCCESS_RATE="0.5" +fi +if [ -z ${VIASH_PAR_STRANDED+x} ]; then + VIASH_PAR_STRANDED="yes" +fi +if [ -z ${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY+x} ]; then + VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY="10" +fi +if [ -z ${VIASH_PAR_ADD_CHROMOSOME_INFO+x} ]; then + VIASH_PAR_ADD_CHROMOSOME_INFO="false" +fi +if [ -z ${VIASH_PAR_MODE+x} ]; then + VIASH_PAR_MODE="union" +fi +if [ -z ${VIASH_PAR_NON_UNIQUE+x} ]; then + VIASH_PAR_NON_UNIQUE="none" +fi +if [ -z ${VIASH_PAR_COUNTS_OUTPUT_SPARSE+x} ]; then + VIASH_PAR_COUNTS_OUTPUT_SPARSE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_R1" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT_R1; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_INPUT_R2" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT_R2; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE_INDEX" ] && [ ! -e "$VIASH_PAR_REFERENCE_INDEX" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE_INDEX' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_REFERENCE_GTF" ] && [ ! -e "$VIASH_PAR_REFERENCE_GTF" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE_GTF' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ] && [ ! -e "$VIASH_PAR_SJDBGTFFILE" ]; then + ViashError "Input file '$VIASH_PAR_SJDBGTFFILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ] && [ ! -e "$VIASH_PAR_READFILESMANIFEST" ]; then + ViashError "Input file '$VIASH_PAR_READFILESMANIFEST' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_RUN_HTSEQ_COUNT" ]]; then + if ! [[ "$VIASH_PAR_RUN_HTSEQ_COUNT" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--run_htseq_count' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RUN_MULTIQC" ]]; then + if ! [[ "$VIASH_PAR_RUN_MULTIQC" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--run_multiqc' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SUCCESS_RATE" ]]; then + if ! [[ "$VIASH_PAR_MIN_SUCCESS_RATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--min_success_rate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RUNRNGSEED" ]]; then + if ! [[ "$VIASH_PAR_RUNRNGSEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--runRNGseed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SJDBOVERHANG" ]]; then + if ! [[ "$VIASH_PAR_SJDBOVERHANG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sjdbOverhang' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SJDBSCORE" ]]; then + if ! [[ "$VIASH_PAR_SJDBSCORE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sjdbScore' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READMAPNUMBER" ]]; then + if ! [[ "$VIASH_PAR_READMAPNUMBER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--readMapNumber' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READQUALITYSCOREBASE" ]]; then + if ! [[ "$VIASH_PAR_READQUALITYSCOREBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--readQualityScoreBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_CLIP3PNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip3pNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PADAPTERMMP; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--clip3pAdapterMMp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PAFTERADAPTERNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip3pAfterAdapterNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP5PNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP5PNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip5pNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ]]; then + if ! [[ "$VIASH_PAR_LIMITGENOMEGENERATERAM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitGenomeGenerateRAM' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_LIMITIOBUFFERSIZE; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitIObufferSize' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSAMoneReadBytes' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSJONEREAD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSJoneRead' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSJCOLLAPSED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSJcollapsed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITBAMSORTRAM" ]]; then + if ! [[ "$VIASH_PAR_LIMITBAMSORTRAM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitBAMsortRAM' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ]]; then + if ! [[ "$VIASH_PAR_LIMITSJDBINSERTNSJ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitSjdbInsertNsj' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITNREADSSOFT" ]]; then + if ! [[ "$VIASH_PAR_LIMITNREADSSOFT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitNreadsSoft' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ]]; then + if ! [[ "$VIASH_PAR_OUTQSCONVERSIONADD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outQSconversionAdd' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMATTRIHSTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMattrIHstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMMAPQUNIQUE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMmapqUnique' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMFLAGOR" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMFLAGOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMflagOR' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMFLAGAND" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMFLAGAND" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMflagAND' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMMULTNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMMULTNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMmultNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMTLEN" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMTLEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMtlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMCOMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMcompression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMSORTINGTHREADN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMsortingThreadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMSORTINGBINSN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMsortingBinsN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ]]; then + if ! [[ "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--bamRemoveDuplicatesMate2basesN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMultimapScoreRange' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMismatchNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMismatchNoverLmax' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMismatchNoverReadLmax' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERSCOREMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterScoreMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterScoreMinOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMATCHNMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMatchNmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMatchNminOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTEROVERHANGMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterOverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterCountUniqueMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterCountTotalMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterDistToOtherSJmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterIntronMaxVsReadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_SCOREGAP" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPNONCAN" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPNONCAN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapNoncan' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPGCAG" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPGCAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapGCAG' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPATAC" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPATAC" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapATAC' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ]]; then + if ! [[ "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGenomicLengthLog2scale' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREDELOPEN" ]]; then + if ! [[ "$VIASH_PAR_SCOREDELOPEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreDelOpen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREDELBASE" ]]; then + if ! [[ "$VIASH_PAR_SCOREDELBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreDelBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREINSOPEN" ]]; then + if ! [[ "$VIASH_PAR_SCOREINSOPEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreInsOpen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREINSBASE" ]]; then + if ! [[ "$VIASH_PAR_SCOREINSBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreInsBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ]]; then + if ! [[ "$VIASH_PAR_SCORESTITCHSJSHIFT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreStitchSJshift' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHSTARTLMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSearchStartLmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--seedSearchStartLmaxOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHLMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHLMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSearchLmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDPERWINDOWNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedPerWindowNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ]]; then + if ! [[ "$VIASH_PAR_SEEDNONELOCIPERWINDOW" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedNoneLociPerWindow' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSPLITMIN" ]]; then + if ! [[ "$VIASH_PAR_SEEDSPLITMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSplitMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDMAPMIN" ]]; then + if ! [[ "$VIASH_PAR_SEEDMAPMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedMapMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNINTRONMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNINTRONMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignIntronMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNINTRONMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNINTRONMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignIntronMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNMATESGAPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignMatesGapMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSJOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJoverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJstitchMismatchNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJDBoverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSplicedMateMapLmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alignSplicedMateMapLminOverLmate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignWindowsPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignTranscriptsPerWindowNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignTranscriptsPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ]]; then + if ! [[ "$VIASH_PAR_PEOVERLAPNBASESMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--peOverlapNbasesMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PEOVERLAPMMP" ]]; then + if ! [[ "$VIASH_PAR_PEOVERLAPMMP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--peOverlapMMp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_WINANCHORMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winAnchorMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINBINNBITS" ]]; then + if ! [[ "$VIASH_PAR_WINBINNBITS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winBinNbits' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINANCHORDISTNBINS" ]]; then + if ! [[ "$VIASH_PAR_WINANCHORDISTNBINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winAnchorDistNbins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINFLANKNBINS" ]]; then + if ! [[ "$VIASH_PAR_WINFLANKNBINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winFlankNbins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ]]; then + if ! [[ "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--winReadCoverageRelativeMin' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ]]; then + if ! [[ "$VIASH_PAR_WINREADCOVERAGEBASESMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winReadCoverageBasesMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSEGMENTMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMSEGMENTMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimSegmentMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREDROPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreDropMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCORESEPARATION" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCORESEPARATION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreSeparation' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreJunctionNonGTAG' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimJunctionOverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimSegmentReadGapMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMainSegmentMultNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ]]; then + if ! [[ "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMultimapScoreRange' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimNonchimScoreDropMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ]]; then + if ! [[ "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimOutJunctionFormat' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--quantTranscriptomeBAMcompression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TWOPASS1READSN" ]]; then + if ! [[ "$VIASH_PAR_TWOPASS1READSN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--twopass1readsN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOCBSTART" ]]; then + if ! [[ "$VIASH_PAR_SOLOCBSTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloCBstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOCBLEN" ]]; then + if ! [[ "$VIASH_PAR_SOLOCBLEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloCBlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOUMISTART" ]]; then + if ! [[ "$VIASH_PAR_SOLOUMISTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloUMIstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOUMILEN" ]]; then + if ! [[ "$VIASH_PAR_SOLOUMILEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloUMIlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ]]; then + if ! [[ "$VIASH_PAR_SOLOBARCODEREADLENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloBarcodeReadLength' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOBARCODEMATE" ]]; then + if ! [[ "$VIASH_PAR_SOLOBARCODEMATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloBarcodeMate' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ]]; then + if ! [[ "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloAdapterMismatchesNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" ]]; then + if ! [[ "$VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--minimum_alignment_quality' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ADD_CHROMOSOME_INFO" ]]; then + if ! [[ "$VIASH_PAR_ADD_CHROMOSOME_INFO" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--add_chromosome_info' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_COUNTS_OUTPUT_SPARSE" ]]; then + if ! [[ "$VIASH_PAR_COUNTS_OUTPUT_SPARSE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--counts_output_sparse' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_STRANDED" ]; then + VIASH_PAR_STRANDED_CHOICES=("yes;no;reverse") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_STRANDED_CHOICES[*]};" =~ ";$VIASH_PAR_STRANDED;" ]]; then + ViashError '--stranded' specified value of \'$VIASH_PAR_STRANDED\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_MODE" ]; then + VIASH_PAR_MODE_CHOICES=("union;intersection-strict;intersection-nonempty") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MODE_CHOICES[*]};" =~ ";$VIASH_PAR_MODE;" ]]; then + ViashError '--mode' specified value of \'$VIASH_PAR_MODE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_NON_UNIQUE" ]; then + VIASH_PAR_NON_UNIQUE_CHOICES=("none;all;fraction;random") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_NON_UNIQUE_CHOICES[*]};" =~ ";$VIASH_PAR_NON_UNIQUE;" ]]; then + ViashError '--non_unique' specified value of \'$VIASH_PAR_NON_UNIQUE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_SECONDARY_ALIGNMENTS" ]; then + VIASH_PAR_SECONDARY_ALIGNMENTS_CHOICES=("score;ignore") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SECONDARY_ALIGNMENTS_CHOICES[*]};" =~ ";$VIASH_PAR_SECONDARY_ALIGNMENTS;" ]]; then + ViashError '--secondary_alignments' specified value of \'$VIASH_PAR_SECONDARY_ALIGNMENTS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS" ]; then + VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS_CHOICES=("score;ignore") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS_CHOICES[*]};" =~ ";$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS;" ]]; then + ViashError '--supplementary_alignments' specified value of \'$VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_R1" ]; then + VIASH_TEST_INPUT_R1=() + IFS=';' + for var in $VIASH_PAR_INPUT_R1; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT_R1+=( "$var" ) + done + VIASH_PAR_INPUT_R1=$(IFS=';' ; echo "${VIASH_TEST_INPUT_R1[*]}") +fi +if [ ! -z "$VIASH_PAR_INPUT_R2" ]; then + VIASH_TEST_INPUT_R2=() + IFS=';' + for var in $VIASH_PAR_INPUT_R2; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT_R2+=( "$var" ) + done + VIASH_PAR_INPUT_R2=$(IFS=';' ; echo "${VIASH_TEST_INPUT_R2[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE_INDEX" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE_INDEX")" ) + VIASH_PAR_REFERENCE_INDEX=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE_INDEX") +fi +if [ ! -z "$VIASH_PAR_REFERENCE_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE_GTF")" ) + VIASH_PAR_REFERENCE_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE_GTF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_TEST_GENOMEFASTAFILES=() + IFS=';' + for var in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GENOMEFASTAFILES+=( "$var" ) + done + VIASH_PAR_GENOMEFASTAFILES=$(IFS=';' ; echo "${VIASH_TEST_GENOMEFASTAFILES[*]}") +fi +if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SJDBGTFFILE")" ) + VIASH_PAR_SJDBGTFFILE=$(ViashDockerAutodetectMount "$VIASH_PAR_SJDBGTFFILE") +fi +if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_READFILESMANIFEST")" ) + VIASH_PAR_READFILESMANIFEST=$(ViashDockerAutodetectMount "$VIASH_PAR_READFILESMANIFEST") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-multi_star-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from typing import Any, Dict, List, Tuple +import math +import tempfile +import subprocess +import tarfile +import gzip +import shutil +from pathlib import Path +import yaml +import pandas as pd +from multiprocess import Pool +import gtfparse +import polars as pl + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'input_r1': $( if [ ! -z ${VIASH_PAR_INPUT_R1+x} ]; then echo "r'${VIASH_PAR_INPUT_R1//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'input_r2': $( if [ ! -z ${VIASH_PAR_INPUT_R2+x} ]; then echo "r'${VIASH_PAR_INPUT_R2//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference_index': $( if [ ! -z ${VIASH_PAR_REFERENCE_INDEX+x} ]; then echo "r'${VIASH_PAR_REFERENCE_INDEX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'reference_gtf': $( if [ ! -z ${VIASH_PAR_REFERENCE_GTF+x} ]; then echo "r'${VIASH_PAR_REFERENCE_GTF//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'run_htseq_count': $( if [ ! -z ${VIASH_PAR_RUN_HTSEQ_COUNT+x} ]; then echo "r'${VIASH_PAR_RUN_HTSEQ_COUNT//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'run_multiqc': $( if [ ! -z ${VIASH_PAR_RUN_MULTIQC+x} ]; then echo "r'${VIASH_PAR_RUN_MULTIQC//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'min_success_rate': $( if [ ! -z ${VIASH_PAR_MIN_SUCCESS_RATE+x} ]; then echo "float(r'${VIASH_PAR_MIN_SUCCESS_RATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'runRNGseed': $( if [ ! -z ${VIASH_PAR_RUNRNGSEED+x} ]; then echo "int(r'${VIASH_PAR_RUNRNGSEED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'genomeFastaFiles': $( if [ ! -z ${VIASH_PAR_GENOMEFASTAFILES+x} ]; then echo "r'${VIASH_PAR_GENOMEFASTAFILES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbFileChrStartEnd': $( if [ ! -z ${VIASH_PAR_SJDBFILECHRSTARTEND+x} ]; then echo "r'${VIASH_PAR_SJDBFILECHRSTARTEND//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFfile': $( if [ ! -z ${VIASH_PAR_SJDBGTFFILE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFchrPrefix': $( if [ ! -z ${VIASH_PAR_SJDBGTFCHRPREFIX+x} ]; then echo "r'${VIASH_PAR_SJDBGTFCHRPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFfeatureExon': $( if [ ! -z ${VIASH_PAR_SJDBGTFFEATUREEXON+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFEATUREEXON//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentTranscript': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGene': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneName': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneType': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbOverhang': $( if [ ! -z ${VIASH_PAR_SJDBOVERHANG+x} ]; then echo "int(r'${VIASH_PAR_SJDBOVERHANG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sjdbScore': $( if [ ! -z ${VIASH_PAR_SJDBSCORE+x} ]; then echo "int(r'${VIASH_PAR_SJDBSCORE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sjdbInsertSave': $( if [ ! -z ${VIASH_PAR_SJDBINSERTSAVE+x} ]; then echo "r'${VIASH_PAR_SJDBINSERTSAVE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'varVCFfile': $( if [ ! -z ${VIASH_PAR_VARVCFFILE+x} ]; then echo "r'${VIASH_PAR_VARVCFFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesType': $( if [ ! -z ${VIASH_PAR_READFILESTYPE+x} ]; then echo "r'${VIASH_PAR_READFILESTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesSAMattrKeep': $( if [ ! -z ${VIASH_PAR_READFILESSAMATTRKEEP+x} ]; then echo "r'${VIASH_PAR_READFILESSAMATTRKEEP//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readFilesManifest': $( if [ ! -z ${VIASH_PAR_READFILESMANIFEST+x} ]; then echo "r'${VIASH_PAR_READFILESMANIFEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesPrefix': $( if [ ! -z ${VIASH_PAR_READFILESPREFIX+x} ]; then echo "r'${VIASH_PAR_READFILESPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesCommand': $( if [ ! -z ${VIASH_PAR_READFILESCOMMAND+x} ]; then echo "r'${VIASH_PAR_READFILESCOMMAND//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readMapNumber': $( if [ ! -z ${VIASH_PAR_READMAPNUMBER+x} ]; then echo "int(r'${VIASH_PAR_READMAPNUMBER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'readMatesLengthsIn': $( if [ ! -z ${VIASH_PAR_READMATESLENGTHSIN+x} ]; then echo "r'${VIASH_PAR_READMATESLENGTHSIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readNameSeparator': $( if [ ! -z ${VIASH_PAR_READNAMESEPARATOR+x} ]; then echo "r'${VIASH_PAR_READNAMESEPARATOR//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readQualityScoreBase': $( if [ ! -z ${VIASH_PAR_READQUALITYSCOREBASE+x} ]; then echo "int(r'${VIASH_PAR_READQUALITYSCOREBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'clipAdapterType': $( if [ ! -z ${VIASH_PAR_CLIPADAPTERTYPE+x} ]; then echo "r'${VIASH_PAR_CLIPADAPTERTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'clip3pNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip3pAdapterSeq': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERSEQ+x} ]; then echo "r'${VIASH_PAR_CLIP3PADAPTERSEQ//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'clip3pAdapterMMp': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERMMP+x} ]; then echo "list(map(float, r'${VIASH_PAR_CLIP3PADAPTERMMP//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip3pAfterAdapterNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PAFTERADAPTERNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PAFTERADAPTERNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip5pNbases': $( if [ ! -z ${VIASH_PAR_CLIP5PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP5PNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'limitGenomeGenerateRAM': $( if [ ! -z ${VIASH_PAR_LIMITGENOMEGENERATERAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITGENOMEGENERATERAM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitIObufferSize': $( if [ ! -z ${VIASH_PAR_LIMITIOBUFFERSIZE+x} ]; then echo "list(map(int, r'${VIASH_PAR_LIMITIOBUFFERSIZE//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'limitOutSAMoneReadBytes': $( if [ ! -z ${VIASH_PAR_LIMITOUTSAMONEREADBYTES+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSAMONEREADBYTES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitOutSJoneRead': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJONEREAD+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJONEREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitOutSJcollapsed': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJCOLLAPSED+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJCOLLAPSED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitBAMsortRAM': $( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITBAMSORTRAM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitSjdbInsertNsj': $( if [ ! -z ${VIASH_PAR_LIMITSJDBINSERTNSJ+x} ]; then echo "int(r'${VIASH_PAR_LIMITSJDBINSERTNSJ//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitNreadsSoft': $( if [ ! -z ${VIASH_PAR_LIMITNREADSSOFT+x} ]; then echo "int(r'${VIASH_PAR_LIMITNREADSSOFT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outTmpKeep': $( if [ ! -z ${VIASH_PAR_OUTTMPKEEP+x} ]; then echo "r'${VIASH_PAR_OUTTMPKEEP//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outStd': $( if [ ! -z ${VIASH_PAR_OUTSTD+x} ]; then echo "r'${VIASH_PAR_OUTSTD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outReadsUnmapped': $( if [ ! -z ${VIASH_PAR_OUTREADSUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTREADSUNMAPPED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outQSconversionAdd': $( if [ ! -z ${VIASH_PAR_OUTQSCONVERSIONADD+x} ]; then echo "int(r'${VIASH_PAR_OUTQSCONVERSIONADD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outMultimapperOrder': $( if [ ! -z ${VIASH_PAR_OUTMULTIMAPPERORDER+x} ]; then echo "r'${VIASH_PAR_OUTMULTIMAPPERORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMmode': $( if [ ! -z ${VIASH_PAR_OUTSAMMODE+x} ]; then echo "r'${VIASH_PAR_OUTSAMMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMstrandField': $( if [ ! -z ${VIASH_PAR_OUTSAMSTRANDFIELD+x} ]; then echo "r'${VIASH_PAR_OUTSAMSTRANDFIELD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMattributes': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRIBUTES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMattrIHstart': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIHSTART+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMATTRIHSTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMunmapped': $( if [ ! -z ${VIASH_PAR_OUTSAMUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTSAMUNMAPPED//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMorder': $( if [ ! -z ${VIASH_PAR_OUTSAMORDER+x} ]; then echo "r'${VIASH_PAR_OUTSAMORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMprimaryFlag': $( if [ ! -z ${VIASH_PAR_OUTSAMPRIMARYFLAG+x} ]; then echo "r'${VIASH_PAR_OUTSAMPRIMARYFLAG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMreadID': $( if [ ! -z ${VIASH_PAR_OUTSAMREADID+x} ]; then echo "r'${VIASH_PAR_OUTSAMREADID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMmapqUnique': $( if [ ! -z ${VIASH_PAR_OUTSAMMAPQUNIQUE+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMAPQUNIQUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMflagOR': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGOR+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMflagAND': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGAND+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGAND//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMattrRGline': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRRGLINE+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRRGLINE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderHD': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERHD+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERHD//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderPG': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERPG+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERPG//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderCommentFile': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERCOMMENTFILE+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERCOMMENTFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMfilter': $( if [ ! -z ${VIASH_PAR_OUTSAMFILTER+x} ]; then echo "r'${VIASH_PAR_OUTSAMFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMmultNmax': $( if [ ! -z ${VIASH_PAR_OUTSAMMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMULTNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMtlen': $( if [ ! -z ${VIASH_PAR_OUTSAMTLEN+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMTLEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMcompression': $( if [ ! -z ${VIASH_PAR_OUTBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMCOMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMsortingThreadN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGTHREADN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGTHREADN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMsortingBinsN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGBINSN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGBINSN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'bamRemoveDuplicatesType': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESTYPE+x} ]; then echo "r'${VIASH_PAR_BAMREMOVEDUPLICATESTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'bamRemoveDuplicatesMate2basesN': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN+x} ]; then echo "int(r'${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outWigType': $( if [ ! -z ${VIASH_PAR_OUTWIGTYPE+x} ]; then echo "r'${VIASH_PAR_OUTWIGTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outWigStrand': $( if [ ! -z ${VIASH_PAR_OUTWIGSTRAND+x} ]; then echo "r'${VIASH_PAR_OUTWIGSTRAND//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outWigReferencesPrefix': $( if [ ! -z ${VIASH_PAR_OUTWIGREFERENCESPREFIX+x} ]; then echo "r'${VIASH_PAR_OUTWIGREFERENCESPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outWigNorm': $( if [ ! -z ${VIASH_PAR_OUTWIGNORM+x} ]; then echo "r'${VIASH_PAR_OUTWIGNORM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterType': $( if [ ! -z ${VIASH_PAR_OUTFILTERTYPE+x} ]; then echo "r'${VIASH_PAR_OUTFILTERTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMultimapNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMISMATCHNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNoverLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNoverReadLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterScoreMin': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERSCOREMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterScoreMinOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMatchNmin': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMATCHNMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMatchNminOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterIntronMotifs': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONMOTIFS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONMOTIFS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterIntronStrands': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONSTRANDS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONSTRANDS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJtype': $( if [ ! -z ${VIASH_PAR_OUTSJTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSJTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJfilterReads': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERREADS+x} ]; then echo "r'${VIASH_PAR_OUTSJFILTERREADS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJfilterOverhangMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTEROVERHANGMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTEROVERHANGMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountUniqueMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountTotalMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterDistToOtherSJmin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterIntronMaxVsReadN': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'scoreGap': $( if [ ! -z ${VIASH_PAR_SCOREGAP+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapNoncan': $( if [ ! -z ${VIASH_PAR_SCOREGAPNONCAN+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPNONCAN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapGCAG': $( if [ ! -z ${VIASH_PAR_SCOREGAPGCAG+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPGCAG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapATAC': $( if [ ! -z ${VIASH_PAR_SCOREGAPATAC+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPATAC//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGenomicLengthLog2scale': $( if [ ! -z ${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE+x} ]; then echo "int(r'${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreDelOpen': $( if [ ! -z ${VIASH_PAR_SCOREDELOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELOPEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreDelBase': $( if [ ! -z ${VIASH_PAR_SCOREDELBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreInsOpen': $( if [ ! -z ${VIASH_PAR_SCOREINSOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSOPEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreInsBase': $( if [ ! -z ${VIASH_PAR_SCOREINSBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreStitchSJshift': $( if [ ! -z ${VIASH_PAR_SCORESTITCHSJSHIFT+x} ]; then echo "int(r'${VIASH_PAR_SCORESTITCHSJSHIFT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchStartLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHSTARTLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchStartLmaxOverLread': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedMultimapNmax': $( if [ ! -z ${VIASH_PAR_SEEDMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedPerReadNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedPerWindowNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERWINDOWNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedNoneLociPerWindow': $( if [ ! -z ${VIASH_PAR_SEEDNONELOCIPERWINDOW+x} ]; then echo "int(r'${VIASH_PAR_SEEDNONELOCIPERWINDOW//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSplitMin': $( if [ ! -z ${VIASH_PAR_SEEDSPLITMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDSPLITMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedMapMin': $( if [ ! -z ${VIASH_PAR_SEEDMAPMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDMAPMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignIntronMin': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignIntronMax': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignMatesGapMax': $( if [ ! -z ${VIASH_PAR_ALIGNMATESGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNMATESGAPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSJoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSJstitchMismatchNmax': $( if [ ! -z ${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX+x} ]; then echo "list(map(int, r'${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'alignSJDBoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJDBOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJDBOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSplicedMateMapLmin': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSplicedMateMapLminOverLmate': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE+x} ]; then echo "float(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignWindowsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNWINDOWSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNWINDOWSPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignTranscriptsPerWindowNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignTranscriptsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignEndsType': $( if [ ! -z ${VIASH_PAR_ALIGNENDSTYPE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignEndsProtrude': $( if [ ! -z ${VIASH_PAR_ALIGNENDSPROTRUDE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSPROTRUDE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignSoftClipAtReferenceEnds': $( if [ ! -z ${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS+x} ]; then echo "r'${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignInsertionFlush': $( if [ ! -z ${VIASH_PAR_ALIGNINSERTIONFLUSH+x} ]; then echo "r'${VIASH_PAR_ALIGNINSERTIONFLUSH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'peOverlapNbasesMin': $( if [ ! -z ${VIASH_PAR_PEOVERLAPNBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_PEOVERLAPNBASESMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'peOverlapMMp': $( if [ ! -z ${VIASH_PAR_PEOVERLAPMMP+x} ]; then echo "float(r'${VIASH_PAR_PEOVERLAPMMP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winAnchorMultimapNmax': $( if [ ! -z ${VIASH_PAR_WINANCHORMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winBinNbits': $( if [ ! -z ${VIASH_PAR_WINBINNBITS+x} ]; then echo "int(r'${VIASH_PAR_WINBINNBITS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winAnchorDistNbins': $( if [ ! -z ${VIASH_PAR_WINANCHORDISTNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORDISTNBINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winFlankNbins': $( if [ ! -z ${VIASH_PAR_WINFLANKNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINFLANKNBINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winReadCoverageRelativeMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGERELATIVEMIN+x} ]; then echo "float(r'${VIASH_PAR_WINREADCOVERAGERELATIVEMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winReadCoverageBasesMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGEBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_WINREADCOVERAGEBASESMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimOutType': $( if [ ! -z ${VIASH_PAR_CHIMOUTTYPE+x} ]; then echo "r'${VIASH_PAR_CHIMOUTTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'chimSegmentMin': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreMin': $( if [ ! -z ${VIASH_PAR_CHIMSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreDropMax': $( if [ ! -z ${VIASH_PAR_CHIMSCOREDROPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREDROPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreSeparation': $( if [ ! -z ${VIASH_PAR_CHIMSCORESEPARATION+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCORESEPARATION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreJunctionNonGTAG': $( if [ ! -z ${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimJunctionOverhangMin': $( if [ ! -z ${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimSegmentReadGapMax': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTREADGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTREADGAPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimFilter': $( if [ ! -z ${VIASH_PAR_CHIMFILTER+x} ]; then echo "r'${VIASH_PAR_CHIMFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'chimMainSegmentMultNmax': $( if [ ! -z ${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimMultimapNmax': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPSCORERANGE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimNonchimScoreDropMin': $( if [ ! -z ${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimOutJunctionFormat': $( if [ ! -z ${VIASH_PAR_CHIMOUTJUNCTIONFORMAT+x} ]; then echo "int(r'${VIASH_PAR_CHIMOUTJUNCTIONFORMAT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'quantMode': $( if [ ! -z ${VIASH_PAR_QUANTMODE+x} ]; then echo "r'${VIASH_PAR_QUANTMODE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'quantTranscriptomeBAMcompression': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'quantTranscriptomeBan': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAN+x} ]; then echo "r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'twopassMode': $( if [ ! -z ${VIASH_PAR_TWOPASSMODE+x} ]; then echo "r'${VIASH_PAR_TWOPASSMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'twopass1readsN': $( if [ ! -z ${VIASH_PAR_TWOPASS1READSN+x} ]; then echo "int(r'${VIASH_PAR_TWOPASS1READSN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'waspOutputMode': $( if [ ! -z ${VIASH_PAR_WASPOUTPUTMODE+x} ]; then echo "r'${VIASH_PAR_WASPOUTPUTMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloType': $( if [ ! -z ${VIASH_PAR_SOLOTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCBwhitelist': $( if [ ! -z ${VIASH_PAR_SOLOCBWHITELIST+x} ]; then echo "r'${VIASH_PAR_SOLOCBWHITELIST//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCBstart': $( if [ ! -z ${VIASH_PAR_SOLOCBSTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBSTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBlen': $( if [ ! -z ${VIASH_PAR_SOLOCBLEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBLEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloUMIstart': $( if [ ! -z ${VIASH_PAR_SOLOUMISTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMISTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloUMIlen': $( if [ ! -z ${VIASH_PAR_SOLOUMILEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMILEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloBarcodeReadLength': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEREADLENGTH+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEREADLENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloBarcodeMate': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEMATE+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEMATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBposition': $( if [ ! -z ${VIASH_PAR_SOLOCBPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOCBPOSITION//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIposition': $( if [ ! -z ${VIASH_PAR_SOLOUMIPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOUMIPOSITION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloAdapterSequence': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERSEQUENCE+x} ]; then echo "r'${VIASH_PAR_SOLOADAPTERSEQUENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloAdapterMismatchesNmax': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX+x} ]; then echo "int(r'${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBmatchWLtype': $( if [ ! -z ${VIASH_PAR_SOLOCBMATCHWLTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOCBMATCHWLTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloInputSAMattrBarcodeSeq': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloInputSAMattrBarcodeQual': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloStrand': $( if [ ! -z ${VIASH_PAR_SOLOSTRAND+x} ]; then echo "r'${VIASH_PAR_SOLOSTRAND//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloFeatures': $( if [ ! -z ${VIASH_PAR_SOLOFEATURES+x} ]; then echo "r'${VIASH_PAR_SOLOFEATURES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloMultiMappers': $( if [ ! -z ${VIASH_PAR_SOLOMULTIMAPPERS+x} ]; then echo "r'${VIASH_PAR_SOLOMULTIMAPPERS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIdedup': $( if [ ! -z ${VIASH_PAR_SOLOUMIDEDUP+x} ]; then echo "r'${VIASH_PAR_SOLOUMIDEDUP//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIfiltering': $( if [ ! -z ${VIASH_PAR_SOLOUMIFILTERING+x} ]; then echo "r'${VIASH_PAR_SOLOUMIFILTERING//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloOutFileNames': $( if [ ! -z ${VIASH_PAR_SOLOOUTFILENAMES+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFILENAMES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCellFilter': $( if [ ! -z ${VIASH_PAR_SOLOCELLFILTER+x} ]; then echo "r'${VIASH_PAR_SOLOCELLFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloOutFormatFeaturesGeneField3': $( if [ ! -z ${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCellReadStats': $( if [ ! -z ${VIASH_PAR_SOLOCELLREADSTATS+x} ]; then echo "r'${VIASH_PAR_SOLOCELLREADSTATS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'stranded': $( if [ ! -z ${VIASH_PAR_STRANDED+x} ]; then echo "r'${VIASH_PAR_STRANDED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'minimum_alignment_quality': $( if [ ! -z ${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY+x} ]; then echo "int(r'${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'type': $( if [ ! -z ${VIASH_PAR_TYPE+x} ]; then echo "r'${VIASH_PAR_TYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'id_attribute': $( if [ ! -z ${VIASH_PAR_ID_ATTRIBUTE+x} ]; then echo "r'${VIASH_PAR_ID_ATTRIBUTE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'additional_attributes': $( if [ ! -z ${VIASH_PAR_ADDITIONAL_ATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_ADDITIONAL_ATTRIBUTES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'add_chromosome_info': $( if [ ! -z ${VIASH_PAR_ADD_CHROMOSOME_INFO+x} ]; then echo "r'${VIASH_PAR_ADD_CHROMOSOME_INFO//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'mode': $( if [ ! -z ${VIASH_PAR_MODE+x} ]; then echo "r'${VIASH_PAR_MODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'non_unique': $( if [ ! -z ${VIASH_PAR_NON_UNIQUE+x} ]; then echo "r'${VIASH_PAR_NON_UNIQUE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'secondary_alignments': $( if [ ! -z ${VIASH_PAR_SECONDARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SECONDARY_ALIGNMENTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'supplementary_alignments': $( if [ ! -z ${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'counts_output_sparse': $( if [ ! -z ${VIASH_PAR_COUNTS_OUTPUT_SPARSE+x} ]; then echo "r'${VIASH_PAR_COUNTS_OUTPUT_SPARSE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +def fetch_arguments_info(config: Dict[str, Any]) -> Dict[str, Any]: + """Fetch arguments from config""" + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + return arguments + + +def process_par( + par: Dict[str, Any], + arguments_info: Dict[str, Any], + gz_args: List[str], + temp_dir: Path, +) -> Dict[str, Any]: + """ + Process the Viash par dictionary + + This turns file strings into Path objects and extracting gzipped files if need be. + + Parameters + ---------- + par: The par dictionary created by Viash + arguments_info: The arguments info Dictionary created by \`fetch_arguments_info\` + gz_args: A list of argument keys which could be gzip files which need to be decompressed. + temp_dir: A temporary directory in which to ungzip files + """ + new_par = {} + for key, value in par.items(): + arg_info = arguments_info[key] + # turn file arguments into paths + if value and arg_info["type"] == "file": + is_multiple = isinstance(value, list) + + if is_multiple: + value = [Path(val) for val in value] + else: + value = Path(value) + + if key in gz_args: + print(f">> Checking compression of --{key}", flush=True) + # turn value into list if need be + if not is_multiple: + value = [value] + + # extract + value = [extract_if_need_be(path, temp_dir) for path in value] + + # unlist if need be + if not is_multiple: + value = value[0] + + new_par[key] = value + return new_par + + +def generate_cmd_arguments(par, arguments_info, step_filter=None, flatten=False): + """ + Generate command-line arguments by fetching the relevant args + + Parameters + ---------- + par: The par dictionary created by Viash + arguments_info: The arguments info Dictionary created by \`fetch_arguments_info\` + step_filter: If provided,\`par\` will be filtered to only contain arguments for which + argument.info.step == step_filter. + flatten: If \`False\`, the command for an argument with multiple values will be + \`["--key", "value1", "--key", "value2"]\`, otherwise \`["--key", "value1", "value2"]\`. + """ + cmd_args = [] + + for key, arg in arguments_info.items(): + arg_val = par.get(key) + # The info key is always present (changed in viash 0.7.4) + # in the parsed config (None if not specified in source config) + info = arg["info"] or {} + orig_arg = info.get("orig_arg") + step = info.get("step") + if arg_val and orig_arg and (not step_filter or step == step_filter): + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + if flatten: + arg_val = [str(x) for x in [orig_arg] + arg_val] + else: + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +def is_gz_file(path: Path) -> bool: + """Check whether something is a gzip""" + with open(path, "rb") as file: + return file.read(2) == b"\\x1f\\x8b" + + +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + """if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path""" + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +def load_star_reference(reference_index: str) -> None: + """Load star reference index into memory.""" + subprocess.run( + [ + "STAR", + "--genomeLoad", + "LoadAndExit", + "--genomeDir", + str(reference_index), + ], + check=True, + ) + + +def unload_star_reference(reference_index: str) -> None: + """Remove star reference index from memory.""" + subprocess.run( + [ + "STAR", + "--genomeLoad", + "Remove", + "--genomeDir", + str(reference_index), + ], + check=True, + ) + + +def star_and_htseq( + group_id: str, + r1_files: List[Path], + r2_files: List[Path], + temp_dir: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], + num_threads: int, +) -> Tuple[int, str]: + star_output = par["output"] / "per" / group_id + temp_dir_group = temp_dir / f"star_tmp_{group_id}" + unsorted_bam = star_output / "Aligned.out.bam" + sorted_bam = star_output / "Aligned.sorted.out.bam" + counts_file = star_output / "htseq-count.txt" + multiqc_path = star_output / "multiqc_data" + + print(f">> Running STAR for group '{group_id}' with command:", flush=True) + star_output.mkdir(parents=True, exist_ok=True) + temp_dir_group.parent.mkdir(parents=True, exist_ok=True) + run_star( + r1_files=r1_files, + r2_files=r2_files, + output_dir=star_output, + temp_dir=temp_dir / f"star_tmp_{group_id}", + par=par, + arguments_info=arguments_info, + num_threads=num_threads, + ) + if not unsorted_bam.exists(): + return (1, f"Could not find unsorted bam at '{unsorted_bam}'") + + if par["run_htseq_count"]: + print( + f">> Running samtools sort for group '{group_id}' with command:", flush=True + ) + run_samtools_sort(unsorted_bam, sorted_bam) + if not sorted_bam.exists(): + return (1, f"Could not find sorted bam at '{unsorted_bam}'") + + print( + f">> Running htseq-count for group '{group_id}' with command:", flush=True + ) + run_htseq_count(sorted_bam, counts_file, par, arguments_info) + if not counts_file.exists(): + return (1, f"Could not find counts at '{counts_file}'") + + if par["run_multiqc"]: + run_multiqc(star_output) + if not multiqc_path.exists(): + return (1, f"Could not find MultiQC output at '{multiqc_path}'") + + return (0, "") + + +def run_star( + r1_files: List[Path], + r2_files: List[Path], + output_dir: Path, + temp_dir: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], + num_threads: int, +) -> None: + """Run star""" + # process manual arguments + r1_pasted = [",".join([str(r1) for r1 in r1_files])] + r2_pasted = [",".join([str(r2) for r2 in r2_files])] if r2_files else [] + manual_par = { + "--genomeDir": [par["reference_index"]], + "--genomeLoad": ["LoadAndRemove"], + "--runThreadN": [str(num_threads)], + "--runMode": ["alignReads"], + "--readFilesIn": r1_pasted + r2_pasted, + # create a tempdir per group + "--outTmpDir": [temp_dir], + # make sure there is a trailing / + "--outFileNamePrefix": [f"{output_dir}/"], + # fix the outSAMtype to return unsorted BAM files + "--outSAMtype": ["BAM", "Unsorted"], + } + manual_cmd = [str(x) for key, values in manual_par.items() for x in [key] + values] + + # process all passthrough star arguments + par_cmd = generate_cmd_arguments(par, arguments_info, "star", flatten=True) + + # combine into one command and turn into strings + cmd_args = [str(val) for val in ["STAR"] + manual_cmd + par_cmd] + + # run star + subprocess.run(cmd_args, check=True) + + +def run_samtools_sort(unsorted_bam: Path, sorted_bam: Path) -> None: + "Run samtools sort" + cmd_args = [ + "samtools", + "sort", + "-o", + sorted_bam, + unsorted_bam, + ] + subprocess.run(cmd_args, check=True) + + +def run_htseq_count( + sorted_bam: Path, + counts_file: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], +) -> None: + """Run HTSeq count""" + # process manual arguments + manual_cmd = [sorted_bam, par["reference_gtf"]] + + # process all passthrough htseq arguments + par_cmd = generate_cmd_arguments(par, arguments_info, "htseq") + + # combine into one command and turn into strings + cmd_args = [str(val) for val in ["htseq-count"] + manual_cmd + par_cmd] + + # run htseq + with open(counts_file, "w", encoding="utf-8") as file: + subprocess.run(cmd_args, check=True, stdout=file) + + +def get_feature_info(reference_gtf) -> pd.DataFrame: + ref = gtfparse.read_gtf(reference_gtf) + ref_genes = ref.filter((pl.col("feature") == "gene") | (pl.col("source") == "ERCC")) + return pd.DataFrame( + { + "feature_id": pd.Index(ref_genes.get_column("gene_id")), + "feature_type": "Gene Expression", + "feature_name": ref_genes.get_column("gene_name").to_pandas(), + } + ) + + +def run_multiqc(input_dir: Path) -> None: + cmd_args = [ + "multiqc", + str(input_dir), + "--outdir", + str(input_dir), + "--no-report", + "--force", + ] + + # run multiqc + subprocess.run(cmd_args, check=True) + + +######################## +### Main code ### +######################## + + +def main(par, meta): + """Main function""" + + # check input arguments + assert len(par["input_id"]) == len(par["input_r1"]), ( + "--input_r1 should have same length as --input_id" + ) + if par["input_r2"]: + assert len(par["input_id"]) == len(par["input_r2"]), ( + "--input_r2 should have same length as --input_id" + ) + + # read config arguments + with open(meta["config"], "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + # fetch all arguments from the config and turn it into a Dict[str, Argument] + arguments_info = fetch_arguments_info(config) + + # temp_dir = "tmp/" + with tempfile.TemporaryDirectory( + prefix=f"{meta['name']}-", dir=meta["temp_dir"], ignore_cleanup_errors=True + ) as temp_dir: + temp_dir = Path(temp_dir) + temp_dir.mkdir(parents=True, exist_ok=True) + + # turn file strings into Paths and decompress gzip if need be + gz_args = ["input_r1", "input_r2", "reference_index", "reference_gtf"] + par = process_par(par, arguments_info, gz_args, temp_dir) + + # make sure input_r2 has same length as input_r1 + if not par["input_r2"]: + par["input_r2"] = [None for _ in par["input_r1"]] + + # group input_files by input_id + print(">> Group by --input_id", flush=True) + grouped_inputs = {} + for group_id, file_r1, file_r2 in zip( + par["input_id"], par["input_r1"], par["input_r2"] + ): + if group_id not in grouped_inputs: + grouped_inputs[group_id] = ([], []) + grouped_inputs[group_id][0].append(file_r1) + if file_r2: + grouped_inputs[group_id][1].append(file_r2) + + # create output dir if need be + par["output"].mkdir(parents=True, exist_ok=True) + + # store features metadata + feature_info = get_feature_info(str(par["reference_gtf"])) + with open(par["output"] / "feature_info.tsv", "w", encoding="utf-8") as file: + feature_info.to_csv(file, sep="\\t", index=False) + + # try: + # print(">> Loading genome in memory", flush=True) + # load_star_reference(par["reference_index"]) + + cpus = meta.get("cpus", 1) + num_items = len(grouped_inputs) + pool_size = min(cpus, num_items) + num_threads_per_task = math.ceil(cpus / pool_size) + + with Pool(pool_size) as pool: + outs = pool.starmap( + lambda group_id, files: star_and_htseq( + group_id=group_id, + r1_files=files[0], + r2_files=files[1], + temp_dir=temp_dir, + par=par, + arguments_info=arguments_info, + num_threads=num_threads_per_task, + ), + grouped_inputs.items(), + ) + + num_errored = 0 + for exit, msg in outs: + if exit != 0: + print(f"Error: {msg}") + num_errored += 1 + + pct_succeeded = 1.0 - num_errored / len(outs) + print("------------------") + print(f"Success rate: {math.ceil(pct_succeeded * 100)}%") + + assert pct_succeeded >= par["min_success_rate"], ( + f"Success rate should be at least {math.ceil(par['min_success_rate'] * 100)}%" + ) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_R1" ]; then + unset VIASH_TEST_INPUT_R1 + IFS=';' + for var in $VIASH_PAR_INPUT_R1; do + unset IFS + if [ -z "$VIASH_TEST_INPUT_R1" ]; then + VIASH_TEST_INPUT_R1="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT_R1="$VIASH_TEST_INPUT_R1;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT_R1="$VIASH_TEST_INPUT_R1" + fi + if [ ! -z "$VIASH_PAR_INPUT_R2" ]; then + unset VIASH_TEST_INPUT_R2 + IFS=';' + for var in $VIASH_PAR_INPUT_R2; do + unset IFS + if [ -z "$VIASH_TEST_INPUT_R2" ]; then + VIASH_TEST_INPUT_R2="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT_R2="$VIASH_TEST_INPUT_R2;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT_R2="$VIASH_TEST_INPUT_R2" + fi + if [ ! -z "$VIASH_PAR_REFERENCE_INDEX" ]; then + VIASH_PAR_REFERENCE_INDEX=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE_INDEX") + fi + if [ ! -z "$VIASH_PAR_REFERENCE_GTF" ]; then + VIASH_PAR_REFERENCE_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE_GTF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + unset VIASH_TEST_GENOMEFASTAFILES + IFS=';' + for var in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + if [ -z "$VIASH_TEST_GENOMEFASTAFILES" ]; then + VIASH_TEST_GENOMEFASTAFILES="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GENOMEFASTAFILES="$VIASH_TEST_GENOMEFASTAFILES;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GENOMEFASTAFILES="$VIASH_TEST_GENOMEFASTAFILES" + fi + if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ]; then + VIASH_PAR_SJDBGTFFILE=$(ViashDockerStripAutomount "$VIASH_PAR_SJDBGTFFILE") + fi + if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ]; then + VIASH_PAR_READFILESMANIFEST=$(ViashDockerStripAutomount "$VIASH_PAR_READFILESMANIFEST") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/multi_star/nextflow_labels.config b/target/executable/mapping/multi_star/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/multi_star/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/multi_star_to_h5mu/.config.vsh.yaml b/target/executable/mapping/multi_star_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..c5d7207d --- /dev/null +++ b/target/executable/mapping/multi_star_to_h5mu/.config.vsh.yaml @@ -0,0 +1,253 @@ +name: "multi_star_to_h5mu" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "The directory created by `multi_star`" + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the output of `multi_star` to a h5mu.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "multi_star" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/multi_star_to_h5mu/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/multi_star_to_h5mu" + executable: "target/executable/mapping/multi_star_to_h5mu/multi_star_to_h5mu" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/multi_star_to_h5mu/multi_star_to_h5mu b/target/executable/mapping/multi_star_to_h5mu/multi_star_to_h5mu new file mode 100755 index 00000000..a389f040 --- /dev/null +++ b/target/executable/mapping/multi_star_to_h5mu/multi_star_to_h5mu @@ -0,0 +1,1226 @@ +#!/usr/bin/env bash + +# multi_star_to_h5mu main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) +# * Angela Oliveira Pisco (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="multi_star_to_h5mu" +VIASH_META_FUNCTIONALITY_NAME="multi_star_to_h5mu" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Angela Oliveira Pisco" +LABEL org.opencontainers.image.description="Companion container for running component mapping multi_star_to_h5mu" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "multi_star_to_h5mu main" + echo "" + echo "Convert the output of \`multi_star\` to a h5mu." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/foo" + echo " The directory created by \`multi_star\`" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "multi_star_to_h5mu main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/multi_star_to_h5mu:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-multi_star_to_h5mu-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from pathlib import Path +import pandas as pd +import mudata as md +import anndata as ad +import numpy as np +import json + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# convert to path +input_dir = Path(par["input"]) + +# read counts information +print("> Read counts data", flush=True) +per_obs_data = [] + +for input_counts in (input_dir / "per").glob("**/htseq-count.txt"): + per_obs_dir = input_counts.parent + input_id = per_obs_dir.name + input_multiqc = per_obs_dir / "multiqc_data" / "multiqc_data.json" + + data = pd.read_table( + input_counts, + index_col=0, + names=["cell_id", input_id], + dtype={"cell_id": "U", input_id: "i"}, + ) + data2 = data[~data.index.str.startswith("__")] + + with open(input_multiqc, "r") as file: + qc = json.load(file) + + qc_star = qc.get("report_saved_raw_data", {}).get("multiqc_star", {}).get(input_id) + qc_htseq = ( + qc.get("report_saved_raw_data", {}).get("multiqc_htseq", {}).get("htseq-count") + ) + + per_obs_data.append( + { + "counts": data2.transpose(), + "qc_star": pd.DataFrame(qc_star, index=[input_id]), + "qc_htseq": pd.DataFrame(qc_htseq, index=[input_id]), + } + ) + + +# combine all counts +counts = pd.concat([x["counts"] for x in per_obs_data], axis=0) +qc_star = pd.concat([x["qc_star"] for x in per_obs_data], axis=0) +qc_htseq = pd.concat([x["qc_htseq"] for x in per_obs_data], axis=0) + +# read feature info +feature_info = pd.read_csv(input_dir / "feature_info.tsv", sep="\\t", index_col=0) +feature_info_ord = feature_info.loc[counts.columns] + +var = pd.DataFrame( + data={ + "gene_ids": feature_info_ord.index, + "feature_types": "Gene Expression", + "gene_name": feature_info_ord["feature_name"], + } +).set_index("gene_ids") + +print("> construct anndata", flush=True) +adata = ad.AnnData( + X=counts, obsm={"qc_star": qc_star, "qc_htseq": qc_htseq}, var=var, dtype=np.int32 +) + +print("> convert to mudata", flush=True) +mdata = md.MuData(adata) + +print("> write to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/multi_star_to_h5mu/nextflow_labels.config b/target/executable/mapping/multi_star_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/multi_star_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/samtools_sort/.config.vsh.yaml b/target/executable/mapping/samtools_sort/.config.vsh.yaml new file mode 100644 index 00000000..75a48c54 --- /dev/null +++ b/target/executable/mapping/samtools_sort/.config.vsh.yaml @@ -0,0 +1,331 @@ +name: "samtools_sort" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Path to the SAM/BAM/CRAM files containing the mapped reads." + info: + orig_arg: "in_sam" + example: + - "input.bam" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output_bam" + description: "Filename to output the counts to." + info: + orig_arg: "-o" + example: + - "output.bam" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_bai" + description: "BAI-format index for BAM file." + info: null + example: + - "output.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_format" + description: "The output format. By default, samtools tries to select a format\ + \ based on the -o filename extension; if output is to standard output or no\ + \ format can be deduced, bam is selected." + info: + orig_arg: "-O" + example: + - "bam" + required: false + choices: + - "sam" + - "bam" + - "cram" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--compression" + description: "Compression level, from 0 (uncompressed) to 9 (best" + info: + orig_arg: "-l" + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean_true" + name: "--minimizer_cluster" + description: "Sort unmapped reads (those in chromosome \"*\") by their sequence\ + \ minimiser (Schleimer et al., 2003; Roberts et al., 2004), \nalso reverse complementing\ + \ as appropriate. This has the effect of collating some similar data together,\ + \ improving the \ncompressibility of the unmapped sequence. The minimiser kmer\ + \ size is adjusted using the -K option. Note data compressed \nin this manner\ + \ may need to be name collated prior to conversion back to fastq.\n\nMapped\ + \ sequences are sorted by chromosome and position. \n" + info: + orig_arg: "-M" + direction: "input" + - type: "integer" + name: "--minimizer_kmer" + description: "Sets the kmer size to be used in the -M option." + info: + orig_arg: "-K" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--sort_by_read_names" + description: "Sort by read names (i.e., the QNAME field) rather than by chromosomal\ + \ coordinates." + info: + orig_arg: "-n" + direction: "input" + - type: "string" + name: "--sort_by" + description: "Sort first by this value in the alignment tag, then by position\ + \ or name (if also using -n)." + info: + orig_arg: "-t" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_pg" + description: "Do not add a @PG line to the header of the output file." + info: + orig_arg: "--no-PG" + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Sort and (optionally) index alignments.\n\nReads are sorted by leftmost\ + \ coordinates, or by read name when `--sort_by_read_names` is used.\n\nAn appropriate\ + \ `@HD-SO` sort order header tag will be added or an existing one updated if necessary.\n\ + \nNote that to generate an index file (by specifying `--output_bai`), the default\ + \ coordinate sort must be used.\nThus the `--sort_by_read_names` and `--sort_by\ + \ ` options are incompatible with `--output_bai`. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "samtools" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pyyaml" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/samtools_sort/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/samtools_sort" + executable: "target/executable/mapping/samtools_sort/samtools_sort" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/samtools_sort/nextflow_labels.config b/target/executable/mapping/samtools_sort/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/samtools_sort/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/samtools_sort/samtools_sort b/target/executable/mapping/samtools_sort/samtools_sort new file mode 100755 index 00000000..4ec28846 --- /dev/null +++ b/target/executable/mapping/samtools_sort/samtools_sort @@ -0,0 +1,1392 @@ +#!/usr/bin/env bash + +# samtools_sort main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) +# * Angela Oliveira Pisco (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="samtools_sort" +VIASH_META_FUNCTIONALITY_NAME="samtools_sort" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y samtools procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pyyaml" + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Angela Oliveira Pisco" +LABEL org.opencontainers.image.description="Companion container for running component mapping samtools_sort" +LABEL org.opencontainers.image.created="2025-08-22T07:23:21Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "samtools_sort main" + echo "" + echo "Sort and (optionally) index alignments." + echo "" + echo "Reads are sorted by leftmost coordinates, or by read name when" + echo "\`--sort_by_read_names\` is used." + echo "" + echo "An appropriate \`@HD-SO\` sort order header tag will be added or an existing one" + echo "updated if necessary." + echo "" + echo "Note that to generate an index file (by specifying \`--output_bai\`), the default" + echo "coordinate sort must be used." + echo "Thus the \`--sort_by_read_names\` and \`--sort_by \` options are incompatible" + echo "with \`--output_bai\`." + echo "" + echo "Input:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.bam" + echo " Path to the SAM/BAM/CRAM files containing the mapped reads." + echo "" + echo "Output:" + echo " --output_bam" + echo " type: file, required parameter, output, file must exist" + echo " example: output.bam" + echo " Filename to output the counts to." + echo "" + echo " --output_bai" + echo " type: file, output, file must exist" + echo " example: output.bam.bai" + echo " BAI-format index for BAM file." + echo "" + echo " --output_format" + echo " type: string" + echo " example: bam" + echo " choices: [ sam, bam, cram ]" + echo " The output format. By default, samtools tries to select a format based" + echo " on the -o filename extension; if output is to standard output or no" + echo " format can be deduced, bam is selected." + echo "" + echo " --compression" + echo " type: integer" + echo " example: 5" + echo " Compression level, from 0 (uncompressed) to 9 (best" + echo "" + echo "Arguments:" + echo " --minimizer_cluster" + echo " type: boolean_true" + echo " Sort unmapped reads (those in chromosome \"*\") by their sequence" + echo " minimiser (Schleimer et al., 2003; Roberts et al., 2004)," + echo " also reverse complementing as appropriate. This has the effect of" + echo " collating some similar data together, improving the" + echo " compressibility of the unmapped sequence. The minimiser kmer size is" + echo " adjusted using the -K option. Note data compressed" + echo " in this manner may need to be name collated prior to conversion back to" + echo " fastq." + echo " Mapped sequences are sorted by chromosome and position." + echo "" + echo " --minimizer_kmer" + echo " type: integer" + echo " example: 20" + echo " Sets the kmer size to be used in the -M option." + echo "" + echo " --sort_by_read_names" + echo " type: boolean_true" + echo " Sort by read names (i.e., the QNAME field) rather than by chromosomal" + echo " coordinates." + echo "" + echo " --sort_by" + echo " type: string" + echo " Sort first by this value in the alignment tag, then by position or name" + echo " (if also using -n)." + echo "" + echo " --no_pg" + echo " type: boolean_true" + echo " Do not add a @PG line to the header of the output file." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "samtools_sort main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_bam) + [ -n "$VIASH_PAR_OUTPUT_BAM" ] && ViashError Bad arguments for option \'--output_bam\': \'$VIASH_PAR_OUTPUT_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_BAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_bam. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_bam=*) + [ -n "$VIASH_PAR_OUTPUT_BAM" ] && ViashError Bad arguments for option \'--output_bam=*\': \'$VIASH_PAR_OUTPUT_BAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_BAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_bai) + [ -n "$VIASH_PAR_OUTPUT_BAI" ] && ViashError Bad arguments for option \'--output_bai\': \'$VIASH_PAR_OUTPUT_BAI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_BAI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_bai. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_bai=*) + [ -n "$VIASH_PAR_OUTPUT_BAI" ] && ViashError Bad arguments for option \'--output_bai=*\': \'$VIASH_PAR_OUTPUT_BAI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_BAI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_format) + [ -n "$VIASH_PAR_OUTPUT_FORMAT" ] && ViashError Bad arguments for option \'--output_format\': \'$VIASH_PAR_OUTPUT_FORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FORMAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_format. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_format=*) + [ -n "$VIASH_PAR_OUTPUT_FORMAT" ] && ViashError Bad arguments for option \'--output_format=*\': \'$VIASH_PAR_OUTPUT_FORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FORMAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --compression) + [ -n "$VIASH_PAR_COMPRESSION" ] && ViashError Bad arguments for option \'--compression\': \'$VIASH_PAR_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --compression=*) + [ -n "$VIASH_PAR_COMPRESSION" ] && ViashError Bad arguments for option \'--compression=*\': \'$VIASH_PAR_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --minimizer_cluster) + [ -n "$VIASH_PAR_MINIMIZER_CLUSTER" ] && ViashError Bad arguments for option \'--minimizer_cluster\': \'$VIASH_PAR_MINIMIZER_CLUSTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMIZER_CLUSTER=true + shift 1 + ;; + --minimizer_kmer) + [ -n "$VIASH_PAR_MINIMIZER_KMER" ] && ViashError Bad arguments for option \'--minimizer_kmer\': \'$VIASH_PAR_MINIMIZER_KMER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMIZER_KMER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --minimizer_kmer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --minimizer_kmer=*) + [ -n "$VIASH_PAR_MINIMIZER_KMER" ] && ViashError Bad arguments for option \'--minimizer_kmer=*\': \'$VIASH_PAR_MINIMIZER_KMER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MINIMIZER_KMER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sort_by_read_names) + [ -n "$VIASH_PAR_SORT_BY_READ_NAMES" ] && ViashError Bad arguments for option \'--sort_by_read_names\': \'$VIASH_PAR_SORT_BY_READ_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SORT_BY_READ_NAMES=true + shift 1 + ;; + --sort_by) + [ -n "$VIASH_PAR_SORT_BY" ] && ViashError Bad arguments for option \'--sort_by\': \'$VIASH_PAR_SORT_BY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SORT_BY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sort_by. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sort_by=*) + [ -n "$VIASH_PAR_SORT_BY" ] && ViashError Bad arguments for option \'--sort_by=*\': \'$VIASH_PAR_SORT_BY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SORT_BY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --no_pg) + [ -n "$VIASH_PAR_NO_PG" ] && ViashError Bad arguments for option \'--no_pg\': \'$VIASH_PAR_NO_PG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NO_PG=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/samtools_sort:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_BAM+x} ]; then + ViashError '--output_bam' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MINIMIZER_CLUSTER+x} ]; then + VIASH_PAR_MINIMIZER_CLUSTER="false" +fi +if [ -z ${VIASH_PAR_SORT_BY_READ_NAMES+x} ]; then + VIASH_PAR_SORT_BY_READ_NAMES="false" +fi +if [ -z ${VIASH_PAR_NO_PG+x} ]; then + VIASH_PAR_NO_PG="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_COMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_COMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--compression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MINIMIZER_CLUSTER" ]]; then + if ! [[ "$VIASH_PAR_MINIMIZER_CLUSTER" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--minimizer_cluster' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MINIMIZER_KMER" ]]; then + if ! [[ "$VIASH_PAR_MINIMIZER_KMER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--minimizer_kmer' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SORT_BY_READ_NAMES" ]]; then + if ! [[ "$VIASH_PAR_SORT_BY_READ_NAMES" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--sort_by_read_names' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NO_PG" ]]; then + if ! [[ "$VIASH_PAR_NO_PG" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--no_pg' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_FORMAT" ]; then + VIASH_PAR_OUTPUT_FORMAT_CHOICES=("sam;bam;cram") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_FORMAT_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_FORMAT;" ]]; then + ViashError '--output_format' specified value of \'$VIASH_PAR_OUTPUT_FORMAT\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_BAM" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_BAM")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_BAM")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_BAI" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_BAI")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_BAI")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_BAM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_BAM")" ) + VIASH_PAR_OUTPUT_BAM=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_BAM") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_BAM" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_BAI" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_BAI")" ) + VIASH_PAR_OUTPUT_BAI=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_BAI") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_BAI" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-samtools_sort-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import tempfile +import subprocess +from pathlib import Path +import yaml + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_bam': $( if [ ! -z ${VIASH_PAR_OUTPUT_BAM+x} ]; then echo "r'${VIASH_PAR_OUTPUT_BAM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_bai': $( if [ ! -z ${VIASH_PAR_OUTPUT_BAI+x} ]; then echo "r'${VIASH_PAR_OUTPUT_BAI//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_format': $( if [ ! -z ${VIASH_PAR_OUTPUT_FORMAT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_FORMAT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'compression': $( if [ ! -z ${VIASH_PAR_COMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_COMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'minimizer_cluster': $( if [ ! -z ${VIASH_PAR_MINIMIZER_CLUSTER+x} ]; then echo "r'${VIASH_PAR_MINIMIZER_CLUSTER//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'minimizer_kmer': $( if [ ! -z ${VIASH_PAR_MINIMIZER_KMER+x} ]; then echo "int(r'${VIASH_PAR_MINIMIZER_KMER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sort_by_read_names': $( if [ ! -z ${VIASH_PAR_SORT_BY_READ_NAMES+x} ]; then echo "r'${VIASH_PAR_SORT_BY_READ_NAMES//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'sort_by': $( if [ ! -z ${VIASH_PAR_SORT_BY+x} ]; then echo "r'${VIASH_PAR_SORT_BY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'no_pg': $( if [ ! -z ${VIASH_PAR_NO_PG+x} ]; then echo "r'${VIASH_PAR_NO_PG//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +def generate_args(par, config): + # fetch arguments from config + arguments = [ + arg for group in config["argument_groups"] for arg in group["arguments"] + ] + + cmd_args = [] + + for arg in arguments: + arg_val = par.get(arg["name"].removeprefix("--")) + # The info key is always present (changed in viash 0.7.4) + # in the parsed config (None if not specified in source config) + info = arg["info"] or {} + orig_arg = info.get("orig_arg") + if arg_val and orig_arg: + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + +print(">> Constructing command", flush=True) +cmd_args = ["samtools", "sort"] + generate_args(par, config) + +# manually process cpus parameter +if "cpus" in meta and meta["cpus"]: + cmd_args.extend(["--threads", str(meta["cpus"])]) +# add memory +if "memory_mb" in meta and meta["memory_mb"]: + import math + + mem_per_thread = math.ceil(meta["memory_mb"] * 0.8 / meta["cpus"]) + cmd_args.extend(["-m", f"{mem_per_thread}M"]) + +with tempfile.TemporaryDirectory(prefix="samtools-", dir=meta["temp_dir"]) as temp_dir: + # add tempdir + cmd_args.extend(["-T", str(temp_dir + "/")]) + + # run command + print(">> Running samtools sort with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + subprocess.run(cmd_args, check=True) + +if par.get("output_bai"): + print(">> Running samtools index with command:", flush=True) + cmd_index_args = ["samtools", "index", "-b", par["output_bam"], par["output_bai"]] + print("+ " + " ".join([str(x) for x in cmd_index_args]), flush=True) + subprocess.run(cmd_index_args, check=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_BAM" ]; then + VIASH_PAR_OUTPUT_BAM=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_BAM") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_BAI" ]; then + VIASH_PAR_OUTPUT_BAI=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_BAI") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_BAM" ] && [ ! -e "$VIASH_PAR_OUTPUT_BAM" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_BAM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_BAI" ] && [ ! -e "$VIASH_PAR_OUTPUT_BAI" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_BAI' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/star_align/.config.vsh.yaml b/target/executable/mapping/star_align/.config.vsh.yaml new file mode 100644 index 00000000..bfe56bb1 --- /dev/null +++ b/target/executable/mapping/star_align/.config.vsh.yaml @@ -0,0 +1,2420 @@ +name: "star_align" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input/Output" + arguments: + - type: "file" + name: "--input" + alternatives: + - "--readFilesIn" + description: "The FASTQ files to be analyzed. Corresponds to the --readFilesIn\ + \ argument in the STAR command." + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "--genomeDir" + description: "Path to the reference built by star_build_reference. Corresponds\ + \ to the --genomeDir argument in the STAR command." + info: null + example: + - "/path/to/reference" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--outFileNamePrefix" + description: "Path to output directory. Corresponds to the --outFileNamePrefix\ + \ argument in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Run Parameters" + arguments: + - type: "integer" + name: "--runRNGseed" + description: "random number generator seed." + info: null + example: + - 777 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Genome Parameters" + arguments: + - type: "string" + name: "--genomeLoad" + description: "mode of shared memory usage for the genome files. Only used with\ + \ --runMode alignReads.\n\n- LoadAndKeep ... load genome into shared and\ + \ keep it in memory after run\n- LoadAndRemove ... load genome into shared\ + \ but remove it after run\n- LoadAndExit ... load genome into shared memory\ + \ and exit, keeping the genome in memory for future runs\n- Remove \ + \ ... do not map anything, just remove loaded genome from memory\n- NoSharedMemory\ + \ ... do not use shared memory, each job will have its own private copy of\ + \ the genome" + info: null + example: + - "NoSharedMemory" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genomeFastaFiles" + description: "path(s) to the fasta files with the genome sequences, separated\ + \ by spaces. These files should be plain text FASTA files, they *cannot* be\ + \ zipped.\n\nRequired for the genome generation (--runMode genomeGenerate).\ + \ Can also be used in the mapping (--runMode alignReads) to add extra (new)\ + \ sequences to the genome (e.g. spike-ins)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--genomeFileSizes" + description: "genome files exact sizes in bytes. Typically, this should not be\ + \ defined by the user." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeTransformOutput" + description: "which output to transform back to original genome\n\n- SAM ...\ + \ SAM/BAM alignments\n- SJ ... splice junctions (SJ.out.tab)\n- None \ + \ ... no transformation of the output" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeChrSetMitochondrial" + description: "names of the mitochondrial chromosomes. Presently only used for\ + \ STARsolo statistics output/" + info: null + example: + - "chrM" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Splice Junctions Database" + arguments: + - type: "string" + name: "--sjdbFileChrStartEnd" + description: "path to the files with genomic coordinates (chr start \ + \ end strand) for the splice junction introns. Multiple files can be supplied\ + \ and will be concatenated." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sjdbGTFfile" + description: "path to the GTF file with annotations" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFchrPrefix" + description: "prefix for chromosome names in a GTF file (e.g. 'chr' for using\ + \ ENSMEBL annotations with UCSC genomes)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFfeatureExon" + description: "feature type in GTF file to be used as exons for building transcripts" + info: null + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentTranscript" + description: "GTF attribute name for parent transcript ID (default \"transcript_id\"\ + \ works for GTF files)" + info: null + example: + - "transcript_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGene" + description: "GTF attribute name for parent gene ID (default \"gene_id\" works\ + \ for GTF files)" + info: null + example: + - "gene_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneName" + description: "GTF attribute name for parent gene name" + info: null + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneType" + description: "GTF attribute name for parent gene type" + info: null + example: + - "gene_type" + - "gene_biotype" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sjdbOverhang" + description: "length of the donor/acceptor sequence on each side of the junctions,\ + \ ideally = (mate_length - 1)" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sjdbScore" + description: "extra alignment score for alignments that cross database junctions" + info: null + example: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbInsertSave" + description: "which files to save when sjdb junctions are inserted on the fly\ + \ at the mapping step\n\n- Basic ... only small junction / transcript files\n\ + - All ... all files including big Genome, SA and SAindex - this will create\ + \ a complete genome directory" + info: null + example: + - "Basic" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variation parameters" + arguments: + - type: "string" + name: "--varVCFfile" + description: "path to the VCF file that contains variation data. The 10th column\ + \ should contain the genotype information, e.g. 0/1" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Parameters" + arguments: + - type: "string" + name: "--readFilesType" + description: "format of input read files\n\n- Fastx ... FASTA or FASTQ\n\ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand\ + \ samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use\ + \ --readFilesCommand samtools view" + info: null + example: + - "Fastx" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesSAMattrKeep" + description: "for --readFilesType SAM SE/PE, which SAM tags to keep in the output\ + \ BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n-\ + \ None ... do not keep any tags" + info: null + example: + - "All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--readFilesManifest" + description: "path to the \"manifest\" file with the names of read files. The\ + \ manifest file should contain 3 tab-separated columns:\n\npaired-end reads:\ + \ read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads:\ + \ read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but\ + \ not tabs are allowed in file names.\nIf read_group_line does not start with\ + \ ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line\ + \ starts with ID:, it can contain several fields separated by $tab$, and all\ + \ fields will be be copied verbatim into SAM @RG header line." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesPrefix" + description: "prefix for the read files names, i.e. it will be added in front\ + \ of the strings in --readFilesIn" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesCommand" + description: "command line to execute for each of the input file. This command\ + \ should generate FASTA or FASTQ text and send it to stdout\n\nFor example:\ + \ zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readMapNumber" + description: "number of reads to map from the beginning of the file\n\n-1: map\ + \ all reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readMatesLengthsIn" + description: "Equal/NotEqual - lengths of names,sequences,qualities for both mates\ + \ are the same / not the same. NotEqual is safe in all situations." + info: null + example: + - "NotEqual" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readNameSeparator" + description: "character(s) separating the part of the read names that will be\ + \ trimmed in output (read name after space is always trimmed)" + info: null + example: + - "/" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readQualityScoreBase" + description: "number to be subtracted from the ASCII code to get Phred quality\ + \ score" + info: null + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Clipping" + arguments: + - type: "string" + name: "--clipAdapterType" + description: "adapter clipping type\n\n- Hamming ... adapter clipping based on\ + \ Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n\ + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes\ + \ Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ...\ + \ no adapter clipping, all other clip* parameters are disregarded" + info: null + example: + - "Hamming" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clip3pNbases" + description: "number(s) of bases to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--clip3pAdapterSeq" + description: "adapter sequences to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence\ + \ with the length equal to read length" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--clip3pAdapterMMp" + description: "max proportion of mismatches for 3p adapter clipping for each mate.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip3pAfterAdapterNbases" + description: "number of bases to clip from 3p of each mate after the adapter clipping.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip5pNbases" + description: "number(s) of bases to clip from 5p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Limits" + arguments: + - type: "long" + name: "--limitGenomeGenerateRAM" + description: "maximum available RAM (bytes) for genome generation" + info: null + example: + - 31000000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitIObufferSize" + description: "max available buffers size (bytes) for input/output, per thread" + info: null + example: + - 30000000 + - 50000000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "long" + name: "--limitOutSAMoneReadBytes" + description: "max size of the SAM record (bytes) for one read. Recommended value:\ + \ >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + info: null + example: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJoneRead" + description: "max number of junctions for one read (including all multi-mappers)" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJcollapsed" + description: "max number of collapsed junctions" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitBAMsortRAM" + description: "maximum available RAM (bytes) for sorting BAM. If =0, it will be\ + \ set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory\ + \ option." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitSjdbInsertNsj" + description: "maximum number of junctions to be inserted to the genome on the\ + \ fly at the mapping stage, including those from annotations and those detected\ + \ in the 1st step of the 2-pass run" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitNreadsSoft" + description: "soft limit on the number of reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: general" + arguments: + - type: "string" + name: "--outTmpKeep" + description: "whether to keep the temporary files after STAR runs is finished\n\ + \n- None ... remove all temporary files\n- All ... keep all files" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outStd" + description: "which output will be directed to stdout (standard out)\n\n- Log\ + \ ... log messages\n- SAM ... alignments\ + \ in SAM format (which normally are output to Aligned.out.sam file), normal\ + \ standard output will go into Log.std.out\n- BAM_Unsorted ... alignments\ + \ in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate\ + \ ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype\ + \ BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome\ + \ in BAM format, unsorted. Requires --quantMode TranscriptomeSAM" + info: null + example: + - "Log" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outReadsUnmapped" + description: "output of unmapped and partially mapped (i.e. mapped only one mate\ + \ of a paired end read) reads in separate file(s).\n\n- None ... no output\n\ + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outQSconversionAdd" + description: "add this number to the quality score (e.g. to convert from Illumina\ + \ to Sanger, use -31)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outMultimapperOrder" + description: "order of multimapping alignments in the output files\n\n- Old_2.4\ + \ ... quasi-random order used before 2.5.0\n- Random \ + \ ... random order of alignments for each multi-mapper. Read mates (pairs)\ + \ are always adjacent, all alignment for each read stay together. This option\ + \ will become default in the future releases." + info: null + example: + - "Old_2.4" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: SAM and BAM" + arguments: + - type: "string" + name: "--outSAMtype" + description: "type of SAM/BAM output\n\n1st word:\n- BAM ... output BAM without\ + \ sorting\n- SAM ... output SAM without sorting\n- None ... no SAM/BAM output\n\ + 2nd, 3rd:\n- Unsorted ... standard unsorted\n- SortedByCoordinate\ + \ ... sorted by coordinate. This option will allocate extra memory for sorting\ + \ which can be specified by --limitBAMsortRAM." + info: null + example: + - "SAM" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMmode" + description: "mode of SAM output\n\n- None ... no SAM output\n- Full ... full\ + \ SAM output\n- NoQS ... full SAM but without quality scores" + info: null + example: + - "Full" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMstrandField" + description: "Cufflinks-like strand field flag\n\n- None ... not used\n\ + - intronMotif ... strand derived from the intron motif. This option changes\ + \ the output alignments: reads with inconsistent and/or non-canonical introns\ + \ are filtered out." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattributes" + description: "a string of desired SAM attributes, in the order desired for the\ + \ output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n\ + - None ... no attributes\n- Standard ... NH HI AS nM\n- All \ + \ ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number\ + \ of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard\ + \ SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart\ + \ (=1 by default). Standard SAM tag.\n- AS ... local alignment score,\ + \ +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE\ + \ reads, total score for two mates. Stadnard SAM tag.\n- nM ... number\ + \ of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance\ + \ to the reference (number of mismatched + inserted + deleted bases) for each\ + \ mate. Standard SAM tag.\n- MD ... string encoding mismatched and\ + \ deleted reference bases (see standard SAM specifications). Standard SAM tag.\n\ + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical;\ + \ 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions\ + \ database is used, and a junction is annotated, 20 is added to its motif value.\n\ + - jI ... start and end of introns for all junctions (1-based).\n- XS\ + \ ... alignment strand according to --outSAMstrandField.\n- MC \ + \ ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all\ + \ segment of all chimeric alingments for --chimOutType WithinBAM output.\n-\ + \ cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n\ + - vA ... variant allele\n- vG ... genomic coordinate of the\ + \ variant overlapped by the read.\n- vW ... 1 - alignment passes WASP\ + \ filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires\ + \ --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality\ + \ scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN \ + \ ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene\ + \ IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected\ + \ cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM\ + \ SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS \ + \ ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ \ + \ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha \ + \ ... haplotype (1/2) when mapping to the diploid genome. Requires genome\ + \ generated with --genomeTransformType Diploid .\n- rB ... alignment\ + \ block read/genomic coordinates.\n- vR ... read coordinate of the\ + \ variant." + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMattrIHstart" + description: "start value for the IH attribute. 0 may be required by some downstream\ + \ software, such as Cufflinks or StringTie." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMunmapped" + description: "output of unmapped reads in the SAM format\n\n1st word:\n- None\ + \ ... no output\n- Within ... output unmapped reads within the main SAM file\ + \ (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for\ + \ each alignment, and, in case of unsorted output, keep it adjacent to its mapped\ + \ mate. Only affects multi-mapping reads." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMorder" + description: "type of sorting for the SAM output\n\nPaired: one mate after the\ + \ other for all paired alignments\nPairedKeepInputOrder: one mate after the\ + \ other for all paired alignments, the order is kept the same as in the input\ + \ FASTQ files" + info: null + example: + - "Paired" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMprimaryFlag" + description: "which alignments are considered primary - all others will be marked\ + \ with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the\ + \ best score is primary\n- AllBestScore ... all alignments with the best score\ + \ are primary" + info: null + example: + - "OneBestScore" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMreadID" + description: "read ID record type\n\n- Standard ... first word (until space) from\ + \ the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number\ + \ (index) in the FASTx file" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMmapqUnique" + description: "0 to 255: the MAPQ value for unique mappers" + info: null + example: + - 255 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagOR" + description: "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e.\ + \ FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, and after outSAMflagAND. Can be used to set specific bits that are not\ + \ set otherwise." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagAND" + description: "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e.\ + \ FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, but before outSAMflagOR. Can be used to unset specific bits that are\ + \ not set otherwise." + info: null + example: + - 65535 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattrRGline" + description: "SAM/BAM read group line. The first word contains the read group\ + \ identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx CN:yy\ + \ \"DS:z z z\".\n\nxxx will be added as RG tag to each output alignment. Any\ + \ spaces in the tag values have to be double quoted.\nComma separated RG lines\ + \ correspons to different (comma separated) input files in --readFilesIn. Commas\ + \ have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz\ + \ \"DS:z z\" , ID:yyy DS:yyyy" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderHD" + description: "@HD (header) line of the SAM header" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderPG" + description: "extra @PG (software) line of the SAM header (in addition to STAR)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderCommentFile" + description: "path to the file with @CO (comment) lines of the SAM header" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMfilter" + description: "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences\ + \ ... only keep the reads for which all alignments are to the extra reference\ + \ sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences\ + \ ... keep all alignments to the extra reference sequences added with --genomeFastaFiles\ + \ at the mapping stage." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMmultNmax" + description: "max number of multiple alignments for a read that will be output\ + \ to the SAM/BAM files. Note that if this value is not equal to -1, the top\ + \ scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax)\ + \ will be output" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMtlen" + description: "calculation method for the TLEN field in the SAM/BAM files\n\n-\ + \ 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate.\ + \ (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost\ + \ base of any mate. (+)sign for the mate with the leftmost base. This is different\ + \ from 1 for overlapping mates with protruding ends" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMcompression" + description: "-1 to 10 BAM compression level, -1=default compression (6?), 0=no\ + \ compression, 10=maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingThreadN" + description: ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN)." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingBinsN" + description: ">0: number of genome bins for coordinate-sorting" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BAM processing" + arguments: + - type: "string" + name: "--bamRemoveDuplicatesType" + description: "mark duplicates in the BAM file, for now only works with (i) sorted\ + \ BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- -\ + \ ... no duplicate removal/marking\n- UniqueIdentical\ + \ ... mark all multimappers, and duplicate unique mappers. The coordinates,\ + \ FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate\ + \ unique mappers but not multimappers." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--bamRemoveDuplicatesMate2basesN" + description: "number of bases from the 5' of mate 2 to use in collapsing (e.g.\ + \ for RAMPAGE)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Wiggle" + arguments: + - type: "string" + name: "--outWigType" + description: "type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\"\ + . Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n\ + - None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle\ + \ ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of\ + \ the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only\ + \ 2nd read" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outWigStrand" + description: "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate\ + \ strands, str1 and str2\n- Unstranded ... collapsed strands" + info: null + example: + - "Stranded" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigReferencesPrefix" + description: "prefix matching reference names to include in the output wiggle\ + \ file, e.g. \"chr\", default \"-\" - include all references" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigNorm" + description: "type of normalization for the signal\n\n- RPM ... reads per million\ + \ of mapped reads\n- None ... no normalization, \"raw\" counts" + info: null + example: + - "RPM" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering" + arguments: + - type: "string" + name: "--outFilterType" + description: "type of filtering\n\n- Normal ... standard filtering using only\ + \ current alignment\n- BySJout ... keep only those reads that contain junctions\ + \ that passed filtering into SJ.out.tab" + info: null + example: + - "Normal" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapScoreRange" + description: "the score range below the maximum score for multimapping alignments" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapNmax" + description: "maximum number of loci the read is allowed to map to. Alignments\ + \ (all of them) will be output only if the read maps to no more loci than this\ + \ value.\n\nOtherwise no alignments will be output, and the read will be counted\ + \ as \"mapped to too many loci\" in the Log.final.out ." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMismatchNmax" + description: "alignment will be output only if it has no more mismatches than\ + \ this value." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverLmax" + description: "alignment will be output only if its ratio of mismatches to *mapped*\ + \ length is less than or equal to this value." + info: null + example: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverReadLmax" + description: "alignment will be output only if its ratio of mismatches to *read*\ + \ length is less than or equal to this value." + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterScoreMin" + description: "alignment will be output only if its score is higher than or equal\ + \ to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterScoreMinOverLread" + description: "same as outFilterScoreMin, but normalized to read length (sum of\ + \ mates' lengths for paired-end reads)" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMatchNmin" + description: "alignment will be output only if the number of matched bases is\ + \ higher than or equal to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMatchNminOverLread" + description: "sam as outFilterMatchNmin, but normalized to the read length (sum\ + \ of mates' lengths for paired-end reads)." + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronMotifs" + description: "filter alignment using their motifs\n\n- None \ + \ ... no filtering\n- RemoveNoncanonical ... filter out\ + \ alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated\ + \ ... filter out alignments that contain non-canonical unannotated junctions\ + \ when using annotated splice junctions database. The annotated non-canonical\ + \ junctions will be kept." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronStrands" + description: "filter alignments\n\n- RemoveInconsistentStrands ... remove\ + \ alignments that have junctions with inconsistent strands\n- None \ + \ ... no filtering" + info: null + example: + - "RemoveInconsistentStrands" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output splice junctions (SJ.out.tab)" + arguments: + - type: "string" + name: "--outSJtype" + description: "type of splice junction output\n\n- Standard ... standard SJ.out.tab\ + \ output\n- None ... no splice junction output" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering: Splice Junctions" + arguments: + - type: "string" + name: "--outSJfilterReads" + description: "which reads to consider for collapsed splice junctions output\n\n\ + - All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping\ + \ reads only" + info: null + example: + - "All" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterOverhangMin" + description: "minimum overhang length for splice junctions on both sides for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply\ + \ to annotated junctions" + info: null + example: + - 30 + - 12 + - 12 + - 12 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountUniqueMin" + description: "minimum uniquely mapping read count per junction for: (1) non-canonical\ + \ motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and\ + \ GT/AT motif. -1 means no output for that motif\n\nJunctions are output if\ + \ one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are\ + \ satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountTotalMin" + description: "minimum total (multi-mapping+unique) read count per junction for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions\ + \ are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin\ + \ conditions are satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterDistToOtherSJmin" + description: "minimum allowed distance to other junctions' donor/acceptor\n\n\ + does not apply to annotated junctions" + info: null + example: + - 10 + - 0 + - 5 + - 10 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterIntronMaxVsReadN" + description: "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ + \ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2\ + \ reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\n\ + does not apply to annotated junctions" + info: null + example: + - 50000 + - 100000 + - 200000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Scoring" + arguments: + - type: "integer" + name: "--scoreGap" + description: "splice junction penalty (independent on intron motif)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapNoncan" + description: "non-canonical junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapGCAG" + description: "GC/AG and CT/GC junction penalty (in addition to scoreGap)" + info: null + example: + - -4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapATAC" + description: "AT/AC and GT/AT junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGenomicLengthLog2scale" + description: "extra score logarithmically scaled with genomic length of the alignment:\ + \ scoreGenomicLengthLog2scale*log2(genomicLength)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelOpen" + description: "deletion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelBase" + description: "deletion extension penalty per base (in addition to scoreDelOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsOpen" + description: "insertion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsBase" + description: "insertion extension penalty per base (in addition to scoreInsOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreStitchSJshift" + description: "maximum score reduction while searching for SJ boundaries in the\ + \ stitching step" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Alignments and Seeding" + arguments: + - type: "integer" + name: "--seedSearchStartLmax" + description: "defines the search start point through the read - the read is split\ + \ into pieces no longer than this value" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--seedSearchStartLmaxOverLread" + description: "seedSearchStartLmax normalized to read length (sum of mates' lengths\ + \ for paired-end reads)" + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSearchLmax" + description: "defines the maximum length of the seeds, if =0 seed length is not\ + \ limited" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMultimapNmax" + description: "only pieces that map fewer than this value are utilized in the stitching\ + \ procedure" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerReadNmax" + description: "max number of seeds per read" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerWindowNmax" + description: "max number of seeds per window" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedNoneLociPerWindow" + description: "max number of one seed loci per window" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSplitMin" + description: "min length of the seed sequences split by Ns or mate gap" + info: null + example: + - 12 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMapMin" + description: "min length of seeds to be mapped" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMin" + description: "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,\ + \ otherwise it is considered Deletion" + info: null + example: + - 21 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMax" + description: "maximum intron size, if 0, max intron size will be determined by\ + \ (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignMatesGapMax" + description: "maximum gap between two mates, if 0, max intron gap will be determined\ + \ by (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJoverhangMin" + description: "minimum overhang (i.e. block size) for spliced alignments" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJstitchMismatchNmax" + description: "maximum number of mismatches for stitching of the splice junctions\ + \ (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3)\ + \ GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif." + info: null + example: + - 0 + - -1 + - 0 + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--alignSJDBoverhangMin" + description: "minimum overhang (i.e. block size) for annotated (sjdb) spliced\ + \ alignments" + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSplicedMateMapLmin" + description: "minimum mapped length for a read mate that is spliced" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alignSplicedMateMapLminOverLmate" + description: "alignSplicedMateMapLmin normalized to mate length" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignWindowsPerReadNmax" + description: "max number of windows per read" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerWindowNmax" + description: "max number of transcripts per window" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerReadNmax" + description: "max number of different alignments per read to consider" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsType" + description: "type of read ends alignment\n\n- Local ... standard\ + \ local alignment with soft-clipping allowed\n- EndToEnd ... force\ + \ end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully\ + \ extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12\ + \ ... fully extend only the 5p of the both read1 and read2, all other ends:\ + \ local alignment" + info: null + example: + - "Local" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsProtrude" + description: "allow protrusion of alignment ends, i.e. start (end) of the +strand\ + \ mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum\ + \ number of protrusion bases allowed\n2nd word: string:\n- \ + \ ConcordantPair ... report alignments with non-zero protrusion as concordant\ + \ pairs\n- DiscordantPair ... report alignments with non-zero\ + \ protrusion as discordant pairs" + info: null + example: + - "0 ConcordantPair" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignSoftClipAtReferenceEnds" + description: "allow the soft-clipping of the alignments past the end of the chromosomes\n\ + \n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks" + info: null + example: + - "Yes" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignInsertionFlush" + description: "how to flush ambiguous insertion positions\n\n- None ... insertions\ + \ are not flushed\n- Right ... insertions are flushed to the right" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-End reads" + arguments: + - type: "integer" + name: "--peOverlapNbasesMin" + description: "minimum number of overlapping bases to trigger mates merging and\ + \ realignment. Specify >0 value to switch on the \"merginf of overlapping mates\"\ + \ algorithm." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--peOverlapMMp" + description: "maximum proportion of mismatched bases in the overlap area" + info: null + example: + - 0.01 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Windows, Anchors, Binning" + arguments: + - type: "integer" + name: "--winAnchorMultimapNmax" + description: "max number of loci anchors are allowed to map to" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winBinNbits" + description: "=log2(winBin), where winBin is the size of the bin for the windows/clustering,\ + \ each window will occupy an integer number of bins." + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winAnchorDistNbins" + description: "max number of bins between two anchors that allows aggregation of\ + \ anchors into one window" + info: null + example: + - 9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winFlankNbins" + description: "log2(winFlank), where win Flank is the size of the left and right\ + \ flanking regions for each window" + info: null + example: + - 4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--winReadCoverageRelativeMin" + description: "minimum relative coverage of the read sequence by the seeds in a\ + \ window, for STARlong algorithm only." + info: null + example: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winReadCoverageBasesMin" + description: "minimum number of bases covered by the seeds in a window , for STARlong\ + \ algorithm only." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Chimeric Alignments" + arguments: + - type: "string" + name: "--chimOutType" + description: "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n\ + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n-\ + \ WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n-\ + \ WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental\ + \ chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip\ + \ ... soft-clipping in the CIGAR for supplemental chimeric alignments" + info: null + example: + - "Junctions" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentMin" + description: "minimum length of chimeric segment length, if ==0, no chimeric output" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreMin" + description: "minimum total (summed) score of the chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreDropMax" + description: "max drop (difference) of chimeric score (the sum of scores of all\ + \ chimeric segments) from the read length" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreSeparation" + description: "minimum difference (separation) between the best chimeric score\ + \ and the next one" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreJunctionNonGTAG" + description: "penalty for a non-GT/AG chimeric junction" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimJunctionOverhangMin" + description: "minimum overhang for a chimeric junction" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentReadGapMax" + description: "maximum gap in the read sequence between chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chimFilter" + description: "different filters for chimeric alignments\n\n- None ... no filtering\n\ + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric\ + \ junction" + info: null + example: + - "banGenomicN" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimMainSegmentMultNmax" + description: "maximum number of multi-alignments for the main chimeric segment.\ + \ =1 will prohibit multimapping main segments." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapNmax" + description: "maximum number of chimeric multi-alignments\n\n- 0 ... use the old\ + \ scheme for chimeric detection which only considered unique alignments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapScoreRange" + description: "the score range for multi-mapping chimeras below the best chimeric\ + \ score. Only works with --chimMultimapNmax > 1" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimNonchimScoreDropMin" + description: "to trigger chimeric detection, the drop in the best non-chimeric\ + \ alignment score with respect to the read length has to be greater than this\ + \ value" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimOutJunctionFormat" + description: "formatting type for the Chimeric.out.junction file\n\n- 0 ... no\ + \ comment lines/headers\n- 1 ... comment lines at the end of the file: command\ + \ line and Nreads: total, unique/multi-mapping" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Quantification of Annotations" + arguments: + - type: "string" + name: "--quantMode" + description: "types of quantification requested\n\n- - ... none\n\ + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate\ + \ file\n- GeneCounts ... count reads per gene" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--quantTranscriptomeBAMcompression" + description: "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM\ + \ output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10\ + \ ... maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quantTranscriptomeBan" + description: "prohibit various alignment type\n\n- IndelSoftclipSingleend ...\ + \ prohibit indels, soft clipping and single-end alignments - compatible with\ + \ RSEM\n- Singleend ... prohibit single-end alignments" + info: null + example: + - "IndelSoftclipSingleend" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "2-pass Mapping" + arguments: + - type: "string" + name: "--twopassMode" + description: "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic\ + \ ... basic 2-pass mapping, with all 1st pass junctions inserted into\ + \ the genome indices on the fly" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--twopass1readsN" + description: "number of reads to process for the 1st step. Use very large number\ + \ (or default -1) to map all reads in the first step." + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "WASP parameters" + arguments: + - type: "string" + name: "--waspOutputMode" + description: "WASP allele-specific output type. This is re-implementation of the\ + \ original WASP mappability filtering by Bryce van de Geijn, Graham McVicker,\ + \ Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature\ + \ Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\ + \n- SAMtag ... add WASP tags to the alignments that pass WASP filtering" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STARsolo (single cell RNA-seq) parameters" + arguments: + - type: "string" + name: "--soloType" + description: "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet)\ + \ one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X\ + \ Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length,\ + \ one UMI of fixed length and one adapter sequence of fixed length are allowed\ + \ in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode\ + \ as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2\ + \ if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or\ + \ SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate\ + \ FASTQ (paired- or single-end), barcodes are corresponding read-groups, no\ + \ UMI sequences, alignments deduplicated according to alignment start and end\ + \ (after extending soft-clipped bases)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCBwhitelist" + description: "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex\ + \ allows more than one whitelist file.\n\n- None ... no whitelist:\ + \ all cell barcodes are allowed" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--soloCBstart" + description: "cell barcode start base" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloCBlen" + description: "cell barcode length" + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIstart" + description: "UMI start base" + info: null + example: + - 17 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIlen" + description: "UMI length" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeReadLength" + description: "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n\ + - 0 ... not defined, do not check" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeMate" + description: "identifies which read mate contains the barcode (CB+UMI) sequence\n\ + \n- 0 ... barcode sequence is on separate read, which should always be the\ + \ last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part\ + \ of mate 1\n- 2 ... barcode sequence is a part of mate 2" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBposition" + description: "position of Cell Barcode(s) on the barcode read.\n\nPresently only\ + \ works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\n\ + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor\ + \ defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter\ + \ start; 3: adapter end\nstart(end)Position is the 0-based position with of\ + \ the CB start(end) with respect to the Anchor Base\nString for different barcodes\ + \ are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols,\ + \ 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIposition" + description: "position of the UMI on the barcode read, same as soloCBposition\n\ + \nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition\ + \ 3_9_3_14" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloAdapterSequence" + description: "adapter sequence to anchor barcodes. Only one adapter sequence is\ + \ allowed." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloAdapterMismatchesNmax" + description: "maximum number of mismatches allowed in adapter sequence." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBmatchWLtype" + description: "matching the Cell Barcodes to the WhiteList\n\n- Exact \ + \ ... only exact matches allowed\n- 1MM \ + \ ... only one match in whitelist with 1 mismatched base allowed. Allowed\ + \ CBs have to have at least one read with exact match.\n- 1MM_multi \ + \ ... multiple matches in whitelist with 1 mismatched base allowed,\ + \ posterior probability calculation is used choose one of the matches.\nAllowed\ + \ CBs have to have at least one read with exact match. This option matches best\ + \ with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi,\ + \ but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts\ + \ ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for\ + \ CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2\ + \ ... allow up to edit distance of 3 fpr each of the barcodes.\ + \ May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex.\ + \ Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio\ + \ Split-seq pipeline." + info: null + example: + - "1MM_multi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeSeq" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance,\ + \ for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR\ + \ .\nThis parameter is required when running STARsolo with input from SAM." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeQual" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode qualities (in proper order).\n\nFor\ + \ instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual\ + \ CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned\ + \ to all bases." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloStrand" + description: "strandedness of the solo libraries:\n\n- Unstranded ... no strand\ + \ information\n- Forward ... read strand same as the original RNA molecule\n\ + - Reverse ... read strand opposite to the original RNA molecule" + info: null + example: + - "Forward" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloFeatures" + description: "genomic features for which the UMI counts per Cell Barcode are collected\n\ + \n- Gene ... genes: reads match the gene transcript\n- SJ \ + \ ... splice junctions: reported in SJ.out.tab\n- GeneFull ...\ + \ full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n\ + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping\ + \ genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS\ + \ ... full gene (pre-RNA): count all reads overlapping genes' exons and\ + \ introns: prioritize >50% overlap with exons. Do not count reads with 100%\ + \ exonic overlap in the antisense direction." + info: null + example: + - "Gene" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloMultiMappers" + description: "counting method for reads mapping to multiple genes\n\n- Unique\ + \ ... count only reads that map to unique genes\n- Uniform ... uniformly\ + \ distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs\ + \ proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique\ + \ ... distribute UMIs proportionally to unique mappers, if present, and uniformly\ + \ if not.\n- EM ... multi-gene UMIs are distributed using Expectation\ + \ Maximization algorithm" + info: null + example: + - "Unique" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIdedup" + description: "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All \ + \ ... all UMIs with 1 mismatch distance to each other are\ + \ collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows\ + \ the \"directional\" method from the UMI-tools by Smith, Heger and Sudbery\ + \ (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools,\ + \ but with more stringent criteria for duplicate UMIs\n- Exact \ + \ ... only exactly matching UMIs are collapsed.\n- NoDedup \ + \ ... no deduplication of UMIs, count all reads.\n- 1MM_CR \ + \ ... CellRanger2-4 algorithm for 1MM UMI collapsing." + info: null + example: + - "1MM_All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIfiltering" + description: "type of UMI filtering (for reads uniquely mapping to genes)\n\n\ + - - ... basic filtering: remove UMIs with N and homopolymers\ + \ (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count\ + \ UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove\ + \ all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic +\ + \ remove lower-count UMIs that map to more than one gene, matching CellRanger\ + \ > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFileNames" + description: "file names for STARsolo output:\n\nfile_name_prefix gene_names\ + \ barcode_sequences cell_feature_count_matrix" + info: null + example: + - "Solo.out/" + - "features.tsv" + - "barcodes.tsv" + - "matrix.mtx" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellFilter" + description: "cell filtering type and parameters\n\n- None ... do not\ + \ output filtered cells\n- TopCells ... only report top cells by UMI\ + \ count, followed by the exact number of cells\n- CellRanger2.2 ... simple\ + \ filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected\ + \ cells, robust maximum percentile for UMI count, maximum to minimum ratio for\ + \ UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; \ + \ maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering\ + \ in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun\ + \ et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\n\ + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile \ + \ maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR\ + \ simN\nThe harcoded values are from CellRanger: 3000 \ + \ 0.99 10 45000 90000 500 0.01 20000\ + \ 0.01 10000" + info: null + example: + - "CellRanger2.2" + - "3000" + - "0.99" + - "10" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFormatFeaturesGeneField3" + description: "field 3 in the Gene features.tsv file. If \"-\", then no 3rd field\ + \ is output." + info: null + example: + - "Gene Expression" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellReadStats" + description: "Output reads statistics for each CB\n\n- Standard ... standard\ + \ output" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using STAR." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "docker" + env: + - "STAR_VERSION 2.7.10b" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/star_align/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/star_align" + executable: "target/executable/mapping/star_align/star_align" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/star_align/nextflow_labels.config b/target/executable/mapping/star_align/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/star_align/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/star_align/setup_logger.py b/target/executable/mapping/star_align/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/mapping/star_align/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/mapping/star_align/star_align b/target/executable/mapping/star_align/star_align new file mode 100755 index 00000000..2952ef3a --- /dev/null +++ b/target/executable/mapping/star_align/star_align @@ -0,0 +1,5938 @@ +#!/usr/bin/env bash + +# star_align main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="star_align" +VIASH_META_FUNCTIONALITY_NAME="star_align" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +ENV STAR_VERSION 2.7.10b +ENV PACKAGES gcc g++ make wget zlib1g-dev unzip +RUN apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component mapping star_align" +LABEL org.opencontainers.image.created="2025-08-22T07:23:20Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "star_align main" + echo "" + echo "Align fastq files using STAR." + echo "" + echo "Input/Output:" + echo " --readFilesIn, --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed. Corresponds to the --readFilesIn" + echo " argument in the STAR command." + echo "" + echo " --genomeDir, --reference" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/reference" + echo " Path to the reference built by star_build_reference. Corresponds to the" + echo " --genomeDir argument in the STAR command." + echo "" + echo " --outFileNamePrefix, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/foo" + echo " Path to output directory. Corresponds to the --outFileNamePrefix" + echo " argument in the STAR command." + echo "" + echo "Run Parameters:" + echo " --runRNGseed" + echo " type: integer" + echo " example: 777" + echo " random number generator seed." + echo "" + echo "Genome Parameters:" + echo " --genomeLoad" + echo " type: string" + echo " example: NoSharedMemory" + echo " mode of shared memory usage for the genome files. Only used with" + echo " --runMode alignReads." + echo " - LoadAndKeep ... load genome into shared and keep it in memory" + echo " after run" + echo " - LoadAndRemove ... load genome into shared but remove it after run" + echo " - LoadAndExit ... load genome into shared memory and exit, keeping" + echo " the genome in memory for future runs" + echo " - Remove ... do not map anything, just remove loaded genome" + echo " from memory" + echo " - NoSharedMemory ... do not use shared memory, each job will have its" + echo " own private copy of the genome" + echo "" + echo " --genomeFastaFiles" + echo " type: file, multiple values allowed, file must exist" + echo " path(s) to the fasta files with the genome sequences, separated by" + echo " spaces. These files should be plain text FASTA files, they *cannot* be" + echo " zipped." + echo " Required for the genome generation (--runMode genomeGenerate). Can also" + echo " be used in the mapping (--runMode alignReads) to add extra (new)" + echo " sequences to the genome (e.g. spike-ins)." + echo "" + echo " --genomeFileSizes" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " genome files exact sizes in bytes. Typically, this should not be defined" + echo " by the user." + echo "" + echo " --genomeTransformOutput" + echo " type: string, multiple values allowed" + echo " which output to transform back to original genome" + echo " - SAM ... SAM/BAM alignments" + echo " - SJ ... splice junctions (SJ.out.tab)" + echo " - None ... no transformation of the output" + echo "" + echo " --genomeChrSetMitochondrial" + echo " type: string, multiple values allowed" + echo " example: chrM;M;MT" + echo " names of the mitochondrial chromosomes. Presently only used for STARsolo" + echo " statistics output/" + echo "" + echo "Splice Junctions Database:" + echo " --sjdbFileChrStartEnd" + echo " type: string, multiple values allowed" + echo " path to the files with genomic coordinates (chr start end" + echo " strand) for the splice junction introns. Multiple files can be" + echo " supplied and will be concatenated." + echo "" + echo " --sjdbGTFfile" + echo " type: file, file must exist" + echo " path to the GTF file with annotations" + echo "" + echo " --sjdbGTFchrPrefix" + echo " type: string" + echo " prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL" + echo " annotations with UCSC genomes)" + echo "" + echo " --sjdbGTFfeatureExon" + echo " type: string" + echo " example: exon" + echo " feature type in GTF file to be used as exons for building transcripts" + echo "" + echo " --sjdbGTFtagExonParentTranscript" + echo " type: string" + echo " example: transcript_id" + echo " GTF attribute name for parent transcript ID (default \"transcript_id\"" + echo " works for GTF files)" + echo "" + echo " --sjdbGTFtagExonParentGene" + echo " type: string" + echo " example: gene_id" + echo " GTF attribute name for parent gene ID (default \"gene_id\" works for GTF" + echo " files)" + echo "" + echo " --sjdbGTFtagExonParentGeneName" + echo " type: string, multiple values allowed" + echo " example: gene_name" + echo " GTF attribute name for parent gene name" + echo "" + echo " --sjdbGTFtagExonParentGeneType" + echo " type: string, multiple values allowed" + echo " example: gene_type;gene_biotype" + echo " GTF attribute name for parent gene type" + echo "" + echo " --sjdbOverhang" + echo " type: integer" + echo " example: 100" + echo " length of the donor/acceptor sequence on each side of the junctions," + echo " ideally = (mate_length - 1)" + echo "" + echo " --sjdbScore" + echo " type: integer" + echo " example: 2" + echo " extra alignment score for alignments that cross database junctions" + echo "" + echo " --sjdbInsertSave" + echo " type: string" + echo " example: Basic" + echo " which files to save when sjdb junctions are inserted on the fly at the" + echo " mapping step" + echo " - Basic ... only small junction / transcript files" + echo " - All ... all files including big Genome, SA and SAindex - this will" + echo " create a complete genome directory" + echo "" + echo "Variation parameters:" + echo " --varVCFfile" + echo " type: string" + echo " path to the VCF file that contains variation data. The 10th column" + echo " should contain the genotype information, e.g. 0/1" + echo "" + echo "Read Parameters:" + echo " --readFilesType" + echo " type: string" + echo " example: Fastx" + echo " format of input read files" + echo " - Fastx ... FASTA or FASTQ" + echo " - SAM SE ... SAM or BAM single-end reads; for BAM use" + echo " --readFilesCommand samtools view" + echo " - SAM PE ... SAM or BAM paired-end reads; for BAM use" + echo " --readFilesCommand samtools view" + echo "" + echo " --readFilesSAMattrKeep" + echo " type: string, multiple values allowed" + echo " example: All" + echo " for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM," + echo " e.g.: --readFilesSAMtagsKeep RG PL" + echo " - All ... keep all tags" + echo " - None ... do not keep any tags" + echo "" + echo " --readFilesManifest" + echo " type: file, file must exist" + echo " path to the \"manifest\" file with the names of read files. The manifest" + echo " file should contain 3 tab-separated columns:" + echo " paired-end reads: read1_file_name \$tab\$ read2_file_name \$tab\$" + echo " read_group_line." + echo " single-end reads: read1_file_name \$tab\$ - \$tab\$" + echo " read_group_line." + echo " Spaces, but not tabs are allowed in file names." + echo " If read_group_line does not start with ID:, it can only contain one ID" + echo " field, and ID: will be added to it." + echo " If read_group_line starts with ID:, it can contain several fields" + echo " separated by \$tab\$, and all fields will be be copied verbatim into SAM" + echo " @RG header line." + echo "" + echo " --readFilesPrefix" + echo " type: string" + echo " prefix for the read files names, i.e. it will be added in front of the" + echo " strings in --readFilesIn" + echo "" + echo " --readFilesCommand" + echo " type: string, multiple values allowed" + echo " command line to execute for each of the input file. This command should" + echo " generate FASTA or FASTQ text and send it to stdout" + echo " For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2" + echo " files, etc." + echo "" + echo " --readMapNumber" + echo " type: integer" + echo " example: -1" + echo " number of reads to map from the beginning of the file" + echo " -1: map all reads" + echo "" + echo " --readMatesLengthsIn" + echo " type: string" + echo " example: NotEqual" + echo " Equal/NotEqual - lengths of names,sequences,qualities for both mates are" + echo " the same / not the same. NotEqual is safe in all situations." + echo "" + echo " --readNameSeparator" + echo " type: string, multiple values allowed" + echo " example: /" + echo " character(s) separating the part of the read names that will be trimmed" + echo " in output (read name after space is always trimmed)" + echo "" + echo " --readQualityScoreBase" + echo " type: integer" + echo " example: 33" + echo " number to be subtracted from the ASCII code to get Phred quality score" + echo "" + echo "Read Clipping:" + echo " --clipAdapterType" + echo " type: string" + echo " example: Hamming" + echo " adapter clipping type" + echo " - Hamming ... adapter clipping based on Hamming distance, with the" + echo " number of mismatches controlled by --clip5pAdapterMMp" + echo " - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4." + echo " Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal" + echo " - None ... no adapter clipping, all other clip* parameters are" + echo " disregarded" + echo "" + echo " --clip3pNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number(s) of bases to clip from 3p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo "" + echo " --clip3pAdapterSeq" + echo " type: string, multiple values allowed" + echo " adapter sequences to clip from 3p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo " - polyA ... polyA sequence with the length equal to read length" + echo "" + echo " --clip3pAdapterMMp" + echo " type: double, multiple values allowed" + echo " example: 0.1" + echo " max proportion of mismatches for 3p adapter clipping for each mate. If" + echo " one value is given, it will be assumed the same for both mates." + echo "" + echo " --clip3pAfterAdapterNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number of bases to clip from 3p of each mate after the adapter clipping." + echo " If one value is given, it will be assumed the same for both mates." + echo "" + echo " --clip5pNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number(s) of bases to clip from 5p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo "" + echo "Limits:" + echo " --limitGenomeGenerateRAM" + echo " type: long" + echo " example: 31000000000" + echo " maximum available RAM (bytes) for genome generation" + echo "" + echo " --limitIObufferSize" + echo " type: long, multiple values allowed" + echo " example: 30000000;50000000" + echo " max available buffers size (bytes) for input/output, per thread" + echo "" + echo " --limitOutSAMoneReadBytes" + echo " type: long" + echo " example: 100000" + echo " max size of the SAM record (bytes) for one read. Recommended value:" + echo " >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + echo "" + echo " --limitOutSJoneRead" + echo " type: integer" + echo " example: 1000" + echo " max number of junctions for one read (including all multi-mappers)" + echo "" + echo " --limitOutSJcollapsed" + echo " type: integer" + echo " example: 1000000" + echo " max number of collapsed junctions" + echo "" + echo " --limitBAMsortRAM" + echo " type: long" + echo " example: 0" + echo " maximum available RAM (bytes) for sorting BAM. If =0, it will be set to" + echo " the genome index size. 0 value can only be used with --genomeLoad" + echo " NoSharedMemory option." + echo "" + echo " --limitSjdbInsertNsj" + echo " type: integer" + echo " example: 1000000" + echo " maximum number of junctions to be inserted to the genome on the fly at" + echo " the mapping stage, including those from annotations and those detected" + echo " in the 1st step of the 2-pass run" + echo "" + echo " --limitNreadsSoft" + echo " type: integer" + echo " example: -1" + echo " soft limit on the number of reads" + echo "" + echo "Output: general:" + echo " --outTmpKeep" + echo " type: string" + echo " whether to keep the temporary files after STAR runs is finished" + echo " - None ... remove all temporary files" + echo " - All ... keep all files" + echo "" + echo " --outStd" + echo " type: string" + echo " example: Log" + echo " which output will be directed to stdout (standard out)" + echo " - Log ... log messages" + echo " - SAM ... alignments in SAM format (which normally" + echo " are output to Aligned.out.sam file), normal standard output will go into" + echo " Log.std.out" + echo " - BAM_Unsorted ... alignments in BAM format, unsorted." + echo " Requires --outSAMtype BAM Unsorted" + echo " - BAM_SortedByCoordinate ... alignments in BAM format, sorted by" + echo " coordinate. Requires --outSAMtype BAM SortedByCoordinate" + echo " - BAM_Quant ... alignments to transcriptome in BAM format," + echo " unsorted. Requires --quantMode TranscriptomeSAM" + echo "" + echo " --outReadsUnmapped" + echo " type: string" + echo " output of unmapped and partially mapped (i.e. mapped only one mate of a" + echo " paired end read) reads in separate file(s)." + echo " - None ... no output" + echo " - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + echo "" + echo " --outQSconversionAdd" + echo " type: integer" + echo " example: 0" + echo " add this number to the quality score (e.g. to convert from Illumina to" + echo " Sanger, use -31)" + echo "" + echo " --outMultimapperOrder" + echo " type: string" + echo " example: Old_2.4" + echo " order of multimapping alignments in the output files" + echo " - Old_2.4 ... quasi-random order used before 2.5.0" + echo " - Random ... random order of alignments for each" + echo " multi-mapper. Read mates (pairs) are always adjacent, all alignment for" + echo " each read stay together. This option will become default in the future" + echo " releases." + echo "" + echo "Output: SAM and BAM:" + echo " --outSAMtype" + echo " type: string, multiple values allowed" + echo " example: SAM" + echo " type of SAM/BAM output" + echo " 1st word:" + echo " - BAM ... output BAM without sorting" + echo " - SAM ... output SAM without sorting" + echo " - None ... no SAM/BAM output" + echo " 2nd, 3rd:" + echo " - Unsorted ... standard unsorted" + echo " - SortedByCoordinate ... sorted by coordinate. This option will allocate" + echo " extra memory for sorting which can be specified by --limitBAMsortRAM." + echo "" + echo " --outSAMmode" + echo " type: string" + echo " example: Full" + echo " mode of SAM output" + echo " - None ... no SAM output" + echo " - Full ... full SAM output" + echo " - NoQS ... full SAM but without quality scores" + echo "" + echo " --outSAMstrandField" + echo " type: string" + echo " Cufflinks-like strand field flag" + echo " - None ... not used" + echo " - intronMotif ... strand derived from the intron motif. This option" + echo " changes the output alignments: reads with inconsistent and/or" + echo " non-canonical introns are filtered out." + echo "" + echo " --outSAMattributes" + echo " type: string, multiple values allowed" + echo " example: Standard" + echo " a string of desired SAM attributes, in the order desired for the output" + echo " SAM. Tags can be listed in any combination/order." + echo " ***Presets:" + echo " - None ... no attributes" + echo " - Standard ... NH HI AS nM" + echo " - All ... NH HI AS nM NM MD jM jI MC ch" + echo " ***Alignment:" + echo " - NH ... number of loci the reads maps to: =1 for unique" + echo " mappers, >1 for multimappers. Standard SAM tag." + echo " - HI ... multiple alignment index, starts with" + echo " --outSAMattrIHstart (=1 by default). Standard SAM tag." + echo " - AS ... local alignment score, +1/-1 for matches/mismateches," + echo " score* penalties for indels and gaps. For PE reads, total score for two" + echo " mates. Stadnard SAM tag." + echo " - nM ... number of mismatches. For PE reads, sum over two" + echo " mates." + echo " - NM ... edit distance to the reference (number of mismatched +" + echo " inserted + deleted bases) for each mate. Standard SAM tag." + echo " - MD ... string encoding mismatched and deleted reference bases" + echo " (see standard SAM specifications). Standard SAM tag." + echo " - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0:" + echo " non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6:" + echo " GT/AT. If splice junctions database is used, and a junction is" + echo " annotated, 20 is added to its motif value." + echo " - jI ... start and end of introns for all junctions (1-based)." + echo " - XS ... alignment strand according to --outSAMstrandField." + echo " - MC ... mate's CIGAR string. Standard SAM tag." + echo " - ch ... marks all segment of all chimeric alingments for" + echo " --chimOutType WithinBAM output." + echo " - cN ... number of bases clipped from the read ends: 5' and 3'" + echo " ***Variation:" + echo " - vA ... variant allele" + echo " - vG ... genomic coordinate of the variant overlapped by the" + echo " read." + echo " - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 -" + echo " alignment does not pass WASP filtering. Requires --waspOutputMode" + echo " SAMtag." + echo " ***STARsolo:" + echo " - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs" + echo " for the solo* demultiplexing." + echo " - GX GN ... gene ID and gene name for unique-gene reads." + echo " - gx gn ... gene IDs and gene names for unique- and multi-gene" + echo " reads." + echo " - CB UB ... error-corrected cell barcodes and UMIs for solo*" + echo " demultiplexing. Requires --outSAMtype BAM SortedByCoordinate." + echo " - sM ... assessment of CB and UMI." + echo " - sS ... sequence of the entire barcode (CB,UMI,adapter)." + echo " - sQ ... quality of the entire barcode." + echo " ***Unsupported/undocumented:" + echo " - ha ... haplotype (1/2) when mapping to the diploid genome." + echo " Requires genome generated with --genomeTransformType Diploid ." + echo " - rB ... alignment block read/genomic coordinates." + echo " - vR ... read coordinate of the variant." + echo "" + echo " --outSAMattrIHstart" + echo " type: integer" + echo " example: 1" + echo " start value for the IH attribute. 0 may be required by some downstream" + echo " software, such as Cufflinks or StringTie." + echo "" + echo " --outSAMunmapped" + echo " type: string, multiple values allowed" + echo " output of unmapped reads in the SAM format" + echo " 1st word:" + echo " - None ... no output" + echo " - Within ... output unmapped reads within the main SAM file (i.e." + echo " Aligned.out.sam)" + echo " 2nd word:" + echo " - KeepPairs ... record unmapped mate for each alignment, and, in case of" + echo " unsorted output, keep it adjacent to its mapped mate. Only affects" + echo " multi-mapping reads." + echo "" + echo " --outSAMorder" + echo " type: string" + echo " example: Paired" + echo " type of sorting for the SAM output" + echo " Paired: one mate after the other for all paired alignments" + echo " PairedKeepInputOrder: one mate after the other for all paired" + echo " alignments, the order is kept the same as in the input FASTQ files" + echo "" + echo " --outSAMprimaryFlag" + echo " type: string" + echo " example: OneBestScore" + echo " which alignments are considered primary - all others will be marked with" + echo " 0x100 bit in the FLAG" + echo " - OneBestScore ... only one alignment with the best score is primary" + echo " - AllBestScore ... all alignments with the best score are primary" + echo "" + echo " --outSAMreadID" + echo " type: string" + echo " example: Standard" + echo " read ID record type" + echo " - Standard ... first word (until space) from the FASTx read ID line," + echo " removing /1,/2 from the end" + echo " - Number ... read number (index) in the FASTx file" + echo "" + echo " --outSAMmapqUnique" + echo " type: integer" + echo " example: 255" + echo " 0 to 255: the MAPQ value for unique mappers" + echo "" + echo " --outSAMflagOR" + echo " type: integer" + echo " example: 0" + echo " 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e." + echo " FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set" + echo " by STAR, and after outSAMflagAND. Can be used to set specific bits that" + echo " are not set otherwise." + echo "" + echo " --outSAMflagAND" + echo " type: integer" + echo " example: 65535" + echo " 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e." + echo " FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set" + echo " by STAR, but before outSAMflagOR. Can be used to unset specific bits" + echo " that are not set otherwise." + echo "" + echo " --outSAMattrRGline" + echo " type: string, multiple values allowed" + echo " SAM/BAM read group line. The first word contains the read group" + echo " identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx" + echo " CN:yy \"DS:z z z\"." + echo " xxx will be added as RG tag to each output alignment. Any spaces in the" + echo " tag values have to be double quoted." + echo " Comma separated RG lines correspons to different (comma separated) input" + echo " files in --readFilesIn. Commas have to be surrounded by spaces, e.g." + echo " --outSAMattrRGline ID:xxx , ID:zzz \"DS:z z\" , ID:yyy DS:yyyy" + echo "" + echo " --outSAMheaderHD" + echo " type: string, multiple values allowed" + echo " @HD (header) line of the SAM header" + echo "" + echo " --outSAMheaderPG" + echo " type: string, multiple values allowed" + echo " extra @PG (software) line of the SAM header (in addition to STAR)" + echo "" + echo " --outSAMheaderCommentFile" + echo " type: string" + echo " path to the file with @CO (comment) lines of the SAM header" + echo "" + echo " --outSAMfilter" + echo " type: string, multiple values allowed" + echo " filter the output into main SAM/BAM files" + echo " - KeepOnlyAddedReferences ... only keep the reads for which all" + echo " alignments are to the extra reference sequences added with" + echo " --genomeFastaFiles at the mapping stage." + echo " - KeepAllAddedReferences ... keep all alignments to the extra reference" + echo " sequences added with --genomeFastaFiles at the mapping stage." + echo "" + echo " --outSAMmultNmax" + echo " type: integer" + echo " example: -1" + echo " max number of multiple alignments for a read that will be output to the" + echo " SAM/BAM files. Note that if this value is not equal to -1, the top" + echo " scoring alignment will be output first" + echo " - -1 ... all alignments (up to --outFilterMultimapNmax) will be output" + echo "" + echo " --outSAMtlen" + echo " type: integer" + echo " example: 1" + echo " calculation method for the TLEN field in the SAM/BAM files" + echo " - 1 ... leftmost base of the (+)strand mate to rightmost base of the" + echo " (-)mate. (+)sign for the (+)strand mate" + echo " - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign" + echo " for the mate with the leftmost base. This is different from 1 for" + echo " overlapping mates with protruding ends" + echo "" + echo " --outBAMcompression" + echo " type: integer" + echo " example: 1" + echo " -1 to 10 BAM compression level, -1=default compression (6?), 0=no" + echo " compression, 10=maximum compression" + echo "" + echo " --outBAMsortingThreadN" + echo " type: integer" + echo " example: 0" + echo " >=0: number of threads for BAM sorting. 0 will default to" + echo " min(6,--runThreadN)." + echo "" + echo " --outBAMsortingBinsN" + echo " type: integer" + echo " example: 50" + echo " >0: number of genome bins for coordinate-sorting" + echo "" + echo "BAM processing:" + echo " --bamRemoveDuplicatesType" + echo " type: string" + echo " mark duplicates in the BAM file, for now only works with (i) sorted BAM" + echo " fed with inputBAMfile, and (ii) for paired-end alignments only" + echo " - - ... no duplicate removal/marking" + echo " - UniqueIdentical ... mark all multimappers, and duplicate" + echo " unique mappers. The coordinates, FLAG, CIGAR must be identical" + echo " - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not" + echo " multimappers." + echo "" + echo " --bamRemoveDuplicatesMate2basesN" + echo " type: integer" + echo " example: 0" + echo " number of bases from the 5' of mate 2 to use in collapsing (e.g. for" + echo " RAMPAGE)" + echo "" + echo "Output Wiggle:" + echo " --outWigType" + echo " type: string, multiple values allowed" + echo " type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\". Requires" + echo " sorted BAM: --outSAMtype BAM SortedByCoordinate ." + echo " 1st word:" + echo " - None ... no signal output" + echo " - bedGraph ... bedGraph format" + echo " - wiggle ... wiggle format" + echo " 2nd word:" + echo " - read1_5p ... signal from only 5' of the 1st read, useful for" + echo " CAGE/RAMPAGE etc" + echo " - read2 ... signal from only 2nd read" + echo "" + echo " --outWigStrand" + echo " type: string" + echo " example: Stranded" + echo " strandedness of wiggle/bedGraph output" + echo " - Stranded ... separate strands, str1 and str2" + echo " - Unstranded ... collapsed strands" + echo "" + echo " --outWigReferencesPrefix" + echo " type: string" + echo " prefix matching reference names to include in the output wiggle file," + echo " e.g. \"chr\", default \"-\" - include all references" + echo "" + echo " --outWigNorm" + echo " type: string" + echo " example: RPM" + echo " type of normalization for the signal" + echo " - RPM ... reads per million of mapped reads" + echo " - None ... no normalization, \"raw\" counts" + echo "" + echo "Output Filtering:" + echo " --outFilterType" + echo " type: string" + echo " example: Normal" + echo " type of filtering" + echo " - Normal ... standard filtering using only current alignment" + echo " - BySJout ... keep only those reads that contain junctions that passed" + echo " filtering into SJ.out.tab" + echo "" + echo " --outFilterMultimapScoreRange" + echo " type: integer" + echo " example: 1" + echo " the score range below the maximum score for multimapping alignments" + echo "" + echo " --outFilterMultimapNmax" + echo " type: integer" + echo " example: 10" + echo " maximum number of loci the read is allowed to map to. Alignments (all of" + echo " them) will be output only if the read maps to no more loci than this" + echo " value." + echo " Otherwise no alignments will be output, and the read will be counted as" + echo " \"mapped to too many loci\" in the Log.final.out ." + echo "" + echo " --outFilterMismatchNmax" + echo " type: integer" + echo " example: 10" + echo " alignment will be output only if it has no more mismatches than this" + echo " value." + echo "" + echo " --outFilterMismatchNoverLmax" + echo " type: double" + echo " example: 0.3" + echo " alignment will be output only if its ratio of mismatches to *mapped*" + echo " length is less than or equal to this value." + echo "" + echo " --outFilterMismatchNoverReadLmax" + echo " type: double" + echo " example: 1.0" + echo " alignment will be output only if its ratio of mismatches to *read*" + echo " length is less than or equal to this value." + echo "" + echo " --outFilterScoreMin" + echo " type: integer" + echo " example: 0" + echo " alignment will be output only if its score is higher than or equal to" + echo " this value." + echo "" + echo " --outFilterScoreMinOverLread" + echo " type: double" + echo " example: 0.66" + echo " same as outFilterScoreMin, but normalized to read length (sum of mates'" + echo " lengths for paired-end reads)" + echo "" + echo " --outFilterMatchNmin" + echo " type: integer" + echo " example: 0" + echo " alignment will be output only if the number of matched bases is higher" + echo " than or equal to this value." + echo "" + echo " --outFilterMatchNminOverLread" + echo " type: double" + echo " example: 0.66" + echo " sam as outFilterMatchNmin, but normalized to the read length (sum of" + echo " mates' lengths for paired-end reads)." + echo "" + echo " --outFilterIntronMotifs" + echo " type: string" + echo " filter alignment using their motifs" + echo " - None ... no filtering" + echo " - RemoveNoncanonical ... filter out alignments that contain" + echo " non-canonical junctions" + echo " - RemoveNoncanonicalUnannotated ... filter out alignments that contain" + echo " non-canonical unannotated junctions when using annotated splice" + echo " junctions database. The annotated non-canonical junctions will be kept." + echo "" + echo " --outFilterIntronStrands" + echo " type: string" + echo " example: RemoveInconsistentStrands" + echo " filter alignments" + echo " - RemoveInconsistentStrands ... remove alignments that have" + echo " junctions with inconsistent strands" + echo " - None ... no filtering" + echo "" + echo "Output splice junctions (SJ.out.tab):" + echo " --outSJtype" + echo " type: string" + echo " example: Standard" + echo " type of splice junction output" + echo " - Standard ... standard SJ.out.tab output" + echo " - None ... no splice junction output" + echo "" + echo "Output Filtering: Splice Junctions:" + echo " --outSJfilterReads" + echo " type: string" + echo " example: All" + echo " which reads to consider for collapsed splice junctions output" + echo " - All ... all reads, unique- and multi-mappers" + echo " - Unique ... uniquely mapping reads only" + echo "" + echo " --outSJfilterOverhangMin" + echo " type: integer, multiple values allowed" + echo " example: 30;12;12;12" + echo " minimum overhang length for splice junctions on both sides for: (1)" + echo " non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterCountUniqueMin" + echo " type: integer, multiple values allowed" + echo " example: 3;1;1;1" + echo " minimum uniquely mapping read count per junction for: (1) non-canonical" + echo " motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC" + echo " and GT/AT motif. -1 means no output for that motif" + echo " Junctions are output if one of outSJfilterCountUniqueMin OR" + echo " outSJfilterCountTotalMin conditions are satisfied" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterCountTotalMin" + echo " type: integer, multiple values allowed" + echo " example: 3;1;1;1" + echo " minimum total (multi-mapping+unique) read count per junction for: (1)" + echo " non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif" + echo " Junctions are output if one of outSJfilterCountUniqueMin OR" + echo " outSJfilterCountTotalMin conditions are satisfied" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterDistToOtherSJmin" + echo " type: integer, multiple values allowed" + echo " example: 10;0;5;10" + echo " minimum allowed distance to other junctions' donor/acceptor" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterIntronMaxVsReadN" + echo " type: integer, multiple values allowed" + echo " example: 50000;100000;200000" + echo " maximum gap allowed for junctions supported by 1,2,3,,,N reads" + echo " i.e. by default junctions supported by 1 read can have gaps <=50000b, by" + echo " 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap" + echo " <=alignIntronMax" + echo " does not apply to annotated junctions" + echo "" + echo "Scoring:" + echo " --scoreGap" + echo " type: integer" + echo " example: 0" + echo " splice junction penalty (independent on intron motif)" + echo "" + echo " --scoreGapNoncan" + echo " type: integer" + echo " example: -8" + echo " non-canonical junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGapGCAG" + echo " type: integer" + echo " example: -4" + echo " GC/AG and CT/GC junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGapATAC" + echo " type: integer" + echo " example: -8" + echo " AT/AC and GT/AT junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGenomicLengthLog2scale" + echo " type: integer" + echo " example: 0" + echo " extra score logarithmically scaled with genomic length of the alignment:" + echo " scoreGenomicLengthLog2scale*log2(genomicLength)" + echo "" + echo " --scoreDelOpen" + echo " type: integer" + echo " example: -2" + echo " deletion open penalty" + echo "" + echo " --scoreDelBase" + echo " type: integer" + echo " example: -2" + echo " deletion extension penalty per base (in addition to scoreDelOpen)" + echo "" + echo " --scoreInsOpen" + echo " type: integer" + echo " example: -2" + echo " insertion open penalty" + echo "" + echo " --scoreInsBase" + echo " type: integer" + echo " example: -2" + echo " insertion extension penalty per base (in addition to scoreInsOpen)" + echo "" + echo " --scoreStitchSJshift" + echo " type: integer" + echo " example: 1" + echo " maximum score reduction while searching for SJ boundaries in the" + echo " stitching step" + echo "" + echo "Alignments and Seeding:" + echo " --seedSearchStartLmax" + echo " type: integer" + echo " example: 50" + echo " defines the search start point through the read - the read is split into" + echo " pieces no longer than this value" + echo "" + echo " --seedSearchStartLmaxOverLread" + echo " type: double" + echo " example: 1.0" + echo " seedSearchStartLmax normalized to read length (sum of mates' lengths for" + echo " paired-end reads)" + echo "" + echo " --seedSearchLmax" + echo " type: integer" + echo " example: 0" + echo " defines the maximum length of the seeds, if =0 seed length is not" + echo " limited" + echo "" + echo " --seedMultimapNmax" + echo " type: integer" + echo " example: 10000" + echo " only pieces that map fewer than this value are utilized in the stitching" + echo " procedure" + echo "" + echo " --seedPerReadNmax" + echo " type: integer" + echo " example: 1000" + echo " max number of seeds per read" + echo "" + echo " --seedPerWindowNmax" + echo " type: integer" + echo " example: 50" + echo " max number of seeds per window" + echo "" + echo " --seedNoneLociPerWindow" + echo " type: integer" + echo " example: 10" + echo " max number of one seed loci per window" + echo "" + echo " --seedSplitMin" + echo " type: integer" + echo " example: 12" + echo " min length of the seed sequences split by Ns or mate gap" + echo "" + echo " --seedMapMin" + echo " type: integer" + echo " example: 5" + echo " min length of seeds to be mapped" + echo "" + echo " --alignIntronMin" + echo " type: integer" + echo " example: 21" + echo " minimum intron size, genomic gap is considered intron if its" + echo " length>=alignIntronMin, otherwise it is considered Deletion" + echo "" + echo " --alignIntronMax" + echo " type: integer" + echo " example: 0" + echo " maximum intron size, if 0, max intron size will be determined by" + echo " (2^winBinNbits)*winAnchorDistNbins" + echo "" + echo " --alignMatesGapMax" + echo " type: integer" + echo " example: 0" + echo " maximum gap between two mates, if 0, max intron gap will be determined" + echo " by (2^winBinNbits)*winAnchorDistNbins" + echo "" + echo " --alignSJoverhangMin" + echo " type: integer" + echo " example: 5" + echo " minimum overhang (i.e. block size) for spliced alignments" + echo "" + echo " --alignSJstitchMismatchNmax" + echo " type: integer, multiple values allowed" + echo " example: 0;-1;0;0" + echo " maximum number of mismatches for stitching of the splice junctions (-1:" + echo " no limit)." + echo " (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif." + echo "" + echo " --alignSJDBoverhangMin" + echo " type: integer" + echo " example: 3" + echo " minimum overhang (i.e. block size) for annotated (sjdb) spliced" + echo " alignments" + echo "" + echo " --alignSplicedMateMapLmin" + echo " type: integer" + echo " example: 0" + echo " minimum mapped length for a read mate that is spliced" + echo "" + echo " --alignSplicedMateMapLminOverLmate" + echo " type: double" + echo " example: 0.66" + echo " alignSplicedMateMapLmin normalized to mate length" + echo "" + echo " --alignWindowsPerReadNmax" + echo " type: integer" + echo " example: 10000" + echo " max number of windows per read" + echo "" + echo " --alignTranscriptsPerWindowNmax" + echo " type: integer" + echo " example: 100" + echo " max number of transcripts per window" + echo "" + echo " --alignTranscriptsPerReadNmax" + echo " type: integer" + echo " example: 10000" + echo " max number of different alignments per read to consider" + echo "" + echo " --alignEndsType" + echo " type: string" + echo " example: Local" + echo " type of read ends alignment" + echo " - Local ... standard local alignment with soft-clipping" + echo " allowed" + echo " - EndToEnd ... force end-to-end read alignment, do not" + echo " soft-clip" + echo " - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other" + echo " ends: local alignment" + echo " - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and" + echo " read2, all other ends: local alignment" + echo "" + echo " --alignEndsProtrude" + echo " type: string" + echo " example: 0 ConcordantPair" + echo " allow protrusion of alignment ends, i.e. start (end) of the +strand mate" + echo " downstream of the start (end) of the -strand mate" + echo " 1st word: int: maximum number of protrusion bases allowed" + echo " 2nd word: string:" + echo " - ConcordantPair ... report alignments with non-zero" + echo " protrusion as concordant pairs" + echo " - DiscordantPair ... report alignments with non-zero" + echo " protrusion as discordant pairs" + echo "" + echo " --alignSoftClipAtReferenceEnds" + echo " type: string" + echo " example: Yes" + echo " allow the soft-clipping of the alignments past the end of the" + echo " chromosomes" + echo " - Yes ... allow" + echo " - No ... prohibit, useful for compatibility with Cufflinks" + echo "" + echo " --alignInsertionFlush" + echo " type: string" + echo " how to flush ambiguous insertion positions" + echo " - None ... insertions are not flushed" + echo " - Right ... insertions are flushed to the right" + echo "" + echo "Paired-End reads:" + echo " --peOverlapNbasesMin" + echo " type: integer" + echo " example: 0" + echo " minimum number of overlapping bases to trigger mates merging and" + echo " realignment. Specify >0 value to switch on the \"merginf of overlapping" + echo " mates\" algorithm." + echo "" + echo " --peOverlapMMp" + echo " type: double" + echo " example: 0.01" + echo " maximum proportion of mismatched bases in the overlap area" + echo "" + echo "Windows, Anchors, Binning:" + echo " --winAnchorMultimapNmax" + echo " type: integer" + echo " example: 50" + echo " max number of loci anchors are allowed to map to" + echo "" + echo " --winBinNbits" + echo " type: integer" + echo " example: 16" + echo " =log2(winBin), where winBin is the size of the bin for the" + echo " windows/clustering, each window will occupy an integer number of bins." + echo "" + echo " --winAnchorDistNbins" + echo " type: integer" + echo " example: 9" + echo " max number of bins between two anchors that allows aggregation of" + echo " anchors into one window" + echo "" + echo " --winFlankNbins" + echo " type: integer" + echo " example: 4" + echo " log2(winFlank), where win Flank is the size of the left and right" + echo " flanking regions for each window" + echo "" + echo " --winReadCoverageRelativeMin" + echo " type: double" + echo " example: 0.5" + echo " minimum relative coverage of the read sequence by the seeds in a window," + echo " for STARlong algorithm only." + echo "" + echo " --winReadCoverageBasesMin" + echo " type: integer" + echo " example: 0" + echo " minimum number of bases covered by the seeds in a window , for STARlong" + echo " algorithm only." + echo "" + echo "Chimeric Alignments:" + echo " --chimOutType" + echo " type: string, multiple values allowed" + echo " example: Junctions" + echo " type of chimeric output" + echo " - Junctions ... Chimeric.out.junction" + echo " - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file" + echo " - WithinBAM ... output into main aligned BAM files (Aligned.*.bam)" + echo " - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for" + echo " supplemental chimeric alignments (default if no 2nd word is present)" + echo " - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental" + echo " chimeric alignments" + echo "" + echo " --chimSegmentMin" + echo " type: integer" + echo " example: 0" + echo " minimum length of chimeric segment length, if ==0, no chimeric output" + echo "" + echo " --chimScoreMin" + echo " type: integer" + echo " example: 0" + echo " minimum total (summed) score of the chimeric segments" + echo "" + echo " --chimScoreDropMax" + echo " type: integer" + echo " example: 20" + echo " max drop (difference) of chimeric score (the sum of scores of all" + echo " chimeric segments) from the read length" + echo "" + echo " --chimScoreSeparation" + echo " type: integer" + echo " example: 10" + echo " minimum difference (separation) between the best chimeric score and the" + echo " next one" + echo "" + echo " --chimScoreJunctionNonGTAG" + echo " type: integer" + echo " example: -1" + echo " penalty for a non-GT/AG chimeric junction" + echo "" + echo " --chimJunctionOverhangMin" + echo " type: integer" + echo " example: 20" + echo " minimum overhang for a chimeric junction" + echo "" + echo " --chimSegmentReadGapMax" + echo " type: integer" + echo " example: 0" + echo " maximum gap in the read sequence between chimeric segments" + echo "" + echo " --chimFilter" + echo " type: string, multiple values allowed" + echo " example: banGenomicN" + echo " different filters for chimeric alignments" + echo " - None ... no filtering" + echo " - banGenomicN ... Ns are not allowed in the genome sequence around the" + echo " chimeric junction" + echo "" + echo " --chimMainSegmentMultNmax" + echo " type: integer" + echo " example: 10" + echo " maximum number of multi-alignments for the main chimeric segment. =1" + echo " will prohibit multimapping main segments." + echo "" + echo " --chimMultimapNmax" + echo " type: integer" + echo " example: 0" + echo " maximum number of chimeric multi-alignments" + echo " - 0 ... use the old scheme for chimeric detection which only considered" + echo " unique alignments" + echo "" + echo " --chimMultimapScoreRange" + echo " type: integer" + echo " example: 1" + echo " the score range for multi-mapping chimeras below the best chimeric" + echo " score. Only works with --chimMultimapNmax > 1" + echo "" + echo " --chimNonchimScoreDropMin" + echo " type: integer" + echo " example: 20" + echo " to trigger chimeric detection, the drop in the best non-chimeric" + echo " alignment score with respect to the read length has to be greater than" + echo " this value" + echo "" + echo " --chimOutJunctionFormat" + echo " type: integer" + echo " example: 0" + echo " formatting type for the Chimeric.out.junction file" + echo " - 0 ... no comment lines/headers" + echo " - 1 ... comment lines at the end of the file: command line and Nreads:" + echo " total, unique/multi-mapping" + echo "" + echo "Quantification of Annotations:" + echo " --quantMode" + echo " type: string, multiple values allowed" + echo " types of quantification requested" + echo " - - ... none" + echo " - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a" + echo " separate file" + echo " - GeneCounts ... count reads per gene" + echo "" + echo " --quantTranscriptomeBAMcompression" + echo " type: integer" + echo " example: 1" + echo " -2 to 10 transcriptome BAM compression level" + echo " - -2 ... no BAM output" + echo " - -1 ... default compression (6?)" + echo " - 0 ... no compression" + echo " - 10 ... maximum compression" + echo "" + echo " --quantTranscriptomeBan" + echo " type: string" + echo " example: IndelSoftclipSingleend" + echo " prohibit various alignment type" + echo " - IndelSoftclipSingleend ... prohibit indels, soft clipping and" + echo " single-end alignments - compatible with RSEM" + echo " - Singleend ... prohibit single-end alignments" + echo "" + echo "2-pass Mapping:" + echo " --twopassMode" + echo " type: string" + echo " 2-pass mapping mode." + echo " - None ... 1-pass mapping" + echo " - Basic ... basic 2-pass mapping, with all 1st pass junctions" + echo " inserted into the genome indices on the fly" + echo "" + echo " --twopass1readsN" + echo " type: integer" + echo " example: -1" + echo " number of reads to process for the 1st step. Use very large number (or" + echo " default -1) to map all reads in the first step." + echo "" + echo "WASP parameters:" + echo " --waspOutputMode" + echo " type: string" + echo " WASP allele-specific output type. This is re-implementation of the" + echo " original WASP mappability filtering by Bryce van de Geijn, Graham" + echo " McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original" + echo " WASP paper: Nature Methods 12, 1061-1063 (2015)," + echo " https://www.nature.com/articles/nmeth.3582 ." + echo " - SAMtag ... add WASP tags to the alignments that pass WASP" + echo " filtering" + echo "" + echo "STARsolo (single cell RNA-seq) parameters:" + echo " --soloType" + echo " type: string, multiple values allowed" + echo " type of single-cell RNA-seq" + echo " - CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of" + echo " fixed length in read2, e.g. Drop-seq and 10X Chromium." + echo " - CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI" + echo " of fixed length and one adapter sequence of fixed length are allowed in" + echo " read2 only (e.g. inDrop, ddSeq)." + echo " - CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No" + echo " UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end]" + echo " CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or" + echo " SortedByCoordinate]" + echo " - SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired-" + echo " or single-end), barcodes are corresponding read-groups, no UMI" + echo " sequences, alignments deduplicated according to alignment start and end" + echo " (after extending soft-clipped bases)" + echo "" + echo " --soloCBwhitelist" + echo " type: string, multiple values allowed" + echo " file(s) with whitelist(s) of cell barcodes. Only --soloType" + echo " CB_UMI_Complex allows more than one whitelist file." + echo " - None ... no whitelist: all cell barcodes are allowed" + echo "" + echo " --soloCBstart" + echo " type: integer" + echo " example: 1" + echo " cell barcode start base" + echo "" + echo " --soloCBlen" + echo " type: integer" + echo " example: 16" + echo " cell barcode length" + echo "" + echo " --soloUMIstart" + echo " type: integer" + echo " example: 17" + echo " UMI start base" + echo "" + echo " --soloUMIlen" + echo " type: integer" + echo " example: 10" + echo " UMI length" + echo "" + echo " --soloBarcodeReadLength" + echo " type: integer" + echo " example: 1" + echo " length of the barcode read" + echo " - 1 ... equal to sum of soloCBlen+soloUMIlen" + echo " - 0 ... not defined, do not check" + echo "" + echo " --soloBarcodeMate" + echo " type: integer" + echo " example: 0" + echo " identifies which read mate contains the barcode (CB+UMI) sequence" + echo " - 0 ... barcode sequence is on separate read, which should always be" + echo " the last file in the --readFilesIn listed" + echo " - 1 ... barcode sequence is a part of mate 1" + echo " - 2 ... barcode sequence is a part of mate 2" + echo "" + echo " --soloCBposition" + echo " type: string, multiple values allowed" + echo " position of Cell Barcode(s) on the barcode read." + echo " Presently only works with --soloType CB_UMI_Complex, and barcodes are" + echo " assumed to be on Read2." + echo " Format for each barcode: startAnchor_startPosition_endAnchor_endPosition" + echo " start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1:" + echo " read end; 2: adapter start; 3: adapter end" + echo " start(end)Position is the 0-based position with of the CB start(end)" + echo " with respect to the Anchor Base" + echo " String for different barcodes are separated by space." + echo " Example: inDrop (Zilionis et al, Nat. Protocols, 2017):" + echo " --soloCBposition 0_0_2_-1 3_1_3_8" + echo "" + echo " --soloUMIposition" + echo " type: string" + echo " position of the UMI on the barcode read, same as soloCBposition" + echo " Example: inDrop (Zilionis et al, Nat. Protocols, 2017):" + echo " --soloCBposition 3_9_3_14" + echo "" + echo " --soloAdapterSequence" + echo " type: string" + echo " adapter sequence to anchor barcodes. Only one adapter sequence is" + echo " allowed." + echo "" + echo " --soloAdapterMismatchesNmax" + echo " type: integer" + echo " example: 1" + echo " maximum number of mismatches allowed in adapter sequence." + echo "" + echo " --soloCBmatchWLtype" + echo " type: string" + echo " example: 1MM_multi" + echo " matching the Cell Barcodes to the WhiteList" + echo " - Exact ... only exact matches allowed" + echo " - 1MM ... only one match in whitelist with 1" + echo " mismatched base allowed. Allowed CBs have to have at least one read with" + echo " exact match." + echo " - 1MM_multi ... multiple matches in whitelist with" + echo " 1 mismatched base allowed, posterior probability calculation is used" + echo " choose one of the matches." + echo " Allowed CBs have to have at least one read with exact match. This option" + echo " matches best with CellRanger 2.2.0" + echo " - 1MM_multi_pseudocounts ... same as 1MM_Multi, but" + echo " pseudocounts of 1 are added to all whitelist barcodes." + echo " - 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts," + echo " multimatching to WL is allowed for CBs with N-bases. This option matches" + echo " best with CellRanger >= 3.0.0" + echo " - EditDist_2 ... allow up to edit distance of 3 fpr" + echo " each of the barcodes. May include one deletion + one insertion. Only" + echo " works with --soloType CB_UMI_Complex. Matches to multiple passlist" + echo " barcdoes are not allowed. Similar to ParseBio Split-seq pipeline." + echo "" + echo " --soloInputSAMattrBarcodeSeq" + echo " type: string, multiple values allowed" + echo " when inputting reads from a SAM file (--readsFileType SAM SE/PE), these" + echo " SAM attributes mark the barcode sequence (in proper order)." + echo " For instance, for 10X CellRanger or STARsolo BAMs, use" + echo " --soloInputSAMattrBarcodeSeq CR UR ." + echo " This parameter is required when running STARsolo with input from SAM." + echo "" + echo " --soloInputSAMattrBarcodeQual" + echo " type: string, multiple values allowed" + echo " when inputting reads from a SAM file (--readsFileType SAM SE/PE), these" + echo " SAM attributes mark the barcode qualities (in proper order)." + echo " For instance, for 10X CellRanger or STARsolo BAMs, use" + echo " --soloInputSAMattrBarcodeQual CY UY ." + echo " If this parameter is '-' (default), the quality 'H' will be assigned to" + echo " all bases." + echo "" + echo " --soloStrand" + echo " type: string" + echo " example: Forward" + echo " strandedness of the solo libraries:" + echo " - Unstranded ... no strand information" + echo " - Forward ... read strand same as the original RNA molecule" + echo " - Reverse ... read strand opposite to the original RNA molecule" + echo "" + echo " --soloFeatures" + echo " type: string, multiple values allowed" + echo " example: Gene" + echo " genomic features for which the UMI counts per Cell Barcode are collected" + echo " - Gene ... genes: reads match the gene transcript" + echo " - SJ ... splice junctions: reported in SJ.out.tab" + echo " - GeneFull ... full gene (pre-mRNA): count all reads overlapping" + echo " genes' exons and introns" + echo " - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads" + echo " overlapping genes' exons and introns: prioritize 100% overlap with exons" + echo " - GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads" + echo " overlapping genes' exons and introns: prioritize >50% overlap with" + echo " exons. Do not count reads with 100% exonic overlap in the antisense" + echo " direction." + echo "" + echo " --soloMultiMappers" + echo " type: string, multiple values allowed" + echo " example: Unique" + echo " counting method for reads mapping to multiple genes" + echo " - Unique ... count only reads that map to unique genes" + echo " - Uniform ... uniformly distribute multi-genic UMIs to all genes" + echo " - Rescue ... distribute UMIs proportionally to unique+uniform counts" + echo " (~ first iteration of EM)" + echo " - PropUnique ... distribute UMIs proportionally to unique mappers, if" + echo " present, and uniformly if not." + echo " - EM ... multi-gene UMIs are distributed using Expectation" + echo " Maximization algorithm" + echo "" + echo " --soloUMIdedup" + echo " type: string, multiple values allowed" + echo " example: 1MM_All" + echo " type of UMI deduplication (collapsing) algorithm" + echo " - 1MM_All ... all UMIs with 1 mismatch distance to" + echo " each other are collapsed (i.e. counted once)." + echo " - 1MM_Directional_UMItools ... follows the \"directional\" method from" + echo " the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017)." + echo " - 1MM_Directional ... same as 1MM_Directional_UMItools, but" + echo " with more stringent criteria for duplicate UMIs" + echo " - Exact ... only exactly matching UMIs are" + echo " collapsed." + echo " - NoDedup ... no deduplication of UMIs, count all" + echo " reads." + echo " - 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI" + echo " collapsing." + echo "" + echo " --soloUMIfiltering" + echo " type: string, multiple values allowed" + echo " type of UMI filtering (for reads uniquely mapping to genes)" + echo " - - ... basic filtering: remove UMIs with N and" + echo " homopolymers (similar to CellRanger 2.2.0)." + echo " - MultiGeneUMI ... basic + remove lower-count UMIs that map to" + echo " more than one gene." + echo " - MultiGeneUMI_All ... basic + remove all UMIs that map to more than" + echo " one gene." + echo " - MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to" + echo " more than one gene, matching CellRanger > 3.0.0 ." + echo " Only works with --soloUMIdedup 1MM_CR" + echo "" + echo " --soloOutFileNames" + echo " type: string, multiple values allowed" + echo " example: Solo.out/;features.tsv;barcodes.tsv;matrix.mtx" + echo " file names for STARsolo output:" + echo " file_name_prefix gene_names barcode_sequences" + echo " cell_feature_count_matrix" + echo "" + echo " --soloCellFilter" + echo " type: string, multiple values allowed" + echo " example: CellRanger2.2;3000;0.99;10" + echo " cell filtering type and parameters" + echo " - None ... do not output filtered cells" + echo " - TopCells ... only report top cells by UMI count, followed by" + echo " the exact number of cells" + echo " - CellRanger2.2 ... simple filtering of CellRanger 2.2." + echo " Can be followed by numbers: number of expected cells, robust maximum" + echo " percentile for UMI count, maximum to minimum ratio for UMI count" + echo " The harcoded values are from CellRanger: nExpectedCells=3000;" + echo " maxPercentile=0.99; maxMinRatio=10" + echo " - EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please" + echo " cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20," + echo " 63 (2019):" + echo " " + echo "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y" + echo " Can be followed by 10 numeric parameters: nExpectedCells" + echo " maxPercentile maxMinRatio indMin indMax umiMin" + echo " umiMinFracMedian candMaxN FDR simN" + echo " The harcoded values are from CellRanger: 3000" + echo " 0.99 10 45000 90000 500 0.01" + echo " 20000 0.01 10000" + echo "" + echo " --soloOutFormatFeaturesGeneField3" + echo " type: string, multiple values allowed" + echo " example: Gene Expression" + echo " field 3 in the Gene features.tsv file. If \"-\", then no 3rd field is" + echo " output." + echo "" + echo " --soloCellReadStats" + echo " type: string" + echo " Output reads statistics for each CB" + echo " - Standard ... standard output" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "star_align main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readFilesIn) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesIn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeDir) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--genomeDir\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFileNamePrefix) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--outFileNamePrefix\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFileNamePrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runRNGseed) + [ -n "$VIASH_PAR_RUNRNGSEED" ] && ViashError Bad arguments for option \'--runRNGseed\': \'$VIASH_PAR_RUNRNGSEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNRNGSEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --runRNGseed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runRNGseed=*) + [ -n "$VIASH_PAR_RUNRNGSEED" ] && ViashError Bad arguments for option \'--runRNGseed=*\': \'$VIASH_PAR_RUNRNGSEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNRNGSEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeLoad) + [ -n "$VIASH_PAR_GENOMELOAD" ] && ViashError Bad arguments for option \'--genomeLoad\': \'$VIASH_PAR_GENOMELOAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMELOAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeLoad. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeLoad=*) + [ -n "$VIASH_PAR_GENOMELOAD" ] && ViashError Bad arguments for option \'--genomeLoad=*\': \'$VIASH_PAR_GENOMELOAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMELOAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeFastaFiles) + if [ -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_PAR_GENOMEFASTAFILES="$2" + else + VIASH_PAR_GENOMEFASTAFILES="$VIASH_PAR_GENOMEFASTAFILES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeFastaFiles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeFastaFiles=*) + if [ -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_PAR_GENOMEFASTAFILES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMEFASTAFILES="$VIASH_PAR_GENOMEFASTAFILES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeFileSizes) + if [ -z "$VIASH_PAR_GENOMEFILESIZES" ]; then + VIASH_PAR_GENOMEFILESIZES="$2" + else + VIASH_PAR_GENOMEFILESIZES="$VIASH_PAR_GENOMEFILESIZES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeFileSizes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeFileSizes=*) + if [ -z "$VIASH_PAR_GENOMEFILESIZES" ]; then + VIASH_PAR_GENOMEFILESIZES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMEFILESIZES="$VIASH_PAR_GENOMEFILESIZES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeTransformOutput) + if [ -z "$VIASH_PAR_GENOMETRANSFORMOUTPUT" ]; then + VIASH_PAR_GENOMETRANSFORMOUTPUT="$2" + else + VIASH_PAR_GENOMETRANSFORMOUTPUT="$VIASH_PAR_GENOMETRANSFORMOUTPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeTransformOutput. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeTransformOutput=*) + if [ -z "$VIASH_PAR_GENOMETRANSFORMOUTPUT" ]; then + VIASH_PAR_GENOMETRANSFORMOUTPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMETRANSFORMOUTPUT="$VIASH_PAR_GENOMETRANSFORMOUTPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeChrSetMitochondrial) + if [ -z "$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL" ]; then + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL="$2" + else + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL="$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeChrSetMitochondrial. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeChrSetMitochondrial=*) + if [ -z "$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL" ]; then + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL="$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbFileChrStartEnd) + if [ -z "$VIASH_PAR_SJDBFILECHRSTARTEND" ]; then + VIASH_PAR_SJDBFILECHRSTARTEND="$2" + else + VIASH_PAR_SJDBFILECHRSTARTEND="$VIASH_PAR_SJDBFILECHRSTARTEND;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbFileChrStartEnd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbFileChrStartEnd=*) + if [ -z "$VIASH_PAR_SJDBFILECHRSTARTEND" ]; then + VIASH_PAR_SJDBFILECHRSTARTEND=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBFILECHRSTARTEND="$VIASH_PAR_SJDBFILECHRSTARTEND;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbGTFfile) + [ -n "$VIASH_PAR_SJDBGTFFILE" ] && ViashError Bad arguments for option \'--sjdbGTFfile\': \'$VIASH_PAR_SJDBGTFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFfile=*) + [ -n "$VIASH_PAR_SJDBGTFFILE" ] && ViashError Bad arguments for option \'--sjdbGTFfile=*\': \'$VIASH_PAR_SJDBGTFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFchrPrefix) + [ -n "$VIASH_PAR_SJDBGTFCHRPREFIX" ] && ViashError Bad arguments for option \'--sjdbGTFchrPrefix\': \'$VIASH_PAR_SJDBGTFCHRPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFCHRPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFchrPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFchrPrefix=*) + [ -n "$VIASH_PAR_SJDBGTFCHRPREFIX" ] && ViashError Bad arguments for option \'--sjdbGTFchrPrefix=*\': \'$VIASH_PAR_SJDBGTFCHRPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFCHRPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFfeatureExon) + [ -n "$VIASH_PAR_SJDBGTFFEATUREEXON" ] && ViashError Bad arguments for option \'--sjdbGTFfeatureExon\': \'$VIASH_PAR_SJDBGTFFEATUREEXON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFEATUREEXON="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfeatureExon. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFfeatureExon=*) + [ -n "$VIASH_PAR_SJDBGTFFEATUREEXON" ] && ViashError Bad arguments for option \'--sjdbGTFfeatureExon=*\': \'$VIASH_PAR_SJDBGTFFEATUREEXON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFEATUREEXON=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentTranscript) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentTranscript\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentTranscript. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentTranscript=*) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentTranscript=*\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentGene) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentGene\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTGENE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGene. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGene=*) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentGene=*\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTGENE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentGeneName) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$2" + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGeneName. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGeneName=*) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbGTFtagExonParentGeneType) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$2" + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGeneType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGeneType=*) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbOverhang) + [ -n "$VIASH_PAR_SJDBOVERHANG" ] && ViashError Bad arguments for option \'--sjdbOverhang\': \'$VIASH_PAR_SJDBOVERHANG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBOVERHANG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbOverhang. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbOverhang=*) + [ -n "$VIASH_PAR_SJDBOVERHANG" ] && ViashError Bad arguments for option \'--sjdbOverhang=*\': \'$VIASH_PAR_SJDBOVERHANG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBOVERHANG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbScore) + [ -n "$VIASH_PAR_SJDBSCORE" ] && ViashError Bad arguments for option \'--sjdbScore\': \'$VIASH_PAR_SJDBSCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBSCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbScore. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbScore=*) + [ -n "$VIASH_PAR_SJDBSCORE" ] && ViashError Bad arguments for option \'--sjdbScore=*\': \'$VIASH_PAR_SJDBSCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBSCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbInsertSave) + [ -n "$VIASH_PAR_SJDBINSERTSAVE" ] && ViashError Bad arguments for option \'--sjdbInsertSave\': \'$VIASH_PAR_SJDBINSERTSAVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBINSERTSAVE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbInsertSave. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbInsertSave=*) + [ -n "$VIASH_PAR_SJDBINSERTSAVE" ] && ViashError Bad arguments for option \'--sjdbInsertSave=*\': \'$VIASH_PAR_SJDBINSERTSAVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBINSERTSAVE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --varVCFfile) + [ -n "$VIASH_PAR_VARVCFFILE" ] && ViashError Bad arguments for option \'--varVCFfile\': \'$VIASH_PAR_VARVCFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARVCFFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --varVCFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --varVCFfile=*) + [ -n "$VIASH_PAR_VARVCFFILE" ] && ViashError Bad arguments for option \'--varVCFfile=*\': \'$VIASH_PAR_VARVCFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARVCFFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesType) + [ -n "$VIASH_PAR_READFILESTYPE" ] && ViashError Bad arguments for option \'--readFilesType\': \'$VIASH_PAR_READFILESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesType=*) + [ -n "$VIASH_PAR_READFILESTYPE" ] && ViashError Bad arguments for option \'--readFilesType=*\': \'$VIASH_PAR_READFILESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesSAMattrKeep) + if [ -z "$VIASH_PAR_READFILESSAMATTRKEEP" ]; then + VIASH_PAR_READFILESSAMATTRKEEP="$2" + else + VIASH_PAR_READFILESSAMATTRKEEP="$VIASH_PAR_READFILESSAMATTRKEEP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesSAMattrKeep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesSAMattrKeep=*) + if [ -z "$VIASH_PAR_READFILESSAMATTRKEEP" ]; then + VIASH_PAR_READFILESSAMATTRKEEP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READFILESSAMATTRKEEP="$VIASH_PAR_READFILESSAMATTRKEEP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readFilesManifest) + [ -n "$VIASH_PAR_READFILESMANIFEST" ] && ViashError Bad arguments for option \'--readFilesManifest\': \'$VIASH_PAR_READFILESMANIFEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESMANIFEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesManifest. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesManifest=*) + [ -n "$VIASH_PAR_READFILESMANIFEST" ] && ViashError Bad arguments for option \'--readFilesManifest=*\': \'$VIASH_PAR_READFILESMANIFEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESMANIFEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesPrefix) + [ -n "$VIASH_PAR_READFILESPREFIX" ] && ViashError Bad arguments for option \'--readFilesPrefix\': \'$VIASH_PAR_READFILESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesPrefix=*) + [ -n "$VIASH_PAR_READFILESPREFIX" ] && ViashError Bad arguments for option \'--readFilesPrefix=*\': \'$VIASH_PAR_READFILESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesCommand) + if [ -z "$VIASH_PAR_READFILESCOMMAND" ]; then + VIASH_PAR_READFILESCOMMAND="$2" + else + VIASH_PAR_READFILESCOMMAND="$VIASH_PAR_READFILESCOMMAND;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesCommand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesCommand=*) + if [ -z "$VIASH_PAR_READFILESCOMMAND" ]; then + VIASH_PAR_READFILESCOMMAND=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READFILESCOMMAND="$VIASH_PAR_READFILESCOMMAND;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readMapNumber) + [ -n "$VIASH_PAR_READMAPNUMBER" ] && ViashError Bad arguments for option \'--readMapNumber\': \'$VIASH_PAR_READMAPNUMBER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMAPNUMBER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readMapNumber. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readMapNumber=*) + [ -n "$VIASH_PAR_READMAPNUMBER" ] && ViashError Bad arguments for option \'--readMapNumber=*\': \'$VIASH_PAR_READMAPNUMBER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMAPNUMBER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readMatesLengthsIn) + [ -n "$VIASH_PAR_READMATESLENGTHSIN" ] && ViashError Bad arguments for option \'--readMatesLengthsIn\': \'$VIASH_PAR_READMATESLENGTHSIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMATESLENGTHSIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readMatesLengthsIn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readMatesLengthsIn=*) + [ -n "$VIASH_PAR_READMATESLENGTHSIN" ] && ViashError Bad arguments for option \'--readMatesLengthsIn=*\': \'$VIASH_PAR_READMATESLENGTHSIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMATESLENGTHSIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readNameSeparator) + if [ -z "$VIASH_PAR_READNAMESEPARATOR" ]; then + VIASH_PAR_READNAMESEPARATOR="$2" + else + VIASH_PAR_READNAMESEPARATOR="$VIASH_PAR_READNAMESEPARATOR;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readNameSeparator. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readNameSeparator=*) + if [ -z "$VIASH_PAR_READNAMESEPARATOR" ]; then + VIASH_PAR_READNAMESEPARATOR=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READNAMESEPARATOR="$VIASH_PAR_READNAMESEPARATOR;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readQualityScoreBase) + [ -n "$VIASH_PAR_READQUALITYSCOREBASE" ] && ViashError Bad arguments for option \'--readQualityScoreBase\': \'$VIASH_PAR_READQUALITYSCOREBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READQUALITYSCOREBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readQualityScoreBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readQualityScoreBase=*) + [ -n "$VIASH_PAR_READQUALITYSCOREBASE" ] && ViashError Bad arguments for option \'--readQualityScoreBase=*\': \'$VIASH_PAR_READQUALITYSCOREBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READQUALITYSCOREBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clipAdapterType) + [ -n "$VIASH_PAR_CLIPADAPTERTYPE" ] && ViashError Bad arguments for option \'--clipAdapterType\': \'$VIASH_PAR_CLIPADAPTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLIPADAPTERTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clipAdapterType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clipAdapterType=*) + [ -n "$VIASH_PAR_CLIPADAPTERTYPE" ] && ViashError Bad arguments for option \'--clipAdapterType=*\': \'$VIASH_PAR_CLIPADAPTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLIPADAPTERTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clip3pNbases) + if [ -z "$VIASH_PAR_CLIP3PNBASES" ]; then + VIASH_PAR_CLIP3PNBASES="$2" + else + VIASH_PAR_CLIP3PNBASES="$VIASH_PAR_CLIP3PNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pNbases=*) + if [ -z "$VIASH_PAR_CLIP3PNBASES" ]; then + VIASH_PAR_CLIP3PNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PNBASES="$VIASH_PAR_CLIP3PNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAdapterSeq) + if [ -z "$VIASH_PAR_CLIP3PADAPTERSEQ" ]; then + VIASH_PAR_CLIP3PADAPTERSEQ="$2" + else + VIASH_PAR_CLIP3PADAPTERSEQ="$VIASH_PAR_CLIP3PADAPTERSEQ;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAdapterSeq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAdapterSeq=*) + if [ -z "$VIASH_PAR_CLIP3PADAPTERSEQ" ]; then + VIASH_PAR_CLIP3PADAPTERSEQ=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PADAPTERSEQ="$VIASH_PAR_CLIP3PADAPTERSEQ;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAdapterMMp) + if [ -z "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + VIASH_PAR_CLIP3PADAPTERMMP="$2" + else + VIASH_PAR_CLIP3PADAPTERMMP="$VIASH_PAR_CLIP3PADAPTERMMP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAdapterMMp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAdapterMMp=*) + if [ -z "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + VIASH_PAR_CLIP3PADAPTERMMP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PADAPTERMMP="$VIASH_PAR_CLIP3PADAPTERMMP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAfterAdapterNbases) + if [ -z "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$2" + else + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$VIASH_PAR_CLIP3PAFTERADAPTERNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAfterAdapterNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAfterAdapterNbases=*) + if [ -z "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + VIASH_PAR_CLIP3PAFTERADAPTERNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$VIASH_PAR_CLIP3PAFTERADAPTERNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip5pNbases) + if [ -z "$VIASH_PAR_CLIP5PNBASES" ]; then + VIASH_PAR_CLIP5PNBASES="$2" + else + VIASH_PAR_CLIP5PNBASES="$VIASH_PAR_CLIP5PNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip5pNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip5pNbases=*) + if [ -z "$VIASH_PAR_CLIP5PNBASES" ]; then + VIASH_PAR_CLIP5PNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP5PNBASES="$VIASH_PAR_CLIP5PNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --limitGenomeGenerateRAM) + [ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ] && ViashError Bad arguments for option \'--limitGenomeGenerateRAM\': \'$VIASH_PAR_LIMITGENOMEGENERATERAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITGENOMEGENERATERAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitGenomeGenerateRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitGenomeGenerateRAM=*) + [ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ] && ViashError Bad arguments for option \'--limitGenomeGenerateRAM=*\': \'$VIASH_PAR_LIMITGENOMEGENERATERAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITGENOMEGENERATERAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitIObufferSize) + if [ -z "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + VIASH_PAR_LIMITIOBUFFERSIZE="$2" + else + VIASH_PAR_LIMITIOBUFFERSIZE="$VIASH_PAR_LIMITIOBUFFERSIZE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitIObufferSize. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitIObufferSize=*) + if [ -z "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + VIASH_PAR_LIMITIOBUFFERSIZE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIMITIOBUFFERSIZE="$VIASH_PAR_LIMITIOBUFFERSIZE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --limitOutSAMoneReadBytes) + [ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ] && ViashError Bad arguments for option \'--limitOutSAMoneReadBytes\': \'$VIASH_PAR_LIMITOUTSAMONEREADBYTES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSAMONEREADBYTES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSAMoneReadBytes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSAMoneReadBytes=*) + [ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ] && ViashError Bad arguments for option \'--limitOutSAMoneReadBytes=*\': \'$VIASH_PAR_LIMITOUTSAMONEREADBYTES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSAMONEREADBYTES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitOutSJoneRead) + [ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ] && ViashError Bad arguments for option \'--limitOutSJoneRead\': \'$VIASH_PAR_LIMITOUTSJONEREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJONEREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSJoneRead. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSJoneRead=*) + [ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ] && ViashError Bad arguments for option \'--limitOutSJoneRead=*\': \'$VIASH_PAR_LIMITOUTSJONEREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJONEREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitOutSJcollapsed) + [ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ] && ViashError Bad arguments for option \'--limitOutSJcollapsed\': \'$VIASH_PAR_LIMITOUTSJCOLLAPSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJCOLLAPSED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSJcollapsed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSJcollapsed=*) + [ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ] && ViashError Bad arguments for option \'--limitOutSJcollapsed=*\': \'$VIASH_PAR_LIMITOUTSJCOLLAPSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJCOLLAPSED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitBAMsortRAM) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitBAMsortRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitBAMsortRAM=*) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM=*\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitSjdbInsertNsj) + [ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ] && ViashError Bad arguments for option \'--limitSjdbInsertNsj\': \'$VIASH_PAR_LIMITSJDBINSERTNSJ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITSJDBINSERTNSJ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitSjdbInsertNsj. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitSjdbInsertNsj=*) + [ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ] && ViashError Bad arguments for option \'--limitSjdbInsertNsj=*\': \'$VIASH_PAR_LIMITSJDBINSERTNSJ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITSJDBINSERTNSJ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitNreadsSoft) + [ -n "$VIASH_PAR_LIMITNREADSSOFT" ] && ViashError Bad arguments for option \'--limitNreadsSoft\': \'$VIASH_PAR_LIMITNREADSSOFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITNREADSSOFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitNreadsSoft. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitNreadsSoft=*) + [ -n "$VIASH_PAR_LIMITNREADSSOFT" ] && ViashError Bad arguments for option \'--limitNreadsSoft=*\': \'$VIASH_PAR_LIMITNREADSSOFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITNREADSSOFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outTmpKeep) + [ -n "$VIASH_PAR_OUTTMPKEEP" ] && ViashError Bad arguments for option \'--outTmpKeep\': \'$VIASH_PAR_OUTTMPKEEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTTMPKEEP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outTmpKeep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outTmpKeep=*) + [ -n "$VIASH_PAR_OUTTMPKEEP" ] && ViashError Bad arguments for option \'--outTmpKeep=*\': \'$VIASH_PAR_OUTTMPKEEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTTMPKEEP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outStd) + [ -n "$VIASH_PAR_OUTSTD" ] && ViashError Bad arguments for option \'--outStd\': \'$VIASH_PAR_OUTSTD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSTD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outStd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outStd=*) + [ -n "$VIASH_PAR_OUTSTD" ] && ViashError Bad arguments for option \'--outStd=*\': \'$VIASH_PAR_OUTSTD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSTD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outReadsUnmapped) + [ -n "$VIASH_PAR_OUTREADSUNMAPPED" ] && ViashError Bad arguments for option \'--outReadsUnmapped\': \'$VIASH_PAR_OUTREADSUNMAPPED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTREADSUNMAPPED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outReadsUnmapped. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outReadsUnmapped=*) + [ -n "$VIASH_PAR_OUTREADSUNMAPPED" ] && ViashError Bad arguments for option \'--outReadsUnmapped=*\': \'$VIASH_PAR_OUTREADSUNMAPPED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTREADSUNMAPPED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outQSconversionAdd) + [ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ] && ViashError Bad arguments for option \'--outQSconversionAdd\': \'$VIASH_PAR_OUTQSCONVERSIONADD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTQSCONVERSIONADD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outQSconversionAdd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outQSconversionAdd=*) + [ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ] && ViashError Bad arguments for option \'--outQSconversionAdd=*\': \'$VIASH_PAR_OUTQSCONVERSIONADD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTQSCONVERSIONADD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outMultimapperOrder) + [ -n "$VIASH_PAR_OUTMULTIMAPPERORDER" ] && ViashError Bad arguments for option \'--outMultimapperOrder\': \'$VIASH_PAR_OUTMULTIMAPPERORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTMULTIMAPPERORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outMultimapperOrder. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outMultimapperOrder=*) + [ -n "$VIASH_PAR_OUTMULTIMAPPERORDER" ] && ViashError Bad arguments for option \'--outMultimapperOrder=*\': \'$VIASH_PAR_OUTMULTIMAPPERORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTMULTIMAPPERORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMtype) + if [ -z "$VIASH_PAR_OUTSAMTYPE" ]; then + VIASH_PAR_OUTSAMTYPE="$2" + else + VIASH_PAR_OUTSAMTYPE="$VIASH_PAR_OUTSAMTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMtype=*) + if [ -z "$VIASH_PAR_OUTSAMTYPE" ]; then + VIASH_PAR_OUTSAMTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMTYPE="$VIASH_PAR_OUTSAMTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMmode) + [ -n "$VIASH_PAR_OUTSAMMODE" ] && ViashError Bad arguments for option \'--outSAMmode\': \'$VIASH_PAR_OUTSAMMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmode=*) + [ -n "$VIASH_PAR_OUTSAMMODE" ] && ViashError Bad arguments for option \'--outSAMmode=*\': \'$VIASH_PAR_OUTSAMMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMstrandField) + [ -n "$VIASH_PAR_OUTSAMSTRANDFIELD" ] && ViashError Bad arguments for option \'--outSAMstrandField\': \'$VIASH_PAR_OUTSAMSTRANDFIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMSTRANDFIELD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMstrandField. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMstrandField=*) + [ -n "$VIASH_PAR_OUTSAMSTRANDFIELD" ] && ViashError Bad arguments for option \'--outSAMstrandField=*\': \'$VIASH_PAR_OUTSAMSTRANDFIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMSTRANDFIELD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMattributes) + if [ -z "$VIASH_PAR_OUTSAMATTRIBUTES" ]; then + VIASH_PAR_OUTSAMATTRIBUTES="$2" + else + VIASH_PAR_OUTSAMATTRIBUTES="$VIASH_PAR_OUTSAMATTRIBUTES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattributes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattributes=*) + if [ -z "$VIASH_PAR_OUTSAMATTRIBUTES" ]; then + VIASH_PAR_OUTSAMATTRIBUTES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMATTRIBUTES="$VIASH_PAR_OUTSAMATTRIBUTES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMattrIHstart) + [ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ] && ViashError Bad arguments for option \'--outSAMattrIHstart\': \'$VIASH_PAR_OUTSAMATTRIHSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMATTRIHSTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattrIHstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattrIHstart=*) + [ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ] && ViashError Bad arguments for option \'--outSAMattrIHstart=*\': \'$VIASH_PAR_OUTSAMATTRIHSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMATTRIHSTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMunmapped) + if [ -z "$VIASH_PAR_OUTSAMUNMAPPED" ]; then + VIASH_PAR_OUTSAMUNMAPPED="$2" + else + VIASH_PAR_OUTSAMUNMAPPED="$VIASH_PAR_OUTSAMUNMAPPED;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMunmapped. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMunmapped=*) + if [ -z "$VIASH_PAR_OUTSAMUNMAPPED" ]; then + VIASH_PAR_OUTSAMUNMAPPED=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMUNMAPPED="$VIASH_PAR_OUTSAMUNMAPPED;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMorder) + [ -n "$VIASH_PAR_OUTSAMORDER" ] && ViashError Bad arguments for option \'--outSAMorder\': \'$VIASH_PAR_OUTSAMORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMorder. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMorder=*) + [ -n "$VIASH_PAR_OUTSAMORDER" ] && ViashError Bad arguments for option \'--outSAMorder=*\': \'$VIASH_PAR_OUTSAMORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMprimaryFlag) + [ -n "$VIASH_PAR_OUTSAMPRIMARYFLAG" ] && ViashError Bad arguments for option \'--outSAMprimaryFlag\': \'$VIASH_PAR_OUTSAMPRIMARYFLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMPRIMARYFLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMprimaryFlag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMprimaryFlag=*) + [ -n "$VIASH_PAR_OUTSAMPRIMARYFLAG" ] && ViashError Bad arguments for option \'--outSAMprimaryFlag=*\': \'$VIASH_PAR_OUTSAMPRIMARYFLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMPRIMARYFLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMreadID) + [ -n "$VIASH_PAR_OUTSAMREADID" ] && ViashError Bad arguments for option \'--outSAMreadID\': \'$VIASH_PAR_OUTSAMREADID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMREADID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMreadID. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMreadID=*) + [ -n "$VIASH_PAR_OUTSAMREADID" ] && ViashError Bad arguments for option \'--outSAMreadID=*\': \'$VIASH_PAR_OUTSAMREADID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMREADID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMmapqUnique) + [ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ] && ViashError Bad arguments for option \'--outSAMmapqUnique\': \'$VIASH_PAR_OUTSAMMAPQUNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMAPQUNIQUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmapqUnique. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmapqUnique=*) + [ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ] && ViashError Bad arguments for option \'--outSAMmapqUnique=*\': \'$VIASH_PAR_OUTSAMMAPQUNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMAPQUNIQUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMflagOR) + [ -n "$VIASH_PAR_OUTSAMFLAGOR" ] && ViashError Bad arguments for option \'--outSAMflagOR\': \'$VIASH_PAR_OUTSAMFLAGOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMflagOR. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMflagOR=*) + [ -n "$VIASH_PAR_OUTSAMFLAGOR" ] && ViashError Bad arguments for option \'--outSAMflagOR=*\': \'$VIASH_PAR_OUTSAMFLAGOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMflagAND) + [ -n "$VIASH_PAR_OUTSAMFLAGAND" ] && ViashError Bad arguments for option \'--outSAMflagAND\': \'$VIASH_PAR_OUTSAMFLAGAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMflagAND. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMflagAND=*) + [ -n "$VIASH_PAR_OUTSAMFLAGAND" ] && ViashError Bad arguments for option \'--outSAMflagAND=*\': \'$VIASH_PAR_OUTSAMFLAGAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMattrRGline) + if [ -z "$VIASH_PAR_OUTSAMATTRRGLINE" ]; then + VIASH_PAR_OUTSAMATTRRGLINE="$2" + else + VIASH_PAR_OUTSAMATTRRGLINE="$VIASH_PAR_OUTSAMATTRRGLINE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattrRGline. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattrRGline=*) + if [ -z "$VIASH_PAR_OUTSAMATTRRGLINE" ]; then + VIASH_PAR_OUTSAMATTRRGLINE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMATTRRGLINE="$VIASH_PAR_OUTSAMATTRRGLINE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderHD) + if [ -z "$VIASH_PAR_OUTSAMHEADERHD" ]; then + VIASH_PAR_OUTSAMHEADERHD="$2" + else + VIASH_PAR_OUTSAMHEADERHD="$VIASH_PAR_OUTSAMHEADERHD;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderHD. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderHD=*) + if [ -z "$VIASH_PAR_OUTSAMHEADERHD" ]; then + VIASH_PAR_OUTSAMHEADERHD=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMHEADERHD="$VIASH_PAR_OUTSAMHEADERHD;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderPG) + if [ -z "$VIASH_PAR_OUTSAMHEADERPG" ]; then + VIASH_PAR_OUTSAMHEADERPG="$2" + else + VIASH_PAR_OUTSAMHEADERPG="$VIASH_PAR_OUTSAMHEADERPG;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderPG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderPG=*) + if [ -z "$VIASH_PAR_OUTSAMHEADERPG" ]; then + VIASH_PAR_OUTSAMHEADERPG=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMHEADERPG="$VIASH_PAR_OUTSAMHEADERPG;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderCommentFile) + [ -n "$VIASH_PAR_OUTSAMHEADERCOMMENTFILE" ] && ViashError Bad arguments for option \'--outSAMheaderCommentFile\': \'$VIASH_PAR_OUTSAMHEADERCOMMENTFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMHEADERCOMMENTFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderCommentFile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderCommentFile=*) + [ -n "$VIASH_PAR_OUTSAMHEADERCOMMENTFILE" ] && ViashError Bad arguments for option \'--outSAMheaderCommentFile=*\': \'$VIASH_PAR_OUTSAMHEADERCOMMENTFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMHEADERCOMMENTFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMfilter) + if [ -z "$VIASH_PAR_OUTSAMFILTER" ]; then + VIASH_PAR_OUTSAMFILTER="$2" + else + VIASH_PAR_OUTSAMFILTER="$VIASH_PAR_OUTSAMFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMfilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMfilter=*) + if [ -z "$VIASH_PAR_OUTSAMFILTER" ]; then + VIASH_PAR_OUTSAMFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMFILTER="$VIASH_PAR_OUTSAMFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMmultNmax) + [ -n "$VIASH_PAR_OUTSAMMULTNMAX" ] && ViashError Bad arguments for option \'--outSAMmultNmax\': \'$VIASH_PAR_OUTSAMMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMULTNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmultNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmultNmax=*) + [ -n "$VIASH_PAR_OUTSAMMULTNMAX" ] && ViashError Bad arguments for option \'--outSAMmultNmax=*\': \'$VIASH_PAR_OUTSAMMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMULTNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMtlen) + [ -n "$VIASH_PAR_OUTSAMTLEN" ] && ViashError Bad arguments for option \'--outSAMtlen\': \'$VIASH_PAR_OUTSAMTLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMTLEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMtlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMtlen=*) + [ -n "$VIASH_PAR_OUTSAMTLEN" ] && ViashError Bad arguments for option \'--outSAMtlen=*\': \'$VIASH_PAR_OUTSAMTLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMTLEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMcompression) + [ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--outBAMcompression\': \'$VIASH_PAR_OUTBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMCOMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMcompression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMcompression=*) + [ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--outBAMcompression=*\': \'$VIASH_PAR_OUTBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMCOMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMsortingThreadN) + [ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ] && ViashError Bad arguments for option \'--outBAMsortingThreadN\': \'$VIASH_PAR_OUTBAMSORTINGTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGTHREADN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMsortingThreadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMsortingThreadN=*) + [ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ] && ViashError Bad arguments for option \'--outBAMsortingThreadN=*\': \'$VIASH_PAR_OUTBAMSORTINGTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGTHREADN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMsortingBinsN) + [ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ] && ViashError Bad arguments for option \'--outBAMsortingBinsN\': \'$VIASH_PAR_OUTBAMSORTINGBINSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGBINSN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMsortingBinsN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMsortingBinsN=*) + [ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ] && ViashError Bad arguments for option \'--outBAMsortingBinsN=*\': \'$VIASH_PAR_OUTBAMSORTINGBINSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGBINSN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bamRemoveDuplicatesType) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESTYPE" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesType\': \'$VIASH_PAR_BAMREMOVEDUPLICATESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bamRemoveDuplicatesType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bamRemoveDuplicatesType=*) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESTYPE" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesType=*\': \'$VIASH_PAR_BAMREMOVEDUPLICATESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bamRemoveDuplicatesMate2basesN) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesMate2basesN\': \'$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bamRemoveDuplicatesMate2basesN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bamRemoveDuplicatesMate2basesN=*) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesMate2basesN=*\': \'$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigType) + if [ -z "$VIASH_PAR_OUTWIGTYPE" ]; then + VIASH_PAR_OUTWIGTYPE="$2" + else + VIASH_PAR_OUTWIGTYPE="$VIASH_PAR_OUTWIGTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigType=*) + if [ -z "$VIASH_PAR_OUTWIGTYPE" ]; then + VIASH_PAR_OUTWIGTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTWIGTYPE="$VIASH_PAR_OUTWIGTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outWigStrand) + [ -n "$VIASH_PAR_OUTWIGSTRAND" ] && ViashError Bad arguments for option \'--outWigStrand\': \'$VIASH_PAR_OUTWIGSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGSTRAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigStrand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigStrand=*) + [ -n "$VIASH_PAR_OUTWIGSTRAND" ] && ViashError Bad arguments for option \'--outWigStrand=*\': \'$VIASH_PAR_OUTWIGSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGSTRAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigReferencesPrefix) + [ -n "$VIASH_PAR_OUTWIGREFERENCESPREFIX" ] && ViashError Bad arguments for option \'--outWigReferencesPrefix\': \'$VIASH_PAR_OUTWIGREFERENCESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGREFERENCESPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigReferencesPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigReferencesPrefix=*) + [ -n "$VIASH_PAR_OUTWIGREFERENCESPREFIX" ] && ViashError Bad arguments for option \'--outWigReferencesPrefix=*\': \'$VIASH_PAR_OUTWIGREFERENCESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGREFERENCESPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigNorm) + [ -n "$VIASH_PAR_OUTWIGNORM" ] && ViashError Bad arguments for option \'--outWigNorm\': \'$VIASH_PAR_OUTWIGNORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGNORM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigNorm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigNorm=*) + [ -n "$VIASH_PAR_OUTWIGNORM" ] && ViashError Bad arguments for option \'--outWigNorm=*\': \'$VIASH_PAR_OUTWIGNORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGNORM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterType) + [ -n "$VIASH_PAR_OUTFILTERTYPE" ] && ViashError Bad arguments for option \'--outFilterType\': \'$VIASH_PAR_OUTFILTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterType=*) + [ -n "$VIASH_PAR_OUTFILTERTYPE" ] && ViashError Bad arguments for option \'--outFilterType=*\': \'$VIASH_PAR_OUTFILTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMultimapScoreRange) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--outFilterMultimapScoreRange\': \'$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMultimapScoreRange. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMultimapScoreRange=*) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--outFilterMultimapScoreRange=*\': \'$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMultimapNmax) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--outFilterMultimapNmax\': \'$VIASH_PAR_OUTFILTERMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMultimapNmax=*) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--outFilterMultimapNmax=*\': \'$VIASH_PAR_OUTFILTERMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNoverLmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverLmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNoverLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNoverLmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverLmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNoverReadLmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverReadLmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNoverReadLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNoverReadLmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverReadLmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterScoreMin) + [ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ] && ViashError Bad arguments for option \'--outFilterScoreMin\': \'$VIASH_PAR_OUTFILTERSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterScoreMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterScoreMin=*) + [ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ] && ViashError Bad arguments for option \'--outFilterScoreMin=*\': \'$VIASH_PAR_OUTFILTERSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterScoreMinOverLread) + [ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterScoreMinOverLread\': \'$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMINOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterScoreMinOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterScoreMinOverLread=*) + [ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterScoreMinOverLread=*\': \'$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMINOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMatchNmin) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ] && ViashError Bad arguments for option \'--outFilterMatchNmin\': \'$VIASH_PAR_OUTFILTERMATCHNMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMatchNmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMatchNmin=*) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ] && ViashError Bad arguments for option \'--outFilterMatchNmin=*\': \'$VIASH_PAR_OUTFILTERMATCHNMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMatchNminOverLread) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterMatchNminOverLread\': \'$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMatchNminOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMatchNminOverLread=*) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterMatchNminOverLread=*\': \'$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterIntronMotifs) + [ -n "$VIASH_PAR_OUTFILTERINTRONMOTIFS" ] && ViashError Bad arguments for option \'--outFilterIntronMotifs\': \'$VIASH_PAR_OUTFILTERINTRONMOTIFS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONMOTIFS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterIntronMotifs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterIntronMotifs=*) + [ -n "$VIASH_PAR_OUTFILTERINTRONMOTIFS" ] && ViashError Bad arguments for option \'--outFilterIntronMotifs=*\': \'$VIASH_PAR_OUTFILTERINTRONMOTIFS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONMOTIFS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterIntronStrands) + [ -n "$VIASH_PAR_OUTFILTERINTRONSTRANDS" ] && ViashError Bad arguments for option \'--outFilterIntronStrands\': \'$VIASH_PAR_OUTFILTERINTRONSTRANDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONSTRANDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterIntronStrands. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterIntronStrands=*) + [ -n "$VIASH_PAR_OUTFILTERINTRONSTRANDS" ] && ViashError Bad arguments for option \'--outFilterIntronStrands=*\': \'$VIASH_PAR_OUTFILTERINTRONSTRANDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONSTRANDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJtype) + [ -n "$VIASH_PAR_OUTSJTYPE" ] && ViashError Bad arguments for option \'--outSJtype\': \'$VIASH_PAR_OUTSJTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJtype=*) + [ -n "$VIASH_PAR_OUTSJTYPE" ] && ViashError Bad arguments for option \'--outSJtype=*\': \'$VIASH_PAR_OUTSJTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJfilterReads) + [ -n "$VIASH_PAR_OUTSJFILTERREADS" ] && ViashError Bad arguments for option \'--outSJfilterReads\': \'$VIASH_PAR_OUTSJFILTERREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJFILTERREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterReads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterReads=*) + [ -n "$VIASH_PAR_OUTSJFILTERREADS" ] && ViashError Bad arguments for option \'--outSJfilterReads=*\': \'$VIASH_PAR_OUTSJFILTERREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJFILTERREADS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJfilterOverhangMin) + if [ -z "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$2" + else + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$VIASH_PAR_OUTSJFILTEROVERHANGMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterOverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterOverhangMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + VIASH_PAR_OUTSJFILTEROVERHANGMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$VIASH_PAR_OUTSJFILTEROVERHANGMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterCountUniqueMin) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$2" + else + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterCountUniqueMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterCountUniqueMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterCountTotalMin) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$2" + else + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterCountTotalMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterCountTotalMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterDistToOtherSJmin) + if [ -z "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$2" + else + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterDistToOtherSJmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterDistToOtherSJmin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterIntronMaxVsReadN) + if [ -z "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$2" + else + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterIntronMaxVsReadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterIntronMaxVsReadN=*) + if [ -z "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --scoreGap) + [ -n "$VIASH_PAR_SCOREGAP" ] && ViashError Bad arguments for option \'--scoreGap\': \'$VIASH_PAR_SCOREGAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGap=*) + [ -n "$VIASH_PAR_SCOREGAP" ] && ViashError Bad arguments for option \'--scoreGap=*\': \'$VIASH_PAR_SCOREGAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapNoncan) + [ -n "$VIASH_PAR_SCOREGAPNONCAN" ] && ViashError Bad arguments for option \'--scoreGapNoncan\': \'$VIASH_PAR_SCOREGAPNONCAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPNONCAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapNoncan. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapNoncan=*) + [ -n "$VIASH_PAR_SCOREGAPNONCAN" ] && ViashError Bad arguments for option \'--scoreGapNoncan=*\': \'$VIASH_PAR_SCOREGAPNONCAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPNONCAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapGCAG) + [ -n "$VIASH_PAR_SCOREGAPGCAG" ] && ViashError Bad arguments for option \'--scoreGapGCAG\': \'$VIASH_PAR_SCOREGAPGCAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPGCAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapGCAG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapGCAG=*) + [ -n "$VIASH_PAR_SCOREGAPGCAG" ] && ViashError Bad arguments for option \'--scoreGapGCAG=*\': \'$VIASH_PAR_SCOREGAPGCAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPGCAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapATAC) + [ -n "$VIASH_PAR_SCOREGAPATAC" ] && ViashError Bad arguments for option \'--scoreGapATAC\': \'$VIASH_PAR_SCOREGAPATAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPATAC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapATAC. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapATAC=*) + [ -n "$VIASH_PAR_SCOREGAPATAC" ] && ViashError Bad arguments for option \'--scoreGapATAC=*\': \'$VIASH_PAR_SCOREGAPATAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPATAC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGenomicLengthLog2scale) + [ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ] && ViashError Bad arguments for option \'--scoreGenomicLengthLog2scale\': \'$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGenomicLengthLog2scale. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGenomicLengthLog2scale=*) + [ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ] && ViashError Bad arguments for option \'--scoreGenomicLengthLog2scale=*\': \'$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreDelOpen) + [ -n "$VIASH_PAR_SCOREDELOPEN" ] && ViashError Bad arguments for option \'--scoreDelOpen\': \'$VIASH_PAR_SCOREDELOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELOPEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreDelOpen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreDelOpen=*) + [ -n "$VIASH_PAR_SCOREDELOPEN" ] && ViashError Bad arguments for option \'--scoreDelOpen=*\': \'$VIASH_PAR_SCOREDELOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELOPEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreDelBase) + [ -n "$VIASH_PAR_SCOREDELBASE" ] && ViashError Bad arguments for option \'--scoreDelBase\': \'$VIASH_PAR_SCOREDELBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreDelBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreDelBase=*) + [ -n "$VIASH_PAR_SCOREDELBASE" ] && ViashError Bad arguments for option \'--scoreDelBase=*\': \'$VIASH_PAR_SCOREDELBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreInsOpen) + [ -n "$VIASH_PAR_SCOREINSOPEN" ] && ViashError Bad arguments for option \'--scoreInsOpen\': \'$VIASH_PAR_SCOREINSOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSOPEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreInsOpen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreInsOpen=*) + [ -n "$VIASH_PAR_SCOREINSOPEN" ] && ViashError Bad arguments for option \'--scoreInsOpen=*\': \'$VIASH_PAR_SCOREINSOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSOPEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreInsBase) + [ -n "$VIASH_PAR_SCOREINSBASE" ] && ViashError Bad arguments for option \'--scoreInsBase\': \'$VIASH_PAR_SCOREINSBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreInsBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreInsBase=*) + [ -n "$VIASH_PAR_SCOREINSBASE" ] && ViashError Bad arguments for option \'--scoreInsBase=*\': \'$VIASH_PAR_SCOREINSBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreStitchSJshift) + [ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ] && ViashError Bad arguments for option \'--scoreStitchSJshift\': \'$VIASH_PAR_SCORESTITCHSJSHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCORESTITCHSJSHIFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreStitchSJshift. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreStitchSJshift=*) + [ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ] && ViashError Bad arguments for option \'--scoreStitchSJshift=*\': \'$VIASH_PAR_SCORESTITCHSJSHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCORESTITCHSJSHIFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchStartLmax) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ] && ViashError Bad arguments for option \'--seedSearchStartLmax\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchStartLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchStartLmax=*) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ] && ViashError Bad arguments for option \'--seedSearchStartLmax=*\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchStartLmaxOverLread) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ] && ViashError Bad arguments for option \'--seedSearchStartLmaxOverLread\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchStartLmaxOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchStartLmaxOverLread=*) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ] && ViashError Bad arguments for option \'--seedSearchStartLmaxOverLread=*\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchLmax) + [ -n "$VIASH_PAR_SEEDSEARCHLMAX" ] && ViashError Bad arguments for option \'--seedSearchLmax\': \'$VIASH_PAR_SEEDSEARCHLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchLmax=*) + [ -n "$VIASH_PAR_SEEDSEARCHLMAX" ] && ViashError Bad arguments for option \'--seedSearchLmax=*\': \'$VIASH_PAR_SEEDSEARCHLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedMultimapNmax) + [ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--seedMultimapNmax\': \'$VIASH_PAR_SEEDMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedMultimapNmax=*) + [ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--seedMultimapNmax=*\': \'$VIASH_PAR_SEEDMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedPerReadNmax) + [ -n "$VIASH_PAR_SEEDPERREADNMAX" ] && ViashError Bad arguments for option \'--seedPerReadNmax\': \'$VIASH_PAR_SEEDPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedPerReadNmax=*) + [ -n "$VIASH_PAR_SEEDPERREADNMAX" ] && ViashError Bad arguments for option \'--seedPerReadNmax=*\': \'$VIASH_PAR_SEEDPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedPerWindowNmax) + [ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--seedPerWindowNmax\': \'$VIASH_PAR_SEEDPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERWINDOWNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedPerWindowNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedPerWindowNmax=*) + [ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--seedPerWindowNmax=*\': \'$VIASH_PAR_SEEDPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERWINDOWNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedNoneLociPerWindow) + [ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ] && ViashError Bad arguments for option \'--seedNoneLociPerWindow\': \'$VIASH_PAR_SEEDNONELOCIPERWINDOW\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDNONELOCIPERWINDOW="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedNoneLociPerWindow. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedNoneLociPerWindow=*) + [ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ] && ViashError Bad arguments for option \'--seedNoneLociPerWindow=*\': \'$VIASH_PAR_SEEDNONELOCIPERWINDOW\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDNONELOCIPERWINDOW=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSplitMin) + [ -n "$VIASH_PAR_SEEDSPLITMIN" ] && ViashError Bad arguments for option \'--seedSplitMin\': \'$VIASH_PAR_SEEDSPLITMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSPLITMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSplitMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSplitMin=*) + [ -n "$VIASH_PAR_SEEDSPLITMIN" ] && ViashError Bad arguments for option \'--seedSplitMin=*\': \'$VIASH_PAR_SEEDSPLITMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSPLITMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedMapMin) + [ -n "$VIASH_PAR_SEEDMAPMIN" ] && ViashError Bad arguments for option \'--seedMapMin\': \'$VIASH_PAR_SEEDMAPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMAPMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedMapMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedMapMin=*) + [ -n "$VIASH_PAR_SEEDMAPMIN" ] && ViashError Bad arguments for option \'--seedMapMin=*\': \'$VIASH_PAR_SEEDMAPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMAPMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignIntronMin) + [ -n "$VIASH_PAR_ALIGNINTRONMIN" ] && ViashError Bad arguments for option \'--alignIntronMin\': \'$VIASH_PAR_ALIGNINTRONMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignIntronMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignIntronMin=*) + [ -n "$VIASH_PAR_ALIGNINTRONMIN" ] && ViashError Bad arguments for option \'--alignIntronMin=*\': \'$VIASH_PAR_ALIGNINTRONMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignIntronMax) + [ -n "$VIASH_PAR_ALIGNINTRONMAX" ] && ViashError Bad arguments for option \'--alignIntronMax\': \'$VIASH_PAR_ALIGNINTRONMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignIntronMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignIntronMax=*) + [ -n "$VIASH_PAR_ALIGNINTRONMAX" ] && ViashError Bad arguments for option \'--alignIntronMax=*\': \'$VIASH_PAR_ALIGNINTRONMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignMatesGapMax) + [ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ] && ViashError Bad arguments for option \'--alignMatesGapMax\': \'$VIASH_PAR_ALIGNMATESGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNMATESGAPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignMatesGapMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignMatesGapMax=*) + [ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ] && ViashError Bad arguments for option \'--alignMatesGapMax=*\': \'$VIASH_PAR_ALIGNMATESGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNMATESGAPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSJoverhangMin) + [ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJoverhangMin\': \'$VIASH_PAR_ALIGNSJOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJoverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJoverhangMin=*) + [ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJoverhangMin=*\': \'$VIASH_PAR_ALIGNSJOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSJstitchMismatchNmax) + if [ -z "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$2" + else + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJstitchMismatchNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJstitchMismatchNmax=*) + if [ -z "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --alignSJDBoverhangMin) + [ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJDBoverhangMin\': \'$VIASH_PAR_ALIGNSJDBOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJDBOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJDBoverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJDBoverhangMin=*) + [ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJDBoverhangMin=*\': \'$VIASH_PAR_ALIGNSJDBOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJDBOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSplicedMateMapLmin) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLmin\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSplicedMateMapLmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSplicedMateMapLmin=*) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLmin=*\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSplicedMateMapLminOverLmate) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLminOverLmate\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSplicedMateMapLminOverLmate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSplicedMateMapLminOverLmate=*) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLminOverLmate=*\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignWindowsPerReadNmax) + [ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignWindowsPerReadNmax\': \'$VIASH_PAR_ALIGNWINDOWSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNWINDOWSPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignWindowsPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignWindowsPerReadNmax=*) + [ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignWindowsPerReadNmax=*\': \'$VIASH_PAR_ALIGNWINDOWSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNWINDOWSPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignTranscriptsPerWindowNmax) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerWindowNmax\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignTranscriptsPerWindowNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignTranscriptsPerWindowNmax=*) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerWindowNmax=*\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignTranscriptsPerReadNmax) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerReadNmax\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignTranscriptsPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignTranscriptsPerReadNmax=*) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerReadNmax=*\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignEndsType) + [ -n "$VIASH_PAR_ALIGNENDSTYPE" ] && ViashError Bad arguments for option \'--alignEndsType\': \'$VIASH_PAR_ALIGNENDSTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignEndsType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignEndsType=*) + [ -n "$VIASH_PAR_ALIGNENDSTYPE" ] && ViashError Bad arguments for option \'--alignEndsType=*\': \'$VIASH_PAR_ALIGNENDSTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignEndsProtrude) + [ -n "$VIASH_PAR_ALIGNENDSPROTRUDE" ] && ViashError Bad arguments for option \'--alignEndsProtrude\': \'$VIASH_PAR_ALIGNENDSPROTRUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSPROTRUDE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignEndsProtrude. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignEndsProtrude=*) + [ -n "$VIASH_PAR_ALIGNENDSPROTRUDE" ] && ViashError Bad arguments for option \'--alignEndsProtrude=*\': \'$VIASH_PAR_ALIGNENDSPROTRUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSPROTRUDE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSoftClipAtReferenceEnds) + [ -n "$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS" ] && ViashError Bad arguments for option \'--alignSoftClipAtReferenceEnds\': \'$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSoftClipAtReferenceEnds. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSoftClipAtReferenceEnds=*) + [ -n "$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS" ] && ViashError Bad arguments for option \'--alignSoftClipAtReferenceEnds=*\': \'$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignInsertionFlush) + [ -n "$VIASH_PAR_ALIGNINSERTIONFLUSH" ] && ViashError Bad arguments for option \'--alignInsertionFlush\': \'$VIASH_PAR_ALIGNINSERTIONFLUSH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINSERTIONFLUSH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignInsertionFlush. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignInsertionFlush=*) + [ -n "$VIASH_PAR_ALIGNINSERTIONFLUSH" ] && ViashError Bad arguments for option \'--alignInsertionFlush=*\': \'$VIASH_PAR_ALIGNINSERTIONFLUSH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINSERTIONFLUSH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peOverlapNbasesMin) + [ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ] && ViashError Bad arguments for option \'--peOverlapNbasesMin\': \'$VIASH_PAR_PEOVERLAPNBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPNBASESMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peOverlapNbasesMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peOverlapNbasesMin=*) + [ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ] && ViashError Bad arguments for option \'--peOverlapNbasesMin=*\': \'$VIASH_PAR_PEOVERLAPNBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPNBASESMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peOverlapMMp) + [ -n "$VIASH_PAR_PEOVERLAPMMP" ] && ViashError Bad arguments for option \'--peOverlapMMp\': \'$VIASH_PAR_PEOVERLAPMMP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPMMP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peOverlapMMp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peOverlapMMp=*) + [ -n "$VIASH_PAR_PEOVERLAPMMP" ] && ViashError Bad arguments for option \'--peOverlapMMp=*\': \'$VIASH_PAR_PEOVERLAPMMP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPMMP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winAnchorMultimapNmax) + [ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--winAnchorMultimapNmax\': \'$VIASH_PAR_WINANCHORMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winAnchorMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winAnchorMultimapNmax=*) + [ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--winAnchorMultimapNmax=*\': \'$VIASH_PAR_WINANCHORMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winBinNbits) + [ -n "$VIASH_PAR_WINBINNBITS" ] && ViashError Bad arguments for option \'--winBinNbits\': \'$VIASH_PAR_WINBINNBITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINBINNBITS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winBinNbits. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winBinNbits=*) + [ -n "$VIASH_PAR_WINBINNBITS" ] && ViashError Bad arguments for option \'--winBinNbits=*\': \'$VIASH_PAR_WINBINNBITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINBINNBITS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winAnchorDistNbins) + [ -n "$VIASH_PAR_WINANCHORDISTNBINS" ] && ViashError Bad arguments for option \'--winAnchorDistNbins\': \'$VIASH_PAR_WINANCHORDISTNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORDISTNBINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winAnchorDistNbins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winAnchorDistNbins=*) + [ -n "$VIASH_PAR_WINANCHORDISTNBINS" ] && ViashError Bad arguments for option \'--winAnchorDistNbins=*\': \'$VIASH_PAR_WINANCHORDISTNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORDISTNBINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winFlankNbins) + [ -n "$VIASH_PAR_WINFLANKNBINS" ] && ViashError Bad arguments for option \'--winFlankNbins\': \'$VIASH_PAR_WINFLANKNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINFLANKNBINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winFlankNbins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winFlankNbins=*) + [ -n "$VIASH_PAR_WINFLANKNBINS" ] && ViashError Bad arguments for option \'--winFlankNbins=*\': \'$VIASH_PAR_WINFLANKNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINFLANKNBINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winReadCoverageRelativeMin) + [ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ] && ViashError Bad arguments for option \'--winReadCoverageRelativeMin\': \'$VIASH_PAR_WINREADCOVERAGERELATIVEMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGERELATIVEMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winReadCoverageRelativeMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winReadCoverageRelativeMin=*) + [ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ] && ViashError Bad arguments for option \'--winReadCoverageRelativeMin=*\': \'$VIASH_PAR_WINREADCOVERAGERELATIVEMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGERELATIVEMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winReadCoverageBasesMin) + [ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ] && ViashError Bad arguments for option \'--winReadCoverageBasesMin\': \'$VIASH_PAR_WINREADCOVERAGEBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGEBASESMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winReadCoverageBasesMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winReadCoverageBasesMin=*) + [ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ] && ViashError Bad arguments for option \'--winReadCoverageBasesMin=*\': \'$VIASH_PAR_WINREADCOVERAGEBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGEBASESMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimOutType) + if [ -z "$VIASH_PAR_CHIMOUTTYPE" ]; then + VIASH_PAR_CHIMOUTTYPE="$2" + else + VIASH_PAR_CHIMOUTTYPE="$VIASH_PAR_CHIMOUTTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimOutType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimOutType=*) + if [ -z "$VIASH_PAR_CHIMOUTTYPE" ]; then + VIASH_PAR_CHIMOUTTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CHIMOUTTYPE="$VIASH_PAR_CHIMOUTTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --chimSegmentMin) + [ -n "$VIASH_PAR_CHIMSEGMENTMIN" ] && ViashError Bad arguments for option \'--chimSegmentMin\': \'$VIASH_PAR_CHIMSEGMENTMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimSegmentMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimSegmentMin=*) + [ -n "$VIASH_PAR_CHIMSEGMENTMIN" ] && ViashError Bad arguments for option \'--chimSegmentMin=*\': \'$VIASH_PAR_CHIMSEGMENTMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreMin) + [ -n "$VIASH_PAR_CHIMSCOREMIN" ] && ViashError Bad arguments for option \'--chimScoreMin\': \'$VIASH_PAR_CHIMSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreMin=*) + [ -n "$VIASH_PAR_CHIMSCOREMIN" ] && ViashError Bad arguments for option \'--chimScoreMin=*\': \'$VIASH_PAR_CHIMSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreDropMax) + [ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ] && ViashError Bad arguments for option \'--chimScoreDropMax\': \'$VIASH_PAR_CHIMSCOREDROPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREDROPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreDropMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreDropMax=*) + [ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ] && ViashError Bad arguments for option \'--chimScoreDropMax=*\': \'$VIASH_PAR_CHIMSCOREDROPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREDROPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreSeparation) + [ -n "$VIASH_PAR_CHIMSCORESEPARATION" ] && ViashError Bad arguments for option \'--chimScoreSeparation\': \'$VIASH_PAR_CHIMSCORESEPARATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCORESEPARATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreSeparation. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreSeparation=*) + [ -n "$VIASH_PAR_CHIMSCORESEPARATION" ] && ViashError Bad arguments for option \'--chimScoreSeparation=*\': \'$VIASH_PAR_CHIMSCORESEPARATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCORESEPARATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreJunctionNonGTAG) + [ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ] && ViashError Bad arguments for option \'--chimScoreJunctionNonGTAG\': \'$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreJunctionNonGTAG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreJunctionNonGTAG=*) + [ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ] && ViashError Bad arguments for option \'--chimScoreJunctionNonGTAG=*\': \'$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimJunctionOverhangMin) + [ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ] && ViashError Bad arguments for option \'--chimJunctionOverhangMin\': \'$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMJUNCTIONOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimJunctionOverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimJunctionOverhangMin=*) + [ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ] && ViashError Bad arguments for option \'--chimJunctionOverhangMin=*\': \'$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMJUNCTIONOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimSegmentReadGapMax) + [ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ] && ViashError Bad arguments for option \'--chimSegmentReadGapMax\': \'$VIASH_PAR_CHIMSEGMENTREADGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTREADGAPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimSegmentReadGapMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimSegmentReadGapMax=*) + [ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ] && ViashError Bad arguments for option \'--chimSegmentReadGapMax=*\': \'$VIASH_PAR_CHIMSEGMENTREADGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTREADGAPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimFilter) + if [ -z "$VIASH_PAR_CHIMFILTER" ]; then + VIASH_PAR_CHIMFILTER="$2" + else + VIASH_PAR_CHIMFILTER="$VIASH_PAR_CHIMFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimFilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimFilter=*) + if [ -z "$VIASH_PAR_CHIMFILTER" ]; then + VIASH_PAR_CHIMFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CHIMFILTER="$VIASH_PAR_CHIMFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --chimMainSegmentMultNmax) + [ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ] && ViashError Bad arguments for option \'--chimMainSegmentMultNmax\': \'$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMAINSEGMENTMULTNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMainSegmentMultNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMainSegmentMultNmax=*) + [ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ] && ViashError Bad arguments for option \'--chimMainSegmentMultNmax=*\': \'$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMAINSEGMENTMULTNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimMultimapNmax) + [ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--chimMultimapNmax\': \'$VIASH_PAR_CHIMMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMultimapNmax=*) + [ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--chimMultimapNmax=*\': \'$VIASH_PAR_CHIMMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimMultimapScoreRange) + [ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--chimMultimapScoreRange\': \'$VIASH_PAR_CHIMMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPSCORERANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMultimapScoreRange. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMultimapScoreRange=*) + [ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--chimMultimapScoreRange=*\': \'$VIASH_PAR_CHIMMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPSCORERANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimNonchimScoreDropMin) + [ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ] && ViashError Bad arguments for option \'--chimNonchimScoreDropMin\': \'$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMNONCHIMSCOREDROPMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimNonchimScoreDropMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimNonchimScoreDropMin=*) + [ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ] && ViashError Bad arguments for option \'--chimNonchimScoreDropMin=*\': \'$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMNONCHIMSCOREDROPMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimOutJunctionFormat) + [ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ] && ViashError Bad arguments for option \'--chimOutJunctionFormat\': \'$VIASH_PAR_CHIMOUTJUNCTIONFORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMOUTJUNCTIONFORMAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimOutJunctionFormat. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimOutJunctionFormat=*) + [ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ] && ViashError Bad arguments for option \'--chimOutJunctionFormat=*\': \'$VIASH_PAR_CHIMOUTJUNCTIONFORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMOUTJUNCTIONFORMAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantMode) + if [ -z "$VIASH_PAR_QUANTMODE" ]; then + VIASH_PAR_QUANTMODE="$2" + else + VIASH_PAR_QUANTMODE="$VIASH_PAR_QUANTMODE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantMode=*) + if [ -z "$VIASH_PAR_QUANTMODE" ]; then + VIASH_PAR_QUANTMODE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_QUANTMODE="$VIASH_PAR_QUANTMODE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --quantTranscriptomeBAMcompression) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--quantTranscriptomeBAMcompression\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantTranscriptomeBAMcompression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantTranscriptomeBAMcompression=*) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--quantTranscriptomeBAMcompression=*\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantTranscriptomeBan) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAN" ] && ViashError Bad arguments for option \'--quantTranscriptomeBan\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantTranscriptomeBan. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantTranscriptomeBan=*) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAN" ] && ViashError Bad arguments for option \'--quantTranscriptomeBan=*\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --twopassMode) + [ -n "$VIASH_PAR_TWOPASSMODE" ] && ViashError Bad arguments for option \'--twopassMode\': \'$VIASH_PAR_TWOPASSMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASSMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --twopassMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --twopassMode=*) + [ -n "$VIASH_PAR_TWOPASSMODE" ] && ViashError Bad arguments for option \'--twopassMode=*\': \'$VIASH_PAR_TWOPASSMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASSMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --twopass1readsN) + [ -n "$VIASH_PAR_TWOPASS1READSN" ] && ViashError Bad arguments for option \'--twopass1readsN\': \'$VIASH_PAR_TWOPASS1READSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASS1READSN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --twopass1readsN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --twopass1readsN=*) + [ -n "$VIASH_PAR_TWOPASS1READSN" ] && ViashError Bad arguments for option \'--twopass1readsN=*\': \'$VIASH_PAR_TWOPASS1READSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASS1READSN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --waspOutputMode) + [ -n "$VIASH_PAR_WASPOUTPUTMODE" ] && ViashError Bad arguments for option \'--waspOutputMode\': \'$VIASH_PAR_WASPOUTPUTMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WASPOUTPUTMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --waspOutputMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --waspOutputMode=*) + [ -n "$VIASH_PAR_WASPOUTPUTMODE" ] && ViashError Bad arguments for option \'--waspOutputMode=*\': \'$VIASH_PAR_WASPOUTPUTMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WASPOUTPUTMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloType) + if [ -z "$VIASH_PAR_SOLOTYPE" ]; then + VIASH_PAR_SOLOTYPE="$2" + else + VIASH_PAR_SOLOTYPE="$VIASH_PAR_SOLOTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloType=*) + if [ -z "$VIASH_PAR_SOLOTYPE" ]; then + VIASH_PAR_SOLOTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOTYPE="$VIASH_PAR_SOLOTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCBwhitelist) + if [ -z "$VIASH_PAR_SOLOCBWHITELIST" ]; then + VIASH_PAR_SOLOCBWHITELIST="$2" + else + VIASH_PAR_SOLOCBWHITELIST="$VIASH_PAR_SOLOCBWHITELIST;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBwhitelist. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBwhitelist=*) + if [ -z "$VIASH_PAR_SOLOCBWHITELIST" ]; then + VIASH_PAR_SOLOCBWHITELIST=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCBWHITELIST="$VIASH_PAR_SOLOCBWHITELIST;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCBstart) + [ -n "$VIASH_PAR_SOLOCBSTART" ] && ViashError Bad arguments for option \'--soloCBstart\': \'$VIASH_PAR_SOLOCBSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBSTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBstart=*) + [ -n "$VIASH_PAR_SOLOCBSTART" ] && ViashError Bad arguments for option \'--soloCBstart=*\': \'$VIASH_PAR_SOLOCBSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBSTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBlen) + [ -n "$VIASH_PAR_SOLOCBLEN" ] && ViashError Bad arguments for option \'--soloCBlen\': \'$VIASH_PAR_SOLOCBLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBLEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBlen=*) + [ -n "$VIASH_PAR_SOLOCBLEN" ] && ViashError Bad arguments for option \'--soloCBlen=*\': \'$VIASH_PAR_SOLOCBLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBLEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloUMIstart) + [ -n "$VIASH_PAR_SOLOUMISTART" ] && ViashError Bad arguments for option \'--soloUMIstart\': \'$VIASH_PAR_SOLOUMISTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMISTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIstart=*) + [ -n "$VIASH_PAR_SOLOUMISTART" ] && ViashError Bad arguments for option \'--soloUMIstart=*\': \'$VIASH_PAR_SOLOUMISTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMISTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloUMIlen) + [ -n "$VIASH_PAR_SOLOUMILEN" ] && ViashError Bad arguments for option \'--soloUMIlen\': \'$VIASH_PAR_SOLOUMILEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMILEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIlen=*) + [ -n "$VIASH_PAR_SOLOUMILEN" ] && ViashError Bad arguments for option \'--soloUMIlen=*\': \'$VIASH_PAR_SOLOUMILEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMILEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloBarcodeReadLength) + [ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ] && ViashError Bad arguments for option \'--soloBarcodeReadLength\': \'$VIASH_PAR_SOLOBARCODEREADLENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEREADLENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloBarcodeReadLength. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloBarcodeReadLength=*) + [ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ] && ViashError Bad arguments for option \'--soloBarcodeReadLength=*\': \'$VIASH_PAR_SOLOBARCODEREADLENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEREADLENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloBarcodeMate) + [ -n "$VIASH_PAR_SOLOBARCODEMATE" ] && ViashError Bad arguments for option \'--soloBarcodeMate\': \'$VIASH_PAR_SOLOBARCODEMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEMATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloBarcodeMate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloBarcodeMate=*) + [ -n "$VIASH_PAR_SOLOBARCODEMATE" ] && ViashError Bad arguments for option \'--soloBarcodeMate=*\': \'$VIASH_PAR_SOLOBARCODEMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEMATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBposition) + if [ -z "$VIASH_PAR_SOLOCBPOSITION" ]; then + VIASH_PAR_SOLOCBPOSITION="$2" + else + VIASH_PAR_SOLOCBPOSITION="$VIASH_PAR_SOLOCBPOSITION;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBposition. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBposition=*) + if [ -z "$VIASH_PAR_SOLOCBPOSITION" ]; then + VIASH_PAR_SOLOCBPOSITION=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCBPOSITION="$VIASH_PAR_SOLOCBPOSITION;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIposition) + [ -n "$VIASH_PAR_SOLOUMIPOSITION" ] && ViashError Bad arguments for option \'--soloUMIposition\': \'$VIASH_PAR_SOLOUMIPOSITION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMIPOSITION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIposition. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIposition=*) + [ -n "$VIASH_PAR_SOLOUMIPOSITION" ] && ViashError Bad arguments for option \'--soloUMIposition=*\': \'$VIASH_PAR_SOLOUMIPOSITION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMIPOSITION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloAdapterSequence) + [ -n "$VIASH_PAR_SOLOADAPTERSEQUENCE" ] && ViashError Bad arguments for option \'--soloAdapterSequence\': \'$VIASH_PAR_SOLOADAPTERSEQUENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERSEQUENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloAdapterSequence. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloAdapterSequence=*) + [ -n "$VIASH_PAR_SOLOADAPTERSEQUENCE" ] && ViashError Bad arguments for option \'--soloAdapterSequence=*\': \'$VIASH_PAR_SOLOADAPTERSEQUENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERSEQUENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloAdapterMismatchesNmax) + [ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ] && ViashError Bad arguments for option \'--soloAdapterMismatchesNmax\': \'$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERMISMATCHESNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloAdapterMismatchesNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloAdapterMismatchesNmax=*) + [ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ] && ViashError Bad arguments for option \'--soloAdapterMismatchesNmax=*\': \'$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERMISMATCHESNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBmatchWLtype) + [ -n "$VIASH_PAR_SOLOCBMATCHWLTYPE" ] && ViashError Bad arguments for option \'--soloCBmatchWLtype\': \'$VIASH_PAR_SOLOCBMATCHWLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBMATCHWLTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBmatchWLtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBmatchWLtype=*) + [ -n "$VIASH_PAR_SOLOCBMATCHWLTYPE" ] && ViashError Bad arguments for option \'--soloCBmatchWLtype=*\': \'$VIASH_PAR_SOLOCBMATCHWLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBMATCHWLTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloInputSAMattrBarcodeSeq) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$2" + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloInputSAMattrBarcodeSeq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloInputSAMattrBarcodeSeq=*) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloInputSAMattrBarcodeQual) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$2" + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloInputSAMattrBarcodeQual. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloInputSAMattrBarcodeQual=*) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloStrand) + [ -n "$VIASH_PAR_SOLOSTRAND" ] && ViashError Bad arguments for option \'--soloStrand\': \'$VIASH_PAR_SOLOSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOSTRAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloStrand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloStrand=*) + [ -n "$VIASH_PAR_SOLOSTRAND" ] && ViashError Bad arguments for option \'--soloStrand=*\': \'$VIASH_PAR_SOLOSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOSTRAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloFeatures) + if [ -z "$VIASH_PAR_SOLOFEATURES" ]; then + VIASH_PAR_SOLOFEATURES="$2" + else + VIASH_PAR_SOLOFEATURES="$VIASH_PAR_SOLOFEATURES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloFeatures. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloFeatures=*) + if [ -z "$VIASH_PAR_SOLOFEATURES" ]; then + VIASH_PAR_SOLOFEATURES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOFEATURES="$VIASH_PAR_SOLOFEATURES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloMultiMappers) + if [ -z "$VIASH_PAR_SOLOMULTIMAPPERS" ]; then + VIASH_PAR_SOLOMULTIMAPPERS="$2" + else + VIASH_PAR_SOLOMULTIMAPPERS="$VIASH_PAR_SOLOMULTIMAPPERS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloMultiMappers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloMultiMappers=*) + if [ -z "$VIASH_PAR_SOLOMULTIMAPPERS" ]; then + VIASH_PAR_SOLOMULTIMAPPERS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOMULTIMAPPERS="$VIASH_PAR_SOLOMULTIMAPPERS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIdedup) + if [ -z "$VIASH_PAR_SOLOUMIDEDUP" ]; then + VIASH_PAR_SOLOUMIDEDUP="$2" + else + VIASH_PAR_SOLOUMIDEDUP="$VIASH_PAR_SOLOUMIDEDUP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIdedup. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIdedup=*) + if [ -z "$VIASH_PAR_SOLOUMIDEDUP" ]; then + VIASH_PAR_SOLOUMIDEDUP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOUMIDEDUP="$VIASH_PAR_SOLOUMIDEDUP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIfiltering) + if [ -z "$VIASH_PAR_SOLOUMIFILTERING" ]; then + VIASH_PAR_SOLOUMIFILTERING="$2" + else + VIASH_PAR_SOLOUMIFILTERING="$VIASH_PAR_SOLOUMIFILTERING;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIfiltering. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIfiltering=*) + if [ -z "$VIASH_PAR_SOLOUMIFILTERING" ]; then + VIASH_PAR_SOLOUMIFILTERING=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOUMIFILTERING="$VIASH_PAR_SOLOUMIFILTERING;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloOutFileNames) + if [ -z "$VIASH_PAR_SOLOOUTFILENAMES" ]; then + VIASH_PAR_SOLOOUTFILENAMES="$2" + else + VIASH_PAR_SOLOOUTFILENAMES="$VIASH_PAR_SOLOOUTFILENAMES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloOutFileNames. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloOutFileNames=*) + if [ -z "$VIASH_PAR_SOLOOUTFILENAMES" ]; then + VIASH_PAR_SOLOOUTFILENAMES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOOUTFILENAMES="$VIASH_PAR_SOLOOUTFILENAMES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCellFilter) + if [ -z "$VIASH_PAR_SOLOCELLFILTER" ]; then + VIASH_PAR_SOLOCELLFILTER="$2" + else + VIASH_PAR_SOLOCELLFILTER="$VIASH_PAR_SOLOCELLFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCellFilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCellFilter=*) + if [ -z "$VIASH_PAR_SOLOCELLFILTER" ]; then + VIASH_PAR_SOLOCELLFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCELLFILTER="$VIASH_PAR_SOLOCELLFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloOutFormatFeaturesGeneField3) + if [ -z "$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3" ]; then + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$2" + else + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloOutFormatFeaturesGeneField3. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloOutFormatFeaturesGeneField3=*) + if [ -z "$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3" ]; then + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCellReadStats) + [ -n "$VIASH_PAR_SOLOCELLREADSTATS" ] && ViashError Bad arguments for option \'--soloCellReadStats\': \'$VIASH_PAR_SOLOCELLREADSTATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCELLREADSTATS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCellReadStats. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCellReadStats=*) + [ -n "$VIASH_PAR_SOLOCELLREADSTATS" ] && ViashError Bad arguments for option \'--soloCellReadStats=*\': \'$VIASH_PAR_SOLOCELLREADSTATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCELLREADSTATS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/star_align:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ] && [ ! -e "$VIASH_PAR_SJDBGTFFILE" ]; then + ViashError "Input file '$VIASH_PAR_SJDBGTFFILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ] && [ ! -e "$VIASH_PAR_READFILESMANIFEST" ]; then + ViashError "Input file '$VIASH_PAR_READFILESMANIFEST' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_RUNRNGSEED" ]]; then + if ! [[ "$VIASH_PAR_RUNRNGSEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--runRNGseed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_GENOMEFILESIZES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_GENOMEFILESIZES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--genomeFileSizes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_SJDBOVERHANG" ]]; then + if ! [[ "$VIASH_PAR_SJDBOVERHANG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sjdbOverhang' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SJDBSCORE" ]]; then + if ! [[ "$VIASH_PAR_SJDBSCORE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sjdbScore' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READMAPNUMBER" ]]; then + if ! [[ "$VIASH_PAR_READMAPNUMBER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--readMapNumber' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READQUALITYSCOREBASE" ]]; then + if ! [[ "$VIASH_PAR_READQUALITYSCOREBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--readQualityScoreBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_CLIP3PNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip3pNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PADAPTERMMP; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--clip3pAdapterMMp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PAFTERADAPTERNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip3pAfterAdapterNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP5PNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP5PNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip5pNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ]]; then + if ! [[ "$VIASH_PAR_LIMITGENOMEGENERATERAM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitGenomeGenerateRAM' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_LIMITIOBUFFERSIZE; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitIObufferSize' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSAMoneReadBytes' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSJONEREAD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSJoneRead' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSJCOLLAPSED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSJcollapsed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITBAMSORTRAM" ]]; then + if ! [[ "$VIASH_PAR_LIMITBAMSORTRAM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitBAMsortRAM' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ]]; then + if ! [[ "$VIASH_PAR_LIMITSJDBINSERTNSJ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitSjdbInsertNsj' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITNREADSSOFT" ]]; then + if ! [[ "$VIASH_PAR_LIMITNREADSSOFT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitNreadsSoft' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ]]; then + if ! [[ "$VIASH_PAR_OUTQSCONVERSIONADD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outQSconversionAdd' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMATTRIHSTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMattrIHstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMMAPQUNIQUE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMmapqUnique' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMFLAGOR" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMFLAGOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMflagOR' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMFLAGAND" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMFLAGAND" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMflagAND' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMMULTNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMMULTNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMmultNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMTLEN" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMTLEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMtlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMCOMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMcompression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMSORTINGTHREADN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMsortingThreadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMSORTINGBINSN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMsortingBinsN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ]]; then + if ! [[ "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--bamRemoveDuplicatesMate2basesN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMultimapScoreRange' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMismatchNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMismatchNoverLmax' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMismatchNoverReadLmax' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERSCOREMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterScoreMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterScoreMinOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMATCHNMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMatchNmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMatchNminOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTEROVERHANGMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterOverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterCountUniqueMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterCountTotalMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterDistToOtherSJmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterIntronMaxVsReadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_SCOREGAP" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPNONCAN" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPNONCAN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapNoncan' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPGCAG" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPGCAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapGCAG' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPATAC" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPATAC" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapATAC' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ]]; then + if ! [[ "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGenomicLengthLog2scale' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREDELOPEN" ]]; then + if ! [[ "$VIASH_PAR_SCOREDELOPEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreDelOpen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREDELBASE" ]]; then + if ! [[ "$VIASH_PAR_SCOREDELBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreDelBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREINSOPEN" ]]; then + if ! [[ "$VIASH_PAR_SCOREINSOPEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreInsOpen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREINSBASE" ]]; then + if ! [[ "$VIASH_PAR_SCOREINSBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreInsBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ]]; then + if ! [[ "$VIASH_PAR_SCORESTITCHSJSHIFT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreStitchSJshift' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHSTARTLMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSearchStartLmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--seedSearchStartLmaxOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHLMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHLMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSearchLmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDPERWINDOWNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedPerWindowNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ]]; then + if ! [[ "$VIASH_PAR_SEEDNONELOCIPERWINDOW" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedNoneLociPerWindow' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSPLITMIN" ]]; then + if ! [[ "$VIASH_PAR_SEEDSPLITMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSplitMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDMAPMIN" ]]; then + if ! [[ "$VIASH_PAR_SEEDMAPMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedMapMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNINTRONMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNINTRONMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignIntronMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNINTRONMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNINTRONMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignIntronMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNMATESGAPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignMatesGapMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSJOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJoverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJstitchMismatchNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJDBoverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSplicedMateMapLmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alignSplicedMateMapLminOverLmate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignWindowsPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignTranscriptsPerWindowNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignTranscriptsPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ]]; then + if ! [[ "$VIASH_PAR_PEOVERLAPNBASESMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--peOverlapNbasesMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PEOVERLAPMMP" ]]; then + if ! [[ "$VIASH_PAR_PEOVERLAPMMP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--peOverlapMMp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_WINANCHORMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winAnchorMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINBINNBITS" ]]; then + if ! [[ "$VIASH_PAR_WINBINNBITS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winBinNbits' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINANCHORDISTNBINS" ]]; then + if ! [[ "$VIASH_PAR_WINANCHORDISTNBINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winAnchorDistNbins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINFLANKNBINS" ]]; then + if ! [[ "$VIASH_PAR_WINFLANKNBINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winFlankNbins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ]]; then + if ! [[ "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--winReadCoverageRelativeMin' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ]]; then + if ! [[ "$VIASH_PAR_WINREADCOVERAGEBASESMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winReadCoverageBasesMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSEGMENTMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMSEGMENTMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimSegmentMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREDROPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreDropMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCORESEPARATION" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCORESEPARATION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreSeparation' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreJunctionNonGTAG' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimJunctionOverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimSegmentReadGapMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMainSegmentMultNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ]]; then + if ! [[ "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMultimapScoreRange' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimNonchimScoreDropMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ]]; then + if ! [[ "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimOutJunctionFormat' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--quantTranscriptomeBAMcompression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TWOPASS1READSN" ]]; then + if ! [[ "$VIASH_PAR_TWOPASS1READSN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--twopass1readsN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOCBSTART" ]]; then + if ! [[ "$VIASH_PAR_SOLOCBSTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloCBstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOCBLEN" ]]; then + if ! [[ "$VIASH_PAR_SOLOCBLEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloCBlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOUMISTART" ]]; then + if ! [[ "$VIASH_PAR_SOLOUMISTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloUMIstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOUMILEN" ]]; then + if ! [[ "$VIASH_PAR_SOLOUMILEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloUMIlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ]]; then + if ! [[ "$VIASH_PAR_SOLOBARCODEREADLENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloBarcodeReadLength' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOBARCODEMATE" ]]; then + if ! [[ "$VIASH_PAR_SOLOBARCODEMATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloBarcodeMate' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ]]; then + if ! [[ "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloAdapterMismatchesNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_TEST_GENOMEFASTAFILES=() + IFS=';' + for var in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GENOMEFASTAFILES+=( "$var" ) + done + VIASH_PAR_GENOMEFASTAFILES=$(IFS=';' ; echo "${VIASH_TEST_GENOMEFASTAFILES[*]}") +fi +if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SJDBGTFFILE")" ) + VIASH_PAR_SJDBGTFFILE=$(ViashDockerAutodetectMount "$VIASH_PAR_SJDBGTFFILE") +fi +if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_READFILESMANIFEST")" ) + VIASH_PAR_READFILESMANIFEST=$(ViashDockerAutodetectMount "$VIASH_PAR_READFILESMANIFEST") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-star_align-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import re +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'runRNGseed': $( if [ ! -z ${VIASH_PAR_RUNRNGSEED+x} ]; then echo "int(r'${VIASH_PAR_RUNRNGSEED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'genomeLoad': $( if [ ! -z ${VIASH_PAR_GENOMELOAD+x} ]; then echo "r'${VIASH_PAR_GENOMELOAD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'genomeFastaFiles': $( if [ ! -z ${VIASH_PAR_GENOMEFASTAFILES+x} ]; then echo "r'${VIASH_PAR_GENOMEFASTAFILES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'genomeFileSizes': $( if [ ! -z ${VIASH_PAR_GENOMEFILESIZES+x} ]; then echo "list(map(int, r'${VIASH_PAR_GENOMEFILESIZES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'genomeTransformOutput': $( if [ ! -z ${VIASH_PAR_GENOMETRANSFORMOUTPUT+x} ]; then echo "r'${VIASH_PAR_GENOMETRANSFORMOUTPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'genomeChrSetMitochondrial': $( if [ ! -z ${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL+x} ]; then echo "r'${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbFileChrStartEnd': $( if [ ! -z ${VIASH_PAR_SJDBFILECHRSTARTEND+x} ]; then echo "r'${VIASH_PAR_SJDBFILECHRSTARTEND//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFfile': $( if [ ! -z ${VIASH_PAR_SJDBGTFFILE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFchrPrefix': $( if [ ! -z ${VIASH_PAR_SJDBGTFCHRPREFIX+x} ]; then echo "r'${VIASH_PAR_SJDBGTFCHRPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFfeatureExon': $( if [ ! -z ${VIASH_PAR_SJDBGTFFEATUREEXON+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFEATUREEXON//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentTranscript': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGene': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneName': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneType': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbOverhang': $( if [ ! -z ${VIASH_PAR_SJDBOVERHANG+x} ]; then echo "int(r'${VIASH_PAR_SJDBOVERHANG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sjdbScore': $( if [ ! -z ${VIASH_PAR_SJDBSCORE+x} ]; then echo "int(r'${VIASH_PAR_SJDBSCORE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sjdbInsertSave': $( if [ ! -z ${VIASH_PAR_SJDBINSERTSAVE+x} ]; then echo "r'${VIASH_PAR_SJDBINSERTSAVE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'varVCFfile': $( if [ ! -z ${VIASH_PAR_VARVCFFILE+x} ]; then echo "r'${VIASH_PAR_VARVCFFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesType': $( if [ ! -z ${VIASH_PAR_READFILESTYPE+x} ]; then echo "r'${VIASH_PAR_READFILESTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesSAMattrKeep': $( if [ ! -z ${VIASH_PAR_READFILESSAMATTRKEEP+x} ]; then echo "r'${VIASH_PAR_READFILESSAMATTRKEEP//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readFilesManifest': $( if [ ! -z ${VIASH_PAR_READFILESMANIFEST+x} ]; then echo "r'${VIASH_PAR_READFILESMANIFEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesPrefix': $( if [ ! -z ${VIASH_PAR_READFILESPREFIX+x} ]; then echo "r'${VIASH_PAR_READFILESPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesCommand': $( if [ ! -z ${VIASH_PAR_READFILESCOMMAND+x} ]; then echo "r'${VIASH_PAR_READFILESCOMMAND//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readMapNumber': $( if [ ! -z ${VIASH_PAR_READMAPNUMBER+x} ]; then echo "int(r'${VIASH_PAR_READMAPNUMBER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'readMatesLengthsIn': $( if [ ! -z ${VIASH_PAR_READMATESLENGTHSIN+x} ]; then echo "r'${VIASH_PAR_READMATESLENGTHSIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readNameSeparator': $( if [ ! -z ${VIASH_PAR_READNAMESEPARATOR+x} ]; then echo "r'${VIASH_PAR_READNAMESEPARATOR//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readQualityScoreBase': $( if [ ! -z ${VIASH_PAR_READQUALITYSCOREBASE+x} ]; then echo "int(r'${VIASH_PAR_READQUALITYSCOREBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'clipAdapterType': $( if [ ! -z ${VIASH_PAR_CLIPADAPTERTYPE+x} ]; then echo "r'${VIASH_PAR_CLIPADAPTERTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'clip3pNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip3pAdapterSeq': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERSEQ+x} ]; then echo "r'${VIASH_PAR_CLIP3PADAPTERSEQ//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'clip3pAdapterMMp': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERMMP+x} ]; then echo "list(map(float, r'${VIASH_PAR_CLIP3PADAPTERMMP//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip3pAfterAdapterNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PAFTERADAPTERNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PAFTERADAPTERNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip5pNbases': $( if [ ! -z ${VIASH_PAR_CLIP5PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP5PNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'limitGenomeGenerateRAM': $( if [ ! -z ${VIASH_PAR_LIMITGENOMEGENERATERAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITGENOMEGENERATERAM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitIObufferSize': $( if [ ! -z ${VIASH_PAR_LIMITIOBUFFERSIZE+x} ]; then echo "list(map(int, r'${VIASH_PAR_LIMITIOBUFFERSIZE//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'limitOutSAMoneReadBytes': $( if [ ! -z ${VIASH_PAR_LIMITOUTSAMONEREADBYTES+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSAMONEREADBYTES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitOutSJoneRead': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJONEREAD+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJONEREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitOutSJcollapsed': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJCOLLAPSED+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJCOLLAPSED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitBAMsortRAM': $( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITBAMSORTRAM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitSjdbInsertNsj': $( if [ ! -z ${VIASH_PAR_LIMITSJDBINSERTNSJ+x} ]; then echo "int(r'${VIASH_PAR_LIMITSJDBINSERTNSJ//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitNreadsSoft': $( if [ ! -z ${VIASH_PAR_LIMITNREADSSOFT+x} ]; then echo "int(r'${VIASH_PAR_LIMITNREADSSOFT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outTmpKeep': $( if [ ! -z ${VIASH_PAR_OUTTMPKEEP+x} ]; then echo "r'${VIASH_PAR_OUTTMPKEEP//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outStd': $( if [ ! -z ${VIASH_PAR_OUTSTD+x} ]; then echo "r'${VIASH_PAR_OUTSTD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outReadsUnmapped': $( if [ ! -z ${VIASH_PAR_OUTREADSUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTREADSUNMAPPED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outQSconversionAdd': $( if [ ! -z ${VIASH_PAR_OUTQSCONVERSIONADD+x} ]; then echo "int(r'${VIASH_PAR_OUTQSCONVERSIONADD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outMultimapperOrder': $( if [ ! -z ${VIASH_PAR_OUTMULTIMAPPERORDER+x} ]; then echo "r'${VIASH_PAR_OUTMULTIMAPPERORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMtype': $( if [ ! -z ${VIASH_PAR_OUTSAMTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSAMTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMmode': $( if [ ! -z ${VIASH_PAR_OUTSAMMODE+x} ]; then echo "r'${VIASH_PAR_OUTSAMMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMstrandField': $( if [ ! -z ${VIASH_PAR_OUTSAMSTRANDFIELD+x} ]; then echo "r'${VIASH_PAR_OUTSAMSTRANDFIELD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMattributes': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRIBUTES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMattrIHstart': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIHSTART+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMATTRIHSTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMunmapped': $( if [ ! -z ${VIASH_PAR_OUTSAMUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTSAMUNMAPPED//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMorder': $( if [ ! -z ${VIASH_PAR_OUTSAMORDER+x} ]; then echo "r'${VIASH_PAR_OUTSAMORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMprimaryFlag': $( if [ ! -z ${VIASH_PAR_OUTSAMPRIMARYFLAG+x} ]; then echo "r'${VIASH_PAR_OUTSAMPRIMARYFLAG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMreadID': $( if [ ! -z ${VIASH_PAR_OUTSAMREADID+x} ]; then echo "r'${VIASH_PAR_OUTSAMREADID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMmapqUnique': $( if [ ! -z ${VIASH_PAR_OUTSAMMAPQUNIQUE+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMAPQUNIQUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMflagOR': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGOR+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMflagAND': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGAND+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGAND//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMattrRGline': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRRGLINE+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRRGLINE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderHD': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERHD+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERHD//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderPG': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERPG+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERPG//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderCommentFile': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERCOMMENTFILE+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERCOMMENTFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMfilter': $( if [ ! -z ${VIASH_PAR_OUTSAMFILTER+x} ]; then echo "r'${VIASH_PAR_OUTSAMFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMmultNmax': $( if [ ! -z ${VIASH_PAR_OUTSAMMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMULTNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMtlen': $( if [ ! -z ${VIASH_PAR_OUTSAMTLEN+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMTLEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMcompression': $( if [ ! -z ${VIASH_PAR_OUTBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMCOMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMsortingThreadN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGTHREADN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGTHREADN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMsortingBinsN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGBINSN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGBINSN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'bamRemoveDuplicatesType': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESTYPE+x} ]; then echo "r'${VIASH_PAR_BAMREMOVEDUPLICATESTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'bamRemoveDuplicatesMate2basesN': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN+x} ]; then echo "int(r'${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outWigType': $( if [ ! -z ${VIASH_PAR_OUTWIGTYPE+x} ]; then echo "r'${VIASH_PAR_OUTWIGTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outWigStrand': $( if [ ! -z ${VIASH_PAR_OUTWIGSTRAND+x} ]; then echo "r'${VIASH_PAR_OUTWIGSTRAND//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outWigReferencesPrefix': $( if [ ! -z ${VIASH_PAR_OUTWIGREFERENCESPREFIX+x} ]; then echo "r'${VIASH_PAR_OUTWIGREFERENCESPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outWigNorm': $( if [ ! -z ${VIASH_PAR_OUTWIGNORM+x} ]; then echo "r'${VIASH_PAR_OUTWIGNORM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterType': $( if [ ! -z ${VIASH_PAR_OUTFILTERTYPE+x} ]; then echo "r'${VIASH_PAR_OUTFILTERTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMultimapNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMISMATCHNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNoverLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNoverReadLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterScoreMin': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERSCOREMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterScoreMinOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMatchNmin': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMATCHNMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMatchNminOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterIntronMotifs': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONMOTIFS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONMOTIFS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterIntronStrands': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONSTRANDS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONSTRANDS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJtype': $( if [ ! -z ${VIASH_PAR_OUTSJTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSJTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJfilterReads': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERREADS+x} ]; then echo "r'${VIASH_PAR_OUTSJFILTERREADS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJfilterOverhangMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTEROVERHANGMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTEROVERHANGMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountUniqueMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountTotalMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterDistToOtherSJmin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterIntronMaxVsReadN': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'scoreGap': $( if [ ! -z ${VIASH_PAR_SCOREGAP+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapNoncan': $( if [ ! -z ${VIASH_PAR_SCOREGAPNONCAN+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPNONCAN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapGCAG': $( if [ ! -z ${VIASH_PAR_SCOREGAPGCAG+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPGCAG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapATAC': $( if [ ! -z ${VIASH_PAR_SCOREGAPATAC+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPATAC//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGenomicLengthLog2scale': $( if [ ! -z ${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE+x} ]; then echo "int(r'${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreDelOpen': $( if [ ! -z ${VIASH_PAR_SCOREDELOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELOPEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreDelBase': $( if [ ! -z ${VIASH_PAR_SCOREDELBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreInsOpen': $( if [ ! -z ${VIASH_PAR_SCOREINSOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSOPEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreInsBase': $( if [ ! -z ${VIASH_PAR_SCOREINSBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreStitchSJshift': $( if [ ! -z ${VIASH_PAR_SCORESTITCHSJSHIFT+x} ]; then echo "int(r'${VIASH_PAR_SCORESTITCHSJSHIFT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchStartLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHSTARTLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchStartLmaxOverLread': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedMultimapNmax': $( if [ ! -z ${VIASH_PAR_SEEDMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedPerReadNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedPerWindowNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERWINDOWNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedNoneLociPerWindow': $( if [ ! -z ${VIASH_PAR_SEEDNONELOCIPERWINDOW+x} ]; then echo "int(r'${VIASH_PAR_SEEDNONELOCIPERWINDOW//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSplitMin': $( if [ ! -z ${VIASH_PAR_SEEDSPLITMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDSPLITMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedMapMin': $( if [ ! -z ${VIASH_PAR_SEEDMAPMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDMAPMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignIntronMin': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignIntronMax': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignMatesGapMax': $( if [ ! -z ${VIASH_PAR_ALIGNMATESGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNMATESGAPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSJoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSJstitchMismatchNmax': $( if [ ! -z ${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX+x} ]; then echo "list(map(int, r'${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'alignSJDBoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJDBOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJDBOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSplicedMateMapLmin': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSplicedMateMapLminOverLmate': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE+x} ]; then echo "float(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignWindowsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNWINDOWSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNWINDOWSPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignTranscriptsPerWindowNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignTranscriptsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignEndsType': $( if [ ! -z ${VIASH_PAR_ALIGNENDSTYPE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignEndsProtrude': $( if [ ! -z ${VIASH_PAR_ALIGNENDSPROTRUDE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSPROTRUDE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignSoftClipAtReferenceEnds': $( if [ ! -z ${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS+x} ]; then echo "r'${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignInsertionFlush': $( if [ ! -z ${VIASH_PAR_ALIGNINSERTIONFLUSH+x} ]; then echo "r'${VIASH_PAR_ALIGNINSERTIONFLUSH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'peOverlapNbasesMin': $( if [ ! -z ${VIASH_PAR_PEOVERLAPNBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_PEOVERLAPNBASESMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'peOverlapMMp': $( if [ ! -z ${VIASH_PAR_PEOVERLAPMMP+x} ]; then echo "float(r'${VIASH_PAR_PEOVERLAPMMP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winAnchorMultimapNmax': $( if [ ! -z ${VIASH_PAR_WINANCHORMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winBinNbits': $( if [ ! -z ${VIASH_PAR_WINBINNBITS+x} ]; then echo "int(r'${VIASH_PAR_WINBINNBITS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winAnchorDistNbins': $( if [ ! -z ${VIASH_PAR_WINANCHORDISTNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORDISTNBINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winFlankNbins': $( if [ ! -z ${VIASH_PAR_WINFLANKNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINFLANKNBINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winReadCoverageRelativeMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGERELATIVEMIN+x} ]; then echo "float(r'${VIASH_PAR_WINREADCOVERAGERELATIVEMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winReadCoverageBasesMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGEBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_WINREADCOVERAGEBASESMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimOutType': $( if [ ! -z ${VIASH_PAR_CHIMOUTTYPE+x} ]; then echo "r'${VIASH_PAR_CHIMOUTTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'chimSegmentMin': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreMin': $( if [ ! -z ${VIASH_PAR_CHIMSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreDropMax': $( if [ ! -z ${VIASH_PAR_CHIMSCOREDROPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREDROPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreSeparation': $( if [ ! -z ${VIASH_PAR_CHIMSCORESEPARATION+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCORESEPARATION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreJunctionNonGTAG': $( if [ ! -z ${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimJunctionOverhangMin': $( if [ ! -z ${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimSegmentReadGapMax': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTREADGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTREADGAPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimFilter': $( if [ ! -z ${VIASH_PAR_CHIMFILTER+x} ]; then echo "r'${VIASH_PAR_CHIMFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'chimMainSegmentMultNmax': $( if [ ! -z ${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimMultimapNmax': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPSCORERANGE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimNonchimScoreDropMin': $( if [ ! -z ${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimOutJunctionFormat': $( if [ ! -z ${VIASH_PAR_CHIMOUTJUNCTIONFORMAT+x} ]; then echo "int(r'${VIASH_PAR_CHIMOUTJUNCTIONFORMAT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'quantMode': $( if [ ! -z ${VIASH_PAR_QUANTMODE+x} ]; then echo "r'${VIASH_PAR_QUANTMODE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'quantTranscriptomeBAMcompression': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'quantTranscriptomeBan': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAN+x} ]; then echo "r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'twopassMode': $( if [ ! -z ${VIASH_PAR_TWOPASSMODE+x} ]; then echo "r'${VIASH_PAR_TWOPASSMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'twopass1readsN': $( if [ ! -z ${VIASH_PAR_TWOPASS1READSN+x} ]; then echo "int(r'${VIASH_PAR_TWOPASS1READSN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'waspOutputMode': $( if [ ! -z ${VIASH_PAR_WASPOUTPUTMODE+x} ]; then echo "r'${VIASH_PAR_WASPOUTPUTMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloType': $( if [ ! -z ${VIASH_PAR_SOLOTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCBwhitelist': $( if [ ! -z ${VIASH_PAR_SOLOCBWHITELIST+x} ]; then echo "r'${VIASH_PAR_SOLOCBWHITELIST//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCBstart': $( if [ ! -z ${VIASH_PAR_SOLOCBSTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBSTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBlen': $( if [ ! -z ${VIASH_PAR_SOLOCBLEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBLEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloUMIstart': $( if [ ! -z ${VIASH_PAR_SOLOUMISTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMISTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloUMIlen': $( if [ ! -z ${VIASH_PAR_SOLOUMILEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMILEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloBarcodeReadLength': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEREADLENGTH+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEREADLENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloBarcodeMate': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEMATE+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEMATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBposition': $( if [ ! -z ${VIASH_PAR_SOLOCBPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOCBPOSITION//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIposition': $( if [ ! -z ${VIASH_PAR_SOLOUMIPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOUMIPOSITION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloAdapterSequence': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERSEQUENCE+x} ]; then echo "r'${VIASH_PAR_SOLOADAPTERSEQUENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloAdapterMismatchesNmax': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX+x} ]; then echo "int(r'${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBmatchWLtype': $( if [ ! -z ${VIASH_PAR_SOLOCBMATCHWLTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOCBMATCHWLTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloInputSAMattrBarcodeSeq': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloInputSAMattrBarcodeQual': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloStrand': $( if [ ! -z ${VIASH_PAR_SOLOSTRAND+x} ]; then echo "r'${VIASH_PAR_SOLOSTRAND//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloFeatures': $( if [ ! -z ${VIASH_PAR_SOLOFEATURES+x} ]; then echo "r'${VIASH_PAR_SOLOFEATURES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloMultiMappers': $( if [ ! -z ${VIASH_PAR_SOLOMULTIMAPPERS+x} ]; then echo "r'${VIASH_PAR_SOLOMULTIMAPPERS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIdedup': $( if [ ! -z ${VIASH_PAR_SOLOUMIDEDUP+x} ]; then echo "r'${VIASH_PAR_SOLOUMIDEDUP//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIfiltering': $( if [ ! -z ${VIASH_PAR_SOLOUMIFILTERING+x} ]; then echo "r'${VIASH_PAR_SOLOUMIFILTERING//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloOutFileNames': $( if [ ! -z ${VIASH_PAR_SOLOOUTFILENAMES+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFILENAMES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCellFilter': $( if [ ! -z ${VIASH_PAR_SOLOCELLFILTER+x} ]; then echo "r'${VIASH_PAR_SOLOCELLFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloOutFormatFeaturesGeneField3': $( if [ ! -z ${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCellReadStats': $( if [ ! -z ${VIASH_PAR_SOLOCELLREADSTATS+x} ]; then echo "r'${VIASH_PAR_SOLOCELLREADSTATS//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + +# regex for matching R[12] fastq(gz) files +# examples: +# - TSP10_Fat_MAT_SS2_B134171_B115063_Immune_A1_L003_R1.fastq.gz +# - tinygex_S1_L001_I1_001.fastq.gz +fastqgz_regex = r"(.+)_(R\\d+)(_\\d+)?\\.fastq(\\.gz)?" + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\x1f\\x8b" + + +# look for fastq files in a directory +def search_fastqs(path: Path) -> list[Path]: + if path.is_dir(): + print( + f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", + flush=True, + ) + value_paths = [ + file for file in path.iterdir() if re.match(fastqgz_regex, file.name) + ] + return value_paths + else: + return [path] + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the \`processPar()\` generator needs to be adapted +to_rename = { + "input": "readFilesIn", + "reference": "genomeDir", + "output": "outFileNamePrefix", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the \`to_rename\` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["outFileNamePrefix"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory( + prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True +) as temp_dir: + print(">> Check whether input files are directories", flush=True) + new_read_files_in = [] + for path in par["readFilesIn"]: + new_read_files_in.extend(search_fastqs(path)) + par["readFilesIn"] = new_read_files_in + print("", flush=True) + + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeDir", "readFilesIn"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print("Grouping R1/R2 input files into pairs", flush=True) + input_grouped = {} + for path in par["readFilesIn"]: + key = re.search(fastqgz_regex, path.name).group(2) + if key not in input_grouped: + input_grouped[key] = [] + input_grouped[key].append(str(path)) + par["readFilesIn"] = [",".join(val) for val in input_grouped.values()] + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "alignReads" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + # make sure there is a trailing / + par["outFileNamePrefix"] = f"{par['outFileNamePrefix']}/" + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + unset VIASH_TEST_GENOMEFASTAFILES + IFS=';' + for var in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + if [ -z "$VIASH_TEST_GENOMEFASTAFILES" ]; then + VIASH_TEST_GENOMEFASTAFILES="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GENOMEFASTAFILES="$VIASH_TEST_GENOMEFASTAFILES;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GENOMEFASTAFILES="$VIASH_TEST_GENOMEFASTAFILES" + fi + if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ]; then + VIASH_PAR_SJDBGTFFILE=$(ViashDockerStripAutomount "$VIASH_PAR_SJDBGTFFILE") + fi + if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ]; then + VIASH_PAR_READFILESMANIFEST=$(ViashDockerStripAutomount "$VIASH_PAR_READFILESMANIFEST") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/mapping/star_align_v273a/.config.vsh.yaml b/target/executable/mapping/star_align_v273a/.config.vsh.yaml new file mode 100644 index 00000000..2306af44 --- /dev/null +++ b/target/executable/mapping/star_align_v273a/.config.vsh.yaml @@ -0,0 +1,2420 @@ +name: "star_align_v273a" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input/Output" + arguments: + - type: "file" + name: "--input" + alternatives: + - "--readFilesIn" + description: "The FASTQ files to be analyzed. Corresponds to the --readFilesIn\ + \ in the STAR command." + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "--genomeDir" + description: "Path to the reference built by star_build_reference. Corresponds\ + \ to the --genomeDir in the STAR command." + info: null + example: + - "/path/to/reference" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--outFileNamePrefix" + description: "Path to output directory. Corresponds to the --outFileNamePrefix\ + \ in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Run Parameters" + arguments: + - type: "integer" + name: "--runRNGseed" + description: "random number generator seed." + info: null + example: + - 777 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Genome Parameters" + arguments: + - type: "string" + name: "--genomeLoad" + description: "mode of shared memory usage for the genome files. Only used with\ + \ --runMode alignReads.\n\n- LoadAndKeep ... load genome into shared and\ + \ keep it in memory after run\n- LoadAndRemove ... load genome into shared\ + \ but remove it after run\n- LoadAndExit ... load genome into shared memory\ + \ and exit, keeping the genome in memory for future runs\n- Remove \ + \ ... do not map anything, just remove loaded genome from memory\n- NoSharedMemory\ + \ ... do not use shared memory, each job will have its own private copy of\ + \ the genome" + info: null + example: + - "NoSharedMemory" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genomeFastaFiles" + description: "path(s) to the fasta files with the genome sequences, separated\ + \ by spaces. These files should be plain text FASTA files, they *cannot* be\ + \ zipped.\n\nRequired for the genome generation (--runMode genomeGenerate).\ + \ Can also be used in the mapping (--runMode alignReads) to add extra (new)\ + \ sequences to the genome (e.g. spike-ins)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--genomeFileSizes" + description: "genome files exact sizes in bytes. Typically, this should not be\ + \ defined by the user." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeTransformOutput" + description: "which output to transform back to original genome\n\n- SAM ...\ + \ SAM/BAM alignments\n- SJ ... splice junctions (SJ.out.tab)\n- None \ + \ ... no transformation of the output" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeChrSetMitochondrial" + description: "names of the mitochondrial chromosomes. Presently only used for\ + \ STARsolo statistics output/" + info: null + example: + - "chrM" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Splice Junctions Database" + arguments: + - type: "string" + name: "--sjdbFileChrStartEnd" + description: "path to the files with genomic coordinates (chr start \ + \ end strand) for the splice junction introns. Multiple files can be supplied\ + \ and will be concatenated." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sjdbGTFfile" + description: "path to the GTF file with annotations" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFchrPrefix" + description: "prefix for chromosome names in a GTF file (e.g. 'chr' for using\ + \ ENSMEBL annotations with UCSC genomes)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFfeatureExon" + description: "feature type in GTF file to be used as exons for building transcripts" + info: null + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentTranscript" + description: "GTF attribute name for parent transcript ID (default \"transcript_id\"\ + \ works for GTF files)" + info: null + example: + - "transcript_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGene" + description: "GTF attribute name for parent gene ID (default \"gene_id\" works\ + \ for GTF files)" + info: null + example: + - "gene_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneName" + description: "GTF attribute name for parent gene name" + info: null + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneType" + description: "GTF attribute name for parent gene type" + info: null + example: + - "gene_type" + - "gene_biotype" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sjdbOverhang" + description: "length of the donor/acceptor sequence on each side of the junctions,\ + \ ideally = (mate_length - 1)" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sjdbScore" + description: "extra alignment score for alignments that cross database junctions" + info: null + example: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbInsertSave" + description: "which files to save when sjdb junctions are inserted on the fly\ + \ at the mapping step\n\n- Basic ... only small junction / transcript files\n\ + - All ... all files including big Genome, SA and SAindex - this will create\ + \ a complete genome directory" + info: null + example: + - "Basic" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variation parameters" + arguments: + - type: "string" + name: "--varVCFfile" + description: "path to the VCF file that contains variation data. The 10th column\ + \ should contain the genotype information, e.g. 0/1" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Parameters" + arguments: + - type: "string" + name: "--readFilesType" + description: "format of input read files\n\n- Fastx ... FASTA or FASTQ\n\ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand\ + \ samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use\ + \ --readFilesCommand samtools view" + info: null + example: + - "Fastx" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesSAMattrKeep" + description: "for --readFilesType SAM SE/PE, which SAM tags to keep in the output\ + \ BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n-\ + \ None ... do not keep any tags" + info: null + example: + - "All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--readFilesManifest" + description: "path to the \"manifest\" file with the names of read files. The\ + \ manifest file should contain 3 tab-separated columns:\n\npaired-end reads:\ + \ read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads:\ + \ read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but\ + \ not tabs are allowed in file names.\nIf read_group_line does not start with\ + \ ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line\ + \ starts with ID:, it can contain several fields separated by $tab$, and all\ + \ fields will be be copied verbatim into SAM @RG header line." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesPrefix" + description: "prefix for the read files names, i.e. it will be added in front\ + \ of the strings in --readFilesIn" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesCommand" + description: "command line to execute for each of the input file. This command\ + \ should generate FASTA or FASTQ text and send it to stdout\n\nFor example:\ + \ zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readMapNumber" + description: "number of reads to map from the beginning of the file\n\n-1: map\ + \ all reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readMatesLengthsIn" + description: "Equal/NotEqual - lengths of names,sequences,qualities for both mates\ + \ are the same / not the same. NotEqual is safe in all situations." + info: null + example: + - "NotEqual" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readNameSeparator" + description: "character(s) separating the part of the read names that will be\ + \ trimmed in output (read name after space is always trimmed)" + info: null + example: + - "/" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readQualityScoreBase" + description: "number to be subtracted from the ASCII code to get Phred quality\ + \ score" + info: null + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Clipping" + arguments: + - type: "string" + name: "--clipAdapterType" + description: "adapter clipping type\n\n- Hamming ... adapter clipping based on\ + \ Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n\ + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes\ + \ Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ...\ + \ no adapter clipping, all other clip* parameters are disregarded" + info: null + example: + - "Hamming" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clip3pNbases" + description: "number(s) of bases to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--clip3pAdapterSeq" + description: "adapter sequences to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence\ + \ with the length equal to read length" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--clip3pAdapterMMp" + description: "max proportion of mismatches for 3p adapter clipping for each mate.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip3pAfterAdapterNbases" + description: "number of bases to clip from 3p of each mate after the adapter clipping.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip5pNbases" + description: "number(s) of bases to clip from 5p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Limits" + arguments: + - type: "long" + name: "--limitGenomeGenerateRAM" + description: "maximum available RAM (bytes) for genome generation" + info: null + example: + - 31000000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitIObufferSize" + description: "max available buffers size (bytes) for input/output, per thread" + info: null + example: + - 30000000 + - 50000000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "long" + name: "--limitOutSAMoneReadBytes" + description: "max size of the SAM record (bytes) for one read. Recommended value:\ + \ >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + info: null + example: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJoneRead" + description: "max number of junctions for one read (including all multi-mappers)" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJcollapsed" + description: "max number of collapsed junctions" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitBAMsortRAM" + description: "maximum available RAM (bytes) for sorting BAM. If =0, it will be\ + \ set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory\ + \ option." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitSjdbInsertNsj" + description: "maximum number of junctions to be inserted to the genome on the\ + \ fly at the mapping stage, including those from annotations and those detected\ + \ in the 1st step of the 2-pass run" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitNreadsSoft" + description: "soft limit on the number of reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: general" + arguments: + - type: "string" + name: "--outTmpKeep" + description: "whether to keep the temporary files after STAR runs is finished\n\ + \n- None ... remove all temporary files\n- All ... keep all files" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outStd" + description: "which output will be directed to stdout (standard out)\n\n- Log\ + \ ... log messages\n- SAM ... alignments\ + \ in SAM format (which normally are output to Aligned.out.sam file), normal\ + \ standard output will go into Log.std.out\n- BAM_Unsorted ... alignments\ + \ in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate\ + \ ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype\ + \ BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome\ + \ in BAM format, unsorted. Requires --quantMode TranscriptomeSAM" + info: null + example: + - "Log" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outReadsUnmapped" + description: "output of unmapped and partially mapped (i.e. mapped only one mate\ + \ of a paired end read) reads in separate file(s).\n\n- None ... no output\n\ + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outQSconversionAdd" + description: "add this number to the quality score (e.g. to convert from Illumina\ + \ to Sanger, use -31)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outMultimapperOrder" + description: "order of multimapping alignments in the output files\n\n- Old_2.4\ + \ ... quasi-random order used before 2.5.0\n- Random \ + \ ... random order of alignments for each multi-mapper. Read mates (pairs)\ + \ are always adjacent, all alignment for each read stay together. This option\ + \ will become default in the future releases." + info: null + example: + - "Old_2.4" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: SAM and BAM" + arguments: + - type: "string" + name: "--outSAMtype" + description: "type of SAM/BAM output\n\n1st word:\n- BAM ... output BAM without\ + \ sorting\n- SAM ... output SAM without sorting\n- None ... no SAM/BAM output\n\ + 2nd, 3rd:\n- Unsorted ... standard unsorted\n- SortedByCoordinate\ + \ ... sorted by coordinate. This option will allocate extra memory for sorting\ + \ which can be specified by --limitBAMsortRAM." + info: null + example: + - "SAM" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMmode" + description: "mode of SAM output\n\n- None ... no SAM output\n- Full ... full\ + \ SAM output\n- NoQS ... full SAM but without quality scores" + info: null + example: + - "Full" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMstrandField" + description: "Cufflinks-like strand field flag\n\n- None ... not used\n\ + - intronMotif ... strand derived from the intron motif. This option changes\ + \ the output alignments: reads with inconsistent and/or non-canonical introns\ + \ are filtered out." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattributes" + description: "a string of desired SAM attributes, in the order desired for the\ + \ output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n\ + - None ... no attributes\n- Standard ... NH HI AS nM\n- All \ + \ ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number\ + \ of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard\ + \ SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart\ + \ (=1 by default). Standard SAM tag.\n- AS ... local alignment score,\ + \ +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE\ + \ reads, total score for two mates. Stadnard SAM tag.\n- nM ... number\ + \ of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance\ + \ to the reference (number of mismatched + inserted + deleted bases) for each\ + \ mate. Standard SAM tag.\n- MD ... string encoding mismatched and\ + \ deleted reference bases (see standard SAM specifications). Standard SAM tag.\n\ + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical;\ + \ 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions\ + \ database is used, and a junction is annotated, 20 is added to its motif value.\n\ + - jI ... start and end of introns for all junctions (1-based).\n- XS\ + \ ... alignment strand according to --outSAMstrandField.\n- MC \ + \ ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all\ + \ segment of all chimeric alingments for --chimOutType WithinBAM output.\n-\ + \ cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n\ + - vA ... variant allele\n- vG ... genomic coordinate of the\ + \ variant overlapped by the read.\n- vW ... 1 - alignment passes WASP\ + \ filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires\ + \ --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality\ + \ scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN \ + \ ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene\ + \ IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected\ + \ cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM\ + \ SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS \ + \ ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ \ + \ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha \ + \ ... haplotype (1/2) when mapping to the diploid genome. Requires genome\ + \ generated with --genomeTransformType Diploid .\n- rB ... alignment\ + \ block read/genomic coordinates.\n- vR ... read coordinate of the\ + \ variant." + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMattrIHstart" + description: "start value for the IH attribute. 0 may be required by some downstream\ + \ software, such as Cufflinks or StringTie." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMunmapped" + description: "output of unmapped reads in the SAM format\n\n1st word:\n- None\ + \ ... no output\n- Within ... output unmapped reads within the main SAM file\ + \ (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for\ + \ each alignment, and, in case of unsorted output, keep it adjacent to its mapped\ + \ mate. Only affects multi-mapping reads." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMorder" + description: "type of sorting for the SAM output\n\nPaired: one mate after the\ + \ other for all paired alignments\nPairedKeepInputOrder: one mate after the\ + \ other for all paired alignments, the order is kept the same as in the input\ + \ FASTQ files" + info: null + example: + - "Paired" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMprimaryFlag" + description: "which alignments are considered primary - all others will be marked\ + \ with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the\ + \ best score is primary\n- AllBestScore ... all alignments with the best score\ + \ are primary" + info: null + example: + - "OneBestScore" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMreadID" + description: "read ID record type\n\n- Standard ... first word (until space) from\ + \ the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number\ + \ (index) in the FASTx file" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMmapqUnique" + description: "0 to 255: the MAPQ value for unique mappers" + info: null + example: + - 255 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagOR" + description: "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e.\ + \ FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, and after outSAMflagAND. Can be used to set specific bits that are not\ + \ set otherwise." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagAND" + description: "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e.\ + \ FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, but before outSAMflagOR. Can be used to unset specific bits that are\ + \ not set otherwise." + info: null + example: + - 65535 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattrRGline" + description: "SAM/BAM read group line. The first word contains the read group\ + \ identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx CN:yy\ + \ \"DS:z z z\".\n\nxxx will be added as RG tag to each output alignment. Any\ + \ spaces in the tag values have to be double quoted.\nComma separated RG lines\ + \ correspons to different (comma separated) input files in --readFilesIn. Commas\ + \ have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz\ + \ \"DS:z z\" , ID:yyy DS:yyyy" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderHD" + description: "@HD (header) line of the SAM header" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderPG" + description: "extra @PG (software) line of the SAM header (in addition to STAR)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderCommentFile" + description: "path to the file with @CO (comment) lines of the SAM header" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMfilter" + description: "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences\ + \ ... only keep the reads for which all alignments are to the extra reference\ + \ sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences\ + \ ... keep all alignments to the extra reference sequences added with --genomeFastaFiles\ + \ at the mapping stage." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMmultNmax" + description: "max number of multiple alignments for a read that will be output\ + \ to the SAM/BAM files. Note that if this value is not equal to -1, the top\ + \ scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax)\ + \ will be output" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMtlen" + description: "calculation method for the TLEN field in the SAM/BAM files\n\n-\ + \ 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate.\ + \ (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost\ + \ base of any mate. (+)sign for the mate with the leftmost base. This is different\ + \ from 1 for overlapping mates with protruding ends" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMcompression" + description: "-1 to 10 BAM compression level, -1=default compression (6?), 0=no\ + \ compression, 10=maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingThreadN" + description: ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN)." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingBinsN" + description: ">0: number of genome bins for coordinate-sorting" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BAM processing" + arguments: + - type: "string" + name: "--bamRemoveDuplicatesType" + description: "mark duplicates in the BAM file, for now only works with (i) sorted\ + \ BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- -\ + \ ... no duplicate removal/marking\n- UniqueIdentical\ + \ ... mark all multimappers, and duplicate unique mappers. The coordinates,\ + \ FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate\ + \ unique mappers but not multimappers." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--bamRemoveDuplicatesMate2basesN" + description: "number of bases from the 5' of mate 2 to use in collapsing (e.g.\ + \ for RAMPAGE)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Wiggle" + arguments: + - type: "string" + name: "--outWigType" + description: "type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\"\ + . Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n\ + - None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle\ + \ ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of\ + \ the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only\ + \ 2nd read" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outWigStrand" + description: "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate\ + \ strands, str1 and str2\n- Unstranded ... collapsed strands" + info: null + example: + - "Stranded" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigReferencesPrefix" + description: "prefix matching reference names to include in the output wiggle\ + \ file, e.g. \"chr\", default \"-\" - include all references" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigNorm" + description: "type of normalization for the signal\n\n- RPM ... reads per million\ + \ of mapped reads\n- None ... no normalization, \"raw\" counts" + info: null + example: + - "RPM" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering" + arguments: + - type: "string" + name: "--outFilterType" + description: "type of filtering\n\n- Normal ... standard filtering using only\ + \ current alignment\n- BySJout ... keep only those reads that contain junctions\ + \ that passed filtering into SJ.out.tab" + info: null + example: + - "Normal" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapScoreRange" + description: "the score range below the maximum score for multimapping alignments" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapNmax" + description: "maximum number of loci the read is allowed to map to. Alignments\ + \ (all of them) will be output only if the read maps to no more loci than this\ + \ value.\n\nOtherwise no alignments will be output, and the read will be counted\ + \ as \"mapped to too many loci\" in the Log.final.out ." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMismatchNmax" + description: "alignment will be output only if it has no more mismatches than\ + \ this value." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverLmax" + description: "alignment will be output only if its ratio of mismatches to *mapped*\ + \ length is less than or equal to this value." + info: null + example: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverReadLmax" + description: "alignment will be output only if its ratio of mismatches to *read*\ + \ length is less than or equal to this value." + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterScoreMin" + description: "alignment will be output only if its score is higher than or equal\ + \ to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterScoreMinOverLread" + description: "same as outFilterScoreMin, but normalized to read length (sum of\ + \ mates' lengths for paired-end reads)" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMatchNmin" + description: "alignment will be output only if the number of matched bases is\ + \ higher than or equal to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMatchNminOverLread" + description: "sam as outFilterMatchNmin, but normalized to the read length (sum\ + \ of mates' lengths for paired-end reads)." + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronMotifs" + description: "filter alignment using their motifs\n\n- None \ + \ ... no filtering\n- RemoveNoncanonical ... filter out\ + \ alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated\ + \ ... filter out alignments that contain non-canonical unannotated junctions\ + \ when using annotated splice junctions database. The annotated non-canonical\ + \ junctions will be kept." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronStrands" + description: "filter alignments\n\n- RemoveInconsistentStrands ... remove\ + \ alignments that have junctions with inconsistent strands\n- None \ + \ ... no filtering" + info: null + example: + - "RemoveInconsistentStrands" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output splice junctions (SJ.out.tab)" + arguments: + - type: "string" + name: "--outSJtype" + description: "type of splice junction output\n\n- Standard ... standard SJ.out.tab\ + \ output\n- None ... no splice junction output" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering: Splice Junctions" + arguments: + - type: "string" + name: "--outSJfilterReads" + description: "which reads to consider for collapsed splice junctions output\n\n\ + - All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping\ + \ reads only" + info: null + example: + - "All" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterOverhangMin" + description: "minimum overhang length for splice junctions on both sides for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply\ + \ to annotated junctions" + info: null + example: + - 30 + - 12 + - 12 + - 12 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountUniqueMin" + description: "minimum uniquely mapping read count per junction for: (1) non-canonical\ + \ motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and\ + \ GT/AT motif. -1 means no output for that motif\n\nJunctions are output if\ + \ one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are\ + \ satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountTotalMin" + description: "minimum total (multi-mapping+unique) read count per junction for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions\ + \ are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin\ + \ conditions are satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterDistToOtherSJmin" + description: "minimum allowed distance to other junctions' donor/acceptor\n\n\ + does not apply to annotated junctions" + info: null + example: + - 10 + - 0 + - 5 + - 10 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterIntronMaxVsReadN" + description: "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ + \ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2\ + \ reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\n\ + does not apply to annotated junctions" + info: null + example: + - 50000 + - 100000 + - 200000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Scoring" + arguments: + - type: "integer" + name: "--scoreGap" + description: "splice junction penalty (independent on intron motif)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapNoncan" + description: "non-canonical junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapGCAG" + description: "GC/AG and CT/GC junction penalty (in addition to scoreGap)" + info: null + example: + - -4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapATAC" + description: "AT/AC and GT/AT junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGenomicLengthLog2scale" + description: "extra score logarithmically scaled with genomic length of the alignment:\ + \ scoreGenomicLengthLog2scale*log2(genomicLength)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelOpen" + description: "deletion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelBase" + description: "deletion extension penalty per base (in addition to scoreDelOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsOpen" + description: "insertion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsBase" + description: "insertion extension penalty per base (in addition to scoreInsOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreStitchSJshift" + description: "maximum score reduction while searching for SJ boundaries in the\ + \ stitching step" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Alignments and Seeding" + arguments: + - type: "integer" + name: "--seedSearchStartLmax" + description: "defines the search start point through the read - the read is split\ + \ into pieces no longer than this value" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--seedSearchStartLmaxOverLread" + description: "seedSearchStartLmax normalized to read length (sum of mates' lengths\ + \ for paired-end reads)" + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSearchLmax" + description: "defines the maximum length of the seeds, if =0 seed length is not\ + \ limited" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMultimapNmax" + description: "only pieces that map fewer than this value are utilized in the stitching\ + \ procedure" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerReadNmax" + description: "max number of seeds per read" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerWindowNmax" + description: "max number of seeds per window" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedNoneLociPerWindow" + description: "max number of one seed loci per window" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSplitMin" + description: "min length of the seed sequences split by Ns or mate gap" + info: null + example: + - 12 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMapMin" + description: "min length of seeds to be mapped" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMin" + description: "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,\ + \ otherwise it is considered Deletion" + info: null + example: + - 21 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMax" + description: "maximum intron size, if 0, max intron size will be determined by\ + \ (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignMatesGapMax" + description: "maximum gap between two mates, if 0, max intron gap will be determined\ + \ by (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJoverhangMin" + description: "minimum overhang (i.e. block size) for spliced alignments" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJstitchMismatchNmax" + description: "maximum number of mismatches for stitching of the splice junctions\ + \ (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3)\ + \ GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif." + info: null + example: + - 0 + - -1 + - 0 + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--alignSJDBoverhangMin" + description: "minimum overhang (i.e. block size) for annotated (sjdb) spliced\ + \ alignments" + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSplicedMateMapLmin" + description: "minimum mapped length for a read mate that is spliced" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alignSplicedMateMapLminOverLmate" + description: "alignSplicedMateMapLmin normalized to mate length" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignWindowsPerReadNmax" + description: "max number of windows per read" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerWindowNmax" + description: "max number of transcripts per window" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerReadNmax" + description: "max number of different alignments per read to consider" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsType" + description: "type of read ends alignment\n\n- Local ... standard\ + \ local alignment with soft-clipping allowed\n- EndToEnd ... force\ + \ end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully\ + \ extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12\ + \ ... fully extend only the 5p of the both read1 and read2, all other ends:\ + \ local alignment" + info: null + example: + - "Local" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsProtrude" + description: "allow protrusion of alignment ends, i.e. start (end) of the +strand\ + \ mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum\ + \ number of protrusion bases allowed\n2nd word: string:\n- \ + \ ConcordantPair ... report alignments with non-zero protrusion as concordant\ + \ pairs\n- DiscordantPair ... report alignments with non-zero\ + \ protrusion as discordant pairs" + info: null + example: + - "0 ConcordantPair" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignSoftClipAtReferenceEnds" + description: "allow the soft-clipping of the alignments past the end of the chromosomes\n\ + \n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks" + info: null + example: + - "Yes" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignInsertionFlush" + description: "how to flush ambiguous insertion positions\n\n- None ... insertions\ + \ are not flushed\n- Right ... insertions are flushed to the right" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-End reads" + arguments: + - type: "integer" + name: "--peOverlapNbasesMin" + description: "minimum number of overlapping bases to trigger mates merging and\ + \ realignment. Specify >0 value to switch on the \"merginf of overlapping mates\"\ + \ algorithm." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--peOverlapMMp" + description: "maximum proportion of mismatched bases in the overlap area" + info: null + example: + - 0.01 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Windows, Anchors, Binning" + arguments: + - type: "integer" + name: "--winAnchorMultimapNmax" + description: "max number of loci anchors are allowed to map to" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winBinNbits" + description: "=log2(winBin), where winBin is the size of the bin for the windows/clustering,\ + \ each window will occupy an integer number of bins." + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winAnchorDistNbins" + description: "max number of bins between two anchors that allows aggregation of\ + \ anchors into one window" + info: null + example: + - 9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winFlankNbins" + description: "log2(winFlank), where win Flank is the size of the left and right\ + \ flanking regions for each window" + info: null + example: + - 4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--winReadCoverageRelativeMin" + description: "minimum relative coverage of the read sequence by the seeds in a\ + \ window, for STARlong algorithm only." + info: null + example: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winReadCoverageBasesMin" + description: "minimum number of bases covered by the seeds in a window , for STARlong\ + \ algorithm only." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Chimeric Alignments" + arguments: + - type: "string" + name: "--chimOutType" + description: "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n\ + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n-\ + \ WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n-\ + \ WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental\ + \ chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip\ + \ ... soft-clipping in the CIGAR for supplemental chimeric alignments" + info: null + example: + - "Junctions" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentMin" + description: "minimum length of chimeric segment length, if ==0, no chimeric output" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreMin" + description: "minimum total (summed) score of the chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreDropMax" + description: "max drop (difference) of chimeric score (the sum of scores of all\ + \ chimeric segments) from the read length" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreSeparation" + description: "minimum difference (separation) between the best chimeric score\ + \ and the next one" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreJunctionNonGTAG" + description: "penalty for a non-GT/AG chimeric junction" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimJunctionOverhangMin" + description: "minimum overhang for a chimeric junction" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentReadGapMax" + description: "maximum gap in the read sequence between chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chimFilter" + description: "different filters for chimeric alignments\n\n- None ... no filtering\n\ + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric\ + \ junction" + info: null + example: + - "banGenomicN" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimMainSegmentMultNmax" + description: "maximum number of multi-alignments for the main chimeric segment.\ + \ =1 will prohibit multimapping main segments." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapNmax" + description: "maximum number of chimeric multi-alignments\n\n- 0 ... use the old\ + \ scheme for chimeric detection which only considered unique alignments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapScoreRange" + description: "the score range for multi-mapping chimeras below the best chimeric\ + \ score. Only works with --chimMultimapNmax > 1" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimNonchimScoreDropMin" + description: "to trigger chimeric detection, the drop in the best non-chimeric\ + \ alignment score with respect to the read length has to be greater than this\ + \ value" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimOutJunctionFormat" + description: "formatting type for the Chimeric.out.junction file\n\n- 0 ... no\ + \ comment lines/headers\n- 1 ... comment lines at the end of the file: command\ + \ line and Nreads: total, unique/multi-mapping" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Quantification of Annotations" + arguments: + - type: "string" + name: "--quantMode" + description: "types of quantification requested\n\n- - ... none\n\ + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate\ + \ file\n- GeneCounts ... count reads per gene" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--quantTranscriptomeBAMcompression" + description: "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM\ + \ output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10\ + \ ... maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quantTranscriptomeBan" + description: "prohibit various alignment type\n\n- IndelSoftclipSingleend ...\ + \ prohibit indels, soft clipping and single-end alignments - compatible with\ + \ RSEM\n- Singleend ... prohibit single-end alignments" + info: null + example: + - "IndelSoftclipSingleend" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "2-pass Mapping" + arguments: + - type: "string" + name: "--twopassMode" + description: "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic\ + \ ... basic 2-pass mapping, with all 1st pass junctions inserted into\ + \ the genome indices on the fly" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--twopass1readsN" + description: "number of reads to process for the 1st step. Use very large number\ + \ (or default -1) to map all reads in the first step." + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "WASP parameters" + arguments: + - type: "string" + name: "--waspOutputMode" + description: "WASP allele-specific output type. This is re-implementation of the\ + \ original WASP mappability filtering by Bryce van de Geijn, Graham McVicker,\ + \ Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature\ + \ Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\ + \n- SAMtag ... add WASP tags to the alignments that pass WASP filtering" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STARsolo (single cell RNA-seq) parameters" + arguments: + - type: "string" + name: "--soloType" + description: "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet)\ + \ one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X\ + \ Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length,\ + \ one UMI of fixed length and one adapter sequence of fixed length are allowed\ + \ in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode\ + \ as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2\ + \ if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or\ + \ SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate\ + \ FASTQ (paired- or single-end), barcodes are corresponding read-groups, no\ + \ UMI sequences, alignments deduplicated according to alignment start and end\ + \ (after extending soft-clipped bases)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCBwhitelist" + description: "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex\ + \ allows more than one whitelist file.\n\n- None ... no whitelist:\ + \ all cell barcodes are allowed" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--soloCBstart" + description: "cell barcode start base" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloCBlen" + description: "cell barcode length" + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIstart" + description: "UMI start base" + info: null + example: + - 17 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIlen" + description: "UMI length" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeReadLength" + description: "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n\ + - 0 ... not defined, do not check" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeMate" + description: "identifies which read mate contains the barcode (CB+UMI) sequence\n\ + \n- 0 ... barcode sequence is on separate read, which should always be the\ + \ last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part\ + \ of mate 1\n- 2 ... barcode sequence is a part of mate 2" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBposition" + description: "position of Cell Barcode(s) on the barcode read.\n\nPresently only\ + \ works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\n\ + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor\ + \ defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter\ + \ start; 3: adapter end\nstart(end)Position is the 0-based position with of\ + \ the CB start(end) with respect to the Anchor Base\nString for different barcodes\ + \ are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols,\ + \ 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIposition" + description: "position of the UMI on the barcode read, same as soloCBposition\n\ + \nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition\ + \ 3_9_3_14" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloAdapterSequence" + description: "adapter sequence to anchor barcodes. Only one adapter sequence is\ + \ allowed." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloAdapterMismatchesNmax" + description: "maximum number of mismatches allowed in adapter sequence." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBmatchWLtype" + description: "matching the Cell Barcodes to the WhiteList\n\n- Exact \ + \ ... only exact matches allowed\n- 1MM \ + \ ... only one match in whitelist with 1 mismatched base allowed. Allowed\ + \ CBs have to have at least one read with exact match.\n- 1MM_multi \ + \ ... multiple matches in whitelist with 1 mismatched base allowed,\ + \ posterior probability calculation is used choose one of the matches.\nAllowed\ + \ CBs have to have at least one read with exact match. This option matches best\ + \ with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi,\ + \ but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts\ + \ ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for\ + \ CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2\ + \ ... allow up to edit distance of 3 fpr each of the barcodes.\ + \ May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex.\ + \ Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio\ + \ Split-seq pipeline." + info: null + example: + - "1MM_multi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeSeq" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance,\ + \ for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR\ + \ .\nThis parameter is required when running STARsolo with input from SAM." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeQual" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode qualities (in proper order).\n\nFor\ + \ instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual\ + \ CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned\ + \ to all bases." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloStrand" + description: "strandedness of the solo libraries:\n\n- Unstranded ... no strand\ + \ information\n- Forward ... read strand same as the original RNA molecule\n\ + - Reverse ... read strand opposite to the original RNA molecule" + info: null + example: + - "Forward" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloFeatures" + description: "genomic features for which the UMI counts per Cell Barcode are collected\n\ + \n- Gene ... genes: reads match the gene transcript\n- SJ \ + \ ... splice junctions: reported in SJ.out.tab\n- GeneFull ...\ + \ full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n\ + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping\ + \ genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS\ + \ ... full gene (pre-RNA): count all reads overlapping genes' exons and\ + \ introns: prioritize >50% overlap with exons. Do not count reads with 100%\ + \ exonic overlap in the antisense direction." + info: null + example: + - "Gene" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloMultiMappers" + description: "counting method for reads mapping to multiple genes\n\n- Unique\ + \ ... count only reads that map to unique genes\n- Uniform ... uniformly\ + \ distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs\ + \ proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique\ + \ ... distribute UMIs proportionally to unique mappers, if present, and uniformly\ + \ if not.\n- EM ... multi-gene UMIs are distributed using Expectation\ + \ Maximization algorithm" + info: null + example: + - "Unique" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIdedup" + description: "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All \ + \ ... all UMIs with 1 mismatch distance to each other are\ + \ collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows\ + \ the \"directional\" method from the UMI-tools by Smith, Heger and Sudbery\ + \ (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools,\ + \ but with more stringent criteria for duplicate UMIs\n- Exact \ + \ ... only exactly matching UMIs are collapsed.\n- NoDedup \ + \ ... no deduplication of UMIs, count all reads.\n- 1MM_CR \ + \ ... CellRanger2-4 algorithm for 1MM UMI collapsing." + info: null + example: + - "1MM_All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIfiltering" + description: "type of UMI filtering (for reads uniquely mapping to genes)\n\n\ + - - ... basic filtering: remove UMIs with N and homopolymers\ + \ (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count\ + \ UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove\ + \ all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic +\ + \ remove lower-count UMIs that map to more than one gene, matching CellRanger\ + \ > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFileNames" + description: "file names for STARsolo output:\n\nfile_name_prefix gene_names\ + \ barcode_sequences cell_feature_count_matrix" + info: null + example: + - "Solo.out/" + - "features.tsv" + - "barcodes.tsv" + - "matrix.mtx" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellFilter" + description: "cell filtering type and parameters\n\n- None ... do not\ + \ output filtered cells\n- TopCells ... only report top cells by UMI\ + \ count, followed by the exact number of cells\n- CellRanger2.2 ... simple\ + \ filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected\ + \ cells, robust maximum percentile for UMI count, maximum to minimum ratio for\ + \ UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; \ + \ maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering\ + \ in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun\ + \ et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\n\ + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile \ + \ maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR\ + \ simN\nThe harcoded values are from CellRanger: 3000 \ + \ 0.99 10 45000 90000 500 0.01 20000\ + \ 0.01 10000" + info: null + example: + - "CellRanger2.2" + - "3000" + - "0.99" + - "10" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFormatFeaturesGeneField3" + description: "field 3 in the Gene features.tsv file. If \"-\", then no 3rd field\ + \ is output." + info: null + example: + - "Gene Expression" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellReadStats" + description: "Output reads statistics for each CB\n\n- Standard ... standard\ + \ output" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using STAR." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "docker" + env: + - "STAR_VERSION 2.7.3a" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/star_align_v273a/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/mapping/star_align_v273a" + executable: "target/executable/mapping/star_align_v273a/star_align_v273a" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/mapping/star_align_v273a/nextflow_labels.config b/target/executable/mapping/star_align_v273a/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/mapping/star_align_v273a/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/mapping/star_align_v273a/setup_logger.py b/target/executable/mapping/star_align_v273a/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/mapping/star_align_v273a/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/mapping/star_align_v273a/star_align_v273a b/target/executable/mapping/star_align_v273a/star_align_v273a new file mode 100755 index 00000000..de3a2389 --- /dev/null +++ b/target/executable/mapping/star_align_v273a/star_align_v273a @@ -0,0 +1,5938 @@ +#!/usr/bin/env bash + +# star_align_v273a main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="star_align_v273a" +VIASH_META_FUNCTIONALITY_NAME="star_align_v273a" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +ENV STAR_VERSION 2.7.3a +ENV PACKAGES gcc g++ make wget zlib1g-dev unzip +RUN apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component mapping star_align_v273a" +LABEL org.opencontainers.image.created="2025-08-22T07:23:22Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "star_align_v273a main" + echo "" + echo "Align fastq files using STAR." + echo "" + echo "Input/Output:" + echo " --readFilesIn, --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example:" + echo "mysample_S1_L001_R1_001.fastq.gz;mysample_S1_L001_R2_001.fastq.gz" + echo " The FASTQ files to be analyzed. Corresponds to the --readFilesIn in the" + echo " STAR command." + echo "" + echo " --genomeDir, --reference" + echo " type: file, required parameter, file must exist" + echo " example: /path/to/reference" + echo " Path to the reference built by star_build_reference. Corresponds to the" + echo " --genomeDir in the STAR command." + echo "" + echo " --outFileNamePrefix, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/foo" + echo " Path to output directory. Corresponds to the --outFileNamePrefix in the" + echo " STAR command." + echo "" + echo "Run Parameters:" + echo " --runRNGseed" + echo " type: integer" + echo " example: 777" + echo " random number generator seed." + echo "" + echo "Genome Parameters:" + echo " --genomeLoad" + echo " type: string" + echo " example: NoSharedMemory" + echo " mode of shared memory usage for the genome files. Only used with" + echo " --runMode alignReads." + echo " - LoadAndKeep ... load genome into shared and keep it in memory" + echo " after run" + echo " - LoadAndRemove ... load genome into shared but remove it after run" + echo " - LoadAndExit ... load genome into shared memory and exit, keeping" + echo " the genome in memory for future runs" + echo " - Remove ... do not map anything, just remove loaded genome" + echo " from memory" + echo " - NoSharedMemory ... do not use shared memory, each job will have its" + echo " own private copy of the genome" + echo "" + echo " --genomeFastaFiles" + echo " type: file, multiple values allowed, file must exist" + echo " path(s) to the fasta files with the genome sequences, separated by" + echo " spaces. These files should be plain text FASTA files, they *cannot* be" + echo " zipped." + echo " Required for the genome generation (--runMode genomeGenerate). Can also" + echo " be used in the mapping (--runMode alignReads) to add extra (new)" + echo " sequences to the genome (e.g. spike-ins)." + echo "" + echo " --genomeFileSizes" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " genome files exact sizes in bytes. Typically, this should not be defined" + echo " by the user." + echo "" + echo " --genomeTransformOutput" + echo " type: string, multiple values allowed" + echo " which output to transform back to original genome" + echo " - SAM ... SAM/BAM alignments" + echo " - SJ ... splice junctions (SJ.out.tab)" + echo " - None ... no transformation of the output" + echo "" + echo " --genomeChrSetMitochondrial" + echo " type: string, multiple values allowed" + echo " example: chrM;M;MT" + echo " names of the mitochondrial chromosomes. Presently only used for STARsolo" + echo " statistics output/" + echo "" + echo "Splice Junctions Database:" + echo " --sjdbFileChrStartEnd" + echo " type: string, multiple values allowed" + echo " path to the files with genomic coordinates (chr start end" + echo " strand) for the splice junction introns. Multiple files can be" + echo " supplied and will be concatenated." + echo "" + echo " --sjdbGTFfile" + echo " type: file, file must exist" + echo " path to the GTF file with annotations" + echo "" + echo " --sjdbGTFchrPrefix" + echo " type: string" + echo " prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL" + echo " annotations with UCSC genomes)" + echo "" + echo " --sjdbGTFfeatureExon" + echo " type: string" + echo " example: exon" + echo " feature type in GTF file to be used as exons for building transcripts" + echo "" + echo " --sjdbGTFtagExonParentTranscript" + echo " type: string" + echo " example: transcript_id" + echo " GTF attribute name for parent transcript ID (default \"transcript_id\"" + echo " works for GTF files)" + echo "" + echo " --sjdbGTFtagExonParentGene" + echo " type: string" + echo " example: gene_id" + echo " GTF attribute name for parent gene ID (default \"gene_id\" works for GTF" + echo " files)" + echo "" + echo " --sjdbGTFtagExonParentGeneName" + echo " type: string, multiple values allowed" + echo " example: gene_name" + echo " GTF attribute name for parent gene name" + echo "" + echo " --sjdbGTFtagExonParentGeneType" + echo " type: string, multiple values allowed" + echo " example: gene_type;gene_biotype" + echo " GTF attribute name for parent gene type" + echo "" + echo " --sjdbOverhang" + echo " type: integer" + echo " example: 100" + echo " length of the donor/acceptor sequence on each side of the junctions," + echo " ideally = (mate_length - 1)" + echo "" + echo " --sjdbScore" + echo " type: integer" + echo " example: 2" + echo " extra alignment score for alignments that cross database junctions" + echo "" + echo " --sjdbInsertSave" + echo " type: string" + echo " example: Basic" + echo " which files to save when sjdb junctions are inserted on the fly at the" + echo " mapping step" + echo " - Basic ... only small junction / transcript files" + echo " - All ... all files including big Genome, SA and SAindex - this will" + echo " create a complete genome directory" + echo "" + echo "Variation parameters:" + echo " --varVCFfile" + echo " type: string" + echo " path to the VCF file that contains variation data. The 10th column" + echo " should contain the genotype information, e.g. 0/1" + echo "" + echo "Read Parameters:" + echo " --readFilesType" + echo " type: string" + echo " example: Fastx" + echo " format of input read files" + echo " - Fastx ... FASTA or FASTQ" + echo " - SAM SE ... SAM or BAM single-end reads; for BAM use" + echo " --readFilesCommand samtools view" + echo " - SAM PE ... SAM or BAM paired-end reads; for BAM use" + echo " --readFilesCommand samtools view" + echo "" + echo " --readFilesSAMattrKeep" + echo " type: string, multiple values allowed" + echo " example: All" + echo " for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM," + echo " e.g.: --readFilesSAMtagsKeep RG PL" + echo " - All ... keep all tags" + echo " - None ... do not keep any tags" + echo "" + echo " --readFilesManifest" + echo " type: file, file must exist" + echo " path to the \"manifest\" file with the names of read files. The manifest" + echo " file should contain 3 tab-separated columns:" + echo " paired-end reads: read1_file_name \$tab\$ read2_file_name \$tab\$" + echo " read_group_line." + echo " single-end reads: read1_file_name \$tab\$ - \$tab\$" + echo " read_group_line." + echo " Spaces, but not tabs are allowed in file names." + echo " If read_group_line does not start with ID:, it can only contain one ID" + echo " field, and ID: will be added to it." + echo " If read_group_line starts with ID:, it can contain several fields" + echo " separated by \$tab\$, and all fields will be be copied verbatim into SAM" + echo " @RG header line." + echo "" + echo " --readFilesPrefix" + echo " type: string" + echo " prefix for the read files names, i.e. it will be added in front of the" + echo " strings in --readFilesIn" + echo "" + echo " --readFilesCommand" + echo " type: string, multiple values allowed" + echo " command line to execute for each of the input file. This command should" + echo " generate FASTA or FASTQ text and send it to stdout" + echo " For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2" + echo " files, etc." + echo "" + echo " --readMapNumber" + echo " type: integer" + echo " example: -1" + echo " number of reads to map from the beginning of the file" + echo " -1: map all reads" + echo "" + echo " --readMatesLengthsIn" + echo " type: string" + echo " example: NotEqual" + echo " Equal/NotEqual - lengths of names,sequences,qualities for both mates are" + echo " the same / not the same. NotEqual is safe in all situations." + echo "" + echo " --readNameSeparator" + echo " type: string, multiple values allowed" + echo " example: /" + echo " character(s) separating the part of the read names that will be trimmed" + echo " in output (read name after space is always trimmed)" + echo "" + echo " --readQualityScoreBase" + echo " type: integer" + echo " example: 33" + echo " number to be subtracted from the ASCII code to get Phred quality score" + echo "" + echo "Read Clipping:" + echo " --clipAdapterType" + echo " type: string" + echo " example: Hamming" + echo " adapter clipping type" + echo " - Hamming ... adapter clipping based on Hamming distance, with the" + echo " number of mismatches controlled by --clip5pAdapterMMp" + echo " - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4." + echo " Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal" + echo " - None ... no adapter clipping, all other clip* parameters are" + echo " disregarded" + echo "" + echo " --clip3pNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number(s) of bases to clip from 3p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo "" + echo " --clip3pAdapterSeq" + echo " type: string, multiple values allowed" + echo " adapter sequences to clip from 3p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo " - polyA ... polyA sequence with the length equal to read length" + echo "" + echo " --clip3pAdapterMMp" + echo " type: double, multiple values allowed" + echo " example: 0.1" + echo " max proportion of mismatches for 3p adapter clipping for each mate. If" + echo " one value is given, it will be assumed the same for both mates." + echo "" + echo " --clip3pAfterAdapterNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number of bases to clip from 3p of each mate after the adapter clipping." + echo " If one value is given, it will be assumed the same for both mates." + echo "" + echo " --clip5pNbases" + echo " type: integer, multiple values allowed" + echo " example: 0" + echo " number(s) of bases to clip from 5p of each mate. If one value is given," + echo " it will be assumed the same for both mates." + echo "" + echo "Limits:" + echo " --limitGenomeGenerateRAM" + echo " type: long" + echo " example: 31000000000" + echo " maximum available RAM (bytes) for genome generation" + echo "" + echo " --limitIObufferSize" + echo " type: long, multiple values allowed" + echo " example: 30000000;50000000" + echo " max available buffers size (bytes) for input/output, per thread" + echo "" + echo " --limitOutSAMoneReadBytes" + echo " type: long" + echo " example: 100000" + echo " max size of the SAM record (bytes) for one read. Recommended value:" + echo " >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + echo "" + echo " --limitOutSJoneRead" + echo " type: integer" + echo " example: 1000" + echo " max number of junctions for one read (including all multi-mappers)" + echo "" + echo " --limitOutSJcollapsed" + echo " type: integer" + echo " example: 1000000" + echo " max number of collapsed junctions" + echo "" + echo " --limitBAMsortRAM" + echo " type: long" + echo " example: 0" + echo " maximum available RAM (bytes) for sorting BAM. If =0, it will be set to" + echo " the genome index size. 0 value can only be used with --genomeLoad" + echo " NoSharedMemory option." + echo "" + echo " --limitSjdbInsertNsj" + echo " type: integer" + echo " example: 1000000" + echo " maximum number of junctions to be inserted to the genome on the fly at" + echo " the mapping stage, including those from annotations and those detected" + echo " in the 1st step of the 2-pass run" + echo "" + echo " --limitNreadsSoft" + echo " type: integer" + echo " example: -1" + echo " soft limit on the number of reads" + echo "" + echo "Output: general:" + echo " --outTmpKeep" + echo " type: string" + echo " whether to keep the temporary files after STAR runs is finished" + echo " - None ... remove all temporary files" + echo " - All ... keep all files" + echo "" + echo " --outStd" + echo " type: string" + echo " example: Log" + echo " which output will be directed to stdout (standard out)" + echo " - Log ... log messages" + echo " - SAM ... alignments in SAM format (which normally" + echo " are output to Aligned.out.sam file), normal standard output will go into" + echo " Log.std.out" + echo " - BAM_Unsorted ... alignments in BAM format, unsorted." + echo " Requires --outSAMtype BAM Unsorted" + echo " - BAM_SortedByCoordinate ... alignments in BAM format, sorted by" + echo " coordinate. Requires --outSAMtype BAM SortedByCoordinate" + echo " - BAM_Quant ... alignments to transcriptome in BAM format," + echo " unsorted. Requires --quantMode TranscriptomeSAM" + echo "" + echo " --outReadsUnmapped" + echo " type: string" + echo " output of unmapped and partially mapped (i.e. mapped only one mate of a" + echo " paired end read) reads in separate file(s)." + echo " - None ... no output" + echo " - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + echo "" + echo " --outQSconversionAdd" + echo " type: integer" + echo " example: 0" + echo " add this number to the quality score (e.g. to convert from Illumina to" + echo " Sanger, use -31)" + echo "" + echo " --outMultimapperOrder" + echo " type: string" + echo " example: Old_2.4" + echo " order of multimapping alignments in the output files" + echo " - Old_2.4 ... quasi-random order used before 2.5.0" + echo " - Random ... random order of alignments for each" + echo " multi-mapper. Read mates (pairs) are always adjacent, all alignment for" + echo " each read stay together. This option will become default in the future" + echo " releases." + echo "" + echo "Output: SAM and BAM:" + echo " --outSAMtype" + echo " type: string, multiple values allowed" + echo " example: SAM" + echo " type of SAM/BAM output" + echo " 1st word:" + echo " - BAM ... output BAM without sorting" + echo " - SAM ... output SAM without sorting" + echo " - None ... no SAM/BAM output" + echo " 2nd, 3rd:" + echo " - Unsorted ... standard unsorted" + echo " - SortedByCoordinate ... sorted by coordinate. This option will allocate" + echo " extra memory for sorting which can be specified by --limitBAMsortRAM." + echo "" + echo " --outSAMmode" + echo " type: string" + echo " example: Full" + echo " mode of SAM output" + echo " - None ... no SAM output" + echo " - Full ... full SAM output" + echo " - NoQS ... full SAM but without quality scores" + echo "" + echo " --outSAMstrandField" + echo " type: string" + echo " Cufflinks-like strand field flag" + echo " - None ... not used" + echo " - intronMotif ... strand derived from the intron motif. This option" + echo " changes the output alignments: reads with inconsistent and/or" + echo " non-canonical introns are filtered out." + echo "" + echo " --outSAMattributes" + echo " type: string, multiple values allowed" + echo " example: Standard" + echo " a string of desired SAM attributes, in the order desired for the output" + echo " SAM. Tags can be listed in any combination/order." + echo " ***Presets:" + echo " - None ... no attributes" + echo " - Standard ... NH HI AS nM" + echo " - All ... NH HI AS nM NM MD jM jI MC ch" + echo " ***Alignment:" + echo " - NH ... number of loci the reads maps to: =1 for unique" + echo " mappers, >1 for multimappers. Standard SAM tag." + echo " - HI ... multiple alignment index, starts with" + echo " --outSAMattrIHstart (=1 by default). Standard SAM tag." + echo " - AS ... local alignment score, +1/-1 for matches/mismateches," + echo " score* penalties for indels and gaps. For PE reads, total score for two" + echo " mates. Stadnard SAM tag." + echo " - nM ... number of mismatches. For PE reads, sum over two" + echo " mates." + echo " - NM ... edit distance to the reference (number of mismatched +" + echo " inserted + deleted bases) for each mate. Standard SAM tag." + echo " - MD ... string encoding mismatched and deleted reference bases" + echo " (see standard SAM specifications). Standard SAM tag." + echo " - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0:" + echo " non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6:" + echo " GT/AT. If splice junctions database is used, and a junction is" + echo " annotated, 20 is added to its motif value." + echo " - jI ... start and end of introns for all junctions (1-based)." + echo " - XS ... alignment strand according to --outSAMstrandField." + echo " - MC ... mate's CIGAR string. Standard SAM tag." + echo " - ch ... marks all segment of all chimeric alingments for" + echo " --chimOutType WithinBAM output." + echo " - cN ... number of bases clipped from the read ends: 5' and 3'" + echo " ***Variation:" + echo " - vA ... variant allele" + echo " - vG ... genomic coordinate of the variant overlapped by the" + echo " read." + echo " - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 -" + echo " alignment does not pass WASP filtering. Requires --waspOutputMode" + echo " SAMtag." + echo " ***STARsolo:" + echo " - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs" + echo " for the solo* demultiplexing." + echo " - GX GN ... gene ID and gene name for unique-gene reads." + echo " - gx gn ... gene IDs and gene names for unique- and multi-gene" + echo " reads." + echo " - CB UB ... error-corrected cell barcodes and UMIs for solo*" + echo " demultiplexing. Requires --outSAMtype BAM SortedByCoordinate." + echo " - sM ... assessment of CB and UMI." + echo " - sS ... sequence of the entire barcode (CB,UMI,adapter)." + echo " - sQ ... quality of the entire barcode." + echo " ***Unsupported/undocumented:" + echo " - ha ... haplotype (1/2) when mapping to the diploid genome." + echo " Requires genome generated with --genomeTransformType Diploid ." + echo " - rB ... alignment block read/genomic coordinates." + echo " - vR ... read coordinate of the variant." + echo "" + echo " --outSAMattrIHstart" + echo " type: integer" + echo " example: 1" + echo " start value for the IH attribute. 0 may be required by some downstream" + echo " software, such as Cufflinks or StringTie." + echo "" + echo " --outSAMunmapped" + echo " type: string, multiple values allowed" + echo " output of unmapped reads in the SAM format" + echo " 1st word:" + echo " - None ... no output" + echo " - Within ... output unmapped reads within the main SAM file (i.e." + echo " Aligned.out.sam)" + echo " 2nd word:" + echo " - KeepPairs ... record unmapped mate for each alignment, and, in case of" + echo " unsorted output, keep it adjacent to its mapped mate. Only affects" + echo " multi-mapping reads." + echo "" + echo " --outSAMorder" + echo " type: string" + echo " example: Paired" + echo " type of sorting for the SAM output" + echo " Paired: one mate after the other for all paired alignments" + echo " PairedKeepInputOrder: one mate after the other for all paired" + echo " alignments, the order is kept the same as in the input FASTQ files" + echo "" + echo " --outSAMprimaryFlag" + echo " type: string" + echo " example: OneBestScore" + echo " which alignments are considered primary - all others will be marked with" + echo " 0x100 bit in the FLAG" + echo " - OneBestScore ... only one alignment with the best score is primary" + echo " - AllBestScore ... all alignments with the best score are primary" + echo "" + echo " --outSAMreadID" + echo " type: string" + echo " example: Standard" + echo " read ID record type" + echo " - Standard ... first word (until space) from the FASTx read ID line," + echo " removing /1,/2 from the end" + echo " - Number ... read number (index) in the FASTx file" + echo "" + echo " --outSAMmapqUnique" + echo " type: integer" + echo " example: 255" + echo " 0 to 255: the MAPQ value for unique mappers" + echo "" + echo " --outSAMflagOR" + echo " type: integer" + echo " example: 0" + echo " 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e." + echo " FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set" + echo " by STAR, and after outSAMflagAND. Can be used to set specific bits that" + echo " are not set otherwise." + echo "" + echo " --outSAMflagAND" + echo " type: integer" + echo " example: 65535" + echo " 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e." + echo " FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set" + echo " by STAR, but before outSAMflagOR. Can be used to unset specific bits" + echo " that are not set otherwise." + echo "" + echo " --outSAMattrRGline" + echo " type: string, multiple values allowed" + echo " SAM/BAM read group line. The first word contains the read group" + echo " identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx" + echo " CN:yy \"DS:z z z\"." + echo " xxx will be added as RG tag to each output alignment. Any spaces in the" + echo " tag values have to be double quoted." + echo " Comma separated RG lines correspons to different (comma separated) input" + echo " files in --readFilesIn. Commas have to be surrounded by spaces, e.g." + echo " --outSAMattrRGline ID:xxx , ID:zzz \"DS:z z\" , ID:yyy DS:yyyy" + echo "" + echo " --outSAMheaderHD" + echo " type: string, multiple values allowed" + echo " @HD (header) line of the SAM header" + echo "" + echo " --outSAMheaderPG" + echo " type: string, multiple values allowed" + echo " extra @PG (software) line of the SAM header (in addition to STAR)" + echo "" + echo " --outSAMheaderCommentFile" + echo " type: string" + echo " path to the file with @CO (comment) lines of the SAM header" + echo "" + echo " --outSAMfilter" + echo " type: string, multiple values allowed" + echo " filter the output into main SAM/BAM files" + echo " - KeepOnlyAddedReferences ... only keep the reads for which all" + echo " alignments are to the extra reference sequences added with" + echo " --genomeFastaFiles at the mapping stage." + echo " - KeepAllAddedReferences ... keep all alignments to the extra reference" + echo " sequences added with --genomeFastaFiles at the mapping stage." + echo "" + echo " --outSAMmultNmax" + echo " type: integer" + echo " example: -1" + echo " max number of multiple alignments for a read that will be output to the" + echo " SAM/BAM files. Note that if this value is not equal to -1, the top" + echo " scoring alignment will be output first" + echo " - -1 ... all alignments (up to --outFilterMultimapNmax) will be output" + echo "" + echo " --outSAMtlen" + echo " type: integer" + echo " example: 1" + echo " calculation method for the TLEN field in the SAM/BAM files" + echo " - 1 ... leftmost base of the (+)strand mate to rightmost base of the" + echo " (-)mate. (+)sign for the (+)strand mate" + echo " - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign" + echo " for the mate with the leftmost base. This is different from 1 for" + echo " overlapping mates with protruding ends" + echo "" + echo " --outBAMcompression" + echo " type: integer" + echo " example: 1" + echo " -1 to 10 BAM compression level, -1=default compression (6?), 0=no" + echo " compression, 10=maximum compression" + echo "" + echo " --outBAMsortingThreadN" + echo " type: integer" + echo " example: 0" + echo " >=0: number of threads for BAM sorting. 0 will default to" + echo " min(6,--runThreadN)." + echo "" + echo " --outBAMsortingBinsN" + echo " type: integer" + echo " example: 50" + echo " >0: number of genome bins for coordinate-sorting" + echo "" + echo "BAM processing:" + echo " --bamRemoveDuplicatesType" + echo " type: string" + echo " mark duplicates in the BAM file, for now only works with (i) sorted BAM" + echo " fed with inputBAMfile, and (ii) for paired-end alignments only" + echo " - - ... no duplicate removal/marking" + echo " - UniqueIdentical ... mark all multimappers, and duplicate" + echo " unique mappers. The coordinates, FLAG, CIGAR must be identical" + echo " - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not" + echo " multimappers." + echo "" + echo " --bamRemoveDuplicatesMate2basesN" + echo " type: integer" + echo " example: 0" + echo " number of bases from the 5' of mate 2 to use in collapsing (e.g. for" + echo " RAMPAGE)" + echo "" + echo "Output Wiggle:" + echo " --outWigType" + echo " type: string, multiple values allowed" + echo " type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\". Requires" + echo " sorted BAM: --outSAMtype BAM SortedByCoordinate ." + echo " 1st word:" + echo " - None ... no signal output" + echo " - bedGraph ... bedGraph format" + echo " - wiggle ... wiggle format" + echo " 2nd word:" + echo " - read1_5p ... signal from only 5' of the 1st read, useful for" + echo " CAGE/RAMPAGE etc" + echo " - read2 ... signal from only 2nd read" + echo "" + echo " --outWigStrand" + echo " type: string" + echo " example: Stranded" + echo " strandedness of wiggle/bedGraph output" + echo " - Stranded ... separate strands, str1 and str2" + echo " - Unstranded ... collapsed strands" + echo "" + echo " --outWigReferencesPrefix" + echo " type: string" + echo " prefix matching reference names to include in the output wiggle file," + echo " e.g. \"chr\", default \"-\" - include all references" + echo "" + echo " --outWigNorm" + echo " type: string" + echo " example: RPM" + echo " type of normalization for the signal" + echo " - RPM ... reads per million of mapped reads" + echo " - None ... no normalization, \"raw\" counts" + echo "" + echo "Output Filtering:" + echo " --outFilterType" + echo " type: string" + echo " example: Normal" + echo " type of filtering" + echo " - Normal ... standard filtering using only current alignment" + echo " - BySJout ... keep only those reads that contain junctions that passed" + echo " filtering into SJ.out.tab" + echo "" + echo " --outFilterMultimapScoreRange" + echo " type: integer" + echo " example: 1" + echo " the score range below the maximum score for multimapping alignments" + echo "" + echo " --outFilterMultimapNmax" + echo " type: integer" + echo " example: 10" + echo " maximum number of loci the read is allowed to map to. Alignments (all of" + echo " them) will be output only if the read maps to no more loci than this" + echo " value." + echo " Otherwise no alignments will be output, and the read will be counted as" + echo " \"mapped to too many loci\" in the Log.final.out ." + echo "" + echo " --outFilterMismatchNmax" + echo " type: integer" + echo " example: 10" + echo " alignment will be output only if it has no more mismatches than this" + echo " value." + echo "" + echo " --outFilterMismatchNoverLmax" + echo " type: double" + echo " example: 0.3" + echo " alignment will be output only if its ratio of mismatches to *mapped*" + echo " length is less than or equal to this value." + echo "" + echo " --outFilterMismatchNoverReadLmax" + echo " type: double" + echo " example: 1.0" + echo " alignment will be output only if its ratio of mismatches to *read*" + echo " length is less than or equal to this value." + echo "" + echo " --outFilterScoreMin" + echo " type: integer" + echo " example: 0" + echo " alignment will be output only if its score is higher than or equal to" + echo " this value." + echo "" + echo " --outFilterScoreMinOverLread" + echo " type: double" + echo " example: 0.66" + echo " same as outFilterScoreMin, but normalized to read length (sum of mates'" + echo " lengths for paired-end reads)" + echo "" + echo " --outFilterMatchNmin" + echo " type: integer" + echo " example: 0" + echo " alignment will be output only if the number of matched bases is higher" + echo " than or equal to this value." + echo "" + echo " --outFilterMatchNminOverLread" + echo " type: double" + echo " example: 0.66" + echo " sam as outFilterMatchNmin, but normalized to the read length (sum of" + echo " mates' lengths for paired-end reads)." + echo "" + echo " --outFilterIntronMotifs" + echo " type: string" + echo " filter alignment using their motifs" + echo " - None ... no filtering" + echo " - RemoveNoncanonical ... filter out alignments that contain" + echo " non-canonical junctions" + echo " - RemoveNoncanonicalUnannotated ... filter out alignments that contain" + echo " non-canonical unannotated junctions when using annotated splice" + echo " junctions database. The annotated non-canonical junctions will be kept." + echo "" + echo " --outFilterIntronStrands" + echo " type: string" + echo " example: RemoveInconsistentStrands" + echo " filter alignments" + echo " - RemoveInconsistentStrands ... remove alignments that have" + echo " junctions with inconsistent strands" + echo " - None ... no filtering" + echo "" + echo "Output splice junctions (SJ.out.tab):" + echo " --outSJtype" + echo " type: string" + echo " example: Standard" + echo " type of splice junction output" + echo " - Standard ... standard SJ.out.tab output" + echo " - None ... no splice junction output" + echo "" + echo "Output Filtering: Splice Junctions:" + echo " --outSJfilterReads" + echo " type: string" + echo " example: All" + echo " which reads to consider for collapsed splice junctions output" + echo " - All ... all reads, unique- and multi-mappers" + echo " - Unique ... uniquely mapping reads only" + echo "" + echo " --outSJfilterOverhangMin" + echo " type: integer, multiple values allowed" + echo " example: 30;12;12;12" + echo " minimum overhang length for splice junctions on both sides for: (1)" + echo " non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterCountUniqueMin" + echo " type: integer, multiple values allowed" + echo " example: 3;1;1;1" + echo " minimum uniquely mapping read count per junction for: (1) non-canonical" + echo " motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC" + echo " and GT/AT motif. -1 means no output for that motif" + echo " Junctions are output if one of outSJfilterCountUniqueMin OR" + echo " outSJfilterCountTotalMin conditions are satisfied" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterCountTotalMin" + echo " type: integer, multiple values allowed" + echo " example: 3;1;1;1" + echo " minimum total (multi-mapping+unique) read count per junction for: (1)" + echo " non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif" + echo " Junctions are output if one of outSJfilterCountUniqueMin OR" + echo " outSJfilterCountTotalMin conditions are satisfied" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterDistToOtherSJmin" + echo " type: integer, multiple values allowed" + echo " example: 10;0;5;10" + echo " minimum allowed distance to other junctions' donor/acceptor" + echo " does not apply to annotated junctions" + echo "" + echo " --outSJfilterIntronMaxVsReadN" + echo " type: integer, multiple values allowed" + echo " example: 50000;100000;200000" + echo " maximum gap allowed for junctions supported by 1,2,3,,,N reads" + echo " i.e. by default junctions supported by 1 read can have gaps <=50000b, by" + echo " 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap" + echo " <=alignIntronMax" + echo " does not apply to annotated junctions" + echo "" + echo "Scoring:" + echo " --scoreGap" + echo " type: integer" + echo " example: 0" + echo " splice junction penalty (independent on intron motif)" + echo "" + echo " --scoreGapNoncan" + echo " type: integer" + echo " example: -8" + echo " non-canonical junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGapGCAG" + echo " type: integer" + echo " example: -4" + echo " GC/AG and CT/GC junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGapATAC" + echo " type: integer" + echo " example: -8" + echo " AT/AC and GT/AT junction penalty (in addition to scoreGap)" + echo "" + echo " --scoreGenomicLengthLog2scale" + echo " type: integer" + echo " example: 0" + echo " extra score logarithmically scaled with genomic length of the alignment:" + echo " scoreGenomicLengthLog2scale*log2(genomicLength)" + echo "" + echo " --scoreDelOpen" + echo " type: integer" + echo " example: -2" + echo " deletion open penalty" + echo "" + echo " --scoreDelBase" + echo " type: integer" + echo " example: -2" + echo " deletion extension penalty per base (in addition to scoreDelOpen)" + echo "" + echo " --scoreInsOpen" + echo " type: integer" + echo " example: -2" + echo " insertion open penalty" + echo "" + echo " --scoreInsBase" + echo " type: integer" + echo " example: -2" + echo " insertion extension penalty per base (in addition to scoreInsOpen)" + echo "" + echo " --scoreStitchSJshift" + echo " type: integer" + echo " example: 1" + echo " maximum score reduction while searching for SJ boundaries in the" + echo " stitching step" + echo "" + echo "Alignments and Seeding:" + echo " --seedSearchStartLmax" + echo " type: integer" + echo " example: 50" + echo " defines the search start point through the read - the read is split into" + echo " pieces no longer than this value" + echo "" + echo " --seedSearchStartLmaxOverLread" + echo " type: double" + echo " example: 1.0" + echo " seedSearchStartLmax normalized to read length (sum of mates' lengths for" + echo " paired-end reads)" + echo "" + echo " --seedSearchLmax" + echo " type: integer" + echo " example: 0" + echo " defines the maximum length of the seeds, if =0 seed length is not" + echo " limited" + echo "" + echo " --seedMultimapNmax" + echo " type: integer" + echo " example: 10000" + echo " only pieces that map fewer than this value are utilized in the stitching" + echo " procedure" + echo "" + echo " --seedPerReadNmax" + echo " type: integer" + echo " example: 1000" + echo " max number of seeds per read" + echo "" + echo " --seedPerWindowNmax" + echo " type: integer" + echo " example: 50" + echo " max number of seeds per window" + echo "" + echo " --seedNoneLociPerWindow" + echo " type: integer" + echo " example: 10" + echo " max number of one seed loci per window" + echo "" + echo " --seedSplitMin" + echo " type: integer" + echo " example: 12" + echo " min length of the seed sequences split by Ns or mate gap" + echo "" + echo " --seedMapMin" + echo " type: integer" + echo " example: 5" + echo " min length of seeds to be mapped" + echo "" + echo " --alignIntronMin" + echo " type: integer" + echo " example: 21" + echo " minimum intron size, genomic gap is considered intron if its" + echo " length>=alignIntronMin, otherwise it is considered Deletion" + echo "" + echo " --alignIntronMax" + echo " type: integer" + echo " example: 0" + echo " maximum intron size, if 0, max intron size will be determined by" + echo " (2^winBinNbits)*winAnchorDistNbins" + echo "" + echo " --alignMatesGapMax" + echo " type: integer" + echo " example: 0" + echo " maximum gap between two mates, if 0, max intron gap will be determined" + echo " by (2^winBinNbits)*winAnchorDistNbins" + echo "" + echo " --alignSJoverhangMin" + echo " type: integer" + echo " example: 5" + echo " minimum overhang (i.e. block size) for spliced alignments" + echo "" + echo " --alignSJstitchMismatchNmax" + echo " type: integer, multiple values allowed" + echo " example: 0;-1;0;0" + echo " maximum number of mismatches for stitching of the splice junctions (-1:" + echo " no limit)." + echo " (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC" + echo " motif, (4) AT/AC and GT/AT motif." + echo "" + echo " --alignSJDBoverhangMin" + echo " type: integer" + echo " example: 3" + echo " minimum overhang (i.e. block size) for annotated (sjdb) spliced" + echo " alignments" + echo "" + echo " --alignSplicedMateMapLmin" + echo " type: integer" + echo " example: 0" + echo " minimum mapped length for a read mate that is spliced" + echo "" + echo " --alignSplicedMateMapLminOverLmate" + echo " type: double" + echo " example: 0.66" + echo " alignSplicedMateMapLmin normalized to mate length" + echo "" + echo " --alignWindowsPerReadNmax" + echo " type: integer" + echo " example: 10000" + echo " max number of windows per read" + echo "" + echo " --alignTranscriptsPerWindowNmax" + echo " type: integer" + echo " example: 100" + echo " max number of transcripts per window" + echo "" + echo " --alignTranscriptsPerReadNmax" + echo " type: integer" + echo " example: 10000" + echo " max number of different alignments per read to consider" + echo "" + echo " --alignEndsType" + echo " type: string" + echo " example: Local" + echo " type of read ends alignment" + echo " - Local ... standard local alignment with soft-clipping" + echo " allowed" + echo " - EndToEnd ... force end-to-end read alignment, do not" + echo " soft-clip" + echo " - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other" + echo " ends: local alignment" + echo " - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and" + echo " read2, all other ends: local alignment" + echo "" + echo " --alignEndsProtrude" + echo " type: string" + echo " example: 0 ConcordantPair" + echo " allow protrusion of alignment ends, i.e. start (end) of the +strand mate" + echo " downstream of the start (end) of the -strand mate" + echo " 1st word: int: maximum number of protrusion bases allowed" + echo " 2nd word: string:" + echo " - ConcordantPair ... report alignments with non-zero" + echo " protrusion as concordant pairs" + echo " - DiscordantPair ... report alignments with non-zero" + echo " protrusion as discordant pairs" + echo "" + echo " --alignSoftClipAtReferenceEnds" + echo " type: string" + echo " example: Yes" + echo " allow the soft-clipping of the alignments past the end of the" + echo " chromosomes" + echo " - Yes ... allow" + echo " - No ... prohibit, useful for compatibility with Cufflinks" + echo "" + echo " --alignInsertionFlush" + echo " type: string" + echo " how to flush ambiguous insertion positions" + echo " - None ... insertions are not flushed" + echo " - Right ... insertions are flushed to the right" + echo "" + echo "Paired-End reads:" + echo " --peOverlapNbasesMin" + echo " type: integer" + echo " example: 0" + echo " minimum number of overlapping bases to trigger mates merging and" + echo " realignment. Specify >0 value to switch on the \"merginf of overlapping" + echo " mates\" algorithm." + echo "" + echo " --peOverlapMMp" + echo " type: double" + echo " example: 0.01" + echo " maximum proportion of mismatched bases in the overlap area" + echo "" + echo "Windows, Anchors, Binning:" + echo " --winAnchorMultimapNmax" + echo " type: integer" + echo " example: 50" + echo " max number of loci anchors are allowed to map to" + echo "" + echo " --winBinNbits" + echo " type: integer" + echo " example: 16" + echo " =log2(winBin), where winBin is the size of the bin for the" + echo " windows/clustering, each window will occupy an integer number of bins." + echo "" + echo " --winAnchorDistNbins" + echo " type: integer" + echo " example: 9" + echo " max number of bins between two anchors that allows aggregation of" + echo " anchors into one window" + echo "" + echo " --winFlankNbins" + echo " type: integer" + echo " example: 4" + echo " log2(winFlank), where win Flank is the size of the left and right" + echo " flanking regions for each window" + echo "" + echo " --winReadCoverageRelativeMin" + echo " type: double" + echo " example: 0.5" + echo " minimum relative coverage of the read sequence by the seeds in a window," + echo " for STARlong algorithm only." + echo "" + echo " --winReadCoverageBasesMin" + echo " type: integer" + echo " example: 0" + echo " minimum number of bases covered by the seeds in a window , for STARlong" + echo " algorithm only." + echo "" + echo "Chimeric Alignments:" + echo " --chimOutType" + echo " type: string, multiple values allowed" + echo " example: Junctions" + echo " type of chimeric output" + echo " - Junctions ... Chimeric.out.junction" + echo " - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file" + echo " - WithinBAM ... output into main aligned BAM files (Aligned.*.bam)" + echo " - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for" + echo " supplemental chimeric alignments (default if no 2nd word is present)" + echo " - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental" + echo " chimeric alignments" + echo "" + echo " --chimSegmentMin" + echo " type: integer" + echo " example: 0" + echo " minimum length of chimeric segment length, if ==0, no chimeric output" + echo "" + echo " --chimScoreMin" + echo " type: integer" + echo " example: 0" + echo " minimum total (summed) score of the chimeric segments" + echo "" + echo " --chimScoreDropMax" + echo " type: integer" + echo " example: 20" + echo " max drop (difference) of chimeric score (the sum of scores of all" + echo " chimeric segments) from the read length" + echo "" + echo " --chimScoreSeparation" + echo " type: integer" + echo " example: 10" + echo " minimum difference (separation) between the best chimeric score and the" + echo " next one" + echo "" + echo " --chimScoreJunctionNonGTAG" + echo " type: integer" + echo " example: -1" + echo " penalty for a non-GT/AG chimeric junction" + echo "" + echo " --chimJunctionOverhangMin" + echo " type: integer" + echo " example: 20" + echo " minimum overhang for a chimeric junction" + echo "" + echo " --chimSegmentReadGapMax" + echo " type: integer" + echo " example: 0" + echo " maximum gap in the read sequence between chimeric segments" + echo "" + echo " --chimFilter" + echo " type: string, multiple values allowed" + echo " example: banGenomicN" + echo " different filters for chimeric alignments" + echo " - None ... no filtering" + echo " - banGenomicN ... Ns are not allowed in the genome sequence around the" + echo " chimeric junction" + echo "" + echo " --chimMainSegmentMultNmax" + echo " type: integer" + echo " example: 10" + echo " maximum number of multi-alignments for the main chimeric segment. =1" + echo " will prohibit multimapping main segments." + echo "" + echo " --chimMultimapNmax" + echo " type: integer" + echo " example: 0" + echo " maximum number of chimeric multi-alignments" + echo " - 0 ... use the old scheme for chimeric detection which only considered" + echo " unique alignments" + echo "" + echo " --chimMultimapScoreRange" + echo " type: integer" + echo " example: 1" + echo " the score range for multi-mapping chimeras below the best chimeric" + echo " score. Only works with --chimMultimapNmax > 1" + echo "" + echo " --chimNonchimScoreDropMin" + echo " type: integer" + echo " example: 20" + echo " to trigger chimeric detection, the drop in the best non-chimeric" + echo " alignment score with respect to the read length has to be greater than" + echo " this value" + echo "" + echo " --chimOutJunctionFormat" + echo " type: integer" + echo " example: 0" + echo " formatting type for the Chimeric.out.junction file" + echo " - 0 ... no comment lines/headers" + echo " - 1 ... comment lines at the end of the file: command line and Nreads:" + echo " total, unique/multi-mapping" + echo "" + echo "Quantification of Annotations:" + echo " --quantMode" + echo " type: string, multiple values allowed" + echo " types of quantification requested" + echo " - - ... none" + echo " - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a" + echo " separate file" + echo " - GeneCounts ... count reads per gene" + echo "" + echo " --quantTranscriptomeBAMcompression" + echo " type: integer" + echo " example: 1" + echo " -2 to 10 transcriptome BAM compression level" + echo " - -2 ... no BAM output" + echo " - -1 ... default compression (6?)" + echo " - 0 ... no compression" + echo " - 10 ... maximum compression" + echo "" + echo " --quantTranscriptomeBan" + echo " type: string" + echo " example: IndelSoftclipSingleend" + echo " prohibit various alignment type" + echo " - IndelSoftclipSingleend ... prohibit indels, soft clipping and" + echo " single-end alignments - compatible with RSEM" + echo " - Singleend ... prohibit single-end alignments" + echo "" + echo "2-pass Mapping:" + echo " --twopassMode" + echo " type: string" + echo " 2-pass mapping mode." + echo " - None ... 1-pass mapping" + echo " - Basic ... basic 2-pass mapping, with all 1st pass junctions" + echo " inserted into the genome indices on the fly" + echo "" + echo " --twopass1readsN" + echo " type: integer" + echo " example: -1" + echo " number of reads to process for the 1st step. Use very large number (or" + echo " default -1) to map all reads in the first step." + echo "" + echo "WASP parameters:" + echo " --waspOutputMode" + echo " type: string" + echo " WASP allele-specific output type. This is re-implementation of the" + echo " original WASP mappability filtering by Bryce van de Geijn, Graham" + echo " McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original" + echo " WASP paper: Nature Methods 12, 1061-1063 (2015)," + echo " https://www.nature.com/articles/nmeth.3582 ." + echo " - SAMtag ... add WASP tags to the alignments that pass WASP" + echo " filtering" + echo "" + echo "STARsolo (single cell RNA-seq) parameters:" + echo " --soloType" + echo " type: string, multiple values allowed" + echo " type of single-cell RNA-seq" + echo " - CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of" + echo " fixed length in read2, e.g. Drop-seq and 10X Chromium." + echo " - CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI" + echo " of fixed length and one adapter sequence of fixed length are allowed in" + echo " read2 only (e.g. inDrop, ddSeq)." + echo " - CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No" + echo " UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end]" + echo " CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or" + echo " SortedByCoordinate]" + echo " - SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired-" + echo " or single-end), barcodes are corresponding read-groups, no UMI" + echo " sequences, alignments deduplicated according to alignment start and end" + echo " (after extending soft-clipped bases)" + echo "" + echo " --soloCBwhitelist" + echo " type: string, multiple values allowed" + echo " file(s) with whitelist(s) of cell barcodes. Only --soloType" + echo " CB_UMI_Complex allows more than one whitelist file." + echo " - None ... no whitelist: all cell barcodes are allowed" + echo "" + echo " --soloCBstart" + echo " type: integer" + echo " example: 1" + echo " cell barcode start base" + echo "" + echo " --soloCBlen" + echo " type: integer" + echo " example: 16" + echo " cell barcode length" + echo "" + echo " --soloUMIstart" + echo " type: integer" + echo " example: 17" + echo " UMI start base" + echo "" + echo " --soloUMIlen" + echo " type: integer" + echo " example: 10" + echo " UMI length" + echo "" + echo " --soloBarcodeReadLength" + echo " type: integer" + echo " example: 1" + echo " length of the barcode read" + echo " - 1 ... equal to sum of soloCBlen+soloUMIlen" + echo " - 0 ... not defined, do not check" + echo "" + echo " --soloBarcodeMate" + echo " type: integer" + echo " example: 0" + echo " identifies which read mate contains the barcode (CB+UMI) sequence" + echo " - 0 ... barcode sequence is on separate read, which should always be" + echo " the last file in the --readFilesIn listed" + echo " - 1 ... barcode sequence is a part of mate 1" + echo " - 2 ... barcode sequence is a part of mate 2" + echo "" + echo " --soloCBposition" + echo " type: string, multiple values allowed" + echo " position of Cell Barcode(s) on the barcode read." + echo " Presently only works with --soloType CB_UMI_Complex, and barcodes are" + echo " assumed to be on Read2." + echo " Format for each barcode: startAnchor_startPosition_endAnchor_endPosition" + echo " start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1:" + echo " read end; 2: adapter start; 3: adapter end" + echo " start(end)Position is the 0-based position with of the CB start(end)" + echo " with respect to the Anchor Base" + echo " String for different barcodes are separated by space." + echo " Example: inDrop (Zilionis et al, Nat. Protocols, 2017):" + echo " --soloCBposition 0_0_2_-1 3_1_3_8" + echo "" + echo " --soloUMIposition" + echo " type: string" + echo " position of the UMI on the barcode read, same as soloCBposition" + echo " Example: inDrop (Zilionis et al, Nat. Protocols, 2017):" + echo " --soloCBposition 3_9_3_14" + echo "" + echo " --soloAdapterSequence" + echo " type: string" + echo " adapter sequence to anchor barcodes. Only one adapter sequence is" + echo " allowed." + echo "" + echo " --soloAdapterMismatchesNmax" + echo " type: integer" + echo " example: 1" + echo " maximum number of mismatches allowed in adapter sequence." + echo "" + echo " --soloCBmatchWLtype" + echo " type: string" + echo " example: 1MM_multi" + echo " matching the Cell Barcodes to the WhiteList" + echo " - Exact ... only exact matches allowed" + echo " - 1MM ... only one match in whitelist with 1" + echo " mismatched base allowed. Allowed CBs have to have at least one read with" + echo " exact match." + echo " - 1MM_multi ... multiple matches in whitelist with" + echo " 1 mismatched base allowed, posterior probability calculation is used" + echo " choose one of the matches." + echo " Allowed CBs have to have at least one read with exact match. This option" + echo " matches best with CellRanger 2.2.0" + echo " - 1MM_multi_pseudocounts ... same as 1MM_Multi, but" + echo " pseudocounts of 1 are added to all whitelist barcodes." + echo " - 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts," + echo " multimatching to WL is allowed for CBs with N-bases. This option matches" + echo " best with CellRanger >= 3.0.0" + echo " - EditDist_2 ... allow up to edit distance of 3 fpr" + echo " each of the barcodes. May include one deletion + one insertion. Only" + echo " works with --soloType CB_UMI_Complex. Matches to multiple passlist" + echo " barcdoes are not allowed. Similar to ParseBio Split-seq pipeline." + echo "" + echo " --soloInputSAMattrBarcodeSeq" + echo " type: string, multiple values allowed" + echo " when inputting reads from a SAM file (--readsFileType SAM SE/PE), these" + echo " SAM attributes mark the barcode sequence (in proper order)." + echo " For instance, for 10X CellRanger or STARsolo BAMs, use" + echo " --soloInputSAMattrBarcodeSeq CR UR ." + echo " This parameter is required when running STARsolo with input from SAM." + echo "" + echo " --soloInputSAMattrBarcodeQual" + echo " type: string, multiple values allowed" + echo " when inputting reads from a SAM file (--readsFileType SAM SE/PE), these" + echo " SAM attributes mark the barcode qualities (in proper order)." + echo " For instance, for 10X CellRanger or STARsolo BAMs, use" + echo " --soloInputSAMattrBarcodeQual CY UY ." + echo " If this parameter is '-' (default), the quality 'H' will be assigned to" + echo " all bases." + echo "" + echo " --soloStrand" + echo " type: string" + echo " example: Forward" + echo " strandedness of the solo libraries:" + echo " - Unstranded ... no strand information" + echo " - Forward ... read strand same as the original RNA molecule" + echo " - Reverse ... read strand opposite to the original RNA molecule" + echo "" + echo " --soloFeatures" + echo " type: string, multiple values allowed" + echo " example: Gene" + echo " genomic features for which the UMI counts per Cell Barcode are collected" + echo " - Gene ... genes: reads match the gene transcript" + echo " - SJ ... splice junctions: reported in SJ.out.tab" + echo " - GeneFull ... full gene (pre-mRNA): count all reads overlapping" + echo " genes' exons and introns" + echo " - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads" + echo " overlapping genes' exons and introns: prioritize 100% overlap with exons" + echo " - GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads" + echo " overlapping genes' exons and introns: prioritize >50% overlap with" + echo " exons. Do not count reads with 100% exonic overlap in the antisense" + echo " direction." + echo "" + echo " --soloMultiMappers" + echo " type: string, multiple values allowed" + echo " example: Unique" + echo " counting method for reads mapping to multiple genes" + echo " - Unique ... count only reads that map to unique genes" + echo " - Uniform ... uniformly distribute multi-genic UMIs to all genes" + echo " - Rescue ... distribute UMIs proportionally to unique+uniform counts" + echo " (~ first iteration of EM)" + echo " - PropUnique ... distribute UMIs proportionally to unique mappers, if" + echo " present, and uniformly if not." + echo " - EM ... multi-gene UMIs are distributed using Expectation" + echo " Maximization algorithm" + echo "" + echo " --soloUMIdedup" + echo " type: string, multiple values allowed" + echo " example: 1MM_All" + echo " type of UMI deduplication (collapsing) algorithm" + echo " - 1MM_All ... all UMIs with 1 mismatch distance to" + echo " each other are collapsed (i.e. counted once)." + echo " - 1MM_Directional_UMItools ... follows the \"directional\" method from" + echo " the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017)." + echo " - 1MM_Directional ... same as 1MM_Directional_UMItools, but" + echo " with more stringent criteria for duplicate UMIs" + echo " - Exact ... only exactly matching UMIs are" + echo " collapsed." + echo " - NoDedup ... no deduplication of UMIs, count all" + echo " reads." + echo " - 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI" + echo " collapsing." + echo "" + echo " --soloUMIfiltering" + echo " type: string, multiple values allowed" + echo " type of UMI filtering (for reads uniquely mapping to genes)" + echo " - - ... basic filtering: remove UMIs with N and" + echo " homopolymers (similar to CellRanger 2.2.0)." + echo " - MultiGeneUMI ... basic + remove lower-count UMIs that map to" + echo " more than one gene." + echo " - MultiGeneUMI_All ... basic + remove all UMIs that map to more than" + echo " one gene." + echo " - MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to" + echo " more than one gene, matching CellRanger > 3.0.0 ." + echo " Only works with --soloUMIdedup 1MM_CR" + echo "" + echo " --soloOutFileNames" + echo " type: string, multiple values allowed" + echo " example: Solo.out/;features.tsv;barcodes.tsv;matrix.mtx" + echo " file names for STARsolo output:" + echo " file_name_prefix gene_names barcode_sequences" + echo " cell_feature_count_matrix" + echo "" + echo " --soloCellFilter" + echo " type: string, multiple values allowed" + echo " example: CellRanger2.2;3000;0.99;10" + echo " cell filtering type and parameters" + echo " - None ... do not output filtered cells" + echo " - TopCells ... only report top cells by UMI count, followed by" + echo " the exact number of cells" + echo " - CellRanger2.2 ... simple filtering of CellRanger 2.2." + echo " Can be followed by numbers: number of expected cells, robust maximum" + echo " percentile for UMI count, maximum to minimum ratio for UMI count" + echo " The harcoded values are from CellRanger: nExpectedCells=3000;" + echo " maxPercentile=0.99; maxMinRatio=10" + echo " - EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please" + echo " cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20," + echo " 63 (2019):" + echo " " + echo "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y" + echo " Can be followed by 10 numeric parameters: nExpectedCells" + echo " maxPercentile maxMinRatio indMin indMax umiMin" + echo " umiMinFracMedian candMaxN FDR simN" + echo " The harcoded values are from CellRanger: 3000" + echo " 0.99 10 45000 90000 500 0.01" + echo " 20000 0.01 10000" + echo "" + echo " --soloOutFormatFeaturesGeneField3" + echo " type: string, multiple values allowed" + echo " example: Gene Expression" + echo " field 3 in the Gene features.tsv file. If \"-\", then no 3rd field is" + echo " output." + echo "" + echo " --soloCellReadStats" + echo " type: string" + echo " Output reads statistics for each CB" + echo " - Standard ... standard output" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "star_align_v273a main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readFilesIn) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesIn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference=*) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--reference=*\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeDir) + [ -n "$VIASH_PAR_REFERENCE" ] && ViashError Bad arguments for option \'--genomeDir\': \'$VIASH_PAR_REFERENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFileNamePrefix) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--outFileNamePrefix\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFileNamePrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runRNGseed) + [ -n "$VIASH_PAR_RUNRNGSEED" ] && ViashError Bad arguments for option \'--runRNGseed\': \'$VIASH_PAR_RUNRNGSEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNRNGSEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --runRNGseed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runRNGseed=*) + [ -n "$VIASH_PAR_RUNRNGSEED" ] && ViashError Bad arguments for option \'--runRNGseed=*\': \'$VIASH_PAR_RUNRNGSEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNRNGSEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeLoad) + [ -n "$VIASH_PAR_GENOMELOAD" ] && ViashError Bad arguments for option \'--genomeLoad\': \'$VIASH_PAR_GENOMELOAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMELOAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeLoad. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeLoad=*) + [ -n "$VIASH_PAR_GENOMELOAD" ] && ViashError Bad arguments for option \'--genomeLoad=*\': \'$VIASH_PAR_GENOMELOAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMELOAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeFastaFiles) + if [ -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_PAR_GENOMEFASTAFILES="$2" + else + VIASH_PAR_GENOMEFASTAFILES="$VIASH_PAR_GENOMEFASTAFILES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeFastaFiles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeFastaFiles=*) + if [ -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_PAR_GENOMEFASTAFILES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMEFASTAFILES="$VIASH_PAR_GENOMEFASTAFILES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeFileSizes) + if [ -z "$VIASH_PAR_GENOMEFILESIZES" ]; then + VIASH_PAR_GENOMEFILESIZES="$2" + else + VIASH_PAR_GENOMEFILESIZES="$VIASH_PAR_GENOMEFILESIZES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeFileSizes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeFileSizes=*) + if [ -z "$VIASH_PAR_GENOMEFILESIZES" ]; then + VIASH_PAR_GENOMEFILESIZES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMEFILESIZES="$VIASH_PAR_GENOMEFILESIZES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeTransformOutput) + if [ -z "$VIASH_PAR_GENOMETRANSFORMOUTPUT" ]; then + VIASH_PAR_GENOMETRANSFORMOUTPUT="$2" + else + VIASH_PAR_GENOMETRANSFORMOUTPUT="$VIASH_PAR_GENOMETRANSFORMOUTPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeTransformOutput. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeTransformOutput=*) + if [ -z "$VIASH_PAR_GENOMETRANSFORMOUTPUT" ]; then + VIASH_PAR_GENOMETRANSFORMOUTPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMETRANSFORMOUTPUT="$VIASH_PAR_GENOMETRANSFORMOUTPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeChrSetMitochondrial) + if [ -z "$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL" ]; then + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL="$2" + else + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL="$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeChrSetMitochondrial. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeChrSetMitochondrial=*) + if [ -z "$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL" ]; then + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOMECHRSETMITOCHONDRIAL="$VIASH_PAR_GENOMECHRSETMITOCHONDRIAL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbFileChrStartEnd) + if [ -z "$VIASH_PAR_SJDBFILECHRSTARTEND" ]; then + VIASH_PAR_SJDBFILECHRSTARTEND="$2" + else + VIASH_PAR_SJDBFILECHRSTARTEND="$VIASH_PAR_SJDBFILECHRSTARTEND;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbFileChrStartEnd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbFileChrStartEnd=*) + if [ -z "$VIASH_PAR_SJDBFILECHRSTARTEND" ]; then + VIASH_PAR_SJDBFILECHRSTARTEND=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBFILECHRSTARTEND="$VIASH_PAR_SJDBFILECHRSTARTEND;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbGTFfile) + [ -n "$VIASH_PAR_SJDBGTFFILE" ] && ViashError Bad arguments for option \'--sjdbGTFfile\': \'$VIASH_PAR_SJDBGTFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFfile=*) + [ -n "$VIASH_PAR_SJDBGTFFILE" ] && ViashError Bad arguments for option \'--sjdbGTFfile=*\': \'$VIASH_PAR_SJDBGTFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFchrPrefix) + [ -n "$VIASH_PAR_SJDBGTFCHRPREFIX" ] && ViashError Bad arguments for option \'--sjdbGTFchrPrefix\': \'$VIASH_PAR_SJDBGTFCHRPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFCHRPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFchrPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFchrPrefix=*) + [ -n "$VIASH_PAR_SJDBGTFCHRPREFIX" ] && ViashError Bad arguments for option \'--sjdbGTFchrPrefix=*\': \'$VIASH_PAR_SJDBGTFCHRPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFCHRPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFfeatureExon) + [ -n "$VIASH_PAR_SJDBGTFFEATUREEXON" ] && ViashError Bad arguments for option \'--sjdbGTFfeatureExon\': \'$VIASH_PAR_SJDBGTFFEATUREEXON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFEATUREEXON="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfeatureExon. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFfeatureExon=*) + [ -n "$VIASH_PAR_SJDBGTFFEATUREEXON" ] && ViashError Bad arguments for option \'--sjdbGTFfeatureExon=*\': \'$VIASH_PAR_SJDBGTFFEATUREEXON\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFFEATUREEXON=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentTranscript) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentTranscript\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentTranscript. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentTranscript=*) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentTranscript=*\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentGene) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentGene\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTGENE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGene. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGene=*) + [ -n "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE" ] && ViashError Bad arguments for option \'--sjdbGTFtagExonParentGene=*\': \'$VIASH_PAR_SJDBGTFTAGEXONPARENTGENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBGTFTAGEXONPARENTGENE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFtagExonParentGeneName) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$2" + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGeneName. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGeneName=*) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbGTFtagExonParentGeneType) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$2" + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFtagExonParentGeneType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbGTFtagExonParentGeneType=*) + if [ -z "$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE" ]; then + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE="$VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --sjdbOverhang) + [ -n "$VIASH_PAR_SJDBOVERHANG" ] && ViashError Bad arguments for option \'--sjdbOverhang\': \'$VIASH_PAR_SJDBOVERHANG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBOVERHANG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbOverhang. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbOverhang=*) + [ -n "$VIASH_PAR_SJDBOVERHANG" ] && ViashError Bad arguments for option \'--sjdbOverhang=*\': \'$VIASH_PAR_SJDBOVERHANG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBOVERHANG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbScore) + [ -n "$VIASH_PAR_SJDBSCORE" ] && ViashError Bad arguments for option \'--sjdbScore\': \'$VIASH_PAR_SJDBSCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBSCORE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbScore. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbScore=*) + [ -n "$VIASH_PAR_SJDBSCORE" ] && ViashError Bad arguments for option \'--sjdbScore=*\': \'$VIASH_PAR_SJDBSCORE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBSCORE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbInsertSave) + [ -n "$VIASH_PAR_SJDBINSERTSAVE" ] && ViashError Bad arguments for option \'--sjdbInsertSave\': \'$VIASH_PAR_SJDBINSERTSAVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBINSERTSAVE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbInsertSave. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --sjdbInsertSave=*) + [ -n "$VIASH_PAR_SJDBINSERTSAVE" ] && ViashError Bad arguments for option \'--sjdbInsertSave=*\': \'$VIASH_PAR_SJDBINSERTSAVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SJDBINSERTSAVE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --varVCFfile) + [ -n "$VIASH_PAR_VARVCFFILE" ] && ViashError Bad arguments for option \'--varVCFfile\': \'$VIASH_PAR_VARVCFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARVCFFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --varVCFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --varVCFfile=*) + [ -n "$VIASH_PAR_VARVCFFILE" ] && ViashError Bad arguments for option \'--varVCFfile=*\': \'$VIASH_PAR_VARVCFFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VARVCFFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesType) + [ -n "$VIASH_PAR_READFILESTYPE" ] && ViashError Bad arguments for option \'--readFilesType\': \'$VIASH_PAR_READFILESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesType=*) + [ -n "$VIASH_PAR_READFILESTYPE" ] && ViashError Bad arguments for option \'--readFilesType=*\': \'$VIASH_PAR_READFILESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesSAMattrKeep) + if [ -z "$VIASH_PAR_READFILESSAMATTRKEEP" ]; then + VIASH_PAR_READFILESSAMATTRKEEP="$2" + else + VIASH_PAR_READFILESSAMATTRKEEP="$VIASH_PAR_READFILESSAMATTRKEEP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesSAMattrKeep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesSAMattrKeep=*) + if [ -z "$VIASH_PAR_READFILESSAMATTRKEEP" ]; then + VIASH_PAR_READFILESSAMATTRKEEP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READFILESSAMATTRKEEP="$VIASH_PAR_READFILESSAMATTRKEEP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readFilesManifest) + [ -n "$VIASH_PAR_READFILESMANIFEST" ] && ViashError Bad arguments for option \'--readFilesManifest\': \'$VIASH_PAR_READFILESMANIFEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESMANIFEST="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesManifest. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesManifest=*) + [ -n "$VIASH_PAR_READFILESMANIFEST" ] && ViashError Bad arguments for option \'--readFilesManifest=*\': \'$VIASH_PAR_READFILESMANIFEST\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESMANIFEST=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesPrefix) + [ -n "$VIASH_PAR_READFILESPREFIX" ] && ViashError Bad arguments for option \'--readFilesPrefix\': \'$VIASH_PAR_READFILESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesPrefix=*) + [ -n "$VIASH_PAR_READFILESPREFIX" ] && ViashError Bad arguments for option \'--readFilesPrefix=*\': \'$VIASH_PAR_READFILESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READFILESPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readFilesCommand) + if [ -z "$VIASH_PAR_READFILESCOMMAND" ]; then + VIASH_PAR_READFILESCOMMAND="$2" + else + VIASH_PAR_READFILESCOMMAND="$VIASH_PAR_READFILESCOMMAND;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readFilesCommand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readFilesCommand=*) + if [ -z "$VIASH_PAR_READFILESCOMMAND" ]; then + VIASH_PAR_READFILESCOMMAND=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READFILESCOMMAND="$VIASH_PAR_READFILESCOMMAND;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readMapNumber) + [ -n "$VIASH_PAR_READMAPNUMBER" ] && ViashError Bad arguments for option \'--readMapNumber\': \'$VIASH_PAR_READMAPNUMBER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMAPNUMBER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readMapNumber. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readMapNumber=*) + [ -n "$VIASH_PAR_READMAPNUMBER" ] && ViashError Bad arguments for option \'--readMapNumber=*\': \'$VIASH_PAR_READMAPNUMBER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMAPNUMBER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readMatesLengthsIn) + [ -n "$VIASH_PAR_READMATESLENGTHSIN" ] && ViashError Bad arguments for option \'--readMatesLengthsIn\': \'$VIASH_PAR_READMATESLENGTHSIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMATESLENGTHSIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readMatesLengthsIn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readMatesLengthsIn=*) + [ -n "$VIASH_PAR_READMATESLENGTHSIN" ] && ViashError Bad arguments for option \'--readMatesLengthsIn=*\': \'$VIASH_PAR_READMATESLENGTHSIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READMATESLENGTHSIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --readNameSeparator) + if [ -z "$VIASH_PAR_READNAMESEPARATOR" ]; then + VIASH_PAR_READNAMESEPARATOR="$2" + else + VIASH_PAR_READNAMESEPARATOR="$VIASH_PAR_READNAMESEPARATOR;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readNameSeparator. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readNameSeparator=*) + if [ -z "$VIASH_PAR_READNAMESEPARATOR" ]; then + VIASH_PAR_READNAMESEPARATOR=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READNAMESEPARATOR="$VIASH_PAR_READNAMESEPARATOR;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --readQualityScoreBase) + [ -n "$VIASH_PAR_READQUALITYSCOREBASE" ] && ViashError Bad arguments for option \'--readQualityScoreBase\': \'$VIASH_PAR_READQUALITYSCOREBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READQUALITYSCOREBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --readQualityScoreBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --readQualityScoreBase=*) + [ -n "$VIASH_PAR_READQUALITYSCOREBASE" ] && ViashError Bad arguments for option \'--readQualityScoreBase=*\': \'$VIASH_PAR_READQUALITYSCOREBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_READQUALITYSCOREBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clipAdapterType) + [ -n "$VIASH_PAR_CLIPADAPTERTYPE" ] && ViashError Bad arguments for option \'--clipAdapterType\': \'$VIASH_PAR_CLIPADAPTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLIPADAPTERTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clipAdapterType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clipAdapterType=*) + [ -n "$VIASH_PAR_CLIPADAPTERTYPE" ] && ViashError Bad arguments for option \'--clipAdapterType=*\': \'$VIASH_PAR_CLIPADAPTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CLIPADAPTERTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --clip3pNbases) + if [ -z "$VIASH_PAR_CLIP3PNBASES" ]; then + VIASH_PAR_CLIP3PNBASES="$2" + else + VIASH_PAR_CLIP3PNBASES="$VIASH_PAR_CLIP3PNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pNbases=*) + if [ -z "$VIASH_PAR_CLIP3PNBASES" ]; then + VIASH_PAR_CLIP3PNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PNBASES="$VIASH_PAR_CLIP3PNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAdapterSeq) + if [ -z "$VIASH_PAR_CLIP3PADAPTERSEQ" ]; then + VIASH_PAR_CLIP3PADAPTERSEQ="$2" + else + VIASH_PAR_CLIP3PADAPTERSEQ="$VIASH_PAR_CLIP3PADAPTERSEQ;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAdapterSeq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAdapterSeq=*) + if [ -z "$VIASH_PAR_CLIP3PADAPTERSEQ" ]; then + VIASH_PAR_CLIP3PADAPTERSEQ=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PADAPTERSEQ="$VIASH_PAR_CLIP3PADAPTERSEQ;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAdapterMMp) + if [ -z "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + VIASH_PAR_CLIP3PADAPTERMMP="$2" + else + VIASH_PAR_CLIP3PADAPTERMMP="$VIASH_PAR_CLIP3PADAPTERMMP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAdapterMMp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAdapterMMp=*) + if [ -z "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + VIASH_PAR_CLIP3PADAPTERMMP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PADAPTERMMP="$VIASH_PAR_CLIP3PADAPTERMMP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip3pAfterAdapterNbases) + if [ -z "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$2" + else + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$VIASH_PAR_CLIP3PAFTERADAPTERNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip3pAfterAdapterNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip3pAfterAdapterNbases=*) + if [ -z "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + VIASH_PAR_CLIP3PAFTERADAPTERNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP3PAFTERADAPTERNBASES="$VIASH_PAR_CLIP3PAFTERADAPTERNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --clip5pNbases) + if [ -z "$VIASH_PAR_CLIP5PNBASES" ]; then + VIASH_PAR_CLIP5PNBASES="$2" + else + VIASH_PAR_CLIP5PNBASES="$VIASH_PAR_CLIP5PNBASES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --clip5pNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --clip5pNbases=*) + if [ -z "$VIASH_PAR_CLIP5PNBASES" ]; then + VIASH_PAR_CLIP5PNBASES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CLIP5PNBASES="$VIASH_PAR_CLIP5PNBASES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --limitGenomeGenerateRAM) + [ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ] && ViashError Bad arguments for option \'--limitGenomeGenerateRAM\': \'$VIASH_PAR_LIMITGENOMEGENERATERAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITGENOMEGENERATERAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitGenomeGenerateRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitGenomeGenerateRAM=*) + [ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ] && ViashError Bad arguments for option \'--limitGenomeGenerateRAM=*\': \'$VIASH_PAR_LIMITGENOMEGENERATERAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITGENOMEGENERATERAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitIObufferSize) + if [ -z "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + VIASH_PAR_LIMITIOBUFFERSIZE="$2" + else + VIASH_PAR_LIMITIOBUFFERSIZE="$VIASH_PAR_LIMITIOBUFFERSIZE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitIObufferSize. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitIObufferSize=*) + if [ -z "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + VIASH_PAR_LIMITIOBUFFERSIZE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LIMITIOBUFFERSIZE="$VIASH_PAR_LIMITIOBUFFERSIZE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --limitOutSAMoneReadBytes) + [ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ] && ViashError Bad arguments for option \'--limitOutSAMoneReadBytes\': \'$VIASH_PAR_LIMITOUTSAMONEREADBYTES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSAMONEREADBYTES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSAMoneReadBytes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSAMoneReadBytes=*) + [ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ] && ViashError Bad arguments for option \'--limitOutSAMoneReadBytes=*\': \'$VIASH_PAR_LIMITOUTSAMONEREADBYTES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSAMONEREADBYTES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitOutSJoneRead) + [ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ] && ViashError Bad arguments for option \'--limitOutSJoneRead\': \'$VIASH_PAR_LIMITOUTSJONEREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJONEREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSJoneRead. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSJoneRead=*) + [ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ] && ViashError Bad arguments for option \'--limitOutSJoneRead=*\': \'$VIASH_PAR_LIMITOUTSJONEREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJONEREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitOutSJcollapsed) + [ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ] && ViashError Bad arguments for option \'--limitOutSJcollapsed\': \'$VIASH_PAR_LIMITOUTSJCOLLAPSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJCOLLAPSED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitOutSJcollapsed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitOutSJcollapsed=*) + [ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ] && ViashError Bad arguments for option \'--limitOutSJcollapsed=*\': \'$VIASH_PAR_LIMITOUTSJCOLLAPSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITOUTSJCOLLAPSED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitBAMsortRAM) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitBAMsortRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitBAMsortRAM=*) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM=*\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitSjdbInsertNsj) + [ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ] && ViashError Bad arguments for option \'--limitSjdbInsertNsj\': \'$VIASH_PAR_LIMITSJDBINSERTNSJ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITSJDBINSERTNSJ="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitSjdbInsertNsj. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitSjdbInsertNsj=*) + [ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ] && ViashError Bad arguments for option \'--limitSjdbInsertNsj=*\': \'$VIASH_PAR_LIMITSJDBINSERTNSJ\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITSJDBINSERTNSJ=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitNreadsSoft) + [ -n "$VIASH_PAR_LIMITNREADSSOFT" ] && ViashError Bad arguments for option \'--limitNreadsSoft\': \'$VIASH_PAR_LIMITNREADSSOFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITNREADSSOFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitNreadsSoft. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitNreadsSoft=*) + [ -n "$VIASH_PAR_LIMITNREADSSOFT" ] && ViashError Bad arguments for option \'--limitNreadsSoft=*\': \'$VIASH_PAR_LIMITNREADSSOFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITNREADSSOFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outTmpKeep) + [ -n "$VIASH_PAR_OUTTMPKEEP" ] && ViashError Bad arguments for option \'--outTmpKeep\': \'$VIASH_PAR_OUTTMPKEEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTTMPKEEP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outTmpKeep. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outTmpKeep=*) + [ -n "$VIASH_PAR_OUTTMPKEEP" ] && ViashError Bad arguments for option \'--outTmpKeep=*\': \'$VIASH_PAR_OUTTMPKEEP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTTMPKEEP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outStd) + [ -n "$VIASH_PAR_OUTSTD" ] && ViashError Bad arguments for option \'--outStd\': \'$VIASH_PAR_OUTSTD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSTD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outStd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outStd=*) + [ -n "$VIASH_PAR_OUTSTD" ] && ViashError Bad arguments for option \'--outStd=*\': \'$VIASH_PAR_OUTSTD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSTD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outReadsUnmapped) + [ -n "$VIASH_PAR_OUTREADSUNMAPPED" ] && ViashError Bad arguments for option \'--outReadsUnmapped\': \'$VIASH_PAR_OUTREADSUNMAPPED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTREADSUNMAPPED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outReadsUnmapped. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outReadsUnmapped=*) + [ -n "$VIASH_PAR_OUTREADSUNMAPPED" ] && ViashError Bad arguments for option \'--outReadsUnmapped=*\': \'$VIASH_PAR_OUTREADSUNMAPPED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTREADSUNMAPPED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outQSconversionAdd) + [ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ] && ViashError Bad arguments for option \'--outQSconversionAdd\': \'$VIASH_PAR_OUTQSCONVERSIONADD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTQSCONVERSIONADD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outQSconversionAdd. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outQSconversionAdd=*) + [ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ] && ViashError Bad arguments for option \'--outQSconversionAdd=*\': \'$VIASH_PAR_OUTQSCONVERSIONADD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTQSCONVERSIONADD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outMultimapperOrder) + [ -n "$VIASH_PAR_OUTMULTIMAPPERORDER" ] && ViashError Bad arguments for option \'--outMultimapperOrder\': \'$VIASH_PAR_OUTMULTIMAPPERORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTMULTIMAPPERORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outMultimapperOrder. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outMultimapperOrder=*) + [ -n "$VIASH_PAR_OUTMULTIMAPPERORDER" ] && ViashError Bad arguments for option \'--outMultimapperOrder=*\': \'$VIASH_PAR_OUTMULTIMAPPERORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTMULTIMAPPERORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMtype) + if [ -z "$VIASH_PAR_OUTSAMTYPE" ]; then + VIASH_PAR_OUTSAMTYPE="$2" + else + VIASH_PAR_OUTSAMTYPE="$VIASH_PAR_OUTSAMTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMtype=*) + if [ -z "$VIASH_PAR_OUTSAMTYPE" ]; then + VIASH_PAR_OUTSAMTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMTYPE="$VIASH_PAR_OUTSAMTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMmode) + [ -n "$VIASH_PAR_OUTSAMMODE" ] && ViashError Bad arguments for option \'--outSAMmode\': \'$VIASH_PAR_OUTSAMMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmode=*) + [ -n "$VIASH_PAR_OUTSAMMODE" ] && ViashError Bad arguments for option \'--outSAMmode=*\': \'$VIASH_PAR_OUTSAMMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMstrandField) + [ -n "$VIASH_PAR_OUTSAMSTRANDFIELD" ] && ViashError Bad arguments for option \'--outSAMstrandField\': \'$VIASH_PAR_OUTSAMSTRANDFIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMSTRANDFIELD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMstrandField. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMstrandField=*) + [ -n "$VIASH_PAR_OUTSAMSTRANDFIELD" ] && ViashError Bad arguments for option \'--outSAMstrandField=*\': \'$VIASH_PAR_OUTSAMSTRANDFIELD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMSTRANDFIELD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMattributes) + if [ -z "$VIASH_PAR_OUTSAMATTRIBUTES" ]; then + VIASH_PAR_OUTSAMATTRIBUTES="$2" + else + VIASH_PAR_OUTSAMATTRIBUTES="$VIASH_PAR_OUTSAMATTRIBUTES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattributes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattributes=*) + if [ -z "$VIASH_PAR_OUTSAMATTRIBUTES" ]; then + VIASH_PAR_OUTSAMATTRIBUTES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMATTRIBUTES="$VIASH_PAR_OUTSAMATTRIBUTES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMattrIHstart) + [ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ] && ViashError Bad arguments for option \'--outSAMattrIHstart\': \'$VIASH_PAR_OUTSAMATTRIHSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMATTRIHSTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattrIHstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattrIHstart=*) + [ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ] && ViashError Bad arguments for option \'--outSAMattrIHstart=*\': \'$VIASH_PAR_OUTSAMATTRIHSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMATTRIHSTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMunmapped) + if [ -z "$VIASH_PAR_OUTSAMUNMAPPED" ]; then + VIASH_PAR_OUTSAMUNMAPPED="$2" + else + VIASH_PAR_OUTSAMUNMAPPED="$VIASH_PAR_OUTSAMUNMAPPED;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMunmapped. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMunmapped=*) + if [ -z "$VIASH_PAR_OUTSAMUNMAPPED" ]; then + VIASH_PAR_OUTSAMUNMAPPED=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMUNMAPPED="$VIASH_PAR_OUTSAMUNMAPPED;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMorder) + [ -n "$VIASH_PAR_OUTSAMORDER" ] && ViashError Bad arguments for option \'--outSAMorder\': \'$VIASH_PAR_OUTSAMORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMORDER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMorder. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMorder=*) + [ -n "$VIASH_PAR_OUTSAMORDER" ] && ViashError Bad arguments for option \'--outSAMorder=*\': \'$VIASH_PAR_OUTSAMORDER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMORDER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMprimaryFlag) + [ -n "$VIASH_PAR_OUTSAMPRIMARYFLAG" ] && ViashError Bad arguments for option \'--outSAMprimaryFlag\': \'$VIASH_PAR_OUTSAMPRIMARYFLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMPRIMARYFLAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMprimaryFlag. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMprimaryFlag=*) + [ -n "$VIASH_PAR_OUTSAMPRIMARYFLAG" ] && ViashError Bad arguments for option \'--outSAMprimaryFlag=*\': \'$VIASH_PAR_OUTSAMPRIMARYFLAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMPRIMARYFLAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMreadID) + [ -n "$VIASH_PAR_OUTSAMREADID" ] && ViashError Bad arguments for option \'--outSAMreadID\': \'$VIASH_PAR_OUTSAMREADID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMREADID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMreadID. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMreadID=*) + [ -n "$VIASH_PAR_OUTSAMREADID" ] && ViashError Bad arguments for option \'--outSAMreadID=*\': \'$VIASH_PAR_OUTSAMREADID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMREADID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMmapqUnique) + [ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ] && ViashError Bad arguments for option \'--outSAMmapqUnique\': \'$VIASH_PAR_OUTSAMMAPQUNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMAPQUNIQUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmapqUnique. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmapqUnique=*) + [ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ] && ViashError Bad arguments for option \'--outSAMmapqUnique=*\': \'$VIASH_PAR_OUTSAMMAPQUNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMAPQUNIQUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMflagOR) + [ -n "$VIASH_PAR_OUTSAMFLAGOR" ] && ViashError Bad arguments for option \'--outSAMflagOR\': \'$VIASH_PAR_OUTSAMFLAGOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMflagOR. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMflagOR=*) + [ -n "$VIASH_PAR_OUTSAMFLAGOR" ] && ViashError Bad arguments for option \'--outSAMflagOR=*\': \'$VIASH_PAR_OUTSAMFLAGOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMflagAND) + [ -n "$VIASH_PAR_OUTSAMFLAGAND" ] && ViashError Bad arguments for option \'--outSAMflagAND\': \'$VIASH_PAR_OUTSAMFLAGAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMflagAND. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMflagAND=*) + [ -n "$VIASH_PAR_OUTSAMFLAGAND" ] && ViashError Bad arguments for option \'--outSAMflagAND=*\': \'$VIASH_PAR_OUTSAMFLAGAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMFLAGAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMattrRGline) + if [ -z "$VIASH_PAR_OUTSAMATTRRGLINE" ]; then + VIASH_PAR_OUTSAMATTRRGLINE="$2" + else + VIASH_PAR_OUTSAMATTRRGLINE="$VIASH_PAR_OUTSAMATTRRGLINE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMattrRGline. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMattrRGline=*) + if [ -z "$VIASH_PAR_OUTSAMATTRRGLINE" ]; then + VIASH_PAR_OUTSAMATTRRGLINE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMATTRRGLINE="$VIASH_PAR_OUTSAMATTRRGLINE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderHD) + if [ -z "$VIASH_PAR_OUTSAMHEADERHD" ]; then + VIASH_PAR_OUTSAMHEADERHD="$2" + else + VIASH_PAR_OUTSAMHEADERHD="$VIASH_PAR_OUTSAMHEADERHD;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderHD. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderHD=*) + if [ -z "$VIASH_PAR_OUTSAMHEADERHD" ]; then + VIASH_PAR_OUTSAMHEADERHD=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMHEADERHD="$VIASH_PAR_OUTSAMHEADERHD;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderPG) + if [ -z "$VIASH_PAR_OUTSAMHEADERPG" ]; then + VIASH_PAR_OUTSAMHEADERPG="$2" + else + VIASH_PAR_OUTSAMHEADERPG="$VIASH_PAR_OUTSAMHEADERPG;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderPG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderPG=*) + if [ -z "$VIASH_PAR_OUTSAMHEADERPG" ]; then + VIASH_PAR_OUTSAMHEADERPG=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMHEADERPG="$VIASH_PAR_OUTSAMHEADERPG;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMheaderCommentFile) + [ -n "$VIASH_PAR_OUTSAMHEADERCOMMENTFILE" ] && ViashError Bad arguments for option \'--outSAMheaderCommentFile\': \'$VIASH_PAR_OUTSAMHEADERCOMMENTFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMHEADERCOMMENTFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMheaderCommentFile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMheaderCommentFile=*) + [ -n "$VIASH_PAR_OUTSAMHEADERCOMMENTFILE" ] && ViashError Bad arguments for option \'--outSAMheaderCommentFile=*\': \'$VIASH_PAR_OUTSAMHEADERCOMMENTFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMHEADERCOMMENTFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMfilter) + if [ -z "$VIASH_PAR_OUTSAMFILTER" ]; then + VIASH_PAR_OUTSAMFILTER="$2" + else + VIASH_PAR_OUTSAMFILTER="$VIASH_PAR_OUTSAMFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMfilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMfilter=*) + if [ -z "$VIASH_PAR_OUTSAMFILTER" ]; then + VIASH_PAR_OUTSAMFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSAMFILTER="$VIASH_PAR_OUTSAMFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSAMmultNmax) + [ -n "$VIASH_PAR_OUTSAMMULTNMAX" ] && ViashError Bad arguments for option \'--outSAMmultNmax\': \'$VIASH_PAR_OUTSAMMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMULTNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMmultNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMmultNmax=*) + [ -n "$VIASH_PAR_OUTSAMMULTNMAX" ] && ViashError Bad arguments for option \'--outSAMmultNmax=*\': \'$VIASH_PAR_OUTSAMMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMMULTNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSAMtlen) + [ -n "$VIASH_PAR_OUTSAMTLEN" ] && ViashError Bad arguments for option \'--outSAMtlen\': \'$VIASH_PAR_OUTSAMTLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMTLEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSAMtlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSAMtlen=*) + [ -n "$VIASH_PAR_OUTSAMTLEN" ] && ViashError Bad arguments for option \'--outSAMtlen=*\': \'$VIASH_PAR_OUTSAMTLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSAMTLEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMcompression) + [ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--outBAMcompression\': \'$VIASH_PAR_OUTBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMCOMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMcompression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMcompression=*) + [ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--outBAMcompression=*\': \'$VIASH_PAR_OUTBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMCOMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMsortingThreadN) + [ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ] && ViashError Bad arguments for option \'--outBAMsortingThreadN\': \'$VIASH_PAR_OUTBAMSORTINGTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGTHREADN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMsortingThreadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMsortingThreadN=*) + [ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ] && ViashError Bad arguments for option \'--outBAMsortingThreadN=*\': \'$VIASH_PAR_OUTBAMSORTINGTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGTHREADN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outBAMsortingBinsN) + [ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ] && ViashError Bad arguments for option \'--outBAMsortingBinsN\': \'$VIASH_PAR_OUTBAMSORTINGBINSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGBINSN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outBAMsortingBinsN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outBAMsortingBinsN=*) + [ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ] && ViashError Bad arguments for option \'--outBAMsortingBinsN=*\': \'$VIASH_PAR_OUTBAMSORTINGBINSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTBAMSORTINGBINSN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bamRemoveDuplicatesType) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESTYPE" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesType\': \'$VIASH_PAR_BAMREMOVEDUPLICATESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bamRemoveDuplicatesType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bamRemoveDuplicatesType=*) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESTYPE" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesType=*\': \'$VIASH_PAR_BAMREMOVEDUPLICATESTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --bamRemoveDuplicatesMate2basesN) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesMate2basesN\': \'$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --bamRemoveDuplicatesMate2basesN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --bamRemoveDuplicatesMate2basesN=*) + [ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ] && ViashError Bad arguments for option \'--bamRemoveDuplicatesMate2basesN=*\': \'$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigType) + if [ -z "$VIASH_PAR_OUTWIGTYPE" ]; then + VIASH_PAR_OUTWIGTYPE="$2" + else + VIASH_PAR_OUTWIGTYPE="$VIASH_PAR_OUTWIGTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigType=*) + if [ -z "$VIASH_PAR_OUTWIGTYPE" ]; then + VIASH_PAR_OUTWIGTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTWIGTYPE="$VIASH_PAR_OUTWIGTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outWigStrand) + [ -n "$VIASH_PAR_OUTWIGSTRAND" ] && ViashError Bad arguments for option \'--outWigStrand\': \'$VIASH_PAR_OUTWIGSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGSTRAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigStrand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigStrand=*) + [ -n "$VIASH_PAR_OUTWIGSTRAND" ] && ViashError Bad arguments for option \'--outWigStrand=*\': \'$VIASH_PAR_OUTWIGSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGSTRAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigReferencesPrefix) + [ -n "$VIASH_PAR_OUTWIGREFERENCESPREFIX" ] && ViashError Bad arguments for option \'--outWigReferencesPrefix\': \'$VIASH_PAR_OUTWIGREFERENCESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGREFERENCESPREFIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigReferencesPrefix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigReferencesPrefix=*) + [ -n "$VIASH_PAR_OUTWIGREFERENCESPREFIX" ] && ViashError Bad arguments for option \'--outWigReferencesPrefix=*\': \'$VIASH_PAR_OUTWIGREFERENCESPREFIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGREFERENCESPREFIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outWigNorm) + [ -n "$VIASH_PAR_OUTWIGNORM" ] && ViashError Bad arguments for option \'--outWigNorm\': \'$VIASH_PAR_OUTWIGNORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGNORM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outWigNorm. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outWigNorm=*) + [ -n "$VIASH_PAR_OUTWIGNORM" ] && ViashError Bad arguments for option \'--outWigNorm=*\': \'$VIASH_PAR_OUTWIGNORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTWIGNORM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterType) + [ -n "$VIASH_PAR_OUTFILTERTYPE" ] && ViashError Bad arguments for option \'--outFilterType\': \'$VIASH_PAR_OUTFILTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterType=*) + [ -n "$VIASH_PAR_OUTFILTERTYPE" ] && ViashError Bad arguments for option \'--outFilterType=*\': \'$VIASH_PAR_OUTFILTERTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMultimapScoreRange) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--outFilterMultimapScoreRange\': \'$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMultimapScoreRange. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMultimapScoreRange=*) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--outFilterMultimapScoreRange=*\': \'$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMultimapNmax) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--outFilterMultimapNmax\': \'$VIASH_PAR_OUTFILTERMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMultimapNmax=*) + [ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--outFilterMultimapNmax=*\': \'$VIASH_PAR_OUTFILTERMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNoverLmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverLmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNoverLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNoverLmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverLmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMismatchNoverReadLmax) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverReadLmax\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMismatchNoverReadLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMismatchNoverReadLmax=*) + [ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ] && ViashError Bad arguments for option \'--outFilterMismatchNoverReadLmax=*\': \'$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterScoreMin) + [ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ] && ViashError Bad arguments for option \'--outFilterScoreMin\': \'$VIASH_PAR_OUTFILTERSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterScoreMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterScoreMin=*) + [ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ] && ViashError Bad arguments for option \'--outFilterScoreMin=*\': \'$VIASH_PAR_OUTFILTERSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterScoreMinOverLread) + [ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterScoreMinOverLread\': \'$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMINOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterScoreMinOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterScoreMinOverLread=*) + [ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterScoreMinOverLread=*\': \'$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERSCOREMINOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMatchNmin) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ] && ViashError Bad arguments for option \'--outFilterMatchNmin\': \'$VIASH_PAR_OUTFILTERMATCHNMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMatchNmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMatchNmin=*) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ] && ViashError Bad arguments for option \'--outFilterMatchNmin=*\': \'$VIASH_PAR_OUTFILTERMATCHNMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterMatchNminOverLread) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterMatchNminOverLread\': \'$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterMatchNminOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterMatchNminOverLread=*) + [ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ] && ViashError Bad arguments for option \'--outFilterMatchNminOverLread=*\': \'$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterIntronMotifs) + [ -n "$VIASH_PAR_OUTFILTERINTRONMOTIFS" ] && ViashError Bad arguments for option \'--outFilterIntronMotifs\': \'$VIASH_PAR_OUTFILTERINTRONMOTIFS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONMOTIFS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterIntronMotifs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterIntronMotifs=*) + [ -n "$VIASH_PAR_OUTFILTERINTRONMOTIFS" ] && ViashError Bad arguments for option \'--outFilterIntronMotifs=*\': \'$VIASH_PAR_OUTFILTERINTRONMOTIFS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONMOTIFS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outFilterIntronStrands) + [ -n "$VIASH_PAR_OUTFILTERINTRONSTRANDS" ] && ViashError Bad arguments for option \'--outFilterIntronStrands\': \'$VIASH_PAR_OUTFILTERINTRONSTRANDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONSTRANDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outFilterIntronStrands. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outFilterIntronStrands=*) + [ -n "$VIASH_PAR_OUTFILTERINTRONSTRANDS" ] && ViashError Bad arguments for option \'--outFilterIntronStrands=*\': \'$VIASH_PAR_OUTFILTERINTRONSTRANDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTFILTERINTRONSTRANDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJtype) + [ -n "$VIASH_PAR_OUTSJTYPE" ] && ViashError Bad arguments for option \'--outSJtype\': \'$VIASH_PAR_OUTSJTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJtype=*) + [ -n "$VIASH_PAR_OUTSJTYPE" ] && ViashError Bad arguments for option \'--outSJtype=*\': \'$VIASH_PAR_OUTSJTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJfilterReads) + [ -n "$VIASH_PAR_OUTSJFILTERREADS" ] && ViashError Bad arguments for option \'--outSJfilterReads\': \'$VIASH_PAR_OUTSJFILTERREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJFILTERREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterReads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterReads=*) + [ -n "$VIASH_PAR_OUTSJFILTERREADS" ] && ViashError Bad arguments for option \'--outSJfilterReads=*\': \'$VIASH_PAR_OUTSJFILTERREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTSJFILTERREADS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --outSJfilterOverhangMin) + if [ -z "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$2" + else + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$VIASH_PAR_OUTSJFILTEROVERHANGMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterOverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterOverhangMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + VIASH_PAR_OUTSJFILTEROVERHANGMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTEROVERHANGMIN="$VIASH_PAR_OUTSJFILTEROVERHANGMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterCountUniqueMin) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$2" + else + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterCountUniqueMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterCountUniqueMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN="$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterCountTotalMin) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$2" + else + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterCountTotalMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterCountTotalMin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN="$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterDistToOtherSJmin) + if [ -z "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$2" + else + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterDistToOtherSJmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterDistToOtherSJmin=*) + if [ -z "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN="$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --outSJfilterIntronMaxVsReadN) + if [ -z "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$2" + else + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --outSJfilterIntronMaxVsReadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --outSJfilterIntronMaxVsReadN=*) + if [ -z "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN="$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --scoreGap) + [ -n "$VIASH_PAR_SCOREGAP" ] && ViashError Bad arguments for option \'--scoreGap\': \'$VIASH_PAR_SCOREGAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGap. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGap=*) + [ -n "$VIASH_PAR_SCOREGAP" ] && ViashError Bad arguments for option \'--scoreGap=*\': \'$VIASH_PAR_SCOREGAP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapNoncan) + [ -n "$VIASH_PAR_SCOREGAPNONCAN" ] && ViashError Bad arguments for option \'--scoreGapNoncan\': \'$VIASH_PAR_SCOREGAPNONCAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPNONCAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapNoncan. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapNoncan=*) + [ -n "$VIASH_PAR_SCOREGAPNONCAN" ] && ViashError Bad arguments for option \'--scoreGapNoncan=*\': \'$VIASH_PAR_SCOREGAPNONCAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPNONCAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapGCAG) + [ -n "$VIASH_PAR_SCOREGAPGCAG" ] && ViashError Bad arguments for option \'--scoreGapGCAG\': \'$VIASH_PAR_SCOREGAPGCAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPGCAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapGCAG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapGCAG=*) + [ -n "$VIASH_PAR_SCOREGAPGCAG" ] && ViashError Bad arguments for option \'--scoreGapGCAG=*\': \'$VIASH_PAR_SCOREGAPGCAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPGCAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGapATAC) + [ -n "$VIASH_PAR_SCOREGAPATAC" ] && ViashError Bad arguments for option \'--scoreGapATAC\': \'$VIASH_PAR_SCOREGAPATAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPATAC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGapATAC. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGapATAC=*) + [ -n "$VIASH_PAR_SCOREGAPATAC" ] && ViashError Bad arguments for option \'--scoreGapATAC=*\': \'$VIASH_PAR_SCOREGAPATAC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGAPATAC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreGenomicLengthLog2scale) + [ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ] && ViashError Bad arguments for option \'--scoreGenomicLengthLog2scale\': \'$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreGenomicLengthLog2scale. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreGenomicLengthLog2scale=*) + [ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ] && ViashError Bad arguments for option \'--scoreGenomicLengthLog2scale=*\': \'$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreDelOpen) + [ -n "$VIASH_PAR_SCOREDELOPEN" ] && ViashError Bad arguments for option \'--scoreDelOpen\': \'$VIASH_PAR_SCOREDELOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELOPEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreDelOpen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreDelOpen=*) + [ -n "$VIASH_PAR_SCOREDELOPEN" ] && ViashError Bad arguments for option \'--scoreDelOpen=*\': \'$VIASH_PAR_SCOREDELOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELOPEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreDelBase) + [ -n "$VIASH_PAR_SCOREDELBASE" ] && ViashError Bad arguments for option \'--scoreDelBase\': \'$VIASH_PAR_SCOREDELBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreDelBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreDelBase=*) + [ -n "$VIASH_PAR_SCOREDELBASE" ] && ViashError Bad arguments for option \'--scoreDelBase=*\': \'$VIASH_PAR_SCOREDELBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREDELBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreInsOpen) + [ -n "$VIASH_PAR_SCOREINSOPEN" ] && ViashError Bad arguments for option \'--scoreInsOpen\': \'$VIASH_PAR_SCOREINSOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSOPEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreInsOpen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreInsOpen=*) + [ -n "$VIASH_PAR_SCOREINSOPEN" ] && ViashError Bad arguments for option \'--scoreInsOpen=*\': \'$VIASH_PAR_SCOREINSOPEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSOPEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreInsBase) + [ -n "$VIASH_PAR_SCOREINSBASE" ] && ViashError Bad arguments for option \'--scoreInsBase\': \'$VIASH_PAR_SCOREINSBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSBASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreInsBase. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreInsBase=*) + [ -n "$VIASH_PAR_SCOREINSBASE" ] && ViashError Bad arguments for option \'--scoreInsBase=*\': \'$VIASH_PAR_SCOREINSBASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCOREINSBASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scoreStitchSJshift) + [ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ] && ViashError Bad arguments for option \'--scoreStitchSJshift\': \'$VIASH_PAR_SCORESTITCHSJSHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCORESTITCHSJSHIFT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scoreStitchSJshift. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scoreStitchSJshift=*) + [ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ] && ViashError Bad arguments for option \'--scoreStitchSJshift=*\': \'$VIASH_PAR_SCORESTITCHSJSHIFT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCORESTITCHSJSHIFT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchStartLmax) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ] && ViashError Bad arguments for option \'--seedSearchStartLmax\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchStartLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchStartLmax=*) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ] && ViashError Bad arguments for option \'--seedSearchStartLmax=*\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchStartLmaxOverLread) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ] && ViashError Bad arguments for option \'--seedSearchStartLmaxOverLread\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchStartLmaxOverLread. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchStartLmaxOverLread=*) + [ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ] && ViashError Bad arguments for option \'--seedSearchStartLmaxOverLread=*\': \'$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSearchLmax) + [ -n "$VIASH_PAR_SEEDSEARCHLMAX" ] && ViashError Bad arguments for option \'--seedSearchLmax\': \'$VIASH_PAR_SEEDSEARCHLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHLMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSearchLmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSearchLmax=*) + [ -n "$VIASH_PAR_SEEDSEARCHLMAX" ] && ViashError Bad arguments for option \'--seedSearchLmax=*\': \'$VIASH_PAR_SEEDSEARCHLMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSEARCHLMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedMultimapNmax) + [ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--seedMultimapNmax\': \'$VIASH_PAR_SEEDMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedMultimapNmax=*) + [ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--seedMultimapNmax=*\': \'$VIASH_PAR_SEEDMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedPerReadNmax) + [ -n "$VIASH_PAR_SEEDPERREADNMAX" ] && ViashError Bad arguments for option \'--seedPerReadNmax\': \'$VIASH_PAR_SEEDPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedPerReadNmax=*) + [ -n "$VIASH_PAR_SEEDPERREADNMAX" ] && ViashError Bad arguments for option \'--seedPerReadNmax=*\': \'$VIASH_PAR_SEEDPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedPerWindowNmax) + [ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--seedPerWindowNmax\': \'$VIASH_PAR_SEEDPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERWINDOWNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedPerWindowNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedPerWindowNmax=*) + [ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--seedPerWindowNmax=*\': \'$VIASH_PAR_SEEDPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDPERWINDOWNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedNoneLociPerWindow) + [ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ] && ViashError Bad arguments for option \'--seedNoneLociPerWindow\': \'$VIASH_PAR_SEEDNONELOCIPERWINDOW\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDNONELOCIPERWINDOW="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedNoneLociPerWindow. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedNoneLociPerWindow=*) + [ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ] && ViashError Bad arguments for option \'--seedNoneLociPerWindow=*\': \'$VIASH_PAR_SEEDNONELOCIPERWINDOW\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDNONELOCIPERWINDOW=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedSplitMin) + [ -n "$VIASH_PAR_SEEDSPLITMIN" ] && ViashError Bad arguments for option \'--seedSplitMin\': \'$VIASH_PAR_SEEDSPLITMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSPLITMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedSplitMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedSplitMin=*) + [ -n "$VIASH_PAR_SEEDSPLITMIN" ] && ViashError Bad arguments for option \'--seedSplitMin=*\': \'$VIASH_PAR_SEEDSPLITMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDSPLITMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seedMapMin) + [ -n "$VIASH_PAR_SEEDMAPMIN" ] && ViashError Bad arguments for option \'--seedMapMin\': \'$VIASH_PAR_SEEDMAPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMAPMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seedMapMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seedMapMin=*) + [ -n "$VIASH_PAR_SEEDMAPMIN" ] && ViashError Bad arguments for option \'--seedMapMin=*\': \'$VIASH_PAR_SEEDMAPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEEDMAPMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignIntronMin) + [ -n "$VIASH_PAR_ALIGNINTRONMIN" ] && ViashError Bad arguments for option \'--alignIntronMin\': \'$VIASH_PAR_ALIGNINTRONMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignIntronMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignIntronMin=*) + [ -n "$VIASH_PAR_ALIGNINTRONMIN" ] && ViashError Bad arguments for option \'--alignIntronMin=*\': \'$VIASH_PAR_ALIGNINTRONMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignIntronMax) + [ -n "$VIASH_PAR_ALIGNINTRONMAX" ] && ViashError Bad arguments for option \'--alignIntronMax\': \'$VIASH_PAR_ALIGNINTRONMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignIntronMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignIntronMax=*) + [ -n "$VIASH_PAR_ALIGNINTRONMAX" ] && ViashError Bad arguments for option \'--alignIntronMax=*\': \'$VIASH_PAR_ALIGNINTRONMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINTRONMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignMatesGapMax) + [ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ] && ViashError Bad arguments for option \'--alignMatesGapMax\': \'$VIASH_PAR_ALIGNMATESGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNMATESGAPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignMatesGapMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignMatesGapMax=*) + [ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ] && ViashError Bad arguments for option \'--alignMatesGapMax=*\': \'$VIASH_PAR_ALIGNMATESGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNMATESGAPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSJoverhangMin) + [ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJoverhangMin\': \'$VIASH_PAR_ALIGNSJOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJoverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJoverhangMin=*) + [ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJoverhangMin=*\': \'$VIASH_PAR_ALIGNSJOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSJstitchMismatchNmax) + if [ -z "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$2" + else + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJstitchMismatchNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJstitchMismatchNmax=*) + if [ -z "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX="$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --alignSJDBoverhangMin) + [ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJDBoverhangMin\': \'$VIASH_PAR_ALIGNSJDBOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJDBOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSJDBoverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSJDBoverhangMin=*) + [ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ] && ViashError Bad arguments for option \'--alignSJDBoverhangMin=*\': \'$VIASH_PAR_ALIGNSJDBOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSJDBOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSplicedMateMapLmin) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLmin\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSplicedMateMapLmin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSplicedMateMapLmin=*) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLmin=*\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSplicedMateMapLminOverLmate) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLminOverLmate\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSplicedMateMapLminOverLmate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSplicedMateMapLminOverLmate=*) + [ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ] && ViashError Bad arguments for option \'--alignSplicedMateMapLminOverLmate=*\': \'$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignWindowsPerReadNmax) + [ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignWindowsPerReadNmax\': \'$VIASH_PAR_ALIGNWINDOWSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNWINDOWSPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignWindowsPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignWindowsPerReadNmax=*) + [ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignWindowsPerReadNmax=*\': \'$VIASH_PAR_ALIGNWINDOWSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNWINDOWSPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignTranscriptsPerWindowNmax) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerWindowNmax\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignTranscriptsPerWindowNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignTranscriptsPerWindowNmax=*) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerWindowNmax=*\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignTranscriptsPerReadNmax) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerReadNmax\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignTranscriptsPerReadNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignTranscriptsPerReadNmax=*) + [ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ] && ViashError Bad arguments for option \'--alignTranscriptsPerReadNmax=*\': \'$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignEndsType) + [ -n "$VIASH_PAR_ALIGNENDSTYPE" ] && ViashError Bad arguments for option \'--alignEndsType\': \'$VIASH_PAR_ALIGNENDSTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignEndsType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignEndsType=*) + [ -n "$VIASH_PAR_ALIGNENDSTYPE" ] && ViashError Bad arguments for option \'--alignEndsType=*\': \'$VIASH_PAR_ALIGNENDSTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignEndsProtrude) + [ -n "$VIASH_PAR_ALIGNENDSPROTRUDE" ] && ViashError Bad arguments for option \'--alignEndsProtrude\': \'$VIASH_PAR_ALIGNENDSPROTRUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSPROTRUDE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignEndsProtrude. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignEndsProtrude=*) + [ -n "$VIASH_PAR_ALIGNENDSPROTRUDE" ] && ViashError Bad arguments for option \'--alignEndsProtrude=*\': \'$VIASH_PAR_ALIGNENDSPROTRUDE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNENDSPROTRUDE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignSoftClipAtReferenceEnds) + [ -n "$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS" ] && ViashError Bad arguments for option \'--alignSoftClipAtReferenceEnds\': \'$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignSoftClipAtReferenceEnds. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignSoftClipAtReferenceEnds=*) + [ -n "$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS" ] && ViashError Bad arguments for option \'--alignSoftClipAtReferenceEnds=*\': \'$VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --alignInsertionFlush) + [ -n "$VIASH_PAR_ALIGNINSERTIONFLUSH" ] && ViashError Bad arguments for option \'--alignInsertionFlush\': \'$VIASH_PAR_ALIGNINSERTIONFLUSH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINSERTIONFLUSH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --alignInsertionFlush. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --alignInsertionFlush=*) + [ -n "$VIASH_PAR_ALIGNINSERTIONFLUSH" ] && ViashError Bad arguments for option \'--alignInsertionFlush=*\': \'$VIASH_PAR_ALIGNINSERTIONFLUSH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ALIGNINSERTIONFLUSH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peOverlapNbasesMin) + [ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ] && ViashError Bad arguments for option \'--peOverlapNbasesMin\': \'$VIASH_PAR_PEOVERLAPNBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPNBASESMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peOverlapNbasesMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peOverlapNbasesMin=*) + [ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ] && ViashError Bad arguments for option \'--peOverlapNbasesMin=*\': \'$VIASH_PAR_PEOVERLAPNBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPNBASESMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --peOverlapMMp) + [ -n "$VIASH_PAR_PEOVERLAPMMP" ] && ViashError Bad arguments for option \'--peOverlapMMp\': \'$VIASH_PAR_PEOVERLAPMMP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPMMP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --peOverlapMMp. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --peOverlapMMp=*) + [ -n "$VIASH_PAR_PEOVERLAPMMP" ] && ViashError Bad arguments for option \'--peOverlapMMp=*\': \'$VIASH_PAR_PEOVERLAPMMP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PEOVERLAPMMP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winAnchorMultimapNmax) + [ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--winAnchorMultimapNmax\': \'$VIASH_PAR_WINANCHORMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winAnchorMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winAnchorMultimapNmax=*) + [ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--winAnchorMultimapNmax=*\': \'$VIASH_PAR_WINANCHORMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winBinNbits) + [ -n "$VIASH_PAR_WINBINNBITS" ] && ViashError Bad arguments for option \'--winBinNbits\': \'$VIASH_PAR_WINBINNBITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINBINNBITS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winBinNbits. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winBinNbits=*) + [ -n "$VIASH_PAR_WINBINNBITS" ] && ViashError Bad arguments for option \'--winBinNbits=*\': \'$VIASH_PAR_WINBINNBITS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINBINNBITS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winAnchorDistNbins) + [ -n "$VIASH_PAR_WINANCHORDISTNBINS" ] && ViashError Bad arguments for option \'--winAnchorDistNbins\': \'$VIASH_PAR_WINANCHORDISTNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORDISTNBINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winAnchorDistNbins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winAnchorDistNbins=*) + [ -n "$VIASH_PAR_WINANCHORDISTNBINS" ] && ViashError Bad arguments for option \'--winAnchorDistNbins=*\': \'$VIASH_PAR_WINANCHORDISTNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINANCHORDISTNBINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winFlankNbins) + [ -n "$VIASH_PAR_WINFLANKNBINS" ] && ViashError Bad arguments for option \'--winFlankNbins\': \'$VIASH_PAR_WINFLANKNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINFLANKNBINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winFlankNbins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winFlankNbins=*) + [ -n "$VIASH_PAR_WINFLANKNBINS" ] && ViashError Bad arguments for option \'--winFlankNbins=*\': \'$VIASH_PAR_WINFLANKNBINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINFLANKNBINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winReadCoverageRelativeMin) + [ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ] && ViashError Bad arguments for option \'--winReadCoverageRelativeMin\': \'$VIASH_PAR_WINREADCOVERAGERELATIVEMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGERELATIVEMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winReadCoverageRelativeMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winReadCoverageRelativeMin=*) + [ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ] && ViashError Bad arguments for option \'--winReadCoverageRelativeMin=*\': \'$VIASH_PAR_WINREADCOVERAGERELATIVEMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGERELATIVEMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --winReadCoverageBasesMin) + [ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ] && ViashError Bad arguments for option \'--winReadCoverageBasesMin\': \'$VIASH_PAR_WINREADCOVERAGEBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGEBASESMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --winReadCoverageBasesMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --winReadCoverageBasesMin=*) + [ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ] && ViashError Bad arguments for option \'--winReadCoverageBasesMin=*\': \'$VIASH_PAR_WINREADCOVERAGEBASESMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WINREADCOVERAGEBASESMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimOutType) + if [ -z "$VIASH_PAR_CHIMOUTTYPE" ]; then + VIASH_PAR_CHIMOUTTYPE="$2" + else + VIASH_PAR_CHIMOUTTYPE="$VIASH_PAR_CHIMOUTTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimOutType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimOutType=*) + if [ -z "$VIASH_PAR_CHIMOUTTYPE" ]; then + VIASH_PAR_CHIMOUTTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CHIMOUTTYPE="$VIASH_PAR_CHIMOUTTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --chimSegmentMin) + [ -n "$VIASH_PAR_CHIMSEGMENTMIN" ] && ViashError Bad arguments for option \'--chimSegmentMin\': \'$VIASH_PAR_CHIMSEGMENTMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimSegmentMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimSegmentMin=*) + [ -n "$VIASH_PAR_CHIMSEGMENTMIN" ] && ViashError Bad arguments for option \'--chimSegmentMin=*\': \'$VIASH_PAR_CHIMSEGMENTMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreMin) + [ -n "$VIASH_PAR_CHIMSCOREMIN" ] && ViashError Bad arguments for option \'--chimScoreMin\': \'$VIASH_PAR_CHIMSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreMin=*) + [ -n "$VIASH_PAR_CHIMSCOREMIN" ] && ViashError Bad arguments for option \'--chimScoreMin=*\': \'$VIASH_PAR_CHIMSCOREMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreDropMax) + [ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ] && ViashError Bad arguments for option \'--chimScoreDropMax\': \'$VIASH_PAR_CHIMSCOREDROPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREDROPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreDropMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreDropMax=*) + [ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ] && ViashError Bad arguments for option \'--chimScoreDropMax=*\': \'$VIASH_PAR_CHIMSCOREDROPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREDROPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreSeparation) + [ -n "$VIASH_PAR_CHIMSCORESEPARATION" ] && ViashError Bad arguments for option \'--chimScoreSeparation\': \'$VIASH_PAR_CHIMSCORESEPARATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCORESEPARATION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreSeparation. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreSeparation=*) + [ -n "$VIASH_PAR_CHIMSCORESEPARATION" ] && ViashError Bad arguments for option \'--chimScoreSeparation=*\': \'$VIASH_PAR_CHIMSCORESEPARATION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCORESEPARATION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimScoreJunctionNonGTAG) + [ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ] && ViashError Bad arguments for option \'--chimScoreJunctionNonGTAG\': \'$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimScoreJunctionNonGTAG. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimScoreJunctionNonGTAG=*) + [ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ] && ViashError Bad arguments for option \'--chimScoreJunctionNonGTAG=*\': \'$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimJunctionOverhangMin) + [ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ] && ViashError Bad arguments for option \'--chimJunctionOverhangMin\': \'$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMJUNCTIONOVERHANGMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimJunctionOverhangMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimJunctionOverhangMin=*) + [ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ] && ViashError Bad arguments for option \'--chimJunctionOverhangMin=*\': \'$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMJUNCTIONOVERHANGMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimSegmentReadGapMax) + [ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ] && ViashError Bad arguments for option \'--chimSegmentReadGapMax\': \'$VIASH_PAR_CHIMSEGMENTREADGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTREADGAPMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimSegmentReadGapMax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimSegmentReadGapMax=*) + [ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ] && ViashError Bad arguments for option \'--chimSegmentReadGapMax=*\': \'$VIASH_PAR_CHIMSEGMENTREADGAPMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMSEGMENTREADGAPMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimFilter) + if [ -z "$VIASH_PAR_CHIMFILTER" ]; then + VIASH_PAR_CHIMFILTER="$2" + else + VIASH_PAR_CHIMFILTER="$VIASH_PAR_CHIMFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimFilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimFilter=*) + if [ -z "$VIASH_PAR_CHIMFILTER" ]; then + VIASH_PAR_CHIMFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CHIMFILTER="$VIASH_PAR_CHIMFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --chimMainSegmentMultNmax) + [ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ] && ViashError Bad arguments for option \'--chimMainSegmentMultNmax\': \'$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMAINSEGMENTMULTNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMainSegmentMultNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMainSegmentMultNmax=*) + [ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ] && ViashError Bad arguments for option \'--chimMainSegmentMultNmax=*\': \'$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMAINSEGMENTMULTNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimMultimapNmax) + [ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--chimMultimapNmax\': \'$VIASH_PAR_CHIMMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMultimapNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMultimapNmax=*) + [ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ] && ViashError Bad arguments for option \'--chimMultimapNmax=*\': \'$VIASH_PAR_CHIMMULTIMAPNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimMultimapScoreRange) + [ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--chimMultimapScoreRange\': \'$VIASH_PAR_CHIMMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPSCORERANGE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimMultimapScoreRange. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimMultimapScoreRange=*) + [ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ] && ViashError Bad arguments for option \'--chimMultimapScoreRange=*\': \'$VIASH_PAR_CHIMMULTIMAPSCORERANGE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMMULTIMAPSCORERANGE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimNonchimScoreDropMin) + [ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ] && ViashError Bad arguments for option \'--chimNonchimScoreDropMin\': \'$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMNONCHIMSCOREDROPMIN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimNonchimScoreDropMin. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimNonchimScoreDropMin=*) + [ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ] && ViashError Bad arguments for option \'--chimNonchimScoreDropMin=*\': \'$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMNONCHIMSCOREDROPMIN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --chimOutJunctionFormat) + [ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ] && ViashError Bad arguments for option \'--chimOutJunctionFormat\': \'$VIASH_PAR_CHIMOUTJUNCTIONFORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMOUTJUNCTIONFORMAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --chimOutJunctionFormat. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --chimOutJunctionFormat=*) + [ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ] && ViashError Bad arguments for option \'--chimOutJunctionFormat=*\': \'$VIASH_PAR_CHIMOUTJUNCTIONFORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CHIMOUTJUNCTIONFORMAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantMode) + if [ -z "$VIASH_PAR_QUANTMODE" ]; then + VIASH_PAR_QUANTMODE="$2" + else + VIASH_PAR_QUANTMODE="$VIASH_PAR_QUANTMODE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantMode=*) + if [ -z "$VIASH_PAR_QUANTMODE" ]; then + VIASH_PAR_QUANTMODE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_QUANTMODE="$VIASH_PAR_QUANTMODE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --quantTranscriptomeBAMcompression) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--quantTranscriptomeBAMcompression\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantTranscriptomeBAMcompression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantTranscriptomeBAMcompression=*) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ] && ViashError Bad arguments for option \'--quantTranscriptomeBAMcompression=*\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --quantTranscriptomeBan) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAN" ] && ViashError Bad arguments for option \'--quantTranscriptomeBan\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --quantTranscriptomeBan. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --quantTranscriptomeBan=*) + [ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAN" ] && ViashError Bad arguments for option \'--quantTranscriptomeBan=*\': \'$VIASH_PAR_QUANTTRANSCRIPTOMEBAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_QUANTTRANSCRIPTOMEBAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --twopassMode) + [ -n "$VIASH_PAR_TWOPASSMODE" ] && ViashError Bad arguments for option \'--twopassMode\': \'$VIASH_PAR_TWOPASSMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASSMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --twopassMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --twopassMode=*) + [ -n "$VIASH_PAR_TWOPASSMODE" ] && ViashError Bad arguments for option \'--twopassMode=*\': \'$VIASH_PAR_TWOPASSMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASSMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --twopass1readsN) + [ -n "$VIASH_PAR_TWOPASS1READSN" ] && ViashError Bad arguments for option \'--twopass1readsN\': \'$VIASH_PAR_TWOPASS1READSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASS1READSN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --twopass1readsN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --twopass1readsN=*) + [ -n "$VIASH_PAR_TWOPASS1READSN" ] && ViashError Bad arguments for option \'--twopass1readsN=*\': \'$VIASH_PAR_TWOPASS1READSN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TWOPASS1READSN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --waspOutputMode) + [ -n "$VIASH_PAR_WASPOUTPUTMODE" ] && ViashError Bad arguments for option \'--waspOutputMode\': \'$VIASH_PAR_WASPOUTPUTMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WASPOUTPUTMODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --waspOutputMode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --waspOutputMode=*) + [ -n "$VIASH_PAR_WASPOUTPUTMODE" ] && ViashError Bad arguments for option \'--waspOutputMode=*\': \'$VIASH_PAR_WASPOUTPUTMODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WASPOUTPUTMODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloType) + if [ -z "$VIASH_PAR_SOLOTYPE" ]; then + VIASH_PAR_SOLOTYPE="$2" + else + VIASH_PAR_SOLOTYPE="$VIASH_PAR_SOLOTYPE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloType. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloType=*) + if [ -z "$VIASH_PAR_SOLOTYPE" ]; then + VIASH_PAR_SOLOTYPE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOTYPE="$VIASH_PAR_SOLOTYPE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCBwhitelist) + if [ -z "$VIASH_PAR_SOLOCBWHITELIST" ]; then + VIASH_PAR_SOLOCBWHITELIST="$2" + else + VIASH_PAR_SOLOCBWHITELIST="$VIASH_PAR_SOLOCBWHITELIST;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBwhitelist. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBwhitelist=*) + if [ -z "$VIASH_PAR_SOLOCBWHITELIST" ]; then + VIASH_PAR_SOLOCBWHITELIST=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCBWHITELIST="$VIASH_PAR_SOLOCBWHITELIST;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCBstart) + [ -n "$VIASH_PAR_SOLOCBSTART" ] && ViashError Bad arguments for option \'--soloCBstart\': \'$VIASH_PAR_SOLOCBSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBSTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBstart=*) + [ -n "$VIASH_PAR_SOLOCBSTART" ] && ViashError Bad arguments for option \'--soloCBstart=*\': \'$VIASH_PAR_SOLOCBSTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBSTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBlen) + [ -n "$VIASH_PAR_SOLOCBLEN" ] && ViashError Bad arguments for option \'--soloCBlen\': \'$VIASH_PAR_SOLOCBLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBLEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBlen=*) + [ -n "$VIASH_PAR_SOLOCBLEN" ] && ViashError Bad arguments for option \'--soloCBlen=*\': \'$VIASH_PAR_SOLOCBLEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBLEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloUMIstart) + [ -n "$VIASH_PAR_SOLOUMISTART" ] && ViashError Bad arguments for option \'--soloUMIstart\': \'$VIASH_PAR_SOLOUMISTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMISTART="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIstart. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIstart=*) + [ -n "$VIASH_PAR_SOLOUMISTART" ] && ViashError Bad arguments for option \'--soloUMIstart=*\': \'$VIASH_PAR_SOLOUMISTART\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMISTART=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloUMIlen) + [ -n "$VIASH_PAR_SOLOUMILEN" ] && ViashError Bad arguments for option \'--soloUMIlen\': \'$VIASH_PAR_SOLOUMILEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMILEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIlen. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIlen=*) + [ -n "$VIASH_PAR_SOLOUMILEN" ] && ViashError Bad arguments for option \'--soloUMIlen=*\': \'$VIASH_PAR_SOLOUMILEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMILEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloBarcodeReadLength) + [ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ] && ViashError Bad arguments for option \'--soloBarcodeReadLength\': \'$VIASH_PAR_SOLOBARCODEREADLENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEREADLENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloBarcodeReadLength. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloBarcodeReadLength=*) + [ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ] && ViashError Bad arguments for option \'--soloBarcodeReadLength=*\': \'$VIASH_PAR_SOLOBARCODEREADLENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEREADLENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloBarcodeMate) + [ -n "$VIASH_PAR_SOLOBARCODEMATE" ] && ViashError Bad arguments for option \'--soloBarcodeMate\': \'$VIASH_PAR_SOLOBARCODEMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEMATE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloBarcodeMate. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloBarcodeMate=*) + [ -n "$VIASH_PAR_SOLOBARCODEMATE" ] && ViashError Bad arguments for option \'--soloBarcodeMate=*\': \'$VIASH_PAR_SOLOBARCODEMATE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOBARCODEMATE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBposition) + if [ -z "$VIASH_PAR_SOLOCBPOSITION" ]; then + VIASH_PAR_SOLOCBPOSITION="$2" + else + VIASH_PAR_SOLOCBPOSITION="$VIASH_PAR_SOLOCBPOSITION;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBposition. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBposition=*) + if [ -z "$VIASH_PAR_SOLOCBPOSITION" ]; then + VIASH_PAR_SOLOCBPOSITION=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCBPOSITION="$VIASH_PAR_SOLOCBPOSITION;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIposition) + [ -n "$VIASH_PAR_SOLOUMIPOSITION" ] && ViashError Bad arguments for option \'--soloUMIposition\': \'$VIASH_PAR_SOLOUMIPOSITION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMIPOSITION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIposition. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIposition=*) + [ -n "$VIASH_PAR_SOLOUMIPOSITION" ] && ViashError Bad arguments for option \'--soloUMIposition=*\': \'$VIASH_PAR_SOLOUMIPOSITION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOUMIPOSITION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloAdapterSequence) + [ -n "$VIASH_PAR_SOLOADAPTERSEQUENCE" ] && ViashError Bad arguments for option \'--soloAdapterSequence\': \'$VIASH_PAR_SOLOADAPTERSEQUENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERSEQUENCE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloAdapterSequence. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloAdapterSequence=*) + [ -n "$VIASH_PAR_SOLOADAPTERSEQUENCE" ] && ViashError Bad arguments for option \'--soloAdapterSequence=*\': \'$VIASH_PAR_SOLOADAPTERSEQUENCE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERSEQUENCE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloAdapterMismatchesNmax) + [ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ] && ViashError Bad arguments for option \'--soloAdapterMismatchesNmax\': \'$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERMISMATCHESNMAX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloAdapterMismatchesNmax. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloAdapterMismatchesNmax=*) + [ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ] && ViashError Bad arguments for option \'--soloAdapterMismatchesNmax=*\': \'$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOADAPTERMISMATCHESNMAX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloCBmatchWLtype) + [ -n "$VIASH_PAR_SOLOCBMATCHWLTYPE" ] && ViashError Bad arguments for option \'--soloCBmatchWLtype\': \'$VIASH_PAR_SOLOCBMATCHWLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBMATCHWLTYPE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCBmatchWLtype. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCBmatchWLtype=*) + [ -n "$VIASH_PAR_SOLOCBMATCHWLTYPE" ] && ViashError Bad arguments for option \'--soloCBmatchWLtype=*\': \'$VIASH_PAR_SOLOCBMATCHWLTYPE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCBMATCHWLTYPE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloInputSAMattrBarcodeSeq) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$2" + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloInputSAMattrBarcodeSeq. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloInputSAMattrBarcodeSeq=*) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ="$VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloInputSAMattrBarcodeQual) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$2" + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloInputSAMattrBarcodeQual. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloInputSAMattrBarcodeQual=*) + if [ -z "$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL" ]; then + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL="$VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloStrand) + [ -n "$VIASH_PAR_SOLOSTRAND" ] && ViashError Bad arguments for option \'--soloStrand\': \'$VIASH_PAR_SOLOSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOSTRAND="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloStrand. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloStrand=*) + [ -n "$VIASH_PAR_SOLOSTRAND" ] && ViashError Bad arguments for option \'--soloStrand=*\': \'$VIASH_PAR_SOLOSTRAND\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOSTRAND=$(ViashRemoveFlags "$1") + shift 1 + ;; + --soloFeatures) + if [ -z "$VIASH_PAR_SOLOFEATURES" ]; then + VIASH_PAR_SOLOFEATURES="$2" + else + VIASH_PAR_SOLOFEATURES="$VIASH_PAR_SOLOFEATURES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloFeatures. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloFeatures=*) + if [ -z "$VIASH_PAR_SOLOFEATURES" ]; then + VIASH_PAR_SOLOFEATURES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOFEATURES="$VIASH_PAR_SOLOFEATURES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloMultiMappers) + if [ -z "$VIASH_PAR_SOLOMULTIMAPPERS" ]; then + VIASH_PAR_SOLOMULTIMAPPERS="$2" + else + VIASH_PAR_SOLOMULTIMAPPERS="$VIASH_PAR_SOLOMULTIMAPPERS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloMultiMappers. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloMultiMappers=*) + if [ -z "$VIASH_PAR_SOLOMULTIMAPPERS" ]; then + VIASH_PAR_SOLOMULTIMAPPERS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOMULTIMAPPERS="$VIASH_PAR_SOLOMULTIMAPPERS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIdedup) + if [ -z "$VIASH_PAR_SOLOUMIDEDUP" ]; then + VIASH_PAR_SOLOUMIDEDUP="$2" + else + VIASH_PAR_SOLOUMIDEDUP="$VIASH_PAR_SOLOUMIDEDUP;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIdedup. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIdedup=*) + if [ -z "$VIASH_PAR_SOLOUMIDEDUP" ]; then + VIASH_PAR_SOLOUMIDEDUP=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOUMIDEDUP="$VIASH_PAR_SOLOUMIDEDUP;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloUMIfiltering) + if [ -z "$VIASH_PAR_SOLOUMIFILTERING" ]; then + VIASH_PAR_SOLOUMIFILTERING="$2" + else + VIASH_PAR_SOLOUMIFILTERING="$VIASH_PAR_SOLOUMIFILTERING;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloUMIfiltering. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloUMIfiltering=*) + if [ -z "$VIASH_PAR_SOLOUMIFILTERING" ]; then + VIASH_PAR_SOLOUMIFILTERING=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOUMIFILTERING="$VIASH_PAR_SOLOUMIFILTERING;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloOutFileNames) + if [ -z "$VIASH_PAR_SOLOOUTFILENAMES" ]; then + VIASH_PAR_SOLOOUTFILENAMES="$2" + else + VIASH_PAR_SOLOOUTFILENAMES="$VIASH_PAR_SOLOOUTFILENAMES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloOutFileNames. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloOutFileNames=*) + if [ -z "$VIASH_PAR_SOLOOUTFILENAMES" ]; then + VIASH_PAR_SOLOOUTFILENAMES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOOUTFILENAMES="$VIASH_PAR_SOLOOUTFILENAMES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCellFilter) + if [ -z "$VIASH_PAR_SOLOCELLFILTER" ]; then + VIASH_PAR_SOLOCELLFILTER="$2" + else + VIASH_PAR_SOLOCELLFILTER="$VIASH_PAR_SOLOCELLFILTER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCellFilter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCellFilter=*) + if [ -z "$VIASH_PAR_SOLOCELLFILTER" ]; then + VIASH_PAR_SOLOCELLFILTER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOCELLFILTER="$VIASH_PAR_SOLOCELLFILTER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloOutFormatFeaturesGeneField3) + if [ -z "$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3" ]; then + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$2" + else + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloOutFormatFeaturesGeneField3. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloOutFormatFeaturesGeneField3=*) + if [ -z "$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3" ]; then + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3=$(ViashRemoveFlags "$1") + else + VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3="$VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --soloCellReadStats) + [ -n "$VIASH_PAR_SOLOCELLREADSTATS" ] && ViashError Bad arguments for option \'--soloCellReadStats\': \'$VIASH_PAR_SOLOCELLREADSTATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCELLREADSTATS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --soloCellReadStats. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --soloCellReadStats=*) + [ -n "$VIASH_PAR_SOLOCELLREADSTATS" ] && ViashError Bad arguments for option \'--soloCellReadStats=*\': \'$VIASH_PAR_SOLOCELLREADSTATS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SOLOCELLREADSTATS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/mapping/star_align_v273a:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE+x} ]; then + ViashError '--reference' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ] && [ ! -e "$VIASH_PAR_REFERENCE" ]; then + ViashError "Input file '$VIASH_PAR_REFERENCE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ] && [ ! -e "$VIASH_PAR_SJDBGTFFILE" ]; then + ViashError "Input file '$VIASH_PAR_SJDBGTFFILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ] && [ ! -e "$VIASH_PAR_READFILESMANIFEST" ]; then + ViashError "Input file '$VIASH_PAR_READFILESMANIFEST' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_RUNRNGSEED" ]]; then + if ! [[ "$VIASH_PAR_RUNRNGSEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--runRNGseed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_GENOMEFILESIZES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_GENOMEFILESIZES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--genomeFileSizes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_SJDBOVERHANG" ]]; then + if ! [[ "$VIASH_PAR_SJDBOVERHANG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sjdbOverhang' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SJDBSCORE" ]]; then + if ! [[ "$VIASH_PAR_SJDBSCORE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--sjdbScore' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READMAPNUMBER" ]]; then + if ! [[ "$VIASH_PAR_READMAPNUMBER" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--readMapNumber' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_READQUALITYSCOREBASE" ]]; then + if ! [[ "$VIASH_PAR_READQUALITYSCOREBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--readQualityScoreBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_CLIP3PNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip3pNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP3PADAPTERMMP" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PADAPTERMMP; do + if ! [[ "${val}" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--clip3pAdapterMMp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP3PAFTERADAPTERNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP3PAFTERADAPTERNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip3pAfterAdapterNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_CLIP5PNBASES" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_CLIP5PNBASES; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--clip5pNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIMITGENOMEGENERATERAM" ]]; then + if ! [[ "$VIASH_PAR_LIMITGENOMEGENERATERAM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitGenomeGenerateRAM' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_LIMITIOBUFFERSIZE" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_LIMITIOBUFFERSIZE; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitIObufferSize' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSAMONEREADBYTES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSAMoneReadBytes' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITOUTSJONEREAD" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSJONEREAD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSJoneRead' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITOUTSJCOLLAPSED" ]]; then + if ! [[ "$VIASH_PAR_LIMITOUTSJCOLLAPSED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitOutSJcollapsed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITBAMSORTRAM" ]]; then + if ! [[ "$VIASH_PAR_LIMITBAMSORTRAM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitBAMsortRAM' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITSJDBINSERTNSJ" ]]; then + if ! [[ "$VIASH_PAR_LIMITSJDBINSERTNSJ" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitSjdbInsertNsj' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LIMITNREADSSOFT" ]]; then + if ! [[ "$VIASH_PAR_LIMITNREADSSOFT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--limitNreadsSoft' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTQSCONVERSIONADD" ]]; then + if ! [[ "$VIASH_PAR_OUTQSCONVERSIONADD" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outQSconversionAdd' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMATTRIHSTART" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMATTRIHSTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMattrIHstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMMAPQUNIQUE" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMMAPQUNIQUE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMmapqUnique' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMFLAGOR" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMFLAGOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMflagOR' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMFLAGAND" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMFLAGAND" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMflagAND' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMMULTNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMMULTNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMmultNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTSAMTLEN" ]]; then + if ! [[ "$VIASH_PAR_OUTSAMTLEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSAMtlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMCOMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMCOMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMcompression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMSORTINGTHREADN" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMSORTINGTHREADN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMsortingThreadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTBAMSORTINGBINSN" ]]; then + if ! [[ "$VIASH_PAR_OUTBAMSORTINGBINSN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outBAMsortingBinsN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" ]]; then + if ! [[ "$VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--bamRemoveDuplicatesMate2basesN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMultimapScoreRange' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMismatchNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMismatchNoverLmax' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMismatchNoverReadLmax' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERSCOREMIN" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERSCOREMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterScoreMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERSCOREMINOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterScoreMinOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMATCHNMIN" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMATCHNMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outFilterMatchNmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--outFilterMatchNminOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_OUTSJFILTEROVERHANGMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTEROVERHANGMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterOverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterCountUniqueMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterCountTotalMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterDistToOtherSJmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [ -n "$VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--outSJfilterIntronMaxVsReadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_SCOREGAP" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAP" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGap' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPNONCAN" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPNONCAN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapNoncan' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPGCAG" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPGCAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapGCAG' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGAPATAC" ]]; then + if ! [[ "$VIASH_PAR_SCOREGAPATAC" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGapATAC' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" ]]; then + if ! [[ "$VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreGenomicLengthLog2scale' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREDELOPEN" ]]; then + if ! [[ "$VIASH_PAR_SCOREDELOPEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreDelOpen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREDELBASE" ]]; then + if ! [[ "$VIASH_PAR_SCOREDELBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreDelBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREINSOPEN" ]]; then + if ! [[ "$VIASH_PAR_SCOREINSOPEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreInsOpen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCOREINSBASE" ]]; then + if ! [[ "$VIASH_PAR_SCOREINSBASE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreInsBase' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SCORESTITCHSJSHIFT" ]]; then + if ! [[ "$VIASH_PAR_SCORESTITCHSJSHIFT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scoreStitchSJshift' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHSTARTLMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSearchStartLmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--seedSearchStartLmaxOverLread' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSEARCHLMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDSEARCHLMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSearchLmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDPERWINDOWNMAX" ]]; then + if ! [[ "$VIASH_PAR_SEEDPERWINDOWNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedPerWindowNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDNONELOCIPERWINDOW" ]]; then + if ! [[ "$VIASH_PAR_SEEDNONELOCIPERWINDOW" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedNoneLociPerWindow' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDSPLITMIN" ]]; then + if ! [[ "$VIASH_PAR_SEEDSPLITMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedSplitMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEEDMAPMIN" ]]; then + if ! [[ "$VIASH_PAR_SEEDMAPMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seedMapMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNINTRONMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNINTRONMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignIntronMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNINTRONMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNINTRONMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignIntronMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNMATESGAPMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNMATESGAPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignMatesGapMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSJOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSJOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJoverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJstitchMismatchNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSJDBOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSJDBoverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignSplicedMateMapLmin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" ]]; then + if ! [[ "$VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--alignSplicedMateMapLminOverLmate' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNWINDOWSPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignWindowsPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignTranscriptsPerWindowNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" ]]; then + if ! [[ "$VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--alignTranscriptsPerReadNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PEOVERLAPNBASESMIN" ]]; then + if ! [[ "$VIASH_PAR_PEOVERLAPNBASESMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--peOverlapNbasesMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_PEOVERLAPMMP" ]]; then + if ! [[ "$VIASH_PAR_PEOVERLAPMMP" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--peOverlapMMp' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINANCHORMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_WINANCHORMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winAnchorMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINBINNBITS" ]]; then + if ! [[ "$VIASH_PAR_WINBINNBITS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winBinNbits' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINANCHORDISTNBINS" ]]; then + if ! [[ "$VIASH_PAR_WINANCHORDISTNBINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winAnchorDistNbins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINFLANKNBINS" ]]; then + if ! [[ "$VIASH_PAR_WINFLANKNBINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winFlankNbins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" ]]; then + if ! [[ "$VIASH_PAR_WINREADCOVERAGERELATIVEMIN" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--winReadCoverageRelativeMin' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WINREADCOVERAGEBASESMIN" ]]; then + if ! [[ "$VIASH_PAR_WINREADCOVERAGEBASESMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--winReadCoverageBasesMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSEGMENTMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMSEGMENTMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimSegmentMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREDROPMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREDROPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreDropMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCORESEPARATION" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCORESEPARATION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreSeparation' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" ]]; then + if ! [[ "$VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimScoreJunctionNonGTAG' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMJUNCTIONOVERHANGMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimJunctionOverhangMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMSEGMENTREADGAPMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimSegmentReadGapMax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMMAINSEGMENTMULTNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMainSegmentMultNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMULTIMAPNMAX" ]]; then + if ! [[ "$VIASH_PAR_CHIMMULTIMAPNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMultimapNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" ]]; then + if ! [[ "$VIASH_PAR_CHIMMULTIMAPSCORERANGE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimMultimapScoreRange' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" ]]; then + if ! [[ "$VIASH_PAR_CHIMNONCHIMSCOREDROPMIN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimNonchimScoreDropMin' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" ]]; then + if ! [[ "$VIASH_PAR_CHIMOUTJUNCTIONFORMAT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--chimOutJunctionFormat' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" ]]; then + if ! [[ "$VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--quantTranscriptomeBAMcompression' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TWOPASS1READSN" ]]; then + if ! [[ "$VIASH_PAR_TWOPASS1READSN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--twopass1readsN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOCBSTART" ]]; then + if ! [[ "$VIASH_PAR_SOLOCBSTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloCBstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOCBLEN" ]]; then + if ! [[ "$VIASH_PAR_SOLOCBLEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloCBlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOUMISTART" ]]; then + if ! [[ "$VIASH_PAR_SOLOUMISTART" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloUMIstart' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOUMILEN" ]]; then + if ! [[ "$VIASH_PAR_SOLOUMILEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloUMIlen' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOBARCODEREADLENGTH" ]]; then + if ! [[ "$VIASH_PAR_SOLOBARCODEREADLENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloBarcodeReadLength' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOBARCODEMATE" ]]; then + if ! [[ "$VIASH_PAR_SOLOBARCODEMATE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloBarcodeMate' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" ]]; then + if ! [[ "$VIASH_PAR_SOLOADAPTERMISMATCHESNMAX" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--soloAdapterMismatchesNmax' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE")" ) + VIASH_PAR_REFERENCE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + VIASH_TEST_GENOMEFASTAFILES=() + IFS=';' + for var in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GENOMEFASTAFILES+=( "$var" ) + done + VIASH_PAR_GENOMEFASTAFILES=$(IFS=';' ; echo "${VIASH_TEST_GENOMEFASTAFILES[*]}") +fi +if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_SJDBGTFFILE")" ) + VIASH_PAR_SJDBGTFFILE=$(ViashDockerAutodetectMount "$VIASH_PAR_SJDBGTFFILE") +fi +if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_READFILESMANIFEST")" ) + VIASH_PAR_READFILESMANIFEST=$(ViashDockerAutodetectMount "$VIASH_PAR_READFILESMANIFEST") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-star_align_v273a-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import re +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'runRNGseed': $( if [ ! -z ${VIASH_PAR_RUNRNGSEED+x} ]; then echo "int(r'${VIASH_PAR_RUNRNGSEED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'genomeLoad': $( if [ ! -z ${VIASH_PAR_GENOMELOAD+x} ]; then echo "r'${VIASH_PAR_GENOMELOAD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'genomeFastaFiles': $( if [ ! -z ${VIASH_PAR_GENOMEFASTAFILES+x} ]; then echo "r'${VIASH_PAR_GENOMEFASTAFILES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'genomeFileSizes': $( if [ ! -z ${VIASH_PAR_GENOMEFILESIZES+x} ]; then echo "list(map(int, r'${VIASH_PAR_GENOMEFILESIZES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'genomeTransformOutput': $( if [ ! -z ${VIASH_PAR_GENOMETRANSFORMOUTPUT+x} ]; then echo "r'${VIASH_PAR_GENOMETRANSFORMOUTPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'genomeChrSetMitochondrial': $( if [ ! -z ${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL+x} ]; then echo "r'${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbFileChrStartEnd': $( if [ ! -z ${VIASH_PAR_SJDBFILECHRSTARTEND+x} ]; then echo "r'${VIASH_PAR_SJDBFILECHRSTARTEND//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFfile': $( if [ ! -z ${VIASH_PAR_SJDBGTFFILE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFchrPrefix': $( if [ ! -z ${VIASH_PAR_SJDBGTFCHRPREFIX+x} ]; then echo "r'${VIASH_PAR_SJDBGTFCHRPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFfeatureExon': $( if [ ! -z ${VIASH_PAR_SJDBGTFFEATUREEXON+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFEATUREEXON//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentTranscript': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGene': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneName': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneType': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'sjdbOverhang': $( if [ ! -z ${VIASH_PAR_SJDBOVERHANG+x} ]; then echo "int(r'${VIASH_PAR_SJDBOVERHANG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sjdbScore': $( if [ ! -z ${VIASH_PAR_SJDBSCORE+x} ]; then echo "int(r'${VIASH_PAR_SJDBSCORE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'sjdbInsertSave': $( if [ ! -z ${VIASH_PAR_SJDBINSERTSAVE+x} ]; then echo "r'${VIASH_PAR_SJDBINSERTSAVE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'varVCFfile': $( if [ ! -z ${VIASH_PAR_VARVCFFILE+x} ]; then echo "r'${VIASH_PAR_VARVCFFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesType': $( if [ ! -z ${VIASH_PAR_READFILESTYPE+x} ]; then echo "r'${VIASH_PAR_READFILESTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesSAMattrKeep': $( if [ ! -z ${VIASH_PAR_READFILESSAMATTRKEEP+x} ]; then echo "r'${VIASH_PAR_READFILESSAMATTRKEEP//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readFilesManifest': $( if [ ! -z ${VIASH_PAR_READFILESMANIFEST+x} ]; then echo "r'${VIASH_PAR_READFILESMANIFEST//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesPrefix': $( if [ ! -z ${VIASH_PAR_READFILESPREFIX+x} ]; then echo "r'${VIASH_PAR_READFILESPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readFilesCommand': $( if [ ! -z ${VIASH_PAR_READFILESCOMMAND+x} ]; then echo "r'${VIASH_PAR_READFILESCOMMAND//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readMapNumber': $( if [ ! -z ${VIASH_PAR_READMAPNUMBER+x} ]; then echo "int(r'${VIASH_PAR_READMAPNUMBER//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'readMatesLengthsIn': $( if [ ! -z ${VIASH_PAR_READMATESLENGTHSIN+x} ]; then echo "r'${VIASH_PAR_READMATESLENGTHSIN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'readNameSeparator': $( if [ ! -z ${VIASH_PAR_READNAMESEPARATOR+x} ]; then echo "r'${VIASH_PAR_READNAMESEPARATOR//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'readQualityScoreBase': $( if [ ! -z ${VIASH_PAR_READQUALITYSCOREBASE+x} ]; then echo "int(r'${VIASH_PAR_READQUALITYSCOREBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'clipAdapterType': $( if [ ! -z ${VIASH_PAR_CLIPADAPTERTYPE+x} ]; then echo "r'${VIASH_PAR_CLIPADAPTERTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'clip3pNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip3pAdapterSeq': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERSEQ+x} ]; then echo "r'${VIASH_PAR_CLIP3PADAPTERSEQ//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'clip3pAdapterMMp': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERMMP+x} ]; then echo "list(map(float, r'${VIASH_PAR_CLIP3PADAPTERMMP//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip3pAfterAdapterNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PAFTERADAPTERNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PAFTERADAPTERNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'clip5pNbases': $( if [ ! -z ${VIASH_PAR_CLIP5PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP5PNBASES//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'limitGenomeGenerateRAM': $( if [ ! -z ${VIASH_PAR_LIMITGENOMEGENERATERAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITGENOMEGENERATERAM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitIObufferSize': $( if [ ! -z ${VIASH_PAR_LIMITIOBUFFERSIZE+x} ]; then echo "list(map(int, r'${VIASH_PAR_LIMITIOBUFFERSIZE//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'limitOutSAMoneReadBytes': $( if [ ! -z ${VIASH_PAR_LIMITOUTSAMONEREADBYTES+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSAMONEREADBYTES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitOutSJoneRead': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJONEREAD+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJONEREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitOutSJcollapsed': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJCOLLAPSED+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJCOLLAPSED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitBAMsortRAM': $( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITBAMSORTRAM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitSjdbInsertNsj': $( if [ ! -z ${VIASH_PAR_LIMITSJDBINSERTNSJ+x} ]; then echo "int(r'${VIASH_PAR_LIMITSJDBINSERTNSJ//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'limitNreadsSoft': $( if [ ! -z ${VIASH_PAR_LIMITNREADSSOFT+x} ]; then echo "int(r'${VIASH_PAR_LIMITNREADSSOFT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outTmpKeep': $( if [ ! -z ${VIASH_PAR_OUTTMPKEEP+x} ]; then echo "r'${VIASH_PAR_OUTTMPKEEP//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outStd': $( if [ ! -z ${VIASH_PAR_OUTSTD+x} ]; then echo "r'${VIASH_PAR_OUTSTD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outReadsUnmapped': $( if [ ! -z ${VIASH_PAR_OUTREADSUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTREADSUNMAPPED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outQSconversionAdd': $( if [ ! -z ${VIASH_PAR_OUTQSCONVERSIONADD+x} ]; then echo "int(r'${VIASH_PAR_OUTQSCONVERSIONADD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outMultimapperOrder': $( if [ ! -z ${VIASH_PAR_OUTMULTIMAPPERORDER+x} ]; then echo "r'${VIASH_PAR_OUTMULTIMAPPERORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMtype': $( if [ ! -z ${VIASH_PAR_OUTSAMTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSAMTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMmode': $( if [ ! -z ${VIASH_PAR_OUTSAMMODE+x} ]; then echo "r'${VIASH_PAR_OUTSAMMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMstrandField': $( if [ ! -z ${VIASH_PAR_OUTSAMSTRANDFIELD+x} ]; then echo "r'${VIASH_PAR_OUTSAMSTRANDFIELD//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMattributes': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRIBUTES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMattrIHstart': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIHSTART+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMATTRIHSTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMunmapped': $( if [ ! -z ${VIASH_PAR_OUTSAMUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTSAMUNMAPPED//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMorder': $( if [ ! -z ${VIASH_PAR_OUTSAMORDER+x} ]; then echo "r'${VIASH_PAR_OUTSAMORDER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMprimaryFlag': $( if [ ! -z ${VIASH_PAR_OUTSAMPRIMARYFLAG+x} ]; then echo "r'${VIASH_PAR_OUTSAMPRIMARYFLAG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMreadID': $( if [ ! -z ${VIASH_PAR_OUTSAMREADID+x} ]; then echo "r'${VIASH_PAR_OUTSAMREADID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMmapqUnique': $( if [ ! -z ${VIASH_PAR_OUTSAMMAPQUNIQUE+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMAPQUNIQUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMflagOR': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGOR+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMflagAND': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGAND+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGAND//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMattrRGline': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRRGLINE+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRRGLINE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderHD': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERHD+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERHD//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderPG': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERPG+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERPG//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderCommentFile': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERCOMMENTFILE+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERCOMMENTFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSAMfilter': $( if [ ! -z ${VIASH_PAR_OUTSAMFILTER+x} ]; then echo "r'${VIASH_PAR_OUTSAMFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outSAMmultNmax': $( if [ ! -z ${VIASH_PAR_OUTSAMMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMULTNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outSAMtlen': $( if [ ! -z ${VIASH_PAR_OUTSAMTLEN+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMTLEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMcompression': $( if [ ! -z ${VIASH_PAR_OUTBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMCOMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMsortingThreadN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGTHREADN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGTHREADN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outBAMsortingBinsN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGBINSN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGBINSN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'bamRemoveDuplicatesType': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESTYPE+x} ]; then echo "r'${VIASH_PAR_BAMREMOVEDUPLICATESTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'bamRemoveDuplicatesMate2basesN': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN+x} ]; then echo "int(r'${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outWigType': $( if [ ! -z ${VIASH_PAR_OUTWIGTYPE+x} ]; then echo "r'${VIASH_PAR_OUTWIGTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'outWigStrand': $( if [ ! -z ${VIASH_PAR_OUTWIGSTRAND+x} ]; then echo "r'${VIASH_PAR_OUTWIGSTRAND//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outWigReferencesPrefix': $( if [ ! -z ${VIASH_PAR_OUTWIGREFERENCESPREFIX+x} ]; then echo "r'${VIASH_PAR_OUTWIGREFERENCESPREFIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outWigNorm': $( if [ ! -z ${VIASH_PAR_OUTWIGNORM+x} ]; then echo "r'${VIASH_PAR_OUTWIGNORM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterType': $( if [ ! -z ${VIASH_PAR_OUTFILTERTYPE+x} ]; then echo "r'${VIASH_PAR_OUTFILTERTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMultimapNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMISMATCHNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNoverLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMismatchNoverReadLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterScoreMin': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERSCOREMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterScoreMinOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMatchNmin': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMATCHNMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterMatchNminOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'outFilterIntronMotifs': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONMOTIFS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONMOTIFS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outFilterIntronStrands': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONSTRANDS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONSTRANDS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJtype': $( if [ ! -z ${VIASH_PAR_OUTSJTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSJTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJfilterReads': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERREADS+x} ]; then echo "r'${VIASH_PAR_OUTSJFILTERREADS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'outSJfilterOverhangMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTEROVERHANGMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTEROVERHANGMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountUniqueMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountTotalMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterDistToOtherSJmin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterIntronMaxVsReadN': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'scoreGap': $( if [ ! -z ${VIASH_PAR_SCOREGAP+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapNoncan': $( if [ ! -z ${VIASH_PAR_SCOREGAPNONCAN+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPNONCAN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapGCAG': $( if [ ! -z ${VIASH_PAR_SCOREGAPGCAG+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPGCAG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGapATAC': $( if [ ! -z ${VIASH_PAR_SCOREGAPATAC+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPATAC//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreGenomicLengthLog2scale': $( if [ ! -z ${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE+x} ]; then echo "int(r'${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreDelOpen': $( if [ ! -z ${VIASH_PAR_SCOREDELOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELOPEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreDelBase': $( if [ ! -z ${VIASH_PAR_SCOREDELBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreInsOpen': $( if [ ! -z ${VIASH_PAR_SCOREINSOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSOPEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreInsBase': $( if [ ! -z ${VIASH_PAR_SCOREINSBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSBASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'scoreStitchSJshift': $( if [ ! -z ${VIASH_PAR_SCORESTITCHSJSHIFT+x} ]; then echo "int(r'${VIASH_PAR_SCORESTITCHSJSHIFT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchStartLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHSTARTLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchStartLmaxOverLread': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSearchLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHLMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedMultimapNmax': $( if [ ! -z ${VIASH_PAR_SEEDMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedPerReadNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedPerWindowNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERWINDOWNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedNoneLociPerWindow': $( if [ ! -z ${VIASH_PAR_SEEDNONELOCIPERWINDOW+x} ]; then echo "int(r'${VIASH_PAR_SEEDNONELOCIPERWINDOW//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedSplitMin': $( if [ ! -z ${VIASH_PAR_SEEDSPLITMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDSPLITMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seedMapMin': $( if [ ! -z ${VIASH_PAR_SEEDMAPMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDMAPMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignIntronMin': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignIntronMax': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignMatesGapMax': $( if [ ! -z ${VIASH_PAR_ALIGNMATESGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNMATESGAPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSJoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSJstitchMismatchNmax': $( if [ ! -z ${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX+x} ]; then echo "list(map(int, r'${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'alignSJDBoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJDBOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJDBOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSplicedMateMapLmin': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignSplicedMateMapLminOverLmate': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE+x} ]; then echo "float(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignWindowsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNWINDOWSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNWINDOWSPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignTranscriptsPerWindowNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignTranscriptsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'alignEndsType': $( if [ ! -z ${VIASH_PAR_ALIGNENDSTYPE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignEndsProtrude': $( if [ ! -z ${VIASH_PAR_ALIGNENDSPROTRUDE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSPROTRUDE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignSoftClipAtReferenceEnds': $( if [ ! -z ${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS+x} ]; then echo "r'${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'alignInsertionFlush': $( if [ ! -z ${VIASH_PAR_ALIGNINSERTIONFLUSH+x} ]; then echo "r'${VIASH_PAR_ALIGNINSERTIONFLUSH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'peOverlapNbasesMin': $( if [ ! -z ${VIASH_PAR_PEOVERLAPNBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_PEOVERLAPNBASESMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'peOverlapMMp': $( if [ ! -z ${VIASH_PAR_PEOVERLAPMMP+x} ]; then echo "float(r'${VIASH_PAR_PEOVERLAPMMP//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winAnchorMultimapNmax': $( if [ ! -z ${VIASH_PAR_WINANCHORMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winBinNbits': $( if [ ! -z ${VIASH_PAR_WINBINNBITS+x} ]; then echo "int(r'${VIASH_PAR_WINBINNBITS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winAnchorDistNbins': $( if [ ! -z ${VIASH_PAR_WINANCHORDISTNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORDISTNBINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winFlankNbins': $( if [ ! -z ${VIASH_PAR_WINFLANKNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINFLANKNBINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winReadCoverageRelativeMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGERELATIVEMIN+x} ]; then echo "float(r'${VIASH_PAR_WINREADCOVERAGERELATIVEMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'winReadCoverageBasesMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGEBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_WINREADCOVERAGEBASESMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimOutType': $( if [ ! -z ${VIASH_PAR_CHIMOUTTYPE+x} ]; then echo "r'${VIASH_PAR_CHIMOUTTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'chimSegmentMin': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreMin': $( if [ ! -z ${VIASH_PAR_CHIMSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreDropMax': $( if [ ! -z ${VIASH_PAR_CHIMSCOREDROPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREDROPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreSeparation': $( if [ ! -z ${VIASH_PAR_CHIMSCORESEPARATION+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCORESEPARATION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimScoreJunctionNonGTAG': $( if [ ! -z ${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimJunctionOverhangMin': $( if [ ! -z ${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimSegmentReadGapMax': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTREADGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTREADGAPMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimFilter': $( if [ ! -z ${VIASH_PAR_CHIMFILTER+x} ]; then echo "r'${VIASH_PAR_CHIMFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'chimMainSegmentMultNmax': $( if [ ! -z ${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimMultimapNmax': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPSCORERANGE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimNonchimScoreDropMin': $( if [ ! -z ${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'chimOutJunctionFormat': $( if [ ! -z ${VIASH_PAR_CHIMOUTJUNCTIONFORMAT+x} ]; then echo "int(r'${VIASH_PAR_CHIMOUTJUNCTIONFORMAT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'quantMode': $( if [ ! -z ${VIASH_PAR_QUANTMODE+x} ]; then echo "r'${VIASH_PAR_QUANTMODE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'quantTranscriptomeBAMcompression': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'quantTranscriptomeBan': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAN+x} ]; then echo "r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'twopassMode': $( if [ ! -z ${VIASH_PAR_TWOPASSMODE+x} ]; then echo "r'${VIASH_PAR_TWOPASSMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'twopass1readsN': $( if [ ! -z ${VIASH_PAR_TWOPASS1READSN+x} ]; then echo "int(r'${VIASH_PAR_TWOPASS1READSN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'waspOutputMode': $( if [ ! -z ${VIASH_PAR_WASPOUTPUTMODE+x} ]; then echo "r'${VIASH_PAR_WASPOUTPUTMODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloType': $( if [ ! -z ${VIASH_PAR_SOLOTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOTYPE//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCBwhitelist': $( if [ ! -z ${VIASH_PAR_SOLOCBWHITELIST+x} ]; then echo "r'${VIASH_PAR_SOLOCBWHITELIST//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCBstart': $( if [ ! -z ${VIASH_PAR_SOLOCBSTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBSTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBlen': $( if [ ! -z ${VIASH_PAR_SOLOCBLEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBLEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloUMIstart': $( if [ ! -z ${VIASH_PAR_SOLOUMISTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMISTART//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloUMIlen': $( if [ ! -z ${VIASH_PAR_SOLOUMILEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMILEN//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloBarcodeReadLength': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEREADLENGTH+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEREADLENGTH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloBarcodeMate': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEMATE+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEMATE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBposition': $( if [ ! -z ${VIASH_PAR_SOLOCBPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOCBPOSITION//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIposition': $( if [ ! -z ${VIASH_PAR_SOLOUMIPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOUMIPOSITION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloAdapterSequence': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERSEQUENCE+x} ]; then echo "r'${VIASH_PAR_SOLOADAPTERSEQUENCE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloAdapterMismatchesNmax': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX+x} ]; then echo "int(r'${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'soloCBmatchWLtype': $( if [ ! -z ${VIASH_PAR_SOLOCBMATCHWLTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOCBMATCHWLTYPE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloInputSAMattrBarcodeSeq': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloInputSAMattrBarcodeQual': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloStrand': $( if [ ! -z ${VIASH_PAR_SOLOSTRAND+x} ]; then echo "r'${VIASH_PAR_SOLOSTRAND//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'soloFeatures': $( if [ ! -z ${VIASH_PAR_SOLOFEATURES+x} ]; then echo "r'${VIASH_PAR_SOLOFEATURES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloMultiMappers': $( if [ ! -z ${VIASH_PAR_SOLOMULTIMAPPERS+x} ]; then echo "r'${VIASH_PAR_SOLOMULTIMAPPERS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIdedup': $( if [ ! -z ${VIASH_PAR_SOLOUMIDEDUP+x} ]; then echo "r'${VIASH_PAR_SOLOUMIDEDUP//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloUMIfiltering': $( if [ ! -z ${VIASH_PAR_SOLOUMIFILTERING+x} ]; then echo "r'${VIASH_PAR_SOLOUMIFILTERING//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloOutFileNames': $( if [ ! -z ${VIASH_PAR_SOLOOUTFILENAMES+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFILENAMES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCellFilter': $( if [ ! -z ${VIASH_PAR_SOLOCELLFILTER+x} ]; then echo "r'${VIASH_PAR_SOLOCELLFILTER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloOutFormatFeaturesGeneField3': $( if [ ! -z ${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'soloCellReadStats': $( if [ ! -z ${VIASH_PAR_SOLOCELLREADSTATS+x} ]; then echo "r'${VIASH_PAR_SOLOCELLREADSTATS//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + +# regex for matching R[12] fastq(gz) files +# examples: +# - TSP10_Fat_MAT_SS2_B134171_B115063_Immune_A1_L003_R1.fastq.gz +# - tinygex_S1_L001_I1_001.fastq.gz +fastqgz_regex = r"(.+)_(R\\d+)(_\\d+)?\\.fastq(\\.gz)?" + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\x1f\\x8b" + + +# look for fastq files in a directory +def search_fastqs(path: Path) -> list[Path]: + if path.is_dir(): + print( + f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", + flush=True, + ) + value_paths = [ + file for file in path.iterdir() if re.match(fastqgz_regex, file.name) + ] + return value_paths + else: + return [path] + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the \`processPar()\` generator needs to be adapted +to_rename = { + "input": "readFilesIn", + "reference": "genomeDir", + "output": "outFileNamePrefix", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the \`to_rename\` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["outFileNamePrefix"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory( + prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True +) as temp_dir: + print(">> Check whether input files are directories", flush=True) + new_read_files_in = [] + for path in par["readFilesIn"]: + new_read_files_in.extend(search_fastqs(path)) + par["readFilesIn"] = new_read_files_in + print("", flush=True) + + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeDir", "readFilesIn"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print("Grouping R1/R2 input files into pairs", flush=True) + input_grouped = {} + for path in par["readFilesIn"]: + key = re.search(fastqgz_regex, path.name).group(2) + if key not in input_grouped: + input_grouped[key] = [] + input_grouped[key].append(str(path)) + par["readFilesIn"] = [",".join(val) for val in input_grouped.values()] + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "alignReads" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + # make sure there is a trailing / + par["outFileNamePrefix"] = f"{par['outFileNamePrefix']}/" + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_REFERENCE" ]; then + VIASH_PAR_REFERENCE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_GENOMEFASTAFILES" ]; then + unset VIASH_TEST_GENOMEFASTAFILES + IFS=';' + for var in $VIASH_PAR_GENOMEFASTAFILES; do + unset IFS + if [ -z "$VIASH_TEST_GENOMEFASTAFILES" ]; then + VIASH_TEST_GENOMEFASTAFILES="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GENOMEFASTAFILES="$VIASH_TEST_GENOMEFASTAFILES;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GENOMEFASTAFILES="$VIASH_TEST_GENOMEFASTAFILES" + fi + if [ ! -z "$VIASH_PAR_SJDBGTFFILE" ]; then + VIASH_PAR_SJDBGTFFILE=$(ViashDockerStripAutomount "$VIASH_PAR_SJDBGTFFILE") + fi + if [ ! -z "$VIASH_PAR_READFILESMANIFEST" ]; then + VIASH_PAR_READFILESMANIFEST=$(ViashDockerStripAutomount "$VIASH_PAR_READFILESMANIFEST") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metadata/add_id/.config.vsh.yaml b/target/executable/metadata/add_id/.config.vsh.yaml new file mode 100644 index 00000000..5de8e38e --- /dev/null +++ b/target/executable/metadata/add_id/.config.vsh.yaml @@ -0,0 +1,273 @@ +name: "add_id" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the input .h5mu." + info: null + example: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_id" + description: "The input id." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output" + description: "Name of the .obs column where to store the id." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Name of output MuData file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--make_observation_keys_unique" + description: "Join the id to the .obs index (.obs_names)." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Add id of .obs. Also allows to make .obs_names (the .obs index) unique\ + \ \nby prefixing the values with an unique id per .h5mu file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/add_id/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/metadata/add_id" + executable: "target/executable/metadata/add_id/add_id" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/metadata/add_id/add_id b/target/executable/metadata/add_id/add_id new file mode 100755 index 00000000..465768cf --- /dev/null +++ b/target/executable/metadata/add_id/add_id @@ -0,0 +1,1288 @@ +#!/usr/bin/env bash + +# add_id main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="add_id" +VIASH_META_FUNCTIONALITY_NAME="add_id" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component metadata add_id" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "add_id main" + echo "" + echo "Add id of .obs. Also allows to make .obs_names (the .obs index) unique" + echo "by prefixing the values with an unique id per .h5mu file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: sample_path" + echo " Path to the input .h5mu." + echo "" + echo " --input_id" + echo " type: string, required parameter" + echo " The input id." + echo "" + echo " --obs_output" + echo " type: string" + echo " default: sample_id" + echo " Name of the .obs column where to store the id." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Name of output MuData file." + echo "" + echo " --make_observation_keys_unique" + echo " type: boolean_true" + echo " Join the id to the .obs index (.obs_names)." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "add_id main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_id) + [ -n "$VIASH_PAR_INPUT_ID" ] && ViashError Bad arguments for option \'--input_id\': \'$VIASH_PAR_INPUT_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_id=*) + [ -n "$VIASH_PAR_INPUT_ID" ] && ViashError Bad arguments for option \'--input_id=*\': \'$VIASH_PAR_INPUT_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_output) + [ -n "$VIASH_PAR_OBS_OUTPUT" ] && ViashError Bad arguments for option \'--obs_output\': \'$VIASH_PAR_OBS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_output=*) + [ -n "$VIASH_PAR_OBS_OUTPUT" ] && ViashError Bad arguments for option \'--obs_output=*\': \'$VIASH_PAR_OBS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --make_observation_keys_unique) + [ -n "$VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE" ] && ViashError Bad arguments for option \'--make_observation_keys_unique\': \'$VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE=true + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/metadata/add_id:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_ID+x} ]; then + ViashError '--input_id' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OBS_OUTPUT+x} ]; then + VIASH_PAR_OBS_OUTPUT="sample_id" +fi +if [ -z ${VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE+x} ]; then + VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE" ]]; then + if ! [[ "$VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--make_observation_keys_unique' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-add_id-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from __future__ import annotations +import sys +from mudata import read_h5mu, MuData + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_output': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'make_observation_keys_unique': $( if [ ! -z ${VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE+x} ]; then echo "r'${VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def make_observation_keys_unique(sample_id: str, sample: MuData) -> None: + """ + Make the observation keys unique across all samples. At input, + the observation keys are unique within a sample. By adding the sample name + (unique for a sample) to each observation key, the observation key is made + unique across all samples as well. + """ + logger.info( + "Making observation keys unique across all " + "samples by appending prefix '%s' to the observation names.", + sample_id, + ) + sample.obs.index = f"{sample_id}_" + sample.obs.index + make_observation_keys_unique_per_mod(sample_id, sample) + logger.info("Done making observation keys unique.") + + +def make_observation_keys_unique_per_mod(sample_id: str, sample: MuData) -> None: + """ + Updating MuData.obs_names is not allowed (it is read-only). + So the observation keys for each modality has to be updated manually. + """ + for mod_name, mod in sample.mod.items(): + logger.info("Processing modality '%s'", mod_name) + mod.obs_names = f"{sample_id}_" + mod.obs_names + + +def main(): + logger.info("Reading input file '%s'.", par["input"]) + input_data = read_h5mu(par["input"]) + logger.info( + "Adding column '%s' to global .obs dataframe, populated with ID '%s'", + par["obs_output"], + par["input_id"], + ) + input_data.obs[par["obs_output"]] = par["input_id"] + logger.info("Done adding column to global .obs") + for mod_name, mod_data in input_data.mod.items(): + logger.info( + "Adding column '%s' to .obs dataframe for modality '%s', " + "populated with ID '%s'", + par["obs_output"], + mod_name, + par["input_id"], + ) + mod_data.obs[par["obs_output"]] = par["input_id"] + logger.info("Done adding per-modality columns.") + if par["make_observation_keys_unique"]: + make_observation_keys_unique(par["input_id"], input_data) + logger.info( + "Writing out data to '%s' with compression '%s'.", + par["output"], + par["output_compression"], + ) + input_data.write_h5mu(par["output"], compression=par["output_compression"]) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metadata/add_id/nextflow_labels.config b/target/executable/metadata/add_id/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/metadata/add_id/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/metadata/add_id/setup_logger.py b/target/executable/metadata/add_id/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/metadata/add_id/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/metadata/grep_annotation_column/.config.vsh.yaml b/target/executable/metadata/grep_annotation_column/.config.vsh.yaml new file mode 100644 index 00000000..e014f64b --- /dev/null +++ b/target/executable/metadata/grep_annotation_column/.config.vsh.yaml @@ -0,0 +1,329 @@ +name: "grep_annotation_column" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + description: "Arguments related to the input dataset." + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the input .h5mu." + info: null + example: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_column" + description: "Column to query. If not specified, use .var_names or .obs_names,\ + \ depending on the value of --matrix" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input data to use when calculating fraction of observations that\ + \ match with the query. \nOnly used when --output_fraction_column is provided.\ + \ If not specified, .X is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to get the annotation matrix from.\n" + info: null + example: + - "rna" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--matrix" + description: "Matrix to fetch the column from that will be searched." + info: null + example: + - "var" + required: false + choices: + - "var" + - "obs" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Arguments related to how the output will be written." + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Location of the output MuData file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_match_column" + description: "Name of the column to write the result to." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_fraction_column" + description: "For the opposite axis, name of the column to write the fraction\ + \ of \nobservations that matches to the pattern.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Query options" + description: "Options related to the query" + arguments: + - type: "string" + name: "--regex_pattern" + description: "Regex to use to match with the input column." + info: null + example: + - "^[mM][tT]-" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Perform a regex lookup on a column from the annotation matrices .obs\ + \ or .var.\nThe annotation matrix can originate from either a modality, or all modalities\ + \ (global .var or .obs).\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/grep_annotation_column/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/metadata/grep_annotation_column" + executable: "target/executable/metadata/grep_annotation_column/grep_annotation_column" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/metadata/grep_annotation_column/compress_h5mu.py b/target/executable/metadata/grep_annotation_column/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/metadata/grep_annotation_column/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/metadata/grep_annotation_column/grep_annotation_column b/target/executable/metadata/grep_annotation_column/grep_annotation_column new file mode 100755 index 00000000..8e5c763a --- /dev/null +++ b/target/executable/metadata/grep_annotation_column/grep_annotation_column @@ -0,0 +1,1450 @@ +#!/usr/bin/env bash + +# grep_annotation_column main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="grep_annotation_column" +VIASH_META_FUNCTIONALITY_NAME="grep_annotation_column" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component metadata grep_annotation_column" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "grep_annotation_column main" + echo "" + echo "Perform a regex lookup on a column from the annotation matrices .obs or .var." + echo "The annotation matrix can originate from either a modality, or all modalities" + echo "(global .var or .obs)." + echo "" + echo "Inputs:" + echo " Arguments related to the input dataset." + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: sample_path" + echo " Path to the input .h5mu." + echo "" + echo " --input_column" + echo " type: string" + echo " Column to query. If not specified, use .var_names or .obs_names," + echo " depending on the value of --matrix" + echo "" + echo " --input_layer" + echo " type: string" + echo " Input data to use when calculating fraction of observations that match" + echo " with the query." + echo " Only used when --output_fraction_column is provided. If not specified," + echo " .X is used." + echo "" + echo " --modality" + echo " type: string, required parameter" + echo " example: rna" + echo " Which modality to get the annotation matrix from." + echo "" + echo " --matrix" + echo " type: string" + echo " example: var" + echo " choices: [ var, obs ]" + echo " Matrix to fetch the column from that will be searched." + echo "" + echo "Outputs:" + echo " Arguments related to how the output will be written." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Location of the output MuData file." + echo "" + echo " --output_match_column" + echo " type: string, required parameter" + echo " Name of the column to write the result to." + echo "" + echo " --output_fraction_column" + echo " type: string" + echo " For the opposite axis, name of the column to write the fraction of" + echo " observations that matches to the pattern." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Query options:" + echo " Options related to the query" + echo "" + echo " --regex_pattern" + echo " type: string, required parameter" + echo " example: ^[mM][tT]-" + echo " Regex to use to match with the input column." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "grep_annotation_column main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_column) + [ -n "$VIASH_PAR_INPUT_COLUMN" ] && ViashError Bad arguments for option \'--input_column\': \'$VIASH_PAR_INPUT_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_COLUMN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_column. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_column=*) + [ -n "$VIASH_PAR_INPUT_COLUMN" ] && ViashError Bad arguments for option \'--input_column=*\': \'$VIASH_PAR_INPUT_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_COLUMN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --matrix) + [ -n "$VIASH_PAR_MATRIX" ] && ViashError Bad arguments for option \'--matrix\': \'$VIASH_PAR_MATRIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MATRIX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --matrix. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --matrix=*) + [ -n "$VIASH_PAR_MATRIX" ] && ViashError Bad arguments for option \'--matrix=*\': \'$VIASH_PAR_MATRIX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MATRIX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_match_column) + [ -n "$VIASH_PAR_OUTPUT_MATCH_COLUMN" ] && ViashError Bad arguments for option \'--output_match_column\': \'$VIASH_PAR_OUTPUT_MATCH_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MATCH_COLUMN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_match_column. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_match_column=*) + [ -n "$VIASH_PAR_OUTPUT_MATCH_COLUMN" ] && ViashError Bad arguments for option \'--output_match_column=*\': \'$VIASH_PAR_OUTPUT_MATCH_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MATCH_COLUMN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_fraction_column) + [ -n "$VIASH_PAR_OUTPUT_FRACTION_COLUMN" ] && ViashError Bad arguments for option \'--output_fraction_column\': \'$VIASH_PAR_OUTPUT_FRACTION_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FRACTION_COLUMN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_fraction_column. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_fraction_column=*) + [ -n "$VIASH_PAR_OUTPUT_FRACTION_COLUMN" ] && ViashError Bad arguments for option \'--output_fraction_column=*\': \'$VIASH_PAR_OUTPUT_FRACTION_COLUMN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FRACTION_COLUMN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --regex_pattern) + [ -n "$VIASH_PAR_REGEX_PATTERN" ] && ViashError Bad arguments for option \'--regex_pattern\': \'$VIASH_PAR_REGEX_PATTERN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REGEX_PATTERN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --regex_pattern. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --regex_pattern=*) + [ -n "$VIASH_PAR_REGEX_PATTERN" ] && ViashError Bad arguments for option \'--regex_pattern=*\': \'$VIASH_PAR_REGEX_PATTERN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REGEX_PATTERN=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/metadata/grep_annotation_column:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + ViashError '--modality' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_MATCH_COLUMN+x} ]; then + ViashError '--output_match_column' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REGEX_PATTERN+x} ]; then + ViashError '--regex_pattern' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_MATRIX" ]; then + VIASH_PAR_MATRIX_CHOICES=("var;obs") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MATRIX_CHOICES[*]};" =~ ";$VIASH_PAR_MATRIX;" ]]; then + ViashError '--matrix' specified value of \'$VIASH_PAR_MATRIX\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-grep_annotation_column-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +from pathlib import Path +from operator import attrgetter +from pandas import Series +import scipy as sc +import re +import numpy as np + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_column': $( if [ ! -z ${VIASH_PAR_INPUT_COLUMN+x} ]; then echo "r'${VIASH_PAR_INPUT_COLUMN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'matrix': $( if [ ! -z ${VIASH_PAR_MATRIX+x} ]; then echo "r'${VIASH_PAR_MATRIX//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_match_column': $( if [ ! -z ${VIASH_PAR_OUTPUT_MATCH_COLUMN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MATCH_COLUMN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_fraction_column': $( if [ ! -z ${VIASH_PAR_OUTPUT_FRACTION_COLUMN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_FRACTION_COLUMN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'regex_pattern': $( if [ ! -z ${VIASH_PAR_REGEX_PATTERN+x} ]; then echo "r'${VIASH_PAR_REGEX_PATTERN//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def describe_array(arr, msg): + # Note: sc.stats returns a DescribeResult NamedTuple. For NamedTuples, + # the _asdict method is public facing even though it starts with an underscore. + description = sc.stats.describe(arr)._asdict() + logger.info( + "%s:\\nshape: %s\\nmean: %s\\nnobs: %s\\n" + "variance: %s\\nmin: %s\\nmax: %s\\ncontains na: %s\\ndtype: %s\\ncontains 0: %s", + msg, + arr.shape, + description["mean"], + description["nobs"], + description["variance"], + description["minmax"][0], + description["minmax"][1], + np.isnan(arr).any(), + arr.dtype, + (arr == 0).any(), + ) + + +def main(par): + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) + logger.info(f"Compiling regular expression '{par['regex_pattern']}'.") + try: + compiled_regex = re.compile(par["regex_pattern"]) + except (TypeError, re.error) as e: + raise ValueError( + f"{par['regex_pattern']} is not a valid regular expression pattern." + ) from e + else: + if compiled_regex.groups: + raise NotImplementedError( + "Using match groups is not supported by this component." + ) + logger.info("Reading input file %s, modality %s.", input_file, mod_name) + + modality_data = mu.read_h5ad(input_file, mod=mod_name) + logger.info("Reading input file done.") + logger.info("Using annotation dataframe '%s'.", par["matrix"]) + annotation_matrix = getattr(modality_data, par["matrix"]) + default_column = {"var": attrgetter("var_names"), "obs": attrgetter("obs_names")} + if par["input_column"]: + logger.info("Input column '%s' was specified.", par["input_column"]) + try: + annotation_column = annotation_matrix[par["input_column"]] + except KeyError as e: + raise ValueError( + f"Column {par['input_column']} could not be found for modality " + f"{par['modality']}. Available columns:" + f" {','.join(annotation_matrix.columns.to_list())}" + ) from e + else: + logger.info(f"No input column specified, using '.{par['matrix']}_names'") + annotation_column = default_column[par["matrix"]](modality_data).to_series() + logger.info("Applying regex search.") + grep_result = annotation_column.str.contains(par["regex_pattern"], regex=True) + logger.info("Search results: %s", grep_result.value_counts()) + # A Series object cannot be used as an indexer for a scipy sparse array + # when the data type is a pandas boolean extension array because + # extension arrays do not define .nonzero() + # See https://github.com/pandas-dev/pandas/issues/46025 + grep_result = grep_result.to_numpy(dtype="bool", na_value=False) + + other_axis_attribute = {"var": "obs", "obs": "var"} + if par["output_fraction_column"]: + logger.info( + "Enabled writing the fraction of values that matches to the pattern." + ) + input_layer = ( + modality_data.X + if not par["input_layer"] + else modality_data.layers[par["input_layer"]] + ) + totals = np.ravel(input_layer.sum(axis=1)) + describe_array(totals, "Summary of total counts for layer") + counts_for_matches = np.ravel(input_layer[:, grep_result].sum(axis=1)) + describe_array(counts_for_matches, "Summary of counts matching grep") + with np.errstate(all="raise"): + pct_matching = np.divide( + counts_for_matches, + totals, + out=np.zeros_like(totals, dtype=np.float64), + where=(~np.isclose(totals, np.zeros_like(totals))), + ) + logger.info("Testing wether or not fractions data contains NA.") + assert ~np.isnan(pct_matching).any(), "Fractions should not contain NA." + logger.info("Fraction statistics: \\n%s", Series(pct_matching).describe()) + pct_matching = np.where(np.isclose(pct_matching, 0, atol=1e-6), 0, pct_matching) + pct_matching = np.where(np.isclose(pct_matching, 1, atol=1e-6), 1, pct_matching) + assert (np.logical_and(pct_matching >= 0, pct_matching <= 1)).all(), ( + "Fractions are not within bounds, please report this as a bug" + ) + output_matrix = other_axis_attribute[par["matrix"]] + logger.info( + "Writing fractions to matrix '%s', column '%s'", + output_matrix, + par["output_fraction_column"], + ) + getattr(modality_data, output_matrix)[par["output_fraction_column"]] = ( + pct_matching + ) + logger.info( + "Adding values that matched the pattern to '%s', column '%s'", + par["matrix"], + par["output_match_column"], + ) + getattr(modality_data, par["matrix"])[par["output_match_column"]] = grep_result + logger.info( + "Writing out data to '%s' with compression '%s'.", + output_file, + par["output_compression"], + ) + write_h5ad_to_h5mu_with_compression( + output_file, par["input"], mod_name, modality_data, par["output_compression"] + ) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metadata/grep_annotation_column/nextflow_labels.config b/target/executable/metadata/grep_annotation_column/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/metadata/grep_annotation_column/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/metadata/grep_annotation_column/setup_logger.py b/target/executable/metadata/grep_annotation_column/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/metadata/grep_annotation_column/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/metadata/join_csv/.config.vsh.yaml b/target/executable/metadata/join_csv/.config.vsh.yaml new file mode 100644 index 00000000..184c63cf --- /dev/null +++ b/target/executable/metadata/join_csv/.config.vsh.yaml @@ -0,0 +1,304 @@ +name: "join_csv" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "MuData Input" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_key" + description: "Obs column name where the sample id can be found for each observation\ + \ to join on.\nUseful when adding metadata to concatenated samples.\nMutually\ + \ exclusive with `--var_key`.\"\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_key" + description: "Var column name where the sample id can be found for each variable\ + \ to join on.\nMutually exclusive with `--obs_key`.\"\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "MuData Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Metadata Input" + arguments: + - type: "file" + name: "--input_csv" + description: ".csv file containing metadata" + info: null + example: + - "metadata.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--csv_key" + description: "column of the the csv that corresponds to the sample id." + info: null + default: + - "id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Join a csv containing metadata to the .obs or .var field of a mudata\ + \ file." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/join_csv/config.vsh.yml" + runner: "executable" + engine: "docker|native" + output: "target/executable/metadata/join_csv" + executable: "target/executable/metadata/join_csv/join_csv" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/metadata/join_csv/compress_h5mu.py b/target/executable/metadata/join_csv/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/metadata/join_csv/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/metadata/join_csv/join_csv b/target/executable/metadata/join_csv/join_csv new file mode 100755 index 00000000..dc323a75 --- /dev/null +++ b/target/executable/metadata/join_csv/join_csv @@ -0,0 +1,1317 @@ +#!/usr/bin/env bash + +# join_csv main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="join_csv" +VIASH_META_FUNCTIONALITY_NAME="join_csv" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component metadata join_csv" +LABEL org.opencontainers.image.created="2025-08-22T07:23:18Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "join_csv main" + echo "" + echo "Join a csv containing metadata to the .obs or .var field of a mudata file." + echo "" + echo "MuData Input:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obs_key" + echo " type: string" + echo " Obs column name where the sample id can be found for each observation to" + echo " join on." + echo " Useful when adding metadata to concatenated samples." + echo " Mutually exclusive with \`--var_key\`.\"" + echo "" + echo " --var_key" + echo " type: string" + echo " Var column name where the sample id can be found for each variable to" + echo " join on." + echo " Mutually exclusive with \`--obs_key\`.\"" + echo "" + echo "MuData Output:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " The compression format to be used on the output h5mu object." + echo "" + echo "Metadata Input:" + echo " --input_csv" + echo " type: file, required parameter, file must exist" + echo " example: metadata.csv" + echo " .csv file containing metadata" + echo "" + echo " --csv_key" + echo " type: string" + echo " default: id" + echo " column of the the csv that corresponds to the sample id." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "join_csv main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_key) + [ -n "$VIASH_PAR_OBS_KEY" ] && ViashError Bad arguments for option \'--obs_key\': \'$VIASH_PAR_OBS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_key=*) + [ -n "$VIASH_PAR_OBS_KEY" ] && ViashError Bad arguments for option \'--obs_key=*\': \'$VIASH_PAR_OBS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_key) + [ -n "$VIASH_PAR_VAR_KEY" ] && ViashError Bad arguments for option \'--var_key\': \'$VIASH_PAR_VAR_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_key=*) + [ -n "$VIASH_PAR_VAR_KEY" ] && ViashError Bad arguments for option \'--var_key=*\': \'$VIASH_PAR_VAR_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_csv) + [ -n "$VIASH_PAR_INPUT_CSV" ] && ViashError Bad arguments for option \'--input_csv\': \'$VIASH_PAR_INPUT_CSV\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_CSV="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_csv. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_csv=*) + [ -n "$VIASH_PAR_INPUT_CSV" ] && ViashError Bad arguments for option \'--input_csv=*\': \'$VIASH_PAR_INPUT_CSV\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_CSV=$(ViashRemoveFlags "$1") + shift 1 + ;; + --csv_key) + [ -n "$VIASH_PAR_CSV_KEY" ] && ViashError Bad arguments for option \'--csv_key\': \'$VIASH_PAR_CSV_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CSV_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --csv_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --csv_key=*) + [ -n "$VIASH_PAR_CSV_KEY" ] && ViashError Bad arguments for option \'--csv_key=*\': \'$VIASH_PAR_CSV_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CSV_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/metadata/join_csv:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_CSV+x} ]; then + ViashError '--input_csv' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_CSV_KEY+x} ]; then + VIASH_PAR_CSV_KEY="id" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT_CSV" ] && [ ! -e "$VIASH_PAR_INPUT_CSV" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_CSV' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_INPUT_CSV" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_CSV")" ) + VIASH_PAR_INPUT_CSV=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_CSV") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-join_csv-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import pandas as pd +from mudata import read_h5ad + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_key': $( if [ ! -z ${VIASH_PAR_OBS_KEY+x} ]; then echo "r'${VIASH_PAR_OBS_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_key': $( if [ ! -z ${VIASH_PAR_VAR_KEY+x} ]; then echo "r'${VIASH_PAR_VAR_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_csv': $( if [ ! -z ${VIASH_PAR_INPUT_CSV+x} ]; then echo "r'${VIASH_PAR_INPUT_CSV//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'csv_key': $( if [ ! -z ${VIASH_PAR_CSV_KEY+x} ]; then echo "r'${VIASH_PAR_CSV_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +if par["obs_key"] and par["var_key"]: + raise ValueError("--obs_key can not be used in conjuction with --var_key.") +if not (par["obs_key"] or par["var_key"]): + raise ValueError("Must define either --obs_key or --var_key") + +logger.info("Read metadata csv from file") +metadata = pd.read_csv(par["input_csv"], sep=",", header=0, index_col=par["csv_key"]) +metadata.fillna("", inplace=True) + +logger.info("Read modality %s from file %s", par["modality"], par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Joining csv to mudata") +matrix = "var" if par["var_key"] else "obs" +matrix_sample_column_name = par["var_key"] if par["var_key"] else par["obs_key"] +original_matrix = getattr(mod_data, matrix) +sample_ids = original_matrix[matrix_sample_column_name] + +try: + new_columns = metadata.loc[sample_ids.tolist()] +except KeyError as e: + raise KeyError( + f"Not all sample IDs selected from {matrix} " + "(using the column selected with --var_key or --obs_key) were found in " + "the csv file." + ) from e +new_matrix = pd.concat( + [original_matrix.reset_index(drop=True), new_columns.reset_index(drop=True)], axis=1 +).set_axis(original_matrix.index) +setattr(mod_data, matrix, new_matrix) + +logger.info("Write output to mudata file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_INPUT_CSV" ]; then + VIASH_PAR_INPUT_CSV=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_CSV") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metadata/join_csv/nextflow_labels.config b/target/executable/metadata/join_csv/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/metadata/join_csv/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/metadata/join_csv/setup_logger.py b/target/executable/metadata/join_csv/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/metadata/join_csv/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/metadata/join_uns_to_obs/.config.vsh.yaml b/target/executable/metadata/join_uns_to_obs/.config.vsh.yaml new file mode 100644 index 00000000..8183bf8a --- /dev/null +++ b/target/executable/metadata/join_uns_to_obs/.config.vsh.yaml @@ -0,0 +1,252 @@ +name: "join_uns_to_obs" +namespace: "metadata" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_key" + description: "Lookup key from .uns pointing to the input data.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Join a data frame of length 1 (1 row index value) in .uns containing\ + \ metadata to the .obs of a mudata file." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/join_uns_to_obs/config.vsh.yml" + runner: "executable" + engine: "docker|native" + output: "target/executable/metadata/join_uns_to_obs" + executable: "target/executable/metadata/join_uns_to_obs/join_uns_to_obs" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/metadata/join_uns_to_obs/compress_h5mu.py b/target/executable/metadata/join_uns_to_obs/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/metadata/join_uns_to_obs/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/metadata/join_uns_to_obs/join_uns_to_obs b/target/executable/metadata/join_uns_to_obs/join_uns_to_obs new file mode 100755 index 00000000..2bf8d978 --- /dev/null +++ b/target/executable/metadata/join_uns_to_obs/join_uns_to_obs @@ -0,0 +1,1230 @@ +#!/usr/bin/env bash + +# join_uns_to_obs main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="join_uns_to_obs" +VIASH_META_FUNCTIONALITY_NAME="join_uns_to_obs" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.description="Companion container for running component metadata join_uns_to_obs" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "join_uns_to_obs main" + echo "" + echo "Join a data frame of length 1 (1 row index value) in .uns containing metadata to" + echo "the .obs of a mudata file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --uns_key" + echo " type: string, required parameter" + echo " Lookup key from .uns pointing to the input data." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " The compression format to be used on the output h5mu object." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "join_uns_to_obs main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --uns_key) + [ -n "$VIASH_PAR_UNS_KEY" ] && ViashError Bad arguments for option \'--uns_key\': \'$VIASH_PAR_UNS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_key=*) + [ -n "$VIASH_PAR_UNS_KEY" ] && ViashError Bad arguments for option \'--uns_key=*\': \'$VIASH_PAR_UNS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/metadata/join_uns_to_obs:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_UNS_KEY+x} ]; then + ViashError '--uns_key' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-join_uns_to_obs-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import pandas as pd +from mudata import read_h5ad + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_key': $( if [ ! -z ${VIASH_PAR_UNS_KEY+x} ]; then echo "r'${VIASH_PAR_UNS_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Read modality %s from file %s", par["modality"], par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Joining uns to obs") +# get data frame +uns_df = mod_data.uns[par["uns_key"]] + +# check for overlapping colnames +intersect_keys = uns_df.keys().intersection(mod_data.obs.keys()) +obs_drop = mod_data.obs.drop(intersect_keys, axis=1) + +# create data frame to join +uns_df_rep = uns_df.loc[uns_df.index.repeat(mod_data.n_obs)] +uns_df_rep.index = mod_data.obs_names + +# create new obs +mod_data.obs = pd.concat([obs_drop, uns_df_rep], axis=1) + +logger.info("Write output to mudata file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metadata/join_uns_to_obs/nextflow_labels.config b/target/executable/metadata/join_uns_to_obs/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/metadata/join_uns_to_obs/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/metadata/join_uns_to_obs/setup_logger.py b/target/executable/metadata/join_uns_to_obs/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/metadata/join_uns_to_obs/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/metadata/move_obsm_to_obs/.config.vsh.yaml b/target/executable/metadata/move_obsm_to_obs/.config.vsh.yaml new file mode 100644 index 00000000..fba998f9 --- /dev/null +++ b/target/executable/metadata/move_obsm_to_obs/.config.vsh.yaml @@ -0,0 +1,271 @@ +name: "move_obsm_to_obs" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "MuData Input" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_key" + description: "Key of a data structure to move from `.obsm` to `.obs`." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "MuData Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Move a matrix from .obsm to .obs. Newly created columns in .obs will\ + \ \nbe created from the .obsm key suffixed with an underscore and the name of the\ + \ columns\nof the specified .obsm matrix.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/move_obsm_to_obs/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/metadata/move_obsm_to_obs" + executable: "target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/metadata/move_obsm_to_obs/compress_h5mu.py b/target/executable/metadata/move_obsm_to_obs/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/metadata/move_obsm_to_obs/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs b/target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs new file mode 100755 index 00000000..269bd2d2 --- /dev/null +++ b/target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs @@ -0,0 +1,1267 @@ +#!/usr/bin/env bash + +# move_obsm_to_obs main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="move_obsm_to_obs" +VIASH_META_FUNCTIONALITY_NAME="move_obsm_to_obs" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component metadata move_obsm_to_obs" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "move_obsm_to_obs main" + echo "" + echo "Move a matrix from .obsm to .obs. Newly created columns in .obs will" + echo "be created from the .obsm key suffixed with an underscore and the name of the" + echo "columns" + echo "of the specified .obsm matrix." + echo "" + echo "MuData Input:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obsm_key" + echo " type: string, required parameter" + echo " Key of a data structure to move from \`.obsm\` to \`.obs\`." + echo "" + echo "MuData Output:" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "move_obsm_to_obs main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_key) + [ -n "$VIASH_PAR_OBSM_KEY" ] && ViashError Bad arguments for option \'--obsm_key\': \'$VIASH_PAR_OBSM_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_key=*) + [ -n "$VIASH_PAR_OBSM_KEY" ] && ViashError Bad arguments for option \'--obsm_key=*\': \'$VIASH_PAR_OBSM_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/metadata/move_obsm_to_obs:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBSM_KEY+x} ]; then + ViashError '--obsm_key' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-move_obsm_to_obs-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from functools import partial +from pandas.errors import MergeError +from mudata import read_h5ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_key': $( if [ ! -z ${VIASH_PAR_OBSM_KEY+x} ]; then echo "r'${VIASH_PAR_OBSM_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from file %s", par["modality"], par["input"]) +try: + mod_data = read_h5ad(par["input"], mod=par["modality"]) +except KeyError: + raise ValueError(f"Modality {par['modality']} does not exist.") + +logger.info("Moving .obm key %s", par["obsm_key"]) +try: + obsm_matrix = mod_data.obsm[par["obsm_key"]].copy() +except KeyError: + raise ValueError( + f".obsm key {par['obsm_key']} was not found in " + f".obsm slot for modality {par['modality']}." + ) + + +obsm_matrix.rename( + partial("{key}_{}".format, key=par["obsm_key"]), + axis="columns", + copy=False, + inplace=True, +) + +original_n_obs = len(mod_data.obs) +try: + logger.info(f".obs names: {mod_data.obs_names}") + logger.info(f".obsm index: {obsm_matrix.index}") + new_obs = mod_data.obs.drop(obsm_matrix.columns, axis=1, errors="ignore") + new_obs = new_obs.merge( + obsm_matrix, + how="left", + validate="one_to_one", + left_index=True, + right_index=True, + ) + mod_data.obs = new_obs +except MergeError: + raise ValueError( + f"Could not join .obsm matrix at {par['obsm_key']} to .obs because there " + "are some observation that are not overlapping between the two matrices " + "(indexes should overlap). This is either a bug or your mudata file is corrupt." + ) +del mod_data.obsm[par["obsm_key"]] + +logger.info( + "Write output to %s with compression %s", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/metadata/move_obsm_to_obs/nextflow_labels.config b/target/executable/metadata/move_obsm_to_obs/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/metadata/move_obsm_to_obs/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/metadata/move_obsm_to_obs/setup_logger.py b/target/executable/metadata/move_obsm_to_obs/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/metadata/move_obsm_to_obs/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/neighbors/bbknn/.config.vsh.yaml b/target/executable/neighbors/bbknn/.config.vsh.yaml new file mode 100644 index 00000000..591b1e3e --- /dev/null +++ b/target/executable/neighbors/bbknn/.config.vsh.yaml @@ -0,0 +1,357 @@ +name: "bbknn" +namespace: "neighbors" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "The dimensionality reduction in `.obsm` to use for neighbour detection.\ + \ Defaults to X_pca." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: ".obs column name discriminating between your batches." + info: null + default: + - "batch" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output .h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "Mandatory .uns slot to store various neighbor output objects." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors_within_batch" + description: "How many top neighbours to report for each batch; total number of\ + \ neighbours in the initial k-nearest-neighbours computation will be this number\ + \ times the number of batches." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_pcs" + description: "How many dimensions (in case of PCA, principal components) to use\ + \ in the analysis." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_trim" + description: "Trim the neighbours of each cell to these many top connectivities.\ + \ May help with population independence and improve the tidiness of clustering.\ + \ The lower the value the more independent the individual populations, at the\ + \ cost of more conserved batch effect. If `None` (default), sets the parameter\ + \ value automatically to 10 times `neighbors_within_batch` times the number\ + \ of batches. Set to 0 to skip." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "BBKNN network generation\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "highmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "bbknn" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/neighbors/bbknn/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/neighbors/bbknn" + executable: "target/executable/neighbors/bbknn/bbknn" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/neighbors/bbknn/bbknn b/target/executable/neighbors/bbknn/bbknn new file mode 100755 index 00000000..816e89ed --- /dev/null +++ b/target/executable/neighbors/bbknn/bbknn @@ -0,0 +1,1398 @@ +#!/usr/bin/env bash + +# bbknn main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (author) +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bbknn" +VIASH_META_FUNCTIONALITY_NAME="bbknn" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps build-essential && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "bbknn" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component neighbors bbknn" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bbknn main" + echo "" + echo "BBKNN network generation" + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obsm_input" + echo " type: string" + echo " default: X_pca" + echo " The dimensionality reduction in \`.obsm\` to use for neighbour detection." + echo " Defaults to X_pca." + echo "" + echo " --obs_batch" + echo " type: string" + echo " default: batch" + echo " .obs column name discriminating between your batches." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output .h5mu file." + echo "" + echo " --uns_output" + echo " type: string" + echo " default: neighbors" + echo " Mandatory .uns slot to store various neighbor output objects." + echo "" + echo " --obsp_distances" + echo " type: string" + echo " default: distances" + echo " In which .obsp slot to store the distance matrix between the resulting" + echo " neighbors." + echo "" + echo " --obsp_connectivities" + echo " type: string" + echo " default: connectivities" + echo " In which .obsp slot to store the connectivities matrix between the" + echo " resulting neighbors." + echo "" + echo " --n_neighbors_within_batch" + echo " type: integer" + echo " default: 3" + echo " How many top neighbours to report for each batch; total number of" + echo " neighbours in the initial k-nearest-neighbours computation will be this" + echo " number times the number of batches." + echo "" + echo " --n_pcs" + echo " type: integer" + echo " default: 50" + echo " How many dimensions (in case of PCA, principal components) to use in the" + echo " analysis." + echo "" + echo " --n_trim" + echo " type: integer" + echo " Trim the neighbours of each cell to these many top connectivities. May" + echo " help with population independence and improve the tidiness of" + echo " clustering. The lower the value the more independent the individual" + echo " populations, at the cost of more conserved batch effect. If \`None\`" + echo " (default), sets the parameter value automatically to 10 times" + echo " \`neighbors_within_batch\` times the number of batches. Set to 0 to skip." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bbknn main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_input) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_input=*) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input=*\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_batch) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch=*) + [ -n "$VIASH_PAR_OBS_BATCH" ] && ViashError Bad arguments for option \'--obs_batch=*\': \'$VIASH_PAR_OBS_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_output) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_output=*) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output=*\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsp_distances) + [ -n "$VIASH_PAR_OBSP_DISTANCES" ] && ViashError Bad arguments for option \'--obsp_distances\': \'$VIASH_PAR_OBSP_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_DISTANCES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsp_distances. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsp_distances=*) + [ -n "$VIASH_PAR_OBSP_DISTANCES" ] && ViashError Bad arguments for option \'--obsp_distances=*\': \'$VIASH_PAR_OBSP_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_DISTANCES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsp_connectivities) + [ -n "$VIASH_PAR_OBSP_CONNECTIVITIES" ] && ViashError Bad arguments for option \'--obsp_connectivities\': \'$VIASH_PAR_OBSP_CONNECTIVITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_CONNECTIVITIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsp_connectivities. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsp_connectivities=*) + [ -n "$VIASH_PAR_OBSP_CONNECTIVITIES" ] && ViashError Bad arguments for option \'--obsp_connectivities=*\': \'$VIASH_PAR_OBSP_CONNECTIVITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_CONNECTIVITIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_neighbors_within_batch) + [ -n "$VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH" ] && ViashError Bad arguments for option \'--n_neighbors_within_batch\': \'$VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_neighbors_within_batch. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_neighbors_within_batch=*) + [ -n "$VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH" ] && ViashError Bad arguments for option \'--n_neighbors_within_batch=*\': \'$VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_pcs) + [ -n "$VIASH_PAR_N_PCS" ] && ViashError Bad arguments for option \'--n_pcs\': \'$VIASH_PAR_N_PCS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PCS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_pcs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_pcs=*) + [ -n "$VIASH_PAR_N_PCS" ] && ViashError Bad arguments for option \'--n_pcs=*\': \'$VIASH_PAR_N_PCS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PCS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_trim) + [ -n "$VIASH_PAR_N_TRIM" ] && ViashError Bad arguments for option \'--n_trim\': \'$VIASH_PAR_N_TRIM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TRIM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_trim. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_trim=*) + [ -n "$VIASH_PAR_N_TRIM" ] && ViashError Bad arguments for option \'--n_trim=*\': \'$VIASH_PAR_N_TRIM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TRIM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/neighbors/bbknn:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSM_INPUT+x} ]; then + VIASH_PAR_OBSM_INPUT="X_pca" +fi +if [ -z ${VIASH_PAR_OBS_BATCH+x} ]; then + VIASH_PAR_OBS_BATCH="batch" +fi +if [ -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then + VIASH_PAR_UNS_OUTPUT="neighbors" +fi +if [ -z ${VIASH_PAR_OBSP_DISTANCES+x} ]; then + VIASH_PAR_OBSP_DISTANCES="distances" +fi +if [ -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then + VIASH_PAR_OBSP_CONNECTIVITIES="connectivities" +fi +if [ -z ${VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH+x} ]; then + VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH="3" +fi +if [ -z ${VIASH_PAR_N_PCS+x} ]; then + VIASH_PAR_N_PCS="50" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH" ]]; then + if ! [[ "$VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_neighbors_within_batch' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_PCS" ]]; then + if ! [[ "$VIASH_PAR_N_PCS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_pcs' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_TRIM" ]]; then + if ! [[ "$VIASH_PAR_N_TRIM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_trim' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bbknn-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import bbknn +from mudata import read_h5ad + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsp_distances': $( if [ ! -z ${VIASH_PAR_OBSP_DISTANCES+x} ]; then echo "r'${VIASH_PAR_OBSP_DISTANCES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsp_connectivities': $( if [ ! -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then echo "r'${VIASH_PAR_OBSP_CONNECTIVITIES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_neighbors_within_batch': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_pcs': $( if [ ! -z ${VIASH_PAR_N_PCS+x} ]; then echo "int(r'${VIASH_PAR_N_PCS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_trim': $( if [ ! -z ${VIASH_PAR_N_TRIM+x} ]; then echo "int(r'${VIASH_PAR_N_TRIM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +adata = read_h5ad(par["input"], mod=par["modality"]) + +# copy data +tmp_adata = adata.copy() +bbknn.bbknn( + tmp_adata, + use_rep=par["obsm_input"], + batch_key=par["obs_batch"], + neighbors_within_batch=par["n_neighbors_within_batch"], + n_pcs=par["n_pcs"], + trim=par["n_trim"], +) + +# store output +adata.obsp[par["obsp_connectivities"]] = tmp_adata.obsp["connectivities"] +adata.obsp[par["obsp_distances"]] = tmp_adata.obsp["distances"] +adata.uns[par["uns_output"]] = tmp_adata.uns["neighbors"] +adata.uns[par["uns_output"]]["distances_key"] = par["obsp_distances"] +adata.uns[par["uns_output"]]["connectivities_key"] = par["obsp_connectivities"] + +# write to file +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/neighbors/bbknn/compress_h5mu.py b/target/executable/neighbors/bbknn/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/neighbors/bbknn/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/neighbors/bbknn/nextflow_labels.config b/target/executable/neighbors/bbknn/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/neighbors/bbknn/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/neighbors/find_neighbors/.config.vsh.yaml b/target/executable/neighbors/find_neighbors/.config.vsh.yaml new file mode 100644 index 00000000..def59929 --- /dev/null +++ b/target/executable/neighbors/find_neighbors/.config.vsh.yaml @@ -0,0 +1,385 @@ +name: "find_neighbors" +namespace: "neighbors" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "Which .obsm slot to use as a starting PCA embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file containing the found neighbors." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "Mandatory .uns slot to store various neighbor output objects." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--metric" + description: "The distance metric to be used in the generation of the nearest\ + \ neighborhood network." + info: null + default: + - "euclidean" + required: false + choices: + - "cityblock" + - "cosine" + - "euclidean" + - "l1" + - "l2" + - "manhattan" + - "braycurtis" + - "canberra" + - "chebyshev" + - "correlation" + - "dice" + - "hamming" + - "jaccard" + - "kulsinski" + - "mahalanobis" + - "minkowski" + - "rogerstanimoto" + - "russellrao" + - "seuclidean" + - "sokalmichener" + - "sokalsneath" + - "sqeuclidean" + - "yule" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_neighbors" + description: "The size of local neighborhood (in terms of number of neighboring\ + \ data points) used for manifold approximation. Larger values result in more\ + \ global views of the manifold, while smaller values result in more local data\ + \ being preserved. In general values should be in the range 2 to 100. If knn\ + \ is True, number of nearest neighbors to be searched. If knn is False, a Gaussian\ + \ kernel width is set to the distance of the n_neighbors neighbor." + info: null + default: + - 15 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "A random seed." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Compute a neighborhood graph of observations [McInnes18].\n\nThe neighbor\ + \ search efficiency of this heavily relies on UMAP [McInnes18], which also provides\ + \ a method for estimating connectivities of data points - the connectivity of the\ + \ manifold (method=='umap'). If method=='gauss', connectivities are computed according\ + \ to [Coifman05], in the adaption of [Haghverdi16].\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/neighbors/find_neighbors/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/neighbors/find_neighbors" + executable: "target/executable/neighbors/find_neighbors/find_neighbors" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/neighbors/find_neighbors/compress_h5mu.py b/target/executable/neighbors/find_neighbors/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/neighbors/find_neighbors/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/neighbors/find_neighbors/find_neighbors b/target/executable/neighbors/find_neighbors/find_neighbors new file mode 100755 index 00000000..6f3d290f --- /dev/null +++ b/target/executable/neighbors/find_neighbors/find_neighbors @@ -0,0 +1,1412 @@ +#!/usr/bin/env bash + +# find_neighbors main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) +# * Robrecht Cannoodt (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="find_neighbors" +VIASH_META_FUNCTIONALITY_NAME="find_neighbors" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component neighbors find_neighbors" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "find_neighbors main" + echo "" + echo "Compute a neighborhood graph of observations [McInnes18]." + echo "" + echo "The neighbor search efficiency of this heavily relies on UMAP [McInnes18], which" + echo "also provides a method for estimating connectivities of data points - the" + echo "connectivity of the manifold (method=='umap'). If method=='gauss'," + echo "connectivities are computed according to [Coifman05], in the adaption of" + echo "[Haghverdi16]." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obsm_input" + echo " type: string" + echo " default: X_pca" + echo " Which .obsm slot to use as a starting PCA embedding." + echo "" + echo " -o, --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file containing the found neighbors." + echo "" + echo " --uns_output" + echo " type: string" + echo " default: neighbors" + echo " Mandatory .uns slot to store various neighbor output objects." + echo "" + echo " --obsp_distances" + echo " type: string" + echo " default: distances" + echo " In which .obsp slot to store the distance matrix between the resulting" + echo " neighbors." + echo "" + echo " --obsp_connectivities" + echo " type: string" + echo " default: connectivities" + echo " In which .obsp slot to store the connectivities matrix between the" + echo " resulting neighbors." + echo "" + echo " --metric" + echo " type: string" + echo " default: euclidean" + echo " choices: [ cityblock, cosine, euclidean, l1, l2, manhattan, braycurtis," + echo "canberra, chebyshev, correlation, dice, hamming, jaccard, kulsinski," + echo "mahalanobis, minkowski, rogerstanimoto, russellrao, seuclidean, sokalmichener," + echo "sokalsneath, sqeuclidean, yule ]" + echo " The distance metric to be used in the generation of the nearest" + echo " neighborhood network." + echo "" + echo " --num_neighbors" + echo " type: integer" + echo " default: 15" + echo " The size of local neighborhood (in terms of number of neighboring data" + echo " points) used for manifold approximation. Larger values result in more" + echo " global views of the manifold, while smaller values result in more local" + echo " data being preserved. In general values should be in the range 2 to 100." + echo " If knn is True, number of nearest neighbors to be searched. If knn is" + echo " False, a Gaussian kernel width is set to the distance of the n_neighbors" + echo " neighbor." + echo "" + echo " --seed" + echo " type: integer" + echo " default: 0" + echo " A random seed." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "find_neighbors main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_input) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_input=*) + [ -n "$VIASH_PAR_OBSM_INPUT" ] && ViashError Bad arguments for option \'--obsm_input=*\': \'$VIASH_PAR_OBSM_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_output) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --uns_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --uns_output=*) + [ -n "$VIASH_PAR_UNS_OUTPUT" ] && ViashError Bad arguments for option \'--uns_output=*\': \'$VIASH_PAR_UNS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UNS_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsp_distances) + [ -n "$VIASH_PAR_OBSP_DISTANCES" ] && ViashError Bad arguments for option \'--obsp_distances\': \'$VIASH_PAR_OBSP_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_DISTANCES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsp_distances. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsp_distances=*) + [ -n "$VIASH_PAR_OBSP_DISTANCES" ] && ViashError Bad arguments for option \'--obsp_distances=*\': \'$VIASH_PAR_OBSP_DISTANCES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_DISTANCES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsp_connectivities) + [ -n "$VIASH_PAR_OBSP_CONNECTIVITIES" ] && ViashError Bad arguments for option \'--obsp_connectivities\': \'$VIASH_PAR_OBSP_CONNECTIVITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_CONNECTIVITIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsp_connectivities. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsp_connectivities=*) + [ -n "$VIASH_PAR_OBSP_CONNECTIVITIES" ] && ViashError Bad arguments for option \'--obsp_connectivities=*\': \'$VIASH_PAR_OBSP_CONNECTIVITIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSP_CONNECTIVITIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --metric) + [ -n "$VIASH_PAR_METRIC" ] && ViashError Bad arguments for option \'--metric\': \'$VIASH_PAR_METRIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRIC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --metric. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --metric=*) + [ -n "$VIASH_PAR_METRIC" ] && ViashError Bad arguments for option \'--metric=*\': \'$VIASH_PAR_METRIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_METRIC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --num_neighbors) + [ -n "$VIASH_PAR_NUM_NEIGHBORS" ] && ViashError Bad arguments for option \'--num_neighbors\': \'$VIASH_PAR_NUM_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_NEIGHBORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --num_neighbors. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --num_neighbors=*) + [ -n "$VIASH_PAR_NUM_NEIGHBORS" ] && ViashError Bad arguments for option \'--num_neighbors=*\': \'$VIASH_PAR_NUM_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUM_NEIGHBORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seed) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seed=*) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed=*\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/neighbors/find_neighbors:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSM_INPUT+x} ]; then + VIASH_PAR_OBSM_INPUT="X_pca" +fi +if [ -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then + VIASH_PAR_UNS_OUTPUT="neighbors" +fi +if [ -z ${VIASH_PAR_OBSP_DISTANCES+x} ]; then + VIASH_PAR_OBSP_DISTANCES="distances" +fi +if [ -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then + VIASH_PAR_OBSP_CONNECTIVITIES="connectivities" +fi +if [ -z ${VIASH_PAR_METRIC+x} ]; then + VIASH_PAR_METRIC="euclidean" +fi +if [ -z ${VIASH_PAR_NUM_NEIGHBORS+x} ]; then + VIASH_PAR_NUM_NEIGHBORS="15" +fi +if [ -z ${VIASH_PAR_SEED+x} ]; then + VIASH_PAR_SEED="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_NUM_NEIGHBORS" ]]; then + if ! [[ "$VIASH_PAR_NUM_NEIGHBORS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--num_neighbors' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEED" ]]; then + if ! [[ "$VIASH_PAR_SEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_METRIC" ]; then + VIASH_PAR_METRIC_CHOICES=("cityblock;cosine;euclidean;l1;l2;manhattan;braycurtis;canberra;chebyshev;correlation;dice;hamming;jaccard;kulsinski;mahalanobis;minkowski;rogerstanimoto;russellrao;seuclidean;sokalmichener;sokalsneath;sqeuclidean;yule") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_METRIC_CHOICES[*]};" =~ ";$VIASH_PAR_METRIC;" ]]; then + ViashError '--metric' specified value of \'$VIASH_PAR_METRIC\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-find_neighbors-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.float_ = np.float64 +sys.modules["numpy"] = numpy_module + +import mudata as mu +import scanpy as sc +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsp_distances': $( if [ ! -z ${VIASH_PAR_OBSP_DISTANCES+x} ]; then echo "r'${VIASH_PAR_OBSP_DISTANCES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsp_connectivities': $( if [ ! -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then echo "r'${VIASH_PAR_OBSP_CONNECTIVITIES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'metric': $( if [ ! -z ${VIASH_PAR_METRIC+x} ]; then echo "r'${VIASH_PAR_METRIC//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'num_neighbors': $( if [ ! -z ${VIASH_PAR_NUM_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_NUM_NEIGHBORS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input modality %s from %s", par["modality"], par["input"]) +adata = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing a neighborhood graph.") +neighbors = sc.Neighbors(adata) +neighbors.compute_neighbors( + n_neighbors=par["num_neighbors"], + use_rep=par["obsm_input"], + metric=par["metric"], + random_state=par["seed"], + method="umap", +) + +adata.uns[par["uns_output"]] = { + "connectivities_key": par["obsp_connectivities"], + "distances_key": par["obsp_distances"], + "params": { + "n_neighbors": neighbors.n_neighbors, + "method": "umap", + "random_state": par["seed"], + "metric": par["metric"], + "use_rep": par["obsm_input"], + }, +} + +adata.obsp[par["obsp_distances"]] = neighbors.distances +adata.obsp[par["obsp_connectivities"]] = neighbors.connectivities + + +logger.info("Writing to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/neighbors/find_neighbors/nextflow_labels.config b/target/executable/neighbors/find_neighbors/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/neighbors/find_neighbors/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/neighbors/find_neighbors/setup_logger.py b/target/executable/neighbors/find_neighbors/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/neighbors/find_neighbors/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/process_10xh5/filter_10xh5/.config.vsh.yaml b/target/executable/process_10xh5/filter_10xh5/.config.vsh.yaml new file mode 100644 index 00000000..c9f16550 --- /dev/null +++ b/target/executable/process_10xh5/filter_10xh5/.config.vsh.yaml @@ -0,0 +1,273 @@ +name: "filter_10xh5" +namespace: "process_10xh5" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "An h5 file from the 10x genomics website." + info: null + example: + - "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5 file." + info: null + example: + - "pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_library_size" + description: "Minimum library size." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_gene" + description: "Minimum number of cells per gene." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--keep_feature_types" + description: "Specify which feature types will never be filtered out" + info: null + example: + - "Antibody Capture" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--verbose" + description: "Increase verbosity" + info: null + direction: "input" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter a 10x h5 dataset.\n" +usage: "filter_10xh5 \\\n --input pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \\\n\ + \ --output pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5 \\\n --min_library_size\ + \ 1000 --min_cells_per_gene 300\n" +test_resources: +- type: "r_script" + path: "run_test.R" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "eddelbuettel/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "PIP_BREAK_SYSTEM_PACKAGES=1" + - "RETICULATE_PYTHON=/usr/bin/python" + - type: "apt" + packages: + - "libhdf5-dev" + - "python3-pip" + - "python3-dev" + - "python-is-python3" + interactive: false + - type: "python" + user: true + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "r" + cran: + - "testthat" + - "anndata" + - "hdf5r" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/process_10xh5/filter_10xh5/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/process_10xh5/filter_10xh5" + executable: "target/executable/process_10xh5/filter_10xh5/filter_10xh5" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/process_10xh5/filter_10xh5/filter_10xh5 b/target/executable/process_10xh5/filter_10xh5/filter_10xh5 new file mode 100755 index 00000000..440d6b8b --- /dev/null +++ b/target/executable/process_10xh5/filter_10xh5/filter_10xh5 @@ -0,0 +1,1312 @@ +#!/usr/bin/env bash + +# filter_10xh5 main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="filter_10xh5" +VIASH_META_FUNCTIONALITY_NAME="filter_10xh5" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM eddelbuettel/r2u:24.04 +ENTRYPOINT [] +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +ENV RETICULATE_PYTHON=/usr/bin/python +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev python3-pip python3-dev python-is-python3 && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --user --upgrade pip && \ + pip install --user --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("testthat", "anndata", "hdf5r"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component process_10xh5 filter_10xh5" +LABEL org.opencontainers.image.created="2025-08-22T07:23:03Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "filter_10xh5 main" + echo "" + echo "Filter a 10x h5 dataset." + echo "" + echo "Usage:" + echo "filter_10xh5 \\" + echo " --input pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \\" + echo " --output pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5 \\" + echo " --min_library_size 1000 --min_cells_per_gene 300" + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + echo " An h5 file from the 10x genomics website." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5" + echo " Output h5 file." + echo "" + echo " --min_library_size" + echo " type: integer" + echo " default: 0" + echo " Minimum library size." + echo "" + echo " --min_cells_per_gene" + echo " type: integer" + echo " default: 0" + echo " Minimum number of cells per gene." + echo "" + echo " --keep_feature_types" + echo " type: string, multiple values allowed" + echo " example: Antibody Capture" + echo " Specify which feature types will never be filtered out" + echo "" + echo " --verbose" + echo " type: boolean_true" + echo " Increase verbosity" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "filter_10xh5 main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_library_size) + [ -n "$VIASH_PAR_MIN_LIBRARY_SIZE" ] && ViashError Bad arguments for option \'--min_library_size\': \'$VIASH_PAR_MIN_LIBRARY_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_LIBRARY_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_library_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_library_size=*) + [ -n "$VIASH_PAR_MIN_LIBRARY_SIZE" ] && ViashError Bad arguments for option \'--min_library_size=*\': \'$VIASH_PAR_MIN_LIBRARY_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_LIBRARY_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_cells_per_gene) + [ -n "$VIASH_PAR_MIN_CELLS_PER_GENE" ] && ViashError Bad arguments for option \'--min_cells_per_gene\': \'$VIASH_PAR_MIN_CELLS_PER_GENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS_PER_GENE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_cells_per_gene. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_cells_per_gene=*) + [ -n "$VIASH_PAR_MIN_CELLS_PER_GENE" ] && ViashError Bad arguments for option \'--min_cells_per_gene=*\': \'$VIASH_PAR_MIN_CELLS_PER_GENE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS_PER_GENE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --keep_feature_types) + if [ -z "$VIASH_PAR_KEEP_FEATURE_TYPES" ]; then + VIASH_PAR_KEEP_FEATURE_TYPES="$2" + else + VIASH_PAR_KEEP_FEATURE_TYPES="$VIASH_PAR_KEEP_FEATURE_TYPES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --keep_feature_types. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --keep_feature_types=*) + if [ -z "$VIASH_PAR_KEEP_FEATURE_TYPES" ]; then + VIASH_PAR_KEEP_FEATURE_TYPES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_KEEP_FEATURE_TYPES="$VIASH_PAR_KEEP_FEATURE_TYPES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --verbose) + [ -n "$VIASH_PAR_VERBOSE" ] && ViashError Bad arguments for option \'--verbose\': \'$VIASH_PAR_VERBOSE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VERBOSE=true + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/process_10xh5/filter_10xh5:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MIN_LIBRARY_SIZE+x} ]; then + VIASH_PAR_MIN_LIBRARY_SIZE="0" +fi +if [ -z ${VIASH_PAR_MIN_CELLS_PER_GENE+x} ]; then + VIASH_PAR_MIN_CELLS_PER_GENE="0" +fi +if [ -z ${VIASH_PAR_VERBOSE+x} ]; then + VIASH_PAR_VERBOSE="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_LIBRARY_SIZE" ]]; then + if ! [[ "$VIASH_PAR_MIN_LIBRARY_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_library_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CELLS_PER_GENE" ]]; then + if ! [[ "$VIASH_PAR_MIN_CELLS_PER_GENE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_cells_per_gene' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_VERBOSE" ]]; then + if ! [[ "$VIASH_PAR_VERBOSE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--verbose' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-filter_10xh5-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_library_size" = $( if [ ! -z ${VIASH_PAR_MIN_LIBRARY_SIZE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_LIBRARY_SIZE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_cells_per_gene" = $( if [ ! -z ${VIASH_PAR_MIN_CELLS_PER_GENE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_CELLS_PER_GENE" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "keep_feature_types" = $( if [ ! -z ${VIASH_PAR_KEEP_FEATURE_TYPES+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_KEEP_FEATURE_TYPES" | sed "s#['\\]#\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "verbose" = $( if [ ! -z ${VIASH_PAR_VERBOSE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_VERBOSE" | sed "s#['\\]#\\\\&#g"; echo "'))"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +if (par\$verbose) cat("Loading dependencies\\n") +requireNamespace("hdf5r", quietly = TRUE) + +if (par\$verbose) cat("Opening h5 file\\n") +h5 <- hdf5r::H5File\$new(par\$input, mode = "r") + +if (par\$verbose) cat("Reading data in memory\\n") +features__all_tag_keys <- h5[["matrix/features/_all_tag_keys"]][] + +features <- data.frame( + feature_type = h5[["matrix/features/feature_type"]][], + genome = h5[["matrix/features/genome"]][], + id = h5[["matrix/features/id"]][], + name = h5[["matrix/features/name"]][] +) + +mat <- Matrix::sparseMatrix( + i = h5[["matrix/indices"]][], + p = h5[["matrix/indptr"]][], + x = h5[["matrix/data"]][], + dims = h5[["matrix/shape"]][], + index1 = FALSE, + dimnames = list( + features\$id, + h5[["matrix/barcodes"]][] + ) +) + +if (par\$verbose) { + cat("Filtering out cells with library size < ", + par\$min_library_size, "\\n", + sep = "" + ) +} +library_size <- Matrix::colSums(mat) +mat2 <- mat[, library_size >= par\$min_library_size, drop = FALSE] + +if (par\$verbose) { + cat("Filtering genes with num cells < ", + par\$min_cells_per_gene, "\\n", + sep = "" + ) +} +num_cells <- Matrix::rowSums(mat2 > 0) +mat3 <- mat2[ + num_cells >= par\$min_cells_per_gene | + features\$feature_type %in% par\$keep_feature_types, , + drop = FALSE +] +features2 <- features[match(rownames(mat3), features\$id), , drop = FALSE] + +# helper fun +set_with_type <- function(path, value) { + orig_dtype <- h5[[path]]\$get_type() + orig_chunk <- h5[[path]]\$chunk_dims + if (is.na(orig_chunk)) orig_chunk <- "auto" + h5new\$create_dataset(path, value, dtype = orig_dtype, chunk_dims = orig_chunk) +} + +# create new file +if (par\$verbose) cat("Saving h5 file at '", par\$output, "'\\n", sep = "") +h5new <- hdf5r::H5File\$new(par\$output, mode = "w") +zz <- h5new\$create_group("matrix") +zz <- h5new\$create_group("matrix/features") + +set_with_type("matrix/features/feature_type", features2\$feature_type) +set_with_type("matrix/features/genome", features2\$genome) +set_with_type("matrix/features/id", features2\$id) +set_with_type("matrix/features/name", features2\$name) +set_with_type("matrix/features/_all_tag_keys", features__all_tag_keys) +set_with_type("matrix/indices", mat3@i) +set_with_type("matrix/indptr", mat3@p) +set_with_type("matrix/data", as.integer(mat3@x)) +set_with_type("matrix/shape", dim(mat3)) +set_with_type("matrix/barcodes", colnames(mat3)) + +for (attname in hdf5r::h5attr_names(h5)) { + h5new\$create_attr(attname, hdf5r::h5attr(h5, attname)) +} +h5new\$close_all() +h5\$close_all() +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/process_10xh5/filter_10xh5/nextflow_labels.config b/target/executable/process_10xh5/filter_10xh5/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/process_10xh5/filter_10xh5/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/qc/calculate_atac_qc_metrics/.config.vsh.yaml b/target/executable/qc/calculate_atac_qc_metrics/.config.vsh.yaml new file mode 100644 index 00000000..365ed3fd --- /dev/null +++ b/target/executable/qc/calculate_atac_qc_metrics/.config.vsh.yaml @@ -0,0 +1,332 @@ +name: "calculate_atac_qc_metrics" +namespace: "qc" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--fragments_path" + description: "Path to the fragments file. If not provided and not present in the\ + \ input h5mu file,\nthe nucleosome signal and TSS enrichment score will not\ + \ be calculated.\n" + info: null + example: + - "fragments.tsv.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "atac" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer at `.layers` to use for the calculation. If not provided,\ + \ `.X` is used.\n" + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_fragments_for_nucleosome_signal" + description: "Number of fragments to use per cell for nucleosome signal calculation.\n\ + Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended.\n\ + See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal\n\ + for more information\n" + info: null + default: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--nuc_signal_threshold" + description: "Threshold for nucleosome signal. Cells with nucleosome signal above\ + \ this threshold\nwill be marked as low quality (\"NS_FAIL\"), otherwise they\ + \ will be marked \"NS_PASS\".\n" + info: null + default: + - 2.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_tss" + description: "Number of the transcription start sites to calculate TSS enrichment\ + \ score.\nSee https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment\n\ + for more information\n" + info: null + default: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--tss_threshold" + description: "Threshold for TSS enrichment score. Cells with TSS enrichment score\ + \ below this threshold\nwill be marked as low quality (\"TSS_FAIL\") otherwise\ + \ they will be marked as \"TSS_PASS\".\n" + info: null + default: + - 1.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Add basic ATAC quality control metrics to an .h5mu file.\n\nThe metrics\ + \ are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\ + \ slightly different names:\n\nObs metrics (name in this component -> name in scanpy):\n\ + \ - n_features_per_cell -> n_genes_by_counts\n - total_fragment_counts -> total_counts\n\ + \ \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "counts" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pkg-config" + - "libhdf5-dev" + - "gcc" + interactive: false + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "muon~=0.1.5" + - "pysam~=0.22.0" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/calculate_atac_qc_metrics/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/qc/calculate_atac_qc_metrics" + executable: "target/executable/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics b/target/executable/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics new file mode 100755 index 00000000..cff07900 --- /dev/null +++ b/target/executable/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics @@ -0,0 +1,1431 @@ +#!/usr/bin/env bash + +# calculate_atac_qc_metrics main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="calculate_atac_qc_metrics" +VIASH_META_FUNCTIONALITY_NAME="calculate_atac_qc_metrics" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps pkg-config libhdf5-dev gcc && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scanpy~=1.10.4" "muon~=0.1.5" "pysam~=0.22.0" + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component qc calculate_atac_qc_metrics" +LABEL org.opencontainers.image.created="2025-08-22T07:23:05Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "calculate_atac_qc_metrics main" + echo "" + echo "Add basic ATAC quality control metrics to an .h5mu file." + echo "" + echo "The metrics are comparable to what scanpy.pp.calculate_qc_metrics output," + echo "although they have slightly different names:" + echo "" + echo "Obs metrics (name in this component -> name in scanpy):" + echo " - n_features_per_cell -> n_genes_by_counts" + echo " - total_fragment_counts -> total_counts" + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --fragments_path" + echo " type: file, file must exist" + echo " example: fragments.tsv.gz" + echo " Path to the fragments file. If not provided and not present in the input" + echo " h5mu file," + echo " the nucleosome signal and TSS enrichment score will not be calculated." + echo "" + echo " --modality" + echo " type: string" + echo " default: atac" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " example: raw_counts" + echo " Layer at \`.layers\` to use for the calculation. If not provided, \`.X\` is" + echo " used." + echo "" + echo " --n_fragments_for_nucleosome_signal" + echo " type: integer" + echo " default: 100000" + echo " Number of fragments to use per cell for nucleosome signal calculation." + echo " Takes very long to calculate, for a test run lower value (e.g. 10e3) is" + echo " recommended." + echo " See" + echo " " + echo "https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal" + echo " for more information" + echo "" + echo " --nuc_signal_threshold" + echo " type: double" + echo " default: 2.0" + echo " Threshold for nucleosome signal. Cells with nucleosome signal above this" + echo " threshold" + echo " will be marked as low quality (\"NS_FAIL\"), otherwise they will be marked" + echo " \"NS_PASS\"." + echo "" + echo " --n_tss" + echo " type: integer" + echo " default: 3000" + echo " Number of the transcription start sites to calculate TSS enrichment" + echo " score." + echo " See" + echo " " + echo "https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment" + echo " for more information" + echo "" + echo " --tss_threshold" + echo " type: double" + echo " default: 1.5" + echo " Threshold for TSS enrichment score. Cells with TSS enrichment score" + echo " below this threshold" + echo " will be marked as low quality (\"TSS_FAIL\") otherwise they will be marked" + echo " as \"TSS_PASS\"." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "calculate_atac_qc_metrics main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fragments_path) + [ -n "$VIASH_PAR_FRAGMENTS_PATH" ] && ViashError Bad arguments for option \'--fragments_path\': \'$VIASH_PAR_FRAGMENTS_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FRAGMENTS_PATH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fragments_path. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fragments_path=*) + [ -n "$VIASH_PAR_FRAGMENTS_PATH" ] && ViashError Bad arguments for option \'--fragments_path=*\': \'$VIASH_PAR_FRAGMENTS_PATH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FRAGMENTS_PATH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_fragments_for_nucleosome_signal) + [ -n "$VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL" ] && ViashError Bad arguments for option \'--n_fragments_for_nucleosome_signal\': \'$VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_fragments_for_nucleosome_signal. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_fragments_for_nucleosome_signal=*) + [ -n "$VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL" ] && ViashError Bad arguments for option \'--n_fragments_for_nucleosome_signal=*\': \'$VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --nuc_signal_threshold) + [ -n "$VIASH_PAR_NUC_SIGNAL_THRESHOLD" ] && ViashError Bad arguments for option \'--nuc_signal_threshold\': \'$VIASH_PAR_NUC_SIGNAL_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUC_SIGNAL_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nuc_signal_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nuc_signal_threshold=*) + [ -n "$VIASH_PAR_NUC_SIGNAL_THRESHOLD" ] && ViashError Bad arguments for option \'--nuc_signal_threshold=*\': \'$VIASH_PAR_NUC_SIGNAL_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NUC_SIGNAL_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_tss) + [ -n "$VIASH_PAR_N_TSS" ] && ViashError Bad arguments for option \'--n_tss\': \'$VIASH_PAR_N_TSS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TSS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_tss. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_tss=*) + [ -n "$VIASH_PAR_N_TSS" ] && ViashError Bad arguments for option \'--n_tss=*\': \'$VIASH_PAR_N_TSS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TSS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --tss_threshold) + [ -n "$VIASH_PAR_TSS_THRESHOLD" ] && ViashError Bad arguments for option \'--tss_threshold\': \'$VIASH_PAR_TSS_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TSS_THRESHOLD="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --tss_threshold. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --tss_threshold=*) + [ -n "$VIASH_PAR_TSS_THRESHOLD" ] && ViashError Bad arguments for option \'--tss_threshold=*\': \'$VIASH_PAR_TSS_THRESHOLD\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TSS_THRESHOLD=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/qc/calculate_atac_qc_metrics:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="atac" +fi +if [ -z ${VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL+x} ]; then + VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL="100000" +fi +if [ -z ${VIASH_PAR_NUC_SIGNAL_THRESHOLD+x} ]; then + VIASH_PAR_NUC_SIGNAL_THRESHOLD="2.0" +fi +if [ -z ${VIASH_PAR_N_TSS+x} ]; then + VIASH_PAR_N_TSS="3000" +fi +if [ -z ${VIASH_PAR_TSS_THRESHOLD+x} ]; then + VIASH_PAR_TSS_THRESHOLD="1.5" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_FRAGMENTS_PATH" ] && [ ! -e "$VIASH_PAR_FRAGMENTS_PATH" ]; then + ViashError "Input file '$VIASH_PAR_FRAGMENTS_PATH' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL" ]]; then + if ! [[ "$VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_fragments_for_nucleosome_signal' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_NUC_SIGNAL_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_NUC_SIGNAL_THRESHOLD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--nuc_signal_threshold' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_TSS" ]]; then + if ! [[ "$VIASH_PAR_N_TSS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_tss' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_TSS_THRESHOLD" ]]; then + if ! [[ "$VIASH_PAR_TSS_THRESHOLD" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--tss_threshold' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_FRAGMENTS_PATH" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FRAGMENTS_PATH")" ) + VIASH_PAR_FRAGMENTS_PATH=$(ViashDockerAutodetectMount "$VIASH_PAR_FRAGMENTS_PATH") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-calculate_atac_qc_metrics-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys + +import scanpy as sc +import muon as mu +from muon import atac as ac # the module containing function for scATAC data processing +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'fragments_path': $( if [ ! -z ${VIASH_PAR_FRAGMENTS_PATH+x} ]; then echo "r'${VIASH_PAR_FRAGMENTS_PATH//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_fragments_for_nucleosome_signal': $( if [ ! -z ${VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL+x} ]; then echo "int(r'${VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'nuc_signal_threshold': $( if [ ! -z ${VIASH_PAR_NUC_SIGNAL_THRESHOLD+x} ]; then echo "float(r'${VIASH_PAR_NUC_SIGNAL_THRESHOLD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_tss': $( if [ ! -z ${VIASH_PAR_N_TSS+x} ]; then echo "int(r'${VIASH_PAR_N_TSS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'tss_threshold': $( if [ ! -z ${VIASH_PAR_TSS_THRESHOLD+x} ]; then echo "float(r'${VIASH_PAR_TSS_THRESHOLD//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading input data") + atac = mu.read_h5ad(par["input"], mod=par["modality"]) + + logger.info("Checking if QC columns are already calculated") + for col in ("n_features_per_cell", "total_fragment_counts"): + if col in atac.obs: + logger.warning(f"{col} is already in atac.obs, dropping") + atac.obs.drop(col, axis=1, inplace=True) + + logger.info("Calculating QC metrics") + sc.pp.calculate_qc_metrics( + atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"] + ) + + logger.debug("Putting QC columns to ATAC adata") + atac.obs.rename( + columns={ + "n_genes_by_counts": "n_features_per_cell", + "total_counts": "total_fragment_counts", + }, + inplace=True, + ) + + logger.debug("Adding log-transformed total fragment counts") + # log-transform total counts and add as column + atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"]) + + if par["fragments_path"] is not None: + logger.info("Trying to locate frafments") + ac.tl.locate_fragments(atac, fragments=par["fragments_path"]) + else: + logger.info("Skipping fragment location: \`fragments_path\` is not set") + + # Calculate the nucleosome signal across cells + if "files" in atac.uns and "fragments" in atac.uns["files"]: + logger.info("Trying to calculate nucleosome signal") + ac.tl.nucleosome_signal( + atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs + ) + atac.obs["nuc_signal_filter"] = [ + "NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS" + for ns in atac.obs["nucleosome_signal"] + ] + else: + logger.info( + "Skipping nucleosome signal calculation: fragments information is not found" + ) + + # If interval information is available, calculate TSS enrichment + if "peak_annotation" in atac.uns.keys(): + tss = ac.tl.tss_enrichment( + atac, + features=atac.uns["peak_annotation"], + n_tss=par["n_tss"], + random_state=666, + ) + + tss.obs["tss_filter"] = [ + "TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS" + for score in atac.obs["tss_score"] + ] + else: + logger.info( + "Skipping TSS enrichment calculation: genes intervals are not found" + ) + + logger.info("Writing output") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], atac, par["output_compression"] + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_FRAGMENTS_PATH" ]; then + VIASH_PAR_FRAGMENTS_PATH=$(ViashDockerStripAutomount "$VIASH_PAR_FRAGMENTS_PATH") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/qc/calculate_atac_qc_metrics/compress_h5mu.py b/target/executable/qc/calculate_atac_qc_metrics/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/qc/calculate_atac_qc_metrics/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/qc/calculate_atac_qc_metrics/nextflow_labels.config b/target/executable/qc/calculate_atac_qc_metrics/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/qc/calculate_atac_qc_metrics/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/qc/calculate_atac_qc_metrics/setup_logger.py b/target/executable/qc/calculate_atac_qc_metrics/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/qc/calculate_atac_qc_metrics/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/qc/calculate_qc_metrics/.config.vsh.yaml b/target/executable/qc/calculate_qc_metrics/.config.vsh.yaml new file mode 100644 index 00000000..31d95171 --- /dev/null +++ b/target/executable/qc/calculate_qc_metrics/.config.vsh.yaml @@ -0,0 +1,389 @@ +name: "calculate_qc_metrics" +namespace: "qc" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process. \n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer from modality to use as input data. If not provided the .X\ + \ attribute is used.\n" + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Metrics added to .obs" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes.\n" + info: null + example: + - "ercc,highly_variable,mitochondrial" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean" + name: "--var_qc_metrics_fill_na_value" + description: "Fill any 'NA' values found in the columns specified with --var_qc_metrics\ + \ to 'True' or 'False'.\nas False.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20;50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_num_nonzero_vars" + description: "Name of column in .obs describing, for each observation, the number\ + \ of stored values\n(including explicit zeroes). In other words, the name of\ + \ the column that counts\nfor each row the number of columns that contain data.\n" + info: null + default: + - "num_nonzero_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_total_counts_vars" + description: "Name of the column for .obs describing, for each observation (row),\n\ + the sum of the stored values in the columns.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Metrics added to .var" + arguments: + - type: "string" + name: "--output_var_num_nonzero_obs" + description: "Name of column describing, for each feature, the number of stored\ + \ values\n(including explicit zeroes). In other words, the name of the column\ + \ that counts\nfor each column the number of rows that contain data.\n" + info: null + default: + - "num_nonzero_obs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_total_counts_obs" + description: "Name of the column in .var describing, for each feature (column),\n\ + the sum of the stored values in the rows.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_obs_mean" + description: "Name of the column in .obs providing the mean of the values in each\ + \ row.\n" + info: null + default: + - "obs_mean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_pct_dropout" + description: "Name of the column in .obs providing for each feature the percentage\ + \ of\nobservations the feature does not appear on (i.e. is missing). Same as\ + \ `--num_nonzero_obs`\nbut percentage based.\n" + info: null + default: + - "pct_dropout" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are\ + \ comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\ + \ slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n\ + \ - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n\ + \ - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs\ + \ metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics}\ + \ -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n\ + \ - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n\ + \ - total_counts -> total_{expr_type}\n \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scipy" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "scanpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/calculate_qc_metrics/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/qc/calculate_qc_metrics" + executable: "target/executable/qc/calculate_qc_metrics/calculate_qc_metrics" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/qc/calculate_qc_metrics/calculate_qc_metrics b/target/executable/qc/calculate_qc_metrics/calculate_qc_metrics new file mode 100755 index 00000000..a003c911 --- /dev/null +++ b/target/executable/qc/calculate_qc_metrics/calculate_qc_metrics @@ -0,0 +1,1602 @@ +#!/usr/bin/env bash + +# calculate_qc_metrics main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="calculate_qc_metrics" +VIASH_META_FUNCTIONALITY_NAME="calculate_qc_metrics" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scipy" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component qc calculate_qc_metrics" +LABEL org.opencontainers.image.created="2025-08-22T07:23:05Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "calculate_qc_metrics main" + echo "" + echo "Add basic quality control metrics to an .h5mu file." + echo "" + echo "The metrics are comparable to what scanpy.pp.calculate_qc_metrics output," + echo "although they have slightly different names:" + echo "" + echo "Var metrics (name in this component -> name in scanpy):" + echo " - pct_dropout -> pct_dropout_by_{expr_type}" + echo " - num_nonzero_obs -> n_cells_by_{expr_type}" + echo " - obs_mean -> mean_{expr_type}" + echo " - total_counts -> total_{expr_type}" + echo "" + echo " Obs metrics:" + echo " - num_nonzero_vars -> n_genes_by_{expr_type}" + echo " - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}" + echo " - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}" + echo " - pct_of_counts_in_top_{top_n_vars}_vars ->" + echo "pct_{expr_type}_in_top_{n}_{var_type}" + echo " - total_counts -> total_{expr_type}" + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string" + echo " example: raw_counts" + echo " Layer from modality to use as input data. If not provided the .X" + echo " attribute is used." + echo "" + echo "Metrics added to .obs:" + echo " --var_qc_metrics" + echo " type: string, multiple values allowed" + echo " example: ercc,highly_variable,mitochondrial" + echo " Keys to select a boolean (containing only True or False) column from" + echo " .var." + echo " For each cell, calculate the proportion of total values for genes which" + echo " are labeled 'True'," + echo " compared to the total sum of the values for all genes." + echo "" + echo " --var_qc_metrics_fill_na_value" + echo " type: boolean" + echo " Fill any 'NA' values found in the columns specified with" + echo " --var_qc_metrics to 'True' or 'False'." + echo " as False." + echo "" + echo " --top_n_vars" + echo " type: integer, multiple values allowed" + echo " Number of top vars to be used to calculate cumulative proportions." + echo " If not specified, proportions are not calculated. \`--top_n_vars 20;50\`" + echo " finds" + echo " cumulative proportion to the 20th and 50th most expressed vars." + echo "" + echo " --output_obs_num_nonzero_vars" + echo " type: string" + echo " default: num_nonzero_vars" + echo " Name of column in .obs describing, for each observation, the number of" + echo " stored values" + echo " (including explicit zeroes). In other words, the name of the column that" + echo " counts" + echo " for each row the number of columns that contain data." + echo "" + echo " --output_obs_total_counts_vars" + echo " type: string" + echo " default: total_counts" + echo " Name of the column for .obs describing, for each observation (row)," + echo " the sum of the stored values in the columns." + echo "" + echo "Metrics added to .var:" + echo " --output_var_num_nonzero_obs" + echo " type: string" + echo " default: num_nonzero_obs" + echo " Name of column describing, for each feature, the number of stored values" + echo " (including explicit zeroes). In other words, the name of the column that" + echo " counts" + echo " for each column the number of rows that contain data." + echo "" + echo " --output_var_total_counts_obs" + echo " type: string" + echo " default: total_counts" + echo " Name of the column in .var describing, for each feature (column)," + echo " the sum of the stored values in the rows." + echo "" + echo " --output_var_obs_mean" + echo " type: string" + echo " default: obs_mean" + echo " Name of the column in .obs providing the mean of the values in each row." + echo "" + echo " --output_var_pct_dropout" + echo " type: string" + echo " default: pct_dropout" + echo " Name of the column in .obs providing for each feature the percentage of" + echo " observations the feature does not appear on (i.e. is missing). Same as" + echo " \`--num_nonzero_obs\`" + echo " but percentage based." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "calculate_qc_metrics main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + [ -n "$VIASH_PAR_LAYER" ] && ViashError Bad arguments for option \'--layer=*\': \'$VIASH_PAR_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_qc_metrics) + if [ -z "$VIASH_PAR_VAR_QC_METRICS" ]; then + VIASH_PAR_VAR_QC_METRICS="$2" + else + VIASH_PAR_VAR_QC_METRICS="$VIASH_PAR_VAR_QC_METRICS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_qc_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_qc_metrics=*) + if [ -z "$VIASH_PAR_VAR_QC_METRICS" ]; then + VIASH_PAR_VAR_QC_METRICS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_VAR_QC_METRICS="$VIASH_PAR_VAR_QC_METRICS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --var_qc_metrics_fill_na_value) + [ -n "$VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE" ] && ViashError Bad arguments for option \'--var_qc_metrics_fill_na_value\': \'$VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_qc_metrics_fill_na_value. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_qc_metrics_fill_na_value=*) + [ -n "$VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE" ] && ViashError Bad arguments for option \'--var_qc_metrics_fill_na_value=*\': \'$VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --top_n_vars) + if [ -z "$VIASH_PAR_TOP_N_VARS" ]; then + VIASH_PAR_TOP_N_VARS="$2" + else + VIASH_PAR_TOP_N_VARS="$VIASH_PAR_TOP_N_VARS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --top_n_vars. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --top_n_vars=*) + if [ -z "$VIASH_PAR_TOP_N_VARS" ]; then + VIASH_PAR_TOP_N_VARS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_TOP_N_VARS="$VIASH_PAR_TOP_N_VARS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_obs_num_nonzero_vars) + [ -n "$VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS" ] && ViashError Bad arguments for option \'--output_obs_num_nonzero_vars\': \'$VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_num_nonzero_vars. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_num_nonzero_vars=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS" ] && ViashError Bad arguments for option \'--output_obs_num_nonzero_vars=*\': \'$VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_total_counts_vars) + [ -n "$VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS" ] && ViashError Bad arguments for option \'--output_obs_total_counts_vars\': \'$VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_total_counts_vars. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_total_counts_vars=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS" ] && ViashError Bad arguments for option \'--output_obs_total_counts_vars=*\': \'$VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_num_nonzero_obs) + [ -n "$VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS" ] && ViashError Bad arguments for option \'--output_var_num_nonzero_obs\': \'$VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_num_nonzero_obs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_num_nonzero_obs=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS" ] && ViashError Bad arguments for option \'--output_var_num_nonzero_obs=*\': \'$VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_total_counts_obs) + [ -n "$VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS" ] && ViashError Bad arguments for option \'--output_var_total_counts_obs\': \'$VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_total_counts_obs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_total_counts_obs=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS" ] && ViashError Bad arguments for option \'--output_var_total_counts_obs=*\': \'$VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_obs_mean) + [ -n "$VIASH_PAR_OUTPUT_VAR_OBS_MEAN" ] && ViashError Bad arguments for option \'--output_var_obs_mean\': \'$VIASH_PAR_OUTPUT_VAR_OBS_MEAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_OBS_MEAN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_obs_mean. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_obs_mean=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_OBS_MEAN" ] && ViashError Bad arguments for option \'--output_var_obs_mean=*\': \'$VIASH_PAR_OUTPUT_VAR_OBS_MEAN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_OBS_MEAN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_pct_dropout) + [ -n "$VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT" ] && ViashError Bad arguments for option \'--output_var_pct_dropout\': \'$VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_pct_dropout. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_pct_dropout=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT" ] && ViashError Bad arguments for option \'--output_var_pct_dropout=*\': \'$VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/qc/calculate_qc_metrics:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS+x} ]; then + VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS="num_nonzero_vars" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS+x} ]; then + VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS="total_counts" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS+x} ]; then + VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS="num_nonzero_obs" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS+x} ]; then + VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS="total_counts" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_OBS_MEAN+x} ]; then + VIASH_PAR_OUTPUT_VAR_OBS_MEAN="obs_mean" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT+x} ]; then + VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT="pct_dropout" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE" ]]; then + if ! [[ "$VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--var_qc_metrics_fill_na_value' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [ -n "$VIASH_PAR_TOP_N_VARS" ]; then + IFS=';' + set -f + for val in $VIASH_PAR_TOP_N_VARS; do + if ! [[ "${val}" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--top_n_vars' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + done + set +f + unset IFS +fi + +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-calculate_qc_metrics-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from mudata import read_h5ad +from scipy.sparse import issparse, csr_array +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_qc_metrics': $( if [ ! -z ${VIASH_PAR_VAR_QC_METRICS+x} ]; then echo "r'${VIASH_PAR_VAR_QC_METRICS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'var_qc_metrics_fill_na_value': $( if [ ! -z ${VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE+x} ]; then echo "r'${VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'top_n_vars': $( if [ ! -z ${VIASH_PAR_TOP_N_VARS+x} ]; then echo "list(map(int, r'${VIASH_PAR_TOP_N_VARS//\'/\'\"\'\"r\'}'.split(';')))"; else echo None; fi ), + 'output_obs_num_nonzero_vars': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_total_counts_vars': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_num_nonzero_obs': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_total_counts_obs': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_obs_mean': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_OBS_MEAN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_OBS_MEAN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_pct_dropout': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def count_nonzero(layer, axis): + """ + This method is the functional equivalent of the old .getnnz function from scirpy, + but that function was deprecated. So we use the nonzero function to mimic the old + behavior. + """ + axis ^= 1 + nonzero_counts = dict(zip(*np.unique(layer.nonzero()[axis], return_counts=True))) + nonzero_per_axis_item = { + row_index: nonzero_counts.get(row_index, 0) + for row_index in range(layer.shape[axis]) + } + return np.array(list(nonzero_per_axis_item.values()), dtype="int64") + + +def main(): + modality_data = read_h5ad(par["input"], mod=par["modality"]) + var = modality_data.var + layer = modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] + if not issparse(layer): + raise NotImplementedError("Expected layer to be in sparse format.") + layer = csr_array(layer) + layer.eliminate_zeros() + + var_columns_to_add = {} + + # var statistics + if par["output_var_obs_mean"]: + obs_mean = layer.mean(axis=0) + var_columns_to_add[par["output_var_obs_mean"]] = obs_mean + if par["output_var_total_counts_obs"]: + # from the np.sum documentation: + # Especially when summing a large number of lower precision floating point numbers, + # such as float32, numerical errors can become significant. In such cases it can + # be advisable to use dtype="float64" to use a higher precision for the output. + layer_with_type = layer + if np.issubdtype(layer.dtype, np.floating) and np.can_cast( + layer.dtype, np.float64, casting="safe" + ): + # 'safe' casting makes sure not to cast np.float128 or anything else to a lower precision dtype + layer_with_type = layer.astype(np.float64) + total_counts_obs = np.ravel(layer_with_type.sum(axis=0)) + var_columns_to_add[par["output_var_total_counts_obs"]] = total_counts_obs + + num_nonzero_obs = count_nonzero(layer, axis=0) + if par["output_var_num_nonzero_obs"]: + var_columns_to_add[par["output_var_num_nonzero_obs"]] = num_nonzero_obs + if par["output_var_pct_dropout"]: + var_columns_to_add[par["output_var_pct_dropout"]] = ( + 1 - num_nonzero_obs / layer.shape[0] + ) * 100 + + modality_data.var = modality_data.var.assign(**var_columns_to_add) + + # obs statistics + obs_columns_to_add = {} + total_counts_var = np.ravel(layer.sum(axis=1)) + + if par["output_obs_num_nonzero_vars"]: + num_nonzero_vars = count_nonzero(layer, axis=1) + obs_columns_to_add[par["output_obs_num_nonzero_vars"]] = num_nonzero_vars + + if par["output_obs_total_counts_vars"]: + obs_columns_to_add[par["output_obs_total_counts_vars"]] = total_counts_var + + top_metrics = {} + if par["top_n_vars"]: + par["top_n_vars"] = sorted(par["top_n_vars"]) + distributions = get_top_from_csr_matrix(layer, par["top_n_vars"]) + top_metrics = { + distribution_size: distribution * 100 + for distribution_size, distribution in zip( + par["top_n_vars"], distributions.T + ) + } + obs_columns_to_add |= { + f"pct_of_counts_in_top_{n_top}_vars": col + for n_top, col in top_metrics.items() + } + + if par["var_qc_metrics"]: + print(f"qc_metrics: {par['var_qc_metrics']}") + for qc_metric in par["var_qc_metrics"]: + if qc_metric not in var: + raise ValueError( + f"Value for --var_qc_metrics, '{qc_metric}' " + f"not found in .var for modality {par['modality']}" + ) + qc_column = var[qc_metric] + if qc_column.isna().any(): + if par["var_qc_metrics_fill_na_value"] is None: + raise ValueError( + f"The .var column '{qc_metric}', selected by '--var_qc_metrics', contains NA values. " + "It is ambiguous whether or not to include these values in the static calulation. " + "You can explicitly map the NA values to 'False' or 'True using '--var_qc_metrics_fill_na_value'" + ) + else: + qc_column = qc_column.fillna( + par["var_qc_metrics_fill_na_value"], inplace=False + ) + qc_column = qc_column.to_list() + if set(np.unique(qc_column)) - {True, False}: + raise ValueError( + f"Column {qc_metric} in .var for modality {par['modality']} " + f"must only contain boolean values" + ) + total_counts_qc_metric = np.ravel(layer[:, qc_column].sum(axis=1)) + obs_columns_to_add |= { + f"total_counts_{qc_metric}": total_counts_qc_metric, + f"pct_{qc_metric}": total_counts_qc_metric / total_counts_var * 100, + } + + modality_data.obs = modality_data.obs.assign(**obs_columns_to_add) + write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], + ) + + +def get_top_from_csr_matrix(array, top_n_genes): + # csr matrices stores a 3D matrix in a format such that data for individual cells + # are stored in 1 array. Another array (indptr) here stores the ranges of indices + # to select from the data-array (.e.g. data[indptr[0]:indptr[1]] for row 0) for each row. + # Another array 'indices' maps each element of data to a column + # (data and indices arrays have the same length) + top_n_genes = np.array(top_n_genes).astype(np.int64) + assert np.all(top_n_genes[:-1] <= top_n_genes[1:]), "top_n_genes must be sorted" + row_indices, data = array.indptr, array.data + number_of_rows, max_genes_to_parse = row_indices.size - 1, top_n_genes[-1] + top_data = np.zeros((number_of_rows, max_genes_to_parse), dtype=data.dtype) + # Loop over each row to create a dense matrix without the 0 counts, + # but not for the whole matrix, only store the genes up until + # the largest number of top n genes. + for row_number in range(number_of_rows): + row_start_index, row_end_index = ( + row_indices[row_number], + row_indices[row_number + 1], + ) + row_data = data[row_start_index:row_end_index] # all non-zero counts for an row + try: + # There are less genes with counts in the row than the + # maximum number of genes we would like to select + # all these genes are in the top genes, just store them + top_data[row_number, : row_end_index - row_start_index] = row_data + except ValueError: + # Store the counts for the top genes + top_data[row_number, :] = np.partition(row_data, -max_genes_to_parse)[ + -max_genes_to_parse: + ] + + # Partition works from smallest to largest, but we want largest + # so do smallest to largest first (but with reversed indices) + top_data = np.partition(top_data, max_genes_to_parse - top_n_genes) + # And then switch the order around + top_data = np.flip(top_data, axis=1) + + cumulative = top_data.cumsum(axis=1, dtype=np.float64)[:, top_n_genes - 1] + return cumulative / np.expand_dims(array.sum(axis=1), 1) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/qc/calculate_qc_metrics/compress_h5mu.py b/target/executable/qc/calculate_qc_metrics/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/qc/calculate_qc_metrics/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/qc/calculate_qc_metrics/nextflow_labels.config b/target/executable/qc/calculate_qc_metrics/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/qc/calculate_qc_metrics/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/qc/calculate_qc_metrics/setup_logger.py b/target/executable/qc/calculate_qc_metrics/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/qc/calculate_qc_metrics/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/qc/fastqc/.config.vsh.yaml b/target/executable/qc/fastqc/.config.vsh.yaml new file mode 100644 index 00000000..59102dc2 --- /dev/null +++ b/target/executable/qc/fastqc/.config.vsh.yaml @@ -0,0 +1,225 @@ +name: "fastqc" +namespace: "qc" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--mode" + alternatives: + - "-m" + description: "The mode in which the component works. Can be either files or dir." + info: null + default: + - "files" + required: false + choices: + - "files" + - "dir" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Directory containing input fastq files." + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory to write reports to." + info: null + example: + - "qc" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--threads" + alternatives: + - "-t" + description: "Specifies the number of files which can be processed simultaneously.\ + \ Each thread will be allocated 250MB of\nmemory.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Fastqc component, please see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/.\ + \ This component can take one or more files (by means of shell globbing) or a complete\ + \ directory.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "fastqc" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/fastqc/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/qc/fastqc" + executable: "target/executable/qc/fastqc/fastqc" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/qc/fastqc/fastqc b/target/executable/qc/fastqc/fastqc new file mode 100755 index 00000000..280a0162 --- /dev/null +++ b/target/executable/qc/fastqc/fastqc @@ -0,0 +1,1197 @@ +#!/usr/bin/env bash + +# fastqc main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="fastqc" +VIASH_META_FUNCTIONALITY_NAME="fastqc" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:22.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y fastqc && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.description="Companion container for running component qc fastqc" +LABEL org.opencontainers.image.created="2025-08-22T07:23:05Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "fastqc main" + echo "" + echo "Fastqc component, please see" + echo "https://www.bioinformatics.babraham.ac.uk/projects/fastqc/. This component can" + echo "take one or more files (by means of shell globbing) or a complete directory." + echo "" + echo "Arguments:" + echo " -m, --mode" + echo " type: string" + echo " default: files" + echo " choices: [ files, dir ]" + echo " The mode in which the component works. Can be either files or dir." + echo "" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: fastq_dir" + echo " Directory containing input fastq files." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: qc" + echo " Output directory to write reports to." + echo "" + echo " -t, --threads" + echo " type: integer" + echo " Specifies the number of files which can be processed simultaneously." + echo " Each thread will be allocated 250MB of" + echo " memory." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "fastqc main" + exit + ;; + --mode) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'--mode\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mode=*) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'--mode=*\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -m) + [ -n "$VIASH_PAR_MODE" ] && ViashError Bad arguments for option \'-m\': \'$VIASH_PAR_MODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -m. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --threads) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'--threads\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --threads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --threads=*) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'--threads=*\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS=$(ViashRemoveFlags "$1") + shift 1 + ;; + -t) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'-t\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -t. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/qc/fastqc:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODE+x} ]; then + VIASH_PAR_MODE="files" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_THREADS" ]]; then + if ! [[ "$VIASH_PAR_THREADS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--threads' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_MODE" ]; then + VIASH_PAR_MODE_CHOICES=("files;dir") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_MODE_CHOICES[*]};" =~ ";$VIASH_PAR_MODE;" ]]; then + ViashError '--mode' specified value of \'$VIASH_PAR_MODE\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-fastqc-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_MODE+x} ]; then echo "${VIASH_PAR_MODE}" | sed "s#'#'\"'\"'#g;s#.*#par_mode='&'#" ; else echo "# par_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_THREADS+x} ]; then echo "${VIASH_PAR_THREADS}" | sed "s#'#'\"'\"'#g;s#.*#par_threads='&'#" ; else echo "# par_threads="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +mkdir -p "\$par_output" + +if [ "\$par_mode" == "dir" ]; then + par_input="\$par_input/*.fastq.gz" +fi + +eval fastqc \${par_threads:+--threads \$par_threads} -o "\$par_output" "\$par_input" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/qc/fastqc/nextflow_labels.config b/target/executable/qc/fastqc/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/qc/fastqc/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/qc/multiqc/.config.vsh.yaml b/target/executable/qc/multiqc/.config.vsh.yaml new file mode 100644 index 00000000..d242225e --- /dev/null +++ b/target/executable/qc/multiqc/.config.vsh.yaml @@ -0,0 +1,211 @@ +name: "multiqc" +namespace: "qc" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Inputs for MultiQC." + info: null + example: + - "input.txt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Create report in the specified output directory." + info: null + example: + - "report" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "MultiQC aggregates results from bioinformatics analyses across many\ + \ samples into a single report.\nIt searches a given directory for analysis logs\ + \ and compiles a HTML report. It's a general use tool, perfect for summarising the\ + \ output from numerous bioinformatics tools.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "fastqc" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "multiqc" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/multiqc/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/qc/multiqc" + executable: "target/executable/qc/multiqc/multiqc" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/qc/multiqc/multiqc b/target/executable/qc/multiqc/multiqc new file mode 100755 index 00000000..553da8f5 --- /dev/null +++ b/target/executable/qc/multiqc/multiqc @@ -0,0 +1,1165 @@ +#!/usr/bin/env bash + +# multiqc main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="multiqc" +VIASH_META_FUNCTIONALITY_NAME="multiqc" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "multiqc" + +LABEL org.opencontainers.image.description="Companion container for running component qc multiqc" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "multiqc main" + echo "" + echo "MultiQC aggregates results from bioinformatics analyses across many samples into" + echo "a single report." + echo "It searches a given directory for analysis logs and compiles a HTML report. It's" + echo "a general use tool, perfect for summarising the output from numerous" + echo "bioinformatics tools." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: input.txt" + echo " Inputs for MultiQC." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: report" + echo " Create report in the specified output directory." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "multiqc main" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + -i) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/qc/multiqc:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-multiqc-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import subprocess + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# Run MultiQC +subprocess.run(["multiqc", "-o", par["output"]] + par["input"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/qc/multiqc/nextflow_labels.config b/target/executable/qc/multiqc/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/qc/multiqc/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/query/cellxgene_census/.config.vsh.yaml b/target/executable/query/cellxgene_census/.config.vsh.yaml new file mode 100644 index 00000000..72ac1b67 --- /dev/null +++ b/target/executable/query/cellxgene_census/.config.vsh.yaml @@ -0,0 +1,586 @@ +name: "cellxgene_census" +namespace: "query" +version: "main" +authors: +- name: "Matthias Beyens" + roles: + - "maintainer" + - "author" + info: + role: "Contributor" + links: + github: "MatthiasBeyens" + orcid: "0000-0003-3304-0706" + email: "matthias.beyens@gmail.com" + linkedin: "mbeyens" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Kai Waldrant" + roles: + - "contributor" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Input database" + description: "Open CellxGene Census by version or URI." + arguments: + - type: "string" + name: "--input_uri" + description: "If specified, a URI containing the Census SOMA objects. If specified,\ + \ will take precedence over the `--census_version` argument." + info: null + example: + - "s3://bucket/path" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--census_version" + description: "Which release of CellxGene census to use. Possible values are \"\ + latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"\ + ). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." + info: null + example: + - "stable" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--add_dataset_metadata" + description: "If true, the experiment metadata will be added to the cell metadata.\ + \ More specifically: `collection_id`, `collection_name`, `collection_doi`, `dataset_title`." + info: null + direction: "input" +- name: "Cell query" + description: "Arguments related to the query." + arguments: + - type: "string" + name: "--species" + description: "The organism to query, usually one of `Homo sapiens` or `Mus musculus`." + info: null + example: + - "homo_sapiens" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_value_filter" + description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a\ + \ filter query written in the SOMA `value_filter` syntax." + info: null + example: + - "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311',\ + \ 'CL:0002616'] and suspension_type == 'cell'" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Filter cells by grouping" + description: "Filter groups with fewer than X number of cells." + arguments: + - type: "string" + name: "--cell_filter_grouping" + description: "A subset of 'obs' columns by which to group the cells for filtering.\n\ + Only groups surpassing or equal to the `--cell_filter_minimum_count`\nthreshold\ + \ will be retained. Take care not to introduce a selection\nbias against cells\ + \ with more fine-grained ontology annotations.\n" + info: null + example: + - "dataset_id" + - "tissue" + - "assay" + - "disease" + - "cell_type" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--cell_filter_minimum_count" + description: "A minimum number of cells per group to retain. If `--cell_filter_grouping`\n\ + is defined, this parameter should also be provided and vice versa.\n" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Count filtering" + description: "Arguments related to filtering cells and genes by counts." + arguments: + - type: "integer" + name: "--cell_filter_min_genes" + description: "Remove cells with less than this number of genes." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--cell_filter_min_counts" + description: "Remove cells with less than this number of counts." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gene_filter_min_cells" + description: "Remove genes expressed in less than this number of cells." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gene_filter_min_counts" + description: "Remove genes with less than this number of counts." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: + label: "CellxGene dataset" + summary: "A dataset queried from the CellxGene Census platform" + description: "The format of this file is derived from the [CELLxGENE schema\ + \ v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).\n" + slots: + mod: + - name: "rna" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + obs: + - type: "string" + name: "dataset_id" + description: "Identifier for the dataset from which the cell data is derived,\ + \ useful for tracking and referencing purposes." + required: false + - type: "string" + name: "assay" + description: "Type of assay used to generate the cell data, indicating\ + \ the methodology or technique employed." + required: true + - type: "string" + name: "assay_ontology_term_id" + description: "Experimental Factor Ontology (`EFO:`) term identifier for\ + \ the assay, providing a standardized reference to the assay type." + required: true + - type: "string" + name: "cell_type" + description: "Classification of the cell type based on its characteristics\ + \ and function within the tissue or organism." + required: true + - type: "string" + name: "cell_type_ontology_term_id" + description: "Cell Ontology (`CL:`) term identifier for the cell type,\ + \ offering a standardized reference to the specific cell classification." + required: true + - type: "string" + name: "development_stage" + description: "Stage of development of the organism or tissue from which\ + \ the cell is derived, indicating its maturity or developmental phase." + required: true + - type: "string" + name: "development_stage_ontology_term_id" + description: "Ontology term identifier for the developmental stage, providing\ + \ a standardized reference to the organism's developmental phase.\n\n\ + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`),\ + \ then the Human Developmental Stages (`HsapDv:`) ontology is used.\ + \ \nIf the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`),\ + \ then the Mouse Developmental Stages (`MmusDv:`) ontology is used.\n\ + Otherwise, the Uberon (`UBERON:`) ontology is used.\n" + required: true + - type: "string" + name: "disease" + description: "Information on any disease or pathological condition associated\ + \ with the cell or donor." + required: true + - type: "string" + name: "disease_ontology_term_id" + description: "Ontology term identifier for the disease, enabling standardized\ + \ disease classification and referencing.\n\nMust be a term from the\ + \ Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461`\ + \ from the Phenotype And Trait Ontology (`PATO:`).\n" + required: true + - type: "string" + name: "donor_id" + description: "Identifier for the donor from whom the cell sample is obtained." + required: true + - type: "boolean" + name: "is_primary_data" + description: "Indicates whether the data is primary (directly obtained\ + \ from experiments) or has been computationally derived from other primary\ + \ data." + required: true + - type: "string" + name: "organism" + description: "Organism from which the cell sample is obtained." + required: true + - type: "string" + name: "organism_ontology_term_id" + description: "Ontology term identifier for the organism, providing a standardized\ + \ reference for the organism.\n\nMust be a term from the NCBI Taxonomy\ + \ Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`.\n" + required: true + - type: "string" + name: "self_reported_ethnicity" + description: "Ethnicity of the donor as self-reported, relevant for studies\ + \ considering genetic diversity and population-specific traits." + required: true + - type: "string" + name: "self_reported_ethnicity_ontology_term_id" + description: "Ontology term identifier for the self-reported ethnicity,\ + \ providing a standardized reference for ethnic classifications.\n\n\ + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`),\ + \ then the Human Ancestry Ontology (`HANCESTRO:`) is used.\n" + required: true + - type: "string" + name: "sex" + description: "Biological sex of the donor or source organism, crucial\ + \ for studies involving sex-specific traits or conditions." + required: true + - type: "string" + name: "sex_ontology_term_id" + description: "Ontology term identifier for the biological sex, ensuring\ + \ standardized classification of sex. Only `PATO:0000383`, `PATO:0000384`\ + \ and `PATO:0001340` are allowed." + required: true + - type: "string" + name: "suspension_type" + description: "Type of suspension or medium in which the cells were stored\ + \ or processed, important for understanding cell handling and conditions." + required: true + - type: "string" + name: "tissue" + description: "Specific tissue from which the cells were derived, key for\ + \ context and specificity in cell studies." + required: true + - type: "string" + name: "tissue_ontology_term_id" + description: "Ontology term identifier for the tissue, providing a standardized\ + \ reference for the tissue type.\n\nFor organoid or tissue samples,\ + \ the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be\ + \ a child term of `UBERON:0001062` (anatomical entity).\nFor cell cultures,\ + \ the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`,\ + \ `CL:0000257` or `CL:0000548`.\n" + required: true + - type: "string" + name: "tissue_general" + description: "General category or classification of the tissue, useful\ + \ for broader grouping and comparison of cell data." + required: true + - type: "string" + name: "tissue_general_ontology_term_id" + description: "Ontology term identifier for the general tissue category,\ + \ aiding in standardizing and grouping tissue types.\n\nFor organoid\ + \ or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used.\ + \ The term ids must be a child term of `UBERON:0001062` (anatomical\ + \ entity).\nFor cell cultures, the Cell Ontology (`CL:`) is used. The\ + \ term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.\n" + required: true + - type: "integer" + name: "soma_joinid" + description: "If the dataset was retrieved from CELLxGENE census, this\ + \ is a unique identifier for the cell." + required: true + var: + - type: "string" + name: "feature_id" + description: "Unique identifier for the feature, usually a ENSEMBL gene\ + \ id." + required: true + - type: "string" + name: "feature_name" + description: "A human-readable name for the feature, usually a gene symbol." + required: true + - type: "integer" + name: "soma_joinid" + description: "If the dataset was retrieved from CELLxGENE census, this\ + \ is a unique identifier for the feature." + required: true + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_modality" + description: "Which modality to store the output in." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer_counts" + description: "Which layer to store the raw counts in. If not provided, the .X\ + \ layer will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Query cells from a CellxGene Census or custom TileDBSoma object.\nAside\ + \ from fetching the cells' RNA counts (`.X`), cell metadata\n(`.obs`) and gene metadata\ + \ (`.var`), this component also fetches\nthe dataset metadata and joins it into\ + \ the cell metadata.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "cellxgene-census" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/query/cellxgene_census/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/query/cellxgene_census" + executable: "target/executable/query/cellxgene_census/cellxgene_census" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/query/cellxgene_census/cellxgene_census b/target/executable/query/cellxgene_census/cellxgene_census new file mode 100755 index 00000000..67f1da62 --- /dev/null +++ b/target/executable/query/cellxgene_census/cellxgene_census @@ -0,0 +1,1604 @@ +#!/usr/bin/env bash + +# cellxgene_census main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Matthias Beyens (maintainer, author) +# * Dries De Maeyer (author) +# * Robrecht Cannoodt (author) +# * Kai Waldrant (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellxgene_census" +VIASH_META_FUNCTIONALITY_NAME="cellxgene_census" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "cellxgene-census" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Matthias Beyens, Dries De Maeyer, Robrecht Cannoodt, Kai Waldrant" +LABEL org.opencontainers.image.description="Companion container for running component query cellxgene_census" +LABEL org.opencontainers.image.created="2025-08-22T07:23:09Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellxgene_census main" + echo "" + echo "Query cells from a CellxGene Census or custom TileDBSoma object." + echo "Aside from fetching the cells' RNA counts (\`.X\`), cell metadata" + echo "(\`.obs\`) and gene metadata (\`.var\`), this component also fetches" + echo "the dataset metadata and joins it into the cell metadata." + echo "" + echo "Input database:" + echo " Open CellxGene Census by version or URI." + echo "" + echo " --input_uri" + echo " type: string" + echo " example: s3://bucket/path" + echo " If specified, a URI containing the Census SOMA objects. If specified," + echo " will take precedence over the \`--census_version\` argument." + echo "" + echo " --census_version" + echo " type: string" + echo " example: stable" + echo " Which release of CellxGene census to use. Possible values are \"latest\"," + echo " \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For" + echo " more information, check the documentation on [Census data" + echo " " + echo "releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." + echo "" + echo " --add_dataset_metadata" + echo " type: boolean_true" + echo " If true, the experiment metadata will be added to the cell metadata." + echo " More specifically: \`collection_id\`, \`collection_name\`, \`collection_doi\`," + echo " \`dataset_title\`." + echo "" + echo "Cell query:" + echo " Arguments related to the query." + echo "" + echo " --species" + echo " type: string, required parameter" + echo " example: homo_sapiens" + echo " The organism to query, usually one of \`Homo sapiens\` or \`Mus musculus\`." + echo "" + echo " --obs_value_filter" + echo " type: string, required parameter" + echo " example: is_primary_data == True and cell_type_ontology_term_id in" + echo "['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" + echo " Filter for selecting the \`obs\` metadata (i.e. cells). Value is a filter" + echo " query written in the SOMA \`value_filter\` syntax." + echo "" + echo "Filter cells by grouping:" + echo " Filter groups with fewer than X number of cells." + echo "" + echo " --cell_filter_grouping" + echo " type: string, multiple values allowed" + echo " example: dataset_id;tissue;assay;disease;cell_type" + echo " A subset of 'obs' columns by which to group the cells for filtering." + echo " Only groups surpassing or equal to the \`--cell_filter_minimum_count\`" + echo " threshold will be retained. Take care not to introduce a selection" + echo " bias against cells with more fine-grained ontology annotations." + echo "" + echo " --cell_filter_minimum_count" + echo " type: integer" + echo " example: 100" + echo " A minimum number of cells per group to retain. If" + echo " \`--cell_filter_grouping\`" + echo " is defined, this parameter should also be provided and vice versa." + echo "" + echo "Count filtering:" + echo " Arguments related to filtering cells and genes by counts." + echo "" + echo " --cell_filter_min_genes" + echo " type: integer" + echo " default: 50" + echo " Remove cells with less than this number of genes." + echo "" + echo " --cell_filter_min_counts" + echo " type: integer" + echo " default: 0" + echo " Remove cells with less than this number of counts." + echo "" + echo " --gene_filter_min_cells" + echo " type: integer" + echo " default: 5" + echo " Remove genes expressed in less than this number of cells." + echo "" + echo " --gene_filter_min_counts" + echo " type: integer" + echo " default: 0" + echo " Remove genes with less than this number of counts." + echo "" + echo "Outputs:" + echo " Output arguments." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_modality" + echo " type: string" + echo " default: rna" + echo " Which modality to store the output in." + echo "" + echo " --output_layer_counts" + echo " type: string" + echo " Which layer to store the raw counts in. If not provided, the .X layer" + echo " will be used." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellxgene_census main" + exit + ;; + --input_uri) + [ -n "$VIASH_PAR_INPUT_URI" ] && ViashError Bad arguments for option \'--input_uri\': \'$VIASH_PAR_INPUT_URI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_URI="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_uri. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_uri=*) + [ -n "$VIASH_PAR_INPUT_URI" ] && ViashError Bad arguments for option \'--input_uri=*\': \'$VIASH_PAR_INPUT_URI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_URI=$(ViashRemoveFlags "$1") + shift 1 + ;; + --census_version) + [ -n "$VIASH_PAR_CENSUS_VERSION" ] && ViashError Bad arguments for option \'--census_version\': \'$VIASH_PAR_CENSUS_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CENSUS_VERSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --census_version. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --census_version=*) + [ -n "$VIASH_PAR_CENSUS_VERSION" ] && ViashError Bad arguments for option \'--census_version=*\': \'$VIASH_PAR_CENSUS_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CENSUS_VERSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --add_dataset_metadata) + [ -n "$VIASH_PAR_ADD_DATASET_METADATA" ] && ViashError Bad arguments for option \'--add_dataset_metadata\': \'$VIASH_PAR_ADD_DATASET_METADATA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ADD_DATASET_METADATA=true + shift 1 + ;; + --species) + [ -n "$VIASH_PAR_SPECIES" ] && ViashError Bad arguments for option \'--species\': \'$VIASH_PAR_SPECIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPECIES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --species. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --species=*) + [ -n "$VIASH_PAR_SPECIES" ] && ViashError Bad arguments for option \'--species=*\': \'$VIASH_PAR_SPECIES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SPECIES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_value_filter) + [ -n "$VIASH_PAR_OBS_VALUE_FILTER" ] && ViashError Bad arguments for option \'--obs_value_filter\': \'$VIASH_PAR_OBS_VALUE_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_VALUE_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_value_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_value_filter=*) + [ -n "$VIASH_PAR_OBS_VALUE_FILTER" ] && ViashError Bad arguments for option \'--obs_value_filter=*\': \'$VIASH_PAR_OBS_VALUE_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_VALUE_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_filter_grouping) + if [ -z "$VIASH_PAR_CELL_FILTER_GROUPING" ]; then + VIASH_PAR_CELL_FILTER_GROUPING="$2" + else + VIASH_PAR_CELL_FILTER_GROUPING="$VIASH_PAR_CELL_FILTER_GROUPING;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_filter_grouping. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_filter_grouping=*) + if [ -z "$VIASH_PAR_CELL_FILTER_GROUPING" ]; then + VIASH_PAR_CELL_FILTER_GROUPING=$(ViashRemoveFlags "$1") + else + VIASH_PAR_CELL_FILTER_GROUPING="$VIASH_PAR_CELL_FILTER_GROUPING;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --cell_filter_minimum_count) + [ -n "$VIASH_PAR_CELL_FILTER_MINIMUM_COUNT" ] && ViashError Bad arguments for option \'--cell_filter_minimum_count\': \'$VIASH_PAR_CELL_FILTER_MINIMUM_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_FILTER_MINIMUM_COUNT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_filter_minimum_count. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_filter_minimum_count=*) + [ -n "$VIASH_PAR_CELL_FILTER_MINIMUM_COUNT" ] && ViashError Bad arguments for option \'--cell_filter_minimum_count=*\': \'$VIASH_PAR_CELL_FILTER_MINIMUM_COUNT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_FILTER_MINIMUM_COUNT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_filter_min_genes) + [ -n "$VIASH_PAR_CELL_FILTER_MIN_GENES" ] && ViashError Bad arguments for option \'--cell_filter_min_genes\': \'$VIASH_PAR_CELL_FILTER_MIN_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_FILTER_MIN_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_filter_min_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_filter_min_genes=*) + [ -n "$VIASH_PAR_CELL_FILTER_MIN_GENES" ] && ViashError Bad arguments for option \'--cell_filter_min_genes=*\': \'$VIASH_PAR_CELL_FILTER_MIN_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_FILTER_MIN_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --cell_filter_min_counts) + [ -n "$VIASH_PAR_CELL_FILTER_MIN_COUNTS" ] && ViashError Bad arguments for option \'--cell_filter_min_counts\': \'$VIASH_PAR_CELL_FILTER_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_FILTER_MIN_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --cell_filter_min_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --cell_filter_min_counts=*) + [ -n "$VIASH_PAR_CELL_FILTER_MIN_COUNTS" ] && ViashError Bad arguments for option \'--cell_filter_min_counts=*\': \'$VIASH_PAR_CELL_FILTER_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_CELL_FILTER_MIN_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_filter_min_cells) + [ -n "$VIASH_PAR_GENE_FILTER_MIN_CELLS" ] && ViashError Bad arguments for option \'--gene_filter_min_cells\': \'$VIASH_PAR_GENE_FILTER_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_FILTER_MIN_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_filter_min_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_filter_min_cells=*) + [ -n "$VIASH_PAR_GENE_FILTER_MIN_CELLS" ] && ViashError Bad arguments for option \'--gene_filter_min_cells=*\': \'$VIASH_PAR_GENE_FILTER_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_FILTER_MIN_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --gene_filter_min_counts) + [ -n "$VIASH_PAR_GENE_FILTER_MIN_COUNTS" ] && ViashError Bad arguments for option \'--gene_filter_min_counts\': \'$VIASH_PAR_GENE_FILTER_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_FILTER_MIN_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_filter_min_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_filter_min_counts=*) + [ -n "$VIASH_PAR_GENE_FILTER_MIN_COUNTS" ] && ViashError Bad arguments for option \'--gene_filter_min_counts=*\': \'$VIASH_PAR_GENE_FILTER_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENE_FILTER_MIN_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_modality) + [ -n "$VIASH_PAR_OUTPUT_MODALITY" ] && ViashError Bad arguments for option \'--output_modality\': \'$VIASH_PAR_OUTPUT_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_modality=*) + [ -n "$VIASH_PAR_OUTPUT_MODALITY" ] && ViashError Bad arguments for option \'--output_modality=*\': \'$VIASH_PAR_OUTPUT_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer_counts) + [ -n "$VIASH_PAR_OUTPUT_LAYER_COUNTS" ] && ViashError Bad arguments for option \'--output_layer_counts\': \'$VIASH_PAR_OUTPUT_LAYER_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer_counts=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER_COUNTS" ] && ViashError Bad arguments for option \'--output_layer_counts=*\': \'$VIASH_PAR_OUTPUT_LAYER_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/query/cellxgene_census:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_SPECIES+x} ]; then + ViashError '--species' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OBS_VALUE_FILTER+x} ]; then + ViashError '--obs_value_filter' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_ADD_DATASET_METADATA+x} ]; then + VIASH_PAR_ADD_DATASET_METADATA="false" +fi +if [ -z ${VIASH_PAR_CELL_FILTER_MIN_GENES+x} ]; then + VIASH_PAR_CELL_FILTER_MIN_GENES="50" +fi +if [ -z ${VIASH_PAR_CELL_FILTER_MIN_COUNTS+x} ]; then + VIASH_PAR_CELL_FILTER_MIN_COUNTS="0" +fi +if [ -z ${VIASH_PAR_GENE_FILTER_MIN_CELLS+x} ]; then + VIASH_PAR_GENE_FILTER_MIN_CELLS="5" +fi +if [ -z ${VIASH_PAR_GENE_FILTER_MIN_COUNTS+x} ]; then + VIASH_PAR_GENE_FILTER_MIN_COUNTS="0" +fi +if [ -z ${VIASH_PAR_OUTPUT_MODALITY+x} ]; then + VIASH_PAR_OUTPUT_MODALITY="rna" +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_ADD_DATASET_METADATA" ]]; then + if ! [[ "$VIASH_PAR_ADD_DATASET_METADATA" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--add_dataset_metadata' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CELL_FILTER_MINIMUM_COUNT" ]]; then + if ! [[ "$VIASH_PAR_CELL_FILTER_MINIMUM_COUNT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--cell_filter_minimum_count' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CELL_FILTER_MIN_GENES" ]]; then + if ! [[ "$VIASH_PAR_CELL_FILTER_MIN_GENES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--cell_filter_min_genes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_CELL_FILTER_MIN_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_CELL_FILTER_MIN_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--cell_filter_min_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENE_FILTER_MIN_CELLS" ]]; then + if ! [[ "$VIASH_PAR_GENE_FILTER_MIN_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gene_filter_min_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_GENE_FILTER_MIN_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_GENE_FILTER_MIN_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--gene_filter_min_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellxgene_census-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import cellxgene_census +import scanpy as sc +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_uri': $( if [ ! -z ${VIASH_PAR_INPUT_URI+x} ]; then echo "r'${VIASH_PAR_INPUT_URI//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'census_version': $( if [ ! -z ${VIASH_PAR_CENSUS_VERSION+x} ]; then echo "r'${VIASH_PAR_CENSUS_VERSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'add_dataset_metadata': $( if [ ! -z ${VIASH_PAR_ADD_DATASET_METADATA+x} ]; then echo "r'${VIASH_PAR_ADD_DATASET_METADATA//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'species': $( if [ ! -z ${VIASH_PAR_SPECIES+x} ]; then echo "r'${VIASH_PAR_SPECIES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_value_filter': $( if [ ! -z ${VIASH_PAR_OBS_VALUE_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_VALUE_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cell_filter_grouping': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_GROUPING+x} ]; then echo "r'${VIASH_PAR_CELL_FILTER_GROUPING//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'cell_filter_minimum_count': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_MINIMUM_COUNT+x} ]; then echo "int(r'${VIASH_PAR_CELL_FILTER_MINIMUM_COUNT//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'cell_filter_min_genes': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_MIN_GENES+x} ]; then echo "int(r'${VIASH_PAR_CELL_FILTER_MIN_GENES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'cell_filter_min_counts': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_CELL_FILTER_MIN_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gene_filter_min_cells': $( if [ ! -z ${VIASH_PAR_GENE_FILTER_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_GENE_FILTER_MIN_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'gene_filter_min_counts': $( if [ ! -z ${VIASH_PAR_GENE_FILTER_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_GENE_FILTER_MIN_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_modality': $( if [ ! -z ${VIASH_PAR_OUTPUT_MODALITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer_counts': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER_COUNTS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER_COUNTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) + + +from setup_logger import setup_logger + +logger = setup_logger() + + +def connect_census(uri, census_version): + """ + Connect to CellxGene Census or user-provided TileDBSoma object + """ + ver = census_version or "stable" + logger.info( + "Connecting to CellxGene Census at %s", + f"'{uri}'" if uri else f"version '{ver}'", + ) + return cellxgene_census.open_soma(uri=uri, census_version=ver) + + +def get_anndata(census_connection, par): + logger.info( + "Getting gene expression data based on \`%s\` query.", par["obs_value_filter"] + ) + return cellxgene_census.get_anndata( + census=census_connection, + obs_value_filter=par["obs_value_filter"], + organism=par["species"], + ) + + +def add_cellcensus_metadata_obs(census_connection, adata): + logger.info("Adding additional metadata to gene expression data.") + census_datasets = ( + census_connection["census_info"]["datasets"].read().concat().to_pandas() + ) + + adata.obs.dataset_id = adata.obs.dataset_id.astype("category") + + dataset_info = ( + census_datasets[ + census_datasets.dataset_id.isin(adata.obs.dataset_id.cat.categories) + ][ + [ + "collection_id", + "collection_name", + "collection_doi", + "dataset_id", + "dataset_title", + ] + ] + .reset_index(drop=True) + .astype("category") + ) + + adata.obs = adata.obs.merge(dataset_info, on="dataset_id", how="left") + + +def filter_min_cells_per_group(adata, par): + n_cells_before, _ = adata.shape + cell_count = adata.obs.groupby(par["cell_filter_grouping"])[ + "soma_joinid" + ].transform("count") + adata = adata[cell_count >= par["cell_filter_minimum_count"]] + n_cells_after, _ = adata.shape + logger.info( + "Removed %s cells based on %s cell_filter_minimum_count of %s cell_filter_grouping." + % ( + (n_cells_before - n_cells_after), + par["cell_filter_minimum_count"], + par["cell_filter_grouping"], + ) + ) + return adata + + +def filter_by_counts(adata, par): + logger.info("Remove cells with few counts and genes with few counts.") + n_cells_before, n_genes_before = adata.shape + # remove cells with few counts and genes with few counts + scanpy_proc = { + par["cell_filter_min_counts"]: (sc.pp.filter_cells, "min_counts"), + par["cell_filter_min_genes"]: (sc.pp.filter_cells, "min_genes"), + par["gene_filter_min_counts"]: (sc.pp.filter_genes, "min_counts"), + par["gene_filter_min_cells"]: (sc.pp.filter_genes, "min_cells"), + } + for threshold, (func, arg) in scanpy_proc.items(): + if threshold: + func(adata, **{arg: threshold}) + n_cells_after, n_genes_after = adata.shape + logger.info( + "Removed %s cells and %s genes.", + (n_cells_before - n_cells_after), + (n_genes_before - n_genes_after), + ) + + +def move_x_to_layers(adata, layer_name): + logger.info(f"Move .X to .layers['{layer_name}']") + adata.layers[layer_name] = adata.X + adata.X = None + + +def print_unique(adata, column): + unique_values = adata.obs[column].unique().astype(str) + formatted = "', '".join(unique_values[:50]) + if len(unique_values) > 50: + formatted += ", ..." + logger.info(f"Unique {column}: ['{formatted}']") + + +def print_summary(adata): + logger.info(f"Resulting dataset: {adata}") + + logger.info("Summary of dataset:") + for field in adata.obs.columns: + print_unique(adata, field) + + +def write_anndata(adata, par): + logger.info("Writing MuData object to '%s'", par["output"]) + + mdata = mu.MuData({par["output_modality"]: adata}) + + mdata.write_h5mu(par["output"], compression=par["output_compression"]) + + +def main(par, meta): + # check arguments + if (par["cell_filter_grouping"] is None) != ( + par["cell_filter_minimum_count"] is None + ): + raise NotImplementedError( + "You need to specify either both or none of the following parameters: cell_filter_grouping, cell_filter_minimum_count" + ) + + with connect_census( + uri=par["input_uri"], census_version=par["census_version"] + ) as conn: + adata = get_anndata(conn, par) + + if par["add_dataset_metadata"]: + add_cellcensus_metadata_obs(conn, adata) + + print(f"AnnData: {adata}", flush=True) + + if par["cell_filter_grouping"] is not None: + adata = filter_min_cells_per_group(adata, par) + + # remove cells with few counts and genes with few counts + filter_by_counts(adata, par) + + # logger.log(f"Filtered AnnData: {adata}") + print(f"Filtered AnnData: {adata}", flush=True) + + # use feature_id as var_names + adata.var_names = adata.var["feature_id"] + + # move .X to .layers["counts"] + if par["output_layer_counts"]: + move_x_to_layers(adata, par["output_layer_counts"]) + + # print summary + print_summary(adata) + + # write output to file + write_anndata(adata, par) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/query/cellxgene_census/nextflow_labels.config b/target/executable/query/cellxgene_census/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/query/cellxgene_census/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/query/cellxgene_census/setup_logger.py b/target/executable/query/cellxgene_census/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/query/cellxgene_census/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/reference/build_bdrhap_reference/.config.vsh.yaml b/target/executable/reference/build_bdrhap_reference/.config.vsh.yaml new file mode 100644 index 00000000..4a669af7 --- /dev/null +++ b/target/executable/reference/build_bdrhap_reference/.config.vsh.yaml @@ -0,0 +1,323 @@ +name: "build_bdrhap_reference" +namespace: "reference" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody\ + \ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse." + info: + config_key: "Genome_fasta" + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gtf" + description: "File path to the transcript annotation files in GTF or GTF.GZ format.\ + \ The Sequence Analysis Pipeline requires the 'gene_name' or \n'gene_id' attribute\ + \ to be set on each gene and exon feature. Gene and exon feature lines must\ + \ have the same attribute, and exons\nmust have a corresponding gene with the\ + \ same value. For TCR/BCR assays, the TCR or BCR gene segments must have the\ + \ 'gene_type' or\n'gene_biotype' attribute set, and the value should begin with\ + \ 'TR' or 'IG', respectively.\n" + info: + config_key: "Gtf" + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--extra_sequences" + description: "File path to additional sequences in FASTA format to use when building\ + \ the STAR index. (e.g. transgenes or CRISPR guide barcodes).\nGTF lines for\ + \ these sequences will be automatically generated and combined with the main\ + \ GTF.\n" + info: + config_key: "Extra_sequences" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--reference_archive" + description: "A Compressed archive containing the Reference Genome Index and annotation\ + \ GTF files. This archive is meant to be used as an\ninput in the BD Rhapsody\ + \ Sequencing Analysis Pipeline.\n" + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--mitochondrial_contigs" + description: "Names of the Mitochondrial contigs in the provided Reference Genome.\ + \ Fragments originating from contigs other than these are\nidentified as 'nuclear\ + \ fragments' in the ATACseq analysis pipeline.\n" + info: + config_key: "Mitochondrial_contigs" + default: + - "chrM" + - "chrMT" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--filtering_off" + description: "By default the input Transcript Annotation files are filtered based\ + \ on the gene_type/gene_biotype attribute. Only features \nhaving the following\ + \ attribute values are kept:\n\n - protein_coding\n - lncRNA \n - IG_LV_gene\n\ + \ - IG_V_gene\n - IG_V_pseudogene\n - IG_D_gene\n - IG_J_gene\n - IG_J_pseudogene\n\ + \ - IG_C_gene\n - IG_C_pseudogene\n - TR_V_gene\n - TR_V_pseudogene\n -\ + \ TR_D_gene\n - TR_J_gene\n - TR_J_pseudogene\n - TR_C_gene\n\n If you have\ + \ already pre-filtered the input Annotation files and/or wish to turn-off the\ + \ filtering, please set this option to True.\n" + info: + config_key: "Filtering_off" + direction: "input" + - type: "boolean_true" + name: "--wta_only_index" + description: "Build a WTA only index, otherwise builds a WTA + ATAC index." + info: + config_key: "Wta_Only" + direction: "input" + - type: "string" + name: "--extra_star_params" + description: "Additional parameters to pass to STAR when building the genome index.\ + \ Specify exactly like how you would on the command line." + info: + config_key: "Extra_STAR_params" + example: + - "--limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "make_rhap_reference_2.2.1_nodocker.cwl" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "The Reference Files Generator creates an archive containing Genome Index\n\ + and Transcriptome annotation files needed for the BD Rhapsody Sequencing\nAnalysis\ + \ Pipeline. The app takes as input one or more FASTA and GTF files\nand produces\ + \ a compressed archive in the form of a tar.gz file. The \narchive contains:\n \ + \ \n- STAR index\n- Filtered GTF file\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "reference.fa.gz" +- type: "file" + path: "reference.gtf.gz" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bdgenomics/rhapsody:2.2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "seqkit" + interactive: false + - type: "python" + user: false + packages: + - "cwlref-runner" + - "cwl-runner" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_bdrhap_reference/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/reference/build_bdrhap_reference" + executable: "target/executable/reference/build_bdrhap_reference/build_bdrhap_reference" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/reference/build_bdrhap_reference/build_bdrhap_reference b/target/executable/reference/build_bdrhap_reference/build_bdrhap_reference new file mode 100755 index 00000000..c43daf2c --- /dev/null +++ b/target/executable/reference/build_bdrhap_reference/build_bdrhap_reference @@ -0,0 +1,1552 @@ +#!/usr/bin/env bash + +# build_bdrhap_reference main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (author, maintainer) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="build_bdrhap_reference" +VIASH_META_FUNCTIONALITY_NAME="build_bdrhap_reference" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM bdgenomics/rhapsody:2.2.1 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps seqkit && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "cwlref-runner" "cwl-runner" + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component reference build_bdrhap_reference" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "build_bdrhap_reference main" + echo "" + echo "The Reference Files Generator creates an archive containing Genome Index" + echo "and Transcriptome annotation files needed for the BD Rhapsody Sequencing" + echo "Analysis Pipeline. The app takes as input one or more FASTA and GTF files" + echo "and produces a compressed archive in the form of a tar.gz file. The" + echo "archive contains:" + echo "- STAR index" + echo "- Filtered GTF file" + echo "" + echo "Inputs:" + echo " --genome_fasta" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: genome_sequence.fa.gz" + echo " Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody" + echo " Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse." + echo "" + echo " --gtf" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: transcriptome_annotation.gtf.gz" + echo " File path to the transcript annotation files in GTF or GTF.GZ format." + echo " The Sequence Analysis Pipeline requires the 'gene_name' or" + echo " 'gene_id' attribute to be set on each gene and exon feature. Gene and" + echo " exon feature lines must have the same attribute, and exons" + echo " must have a corresponding gene with the same value. For TCR/BCR assays," + echo " the TCR or BCR gene segments must have the 'gene_type' or" + echo " 'gene_biotype' attribute set, and the value should begin with 'TR' or" + echo " 'IG', respectively." + echo "" + echo " --extra_sequences" + echo " type: file, multiple values allowed, file must exist" + echo " File path to additional sequences in FASTA format to use when building" + echo " the STAR index. (e.g. transgenes or CRISPR guide barcodes)." + echo " GTF lines for these sequences will be automatically generated and" + echo " combined with the main GTF." + echo "" + echo "Outputs:" + echo " --reference_archive" + echo " type: file, required parameter, output, file must exist" + echo " example: reference.tar.gz" + echo " A Compressed archive containing the Reference Genome Index and" + echo " annotation GTF files. This archive is meant to be used as an" + echo " input in the BD Rhapsody Sequencing Analysis Pipeline." + echo "" + echo "Arguments:" + echo " --mitochondrial_contigs" + echo " type: string, multiple values allowed" + echo " default: chrM;chrMT;M;MT" + echo " Names of the Mitochondrial contigs in the provided Reference Genome." + echo " Fragments originating from contigs other than these are" + echo " identified as 'nuclear fragments' in the ATACseq analysis pipeline." + echo "" + echo " --filtering_off" + echo " type: boolean_true" + echo " By default the input Transcript Annotation files are filtered based on" + echo " the gene_type/gene_biotype attribute. Only features" + echo " having the following attribute values are kept:" + echo " - protein_coding" + echo " - lncRNA" + echo " - IG_LV_gene" + echo " - IG_V_gene" + echo " - IG_V_pseudogene" + echo " - IG_D_gene" + echo " - IG_J_gene" + echo " - IG_J_pseudogene" + echo " - IG_C_gene" + echo " - IG_C_pseudogene" + echo " - TR_V_gene" + echo " - TR_V_pseudogene" + echo " - TR_D_gene" + echo " - TR_J_gene" + echo " - TR_J_pseudogene" + echo " - TR_C_gene" + echo " If you have already pre-filtered the input Annotation files and/or" + echo " wish to turn-off the filtering, please set this option to True." + echo "" + echo " --wta_only_index" + echo " type: boolean_true" + echo " Build a WTA only index, otherwise builds a WTA + ATAC index." + echo "" + echo " --extra_star_params" + echo " type: string" + echo " example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11" + echo " Additional parameters to pass to STAR when building the genome index." + echo " Specify exactly like how you would on the command line." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "build_bdrhap_reference main" + exit + ;; + --genome_fasta) + if [ -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA="$2" + else + VIASH_PAR_GENOME_FASTA="$VIASH_PAR_GENOME_FASTA;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genome_fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genome_fasta=*) + if [ -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOME_FASTA="$VIASH_PAR_GENOME_FASTA;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --gtf) + if [ -z "$VIASH_PAR_GTF" ]; then + VIASH_PAR_GTF="$2" + else + VIASH_PAR_GTF="$VIASH_PAR_GTF;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gtf=*) + if [ -z "$VIASH_PAR_GTF" ]; then + VIASH_PAR_GTF=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GTF="$VIASH_PAR_GTF;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --extra_sequences) + if [ -z "$VIASH_PAR_EXTRA_SEQUENCES" ]; then + VIASH_PAR_EXTRA_SEQUENCES="$2" + else + VIASH_PAR_EXTRA_SEQUENCES="$VIASH_PAR_EXTRA_SEQUENCES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --extra_sequences. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --extra_sequences=*) + if [ -z "$VIASH_PAR_EXTRA_SEQUENCES" ]; then + VIASH_PAR_EXTRA_SEQUENCES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_EXTRA_SEQUENCES="$VIASH_PAR_EXTRA_SEQUENCES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reference_archive) + [ -n "$VIASH_PAR_REFERENCE_ARCHIVE" ] && ViashError Bad arguments for option \'--reference_archive\': \'$VIASH_PAR_REFERENCE_ARCHIVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_ARCHIVE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_archive. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_archive=*) + [ -n "$VIASH_PAR_REFERENCE_ARCHIVE" ] && ViashError Bad arguments for option \'--reference_archive=*\': \'$VIASH_PAR_REFERENCE_ARCHIVE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_ARCHIVE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --mitochondrial_contigs) + if [ -z "$VIASH_PAR_MITOCHONDRIAL_CONTIGS" ]; then + VIASH_PAR_MITOCHONDRIAL_CONTIGS="$2" + else + VIASH_PAR_MITOCHONDRIAL_CONTIGS="$VIASH_PAR_MITOCHONDRIAL_CONTIGS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mitochondrial_contigs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mitochondrial_contigs=*) + if [ -z "$VIASH_PAR_MITOCHONDRIAL_CONTIGS" ]; then + VIASH_PAR_MITOCHONDRIAL_CONTIGS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MITOCHONDRIAL_CONTIGS="$VIASH_PAR_MITOCHONDRIAL_CONTIGS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --filtering_off) + [ -n "$VIASH_PAR_FILTERING_OFF" ] && ViashError Bad arguments for option \'--filtering_off\': \'$VIASH_PAR_FILTERING_OFF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FILTERING_OFF=true + shift 1 + ;; + --wta_only_index) + [ -n "$VIASH_PAR_WTA_ONLY_INDEX" ] && ViashError Bad arguments for option \'--wta_only_index\': \'$VIASH_PAR_WTA_ONLY_INDEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WTA_ONLY_INDEX=true + shift 1 + ;; + --extra_star_params) + [ -n "$VIASH_PAR_EXTRA_STAR_PARAMS" ] && ViashError Bad arguments for option \'--extra_star_params\': \'$VIASH_PAR_EXTRA_STAR_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXTRA_STAR_PARAMS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --extra_star_params. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --extra_star_params=*) + [ -n "$VIASH_PAR_EXTRA_STAR_PARAMS" ] && ViashError Bad arguments for option \'--extra_star_params=*\': \'$VIASH_PAR_EXTRA_STAR_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXTRA_STAR_PARAMS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/reference/build_bdrhap_reference:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GENOME_FASTA+x} ]; then + ViashError '--genome_fasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_GTF+x} ]; then + ViashError '--gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_REFERENCE_ARCHIVE+x} ]; then + ViashError '--reference_archive' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MITOCHONDRIAL_CONTIGS+x} ]; then + VIASH_PAR_MITOCHONDRIAL_CONTIGS="chrM;chrMT;M;MT" +fi +if [ -z ${VIASH_PAR_FILTERING_OFF+x} ]; then + VIASH_PAR_FILTERING_OFF="false" +fi +if [ -z ${VIASH_PAR_WTA_ONLY_INDEX+x} ]; then + VIASH_PAR_WTA_ONLY_INDEX="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GENOME_FASTA; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_GTF" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GTF; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_EXTRA_SEQUENCES" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_EXTRA_SEQUENCES; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_FILTERING_OFF" ]]; then + if ! [[ "$VIASH_PAR_FILTERING_OFF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--filtering_off' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_WTA_ONLY_INDEX" ]]; then + if ! [[ "$VIASH_PAR_WTA_ONLY_INDEX" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--wta_only_index' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ] && [ ! -d "$(dirname "$VIASH_PAR_REFERENCE_ARCHIVE")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_REFERENCE_ARCHIVE")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_TEST_GENOME_FASTA=() + IFS=';' + for var in $VIASH_PAR_GENOME_FASTA; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GENOME_FASTA+=( "$var" ) + done + VIASH_PAR_GENOME_FASTA=$(IFS=';' ; echo "${VIASH_TEST_GENOME_FASTA[*]}") +fi +if [ ! -z "$VIASH_PAR_GTF" ]; then + VIASH_TEST_GTF=() + IFS=';' + for var in $VIASH_PAR_GTF; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GTF+=( "$var" ) + done + VIASH_PAR_GTF=$(IFS=';' ; echo "${VIASH_TEST_GTF[*]}") +fi +if [ ! -z "$VIASH_PAR_EXTRA_SEQUENCES" ]; then + VIASH_TEST_EXTRA_SEQUENCES=() + IFS=';' + for var in $VIASH_PAR_EXTRA_SEQUENCES; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_EXTRA_SEQUENCES+=( "$var" ) + done + VIASH_PAR_EXTRA_SEQUENCES=$(IFS=';' ; echo "${VIASH_TEST_EXTRA_SEQUENCES[*]}") +fi +if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_REFERENCE_ARCHIVE")" ) + VIASH_PAR_REFERENCE_ARCHIVE=$(ViashDockerAutodetectMount "$VIASH_PAR_REFERENCE_ARCHIVE") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_REFERENCE_ARCHIVE" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-build_bdrhap_reference-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'genome_fasta': $( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "r'${VIASH_PAR_GENOME_FASTA//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'gtf': $( if [ ! -z ${VIASH_PAR_GTF+x} ]; then echo "r'${VIASH_PAR_GTF//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'extra_sequences': $( if [ ! -z ${VIASH_PAR_EXTRA_SEQUENCES+x} ]; then echo "r'${VIASH_PAR_EXTRA_SEQUENCES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reference_archive': $( if [ ! -z ${VIASH_PAR_REFERENCE_ARCHIVE+x} ]; then echo "r'${VIASH_PAR_REFERENCE_ARCHIVE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'mitochondrial_contigs': $( if [ ! -z ${VIASH_PAR_MITOCHONDRIAL_CONTIGS+x} ]; then echo "r'${VIASH_PAR_MITOCHONDRIAL_CONTIGS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'filtering_off': $( if [ ! -z ${VIASH_PAR_FILTERING_OFF+x} ]; then echo "r'${VIASH_PAR_FILTERING_OFF//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'wta_only_index': $( if [ ! -z ${VIASH_PAR_WTA_ONLY_INDEX+x} ]; then echo "r'${VIASH_PAR_WTA_ONLY_INDEX//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'extra_star_params': $( if [ ! -z ${VIASH_PAR_EXTRA_STAR_PARAMS+x} ]; then echo "r'${VIASH_PAR_EXTRA_STAR_PARAMS//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] + ] + + return config + + +def strip_margin(text: str) -> str: + return re.sub("(\\n?)[ \\t]*\\|", "\\\\1", text) + + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".gz"), ( + "Output reference_archive must end with .tar.gz." + ) + + # make paths absolute + for argument in config["arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] + else: + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + + return par + + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [ + strip_margin("""\\ + |#!/usr/bin/env cwl-runner + | + |""") + ] + + config_key_value_pairs = [] + for argument in config["arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\\ + |{config_key}: + |""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\\ + | class: File + | location: "{par_value}" + |""") + content_list.append(str) + else: + content_list.append( + strip_margin(f"""\\ + |{config_key}: {par_value} + |""") + ) + + ## Write config to file + return "".join(content_list) + + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file = os.path.join( + meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl" + ) + + return os.path.abspath(cwl_file) + + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"] + ) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file, + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + shutil.move( + os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"] + ) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + unset VIASH_TEST_GENOME_FASTA + IFS=';' + for var in $VIASH_PAR_GENOME_FASTA; do + unset IFS + if [ -z "$VIASH_TEST_GENOME_FASTA" ]; then + VIASH_TEST_GENOME_FASTA="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GENOME_FASTA="$VIASH_TEST_GENOME_FASTA;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GENOME_FASTA="$VIASH_TEST_GENOME_FASTA" + fi + if [ ! -z "$VIASH_PAR_GTF" ]; then + unset VIASH_TEST_GTF + IFS=';' + for var in $VIASH_PAR_GTF; do + unset IFS + if [ -z "$VIASH_TEST_GTF" ]; then + VIASH_TEST_GTF="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GTF="$VIASH_TEST_GTF;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GTF="$VIASH_TEST_GTF" + fi + if [ ! -z "$VIASH_PAR_EXTRA_SEQUENCES" ]; then + unset VIASH_TEST_EXTRA_SEQUENCES + IFS=';' + for var in $VIASH_PAR_EXTRA_SEQUENCES; do + unset IFS + if [ -z "$VIASH_TEST_EXTRA_SEQUENCES" ]; then + VIASH_TEST_EXTRA_SEQUENCES="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_EXTRA_SEQUENCES="$VIASH_TEST_EXTRA_SEQUENCES;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_EXTRA_SEQUENCES="$VIASH_TEST_EXTRA_SEQUENCES" + fi + if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ]; then + VIASH_PAR_REFERENCE_ARCHIVE=$(ViashDockerStripAutomount "$VIASH_PAR_REFERENCE_ARCHIVE") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_REFERENCE_ARCHIVE" ] && [ ! -e "$VIASH_PAR_REFERENCE_ARCHIVE" ]; then + ViashError "Output file '$VIASH_PAR_REFERENCE_ARCHIVE' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl b/target/executable/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/target/executable/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/target/executable/reference/build_bdrhap_reference/nextflow_labels.config b/target/executable/reference/build_bdrhap_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/reference/build_bdrhap_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/reference/build_cellranger_arc_reference/.config.vsh.yaml b/target/executable/reference/build_cellranger_arc_reference/.config.vsh.yaml new file mode 100644 index 00000000..e846c0f4 --- /dev/null +++ b/target/executable/reference/build_cellranger_arc_reference/.config.vsh.yaml @@ -0,0 +1,295 @@ +name: "build_cellranger_arc_reference" +namespace: "reference" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--annotation_gtf" + description: "Reference annotation." + info: null + example: + - "annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--motifs_file" + description: "Transcription factor motifs in JASPAR format. See https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references" + info: null + example: + - "JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_nuclear_contigs" + description: "Name(s) of contig(s) that do not have any chromatin structure, for\ + \ example, mitochondria or plastids. These contigs are excluded from peak calling\ + \ since the entire contig will be \"open\" due to a lack of chromatin structure.\ + \ Leave empty if there are no such contigs." + info: null + example: + - "chrM" + default: + - "chrM" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output folder" + info: null + example: + - "cellranger_reference" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--genome" + description: "Name of the genome. This will be the name of the intermediate output\ + \ folder" + info: null + example: + - "GRCh38" + default: + - "output" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--organism" + description: "Name of the organism. This is displayed in the web summary but is\ + \ otherwise not used in the analysis." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--subset_regex" + description: "Will subset the reference chromosomes using the given regex." + info: null + example: + - "(ERCC-00002|chr1)" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied\ + \ genome FASTA and gene GTF files. Creates a new folder named after the genome." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger_arc:2.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps pigz && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "docker" + - type: "apt" + packages: + - "git" + - "wget" + interactive: false + - type: "docker" + run: + - "TARGETARCH=\"${TARGETARCH:-$(dpkg --print-architecture)}\" && \\\nTARGETOS=\"\ + ${TARGETOS:-linux}\" && \\\nPATH=\"${PATH}:/usr/local/go/bin\" && \\\nwget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz\ + \ && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\nrm\ + \ go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\ngit clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git\ + \ && \\\ncd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ &&\ + \ rm -rf seqkit && rm -r /usr/local/go\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_cellranger_arc_reference/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/reference/build_cellranger_arc_reference" + executable: "target/executable/reference/build_cellranger_arc_reference/build_cellranger_arc_reference" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/reference/build_cellranger_arc_reference/build_cellranger_arc_reference b/target/executable/reference/build_cellranger_arc_reference/build_cellranger_arc_reference new file mode 100755 index 00000000..f2bc69bd --- /dev/null +++ b/target/executable/reference/build_cellranger_arc_reference/build_cellranger_arc_reference @@ -0,0 +1,1324 @@ +#!/usr/bin/env bash + +# build_cellranger_arc_reference main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="build_cellranger_arc_reference" +VIASH_META_FUNCTIONALITY_NAME="build_cellranger_arc_reference" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger_arc:2.0 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update && \ +apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component reference build_cellranger_arc_reference" +LABEL org.opencontainers.image.created="2025-08-22T07:23:11Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "build_cellranger_arc_reference main" + echo "" + echo "Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied" + echo "genome FASTA and gene GTF files. Creates a new folder named after the genome." + echo "" + echo "Arguments:" + echo " --genome_fasta" + echo " type: file, required parameter, file must exist" + echo " example: genome_sequence.fa.gz" + echo " Reference genome fasta." + echo "" + echo " --annotation_gtf" + echo " type: file, required parameter, file must exist" + echo " example: annotation.gtf.gz" + echo " Reference annotation." + echo "" + echo " --motifs_file" + echo " type: file, file must exist" + echo " example: JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified" + echo " Transcription factor motifs in JASPAR format. See" + echo " " + echo "https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references" + echo "" + echo " --non_nuclear_contigs" + echo " type: string, multiple values allowed" + echo " default: chrM" + echo " example: chrM" + echo " Name(s) of contig(s) that do not have any chromatin structure, for" + echo " example, mitochondria or plastids. These contigs are excluded from peak" + echo " calling since the entire contig will be \"open\" due to a lack of" + echo " chromatin structure. Leave empty if there are no such contigs." + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: cellranger_reference" + echo " Output folder" + echo "" + echo " --genome" + echo " type: string, required parameter" + echo " default: output" + echo " example: GRCh38" + echo " Name of the genome. This will be the name of the intermediate output" + echo " folder" + echo "" + echo " --organism" + echo " type: string" + echo " Name of the organism. This is displayed in the web summary but is" + echo " otherwise not used in the analysis." + echo "" + echo " --subset_regex" + echo " type: string" + echo " example: (ERCC-00002|chr1)" + echo " Will subset the reference chromosomes using the given regex." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "build_cellranger_arc_reference main" + exit + ;; + --genome_fasta) + [ -n "$VIASH_PAR_GENOME_FASTA" ] && ViashError Bad arguments for option \'--genome_fasta\': \'$VIASH_PAR_GENOME_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME_FASTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genome_fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genome_fasta=*) + [ -n "$VIASH_PAR_GENOME_FASTA" ] && ViashError Bad arguments for option \'--genome_fasta=*\': \'$VIASH_PAR_GENOME_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME_FASTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --annotation_gtf) + [ -n "$VIASH_PAR_ANNOTATION_GTF" ] && ViashError Bad arguments for option \'--annotation_gtf\': \'$VIASH_PAR_ANNOTATION_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ANNOTATION_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --annotation_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --annotation_gtf=*) + [ -n "$VIASH_PAR_ANNOTATION_GTF" ] && ViashError Bad arguments for option \'--annotation_gtf=*\': \'$VIASH_PAR_ANNOTATION_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ANNOTATION_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --motifs_file) + [ -n "$VIASH_PAR_MOTIFS_FILE" ] && ViashError Bad arguments for option \'--motifs_file\': \'$VIASH_PAR_MOTIFS_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MOTIFS_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --motifs_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --motifs_file=*) + [ -n "$VIASH_PAR_MOTIFS_FILE" ] && ViashError Bad arguments for option \'--motifs_file=*\': \'$VIASH_PAR_MOTIFS_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MOTIFS_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --non_nuclear_contigs) + if [ -z "$VIASH_PAR_NON_NUCLEAR_CONTIGS" ]; then + VIASH_PAR_NON_NUCLEAR_CONTIGS="$2" + else + VIASH_PAR_NON_NUCLEAR_CONTIGS="$VIASH_PAR_NON_NUCLEAR_CONTIGS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --non_nuclear_contigs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --non_nuclear_contigs=*) + if [ -z "$VIASH_PAR_NON_NUCLEAR_CONTIGS" ]; then + VIASH_PAR_NON_NUCLEAR_CONTIGS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_NON_NUCLEAR_CONTIGS="$VIASH_PAR_NON_NUCLEAR_CONTIGS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genome) + [ -n "$VIASH_PAR_GENOME" ] && ViashError Bad arguments for option \'--genome\': \'$VIASH_PAR_GENOME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genome. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genome=*) + [ -n "$VIASH_PAR_GENOME" ] && ViashError Bad arguments for option \'--genome=*\': \'$VIASH_PAR_GENOME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --organism) + [ -n "$VIASH_PAR_ORGANISM" ] && ViashError Bad arguments for option \'--organism\': \'$VIASH_PAR_ORGANISM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORGANISM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --organism. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --organism=*) + [ -n "$VIASH_PAR_ORGANISM" ] && ViashError Bad arguments for option \'--organism=*\': \'$VIASH_PAR_ORGANISM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ORGANISM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --subset_regex) + [ -n "$VIASH_PAR_SUBSET_REGEX" ] && ViashError Bad arguments for option \'--subset_regex\': \'$VIASH_PAR_SUBSET_REGEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSET_REGEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --subset_regex. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --subset_regex=*) + [ -n "$VIASH_PAR_SUBSET_REGEX" ] && ViashError Bad arguments for option \'--subset_regex=*\': \'$VIASH_PAR_SUBSET_REGEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSET_REGEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/reference/build_cellranger_arc_reference:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GENOME_FASTA+x} ]; then + ViashError '--genome_fasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ANNOTATION_GTF+x} ]; then + ViashError '--annotation_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_GENOME+x} ]; then + ViashError '--genome' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_NON_NUCLEAR_CONTIGS+x} ]; then + VIASH_PAR_NON_NUCLEAR_CONTIGS="chrM" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ] && [ ! -e "$VIASH_PAR_GENOME_FASTA" ]; then + ViashError "Input file '$VIASH_PAR_GENOME_FASTA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ANNOTATION_GTF" ] && [ ! -e "$VIASH_PAR_ANNOTATION_GTF" ]; then + ViashError "Input file '$VIASH_PAR_ANNOTATION_GTF' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MOTIFS_FILE" ] && [ ! -e "$VIASH_PAR_MOTIFS_FILE" ]; then + ViashError "Input file '$VIASH_PAR_MOTIFS_FILE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENOME_FASTA")" ) + VIASH_PAR_GENOME_FASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_GENOME_FASTA") +fi +if [ ! -z "$VIASH_PAR_ANNOTATION_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ANNOTATION_GTF")" ) + VIASH_PAR_ANNOTATION_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_ANNOTATION_GTF") +fi +if [ ! -z "$VIASH_PAR_MOTIFS_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MOTIFS_FILE")" ) + VIASH_PAR_MOTIFS_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_MOTIFS_FILE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-build_cellranger_arc_reference-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "${VIASH_PAR_GENOME_FASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_genome_fasta='&'#" ; else echo "# par_genome_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_ANNOTATION_GTF+x} ]; then echo "${VIASH_PAR_ANNOTATION_GTF}" | sed "s#'#'\"'\"'#g;s#.*#par_annotation_gtf='&'#" ; else echo "# par_annotation_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_MOTIFS_FILE+x} ]; then echo "${VIASH_PAR_MOTIFS_FILE}" | sed "s#'#'\"'\"'#g;s#.*#par_motifs_file='&'#" ; else echo "# par_motifs_file="; fi ) +$( if [ ! -z ${VIASH_PAR_NON_NUCLEAR_CONTIGS+x} ]; then echo "${VIASH_PAR_NON_NUCLEAR_CONTIGS}" | sed "s#'#'\"'\"'#g;s#.*#par_non_nuclear_contigs='&'#" ; else echo "# par_non_nuclear_contigs="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOME+x} ]; then echo "${VIASH_PAR_GENOME}" | sed "s#'#'\"'\"'#g;s#.*#par_genome='&'#" ; else echo "# par_genome="; fi ) +$( if [ ! -z ${VIASH_PAR_ORGANISM+x} ]; then echo "${VIASH_PAR_ORGANISM}" | sed "s#'#'\"'\"'#g;s#.*#par_organism='&'#" ; else echo "# par_organism="; fi ) +$( if [ ! -z ${VIASH_PAR_SUBSET_REGEX+x} ]; then echo "${VIASH_PAR_SUBSET_REGEX}" | sed "s#'#'\"'\"'#g;s#.*#par_subset_regex='&'#" ; else echo "# par_subset_regex="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\$(mktemp -d "$VIASH_TEMP/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# just to make sure +echo "> Getting path of fasta file" +par_genome_fasta=\`realpath \$par_genome_fasta\` +echo "> Getting path of annotation file" +par_annotation_gtf=\`realpath \$par_annotation_gtf\` +echo "> Getting path of output file" +par_output=\`realpath \$par_output\` +echo "> Getting path of motifs file" +par_motifs_file=\`realpath \$par_motifs_file\` + +# process params +extra_params=( ) + +if [ ! -z "\$meta_cpus" ]; then + extra_params+=( "nthreads: \\"\$meta_cpus"\\" ) +fi +if [ ! -z "\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\`python -c "print(int('\$meta_memory_gb') - 2)"\` + extra_params+=( "memgb: \\"\$memory_gb"\\" ) +fi + +echo "> Unzipping input files" +unpigz -c "\$par_genome_fasta" > "\$tmpdir/genome.fa" + +echo "> Building star index" +cd "\$tmpdir" + +echo "> Building config" +config_in="\${tmpdir}/config" + +# If non_nuclear_contigs is not set or bash thinks it is a flag, set it to an empty string +if [[ -z \$par_non_nuclear_contigs || \$par_non_nuclear_contigs == "--non_nuclear_contigs" ]]; then + non_nuclear_contigs="" +else + printf -v non_nuclear_contigs '"%s",' "\${par_non_nuclear_contigs[@]}" + non_nuclear_contigs="[\${non_nuclear_contigs%,}]" # remove trailing comma +fi + +echo """{ + \${par_organism:+organism: \\"\$par_organism\\"} + genome: [\\"\${par_genome}\\"] + input_fasta: [\\""\${tmpdir}/genome.fa"\\"] + input_gtf: [\\""\${par_annotation_gtf}\\""] + \${non_nuclear_contigs:+non_nuclear_contigs: "\${non_nuclear_contigs}"} + input_motifs: \\""\$par_motifs_file"\\" + \$(printf "%s\\n" "\${extra_params[@]}") +}""" > "\$config_in" + +echo "> Config content:" +cat \${config_in} + +echo "> Running cellranger" +cellranger-arc mkref --config=\${config_in} + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "\$par_output" -C "\${tmpdir}/\${par_genome}" . +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA=$(ViashDockerStripAutomount "$VIASH_PAR_GENOME_FASTA") + fi + if [ ! -z "$VIASH_PAR_ANNOTATION_GTF" ]; then + VIASH_PAR_ANNOTATION_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_ANNOTATION_GTF") + fi + if [ ! -z "$VIASH_PAR_MOTIFS_FILE" ]; then + VIASH_PAR_MOTIFS_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_MOTIFS_FILE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/reference/build_cellranger_arc_reference/nextflow_labels.config b/target/executable/reference/build_cellranger_arc_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/reference/build_cellranger_arc_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/reference/build_cellranger_reference/.config.vsh.yaml b/target/executable/reference/build_cellranger_reference/.config.vsh.yaml new file mode 100644 index 00000000..cda53af3 --- /dev/null +++ b/target/executable/reference/build_cellranger_reference/.config.vsh.yaml @@ -0,0 +1,263 @@ +name: "build_cellranger_reference" +namespace: "reference" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + description: "Reference transcriptome annotation." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_version" + description: "Optional reference version string to include with reference" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output folder" + info: null + example: + - "cellranger_reference" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Build a Cell Ranger-compatible reference folder from user-supplied genome\ + \ FASTA and gene GTF files. Creates a new folder named after the genome." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps pigz && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "apt" + packages: + - "git" + - "wget" + interactive: false + - type: "docker" + run: + - "TARGETARCH=\"${TARGETARCH:-$(dpkg --print-architecture)}\" && \\\nTARGETOS=\"\ + ${TARGETOS:-linux}\" && \\\nPATH=\"${PATH}:/usr/local/go/bin\" && \\\nwget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz\ + \ && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\nrm\ + \ go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\ngit clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git\ + \ && \\\ncd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ &&\ + \ rm -rf seqkit && rm -r /usr/local/go\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_cellranger_reference/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/reference/build_cellranger_reference" + executable: "target/executable/reference/build_cellranger_reference/build_cellranger_reference" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/reference/build_cellranger_reference/build_cellranger_reference b/target/executable/reference/build_cellranger_reference/build_cellranger_reference new file mode 100755 index 00000000..c75dbbea --- /dev/null +++ b/target/executable/reference/build_cellranger_reference/build_cellranger_reference @@ -0,0 +1,1186 @@ +#!/usr/bin/env bash + +# build_cellranger_reference main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="build_cellranger_reference" +VIASH_META_FUNCTIONALITY_NAME="build_cellranger_reference" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger:9.0 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update && \ +apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component reference build_cellranger_reference" +LABEL org.opencontainers.image.created="2025-08-22T07:23:11Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "build_cellranger_reference main" + echo "" + echo "Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA" + echo "and gene GTF files. Creates a new folder named after the genome." + echo "" + echo "Arguments:" + echo " --genome_fasta" + echo " type: file, required parameter, file must exist" + echo " example: genome_sequence.fa.gz" + echo " Reference genome fasta." + echo "" + echo " --transcriptome_gtf" + echo " type: file, required parameter, file must exist" + echo " example: transcriptome_annotation.gtf.gz" + echo " Reference transcriptome annotation." + echo "" + echo " --reference_version" + echo " type: string" + echo " Optional reference version string to include with reference" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: cellranger_reference" + echo " Output folder" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "build_cellranger_reference main" + exit + ;; + --genome_fasta) + [ -n "$VIASH_PAR_GENOME_FASTA" ] && ViashError Bad arguments for option \'--genome_fasta\': \'$VIASH_PAR_GENOME_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME_FASTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genome_fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genome_fasta=*) + [ -n "$VIASH_PAR_GENOME_FASTA" ] && ViashError Bad arguments for option \'--genome_fasta=*\': \'$VIASH_PAR_GENOME_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME_FASTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --transcriptome_gtf) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--transcriptome_gtf\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --transcriptome_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --transcriptome_gtf=*) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--transcriptome_gtf=*\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --reference_version) + [ -n "$VIASH_PAR_REFERENCE_VERSION" ] && ViashError Bad arguments for option \'--reference_version\': \'$VIASH_PAR_REFERENCE_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VERSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reference_version. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reference_version=*) + [ -n "$VIASH_PAR_REFERENCE_VERSION" ] && ViashError Bad arguments for option \'--reference_version=*\': \'$VIASH_PAR_REFERENCE_VERSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_REFERENCE_VERSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/reference/build_cellranger_reference:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GENOME_FASTA+x} ]; then + ViashError '--genome_fasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then + ViashError '--transcriptome_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ] && [ ! -e "$VIASH_PAR_GENOME_FASTA" ]; then + ViashError "Input file '$VIASH_PAR_GENOME_FASTA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && [ ! -e "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + ViashError "Input file '$VIASH_PAR_TRANSCRIPTOME_GTF' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENOME_FASTA")" ) + VIASH_PAR_GENOME_FASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_GENOME_FASTA") +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TRANSCRIPTOME_GTF")" ) + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_TRANSCRIPTOME_GTF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-build_cellranger_reference-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "${VIASH_PAR_GENOME_FASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_genome_fasta='&'#" ; else echo "# par_genome_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then echo "${VIASH_PAR_TRANSCRIPTOME_GTF}" | sed "s#'#'\"'\"'#g;s#.*#par_transcriptome_gtf='&'#" ; else echo "# par_transcriptome_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE_VERSION+x} ]; then echo "${VIASH_PAR_REFERENCE_VERSION}" | sed "s#'#'\"'\"'#g;s#.*#par_reference_version='&'#" ; else echo "# par_reference_version="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\$(mktemp -d "$VIASH_TEMP/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_genome_fasta=\`realpath \$par_genome_fasta\` +par_transcriptome_gtf=\`realpath \$par_transcriptome_gtf\` +par_output=\`realpath \$par_output\` + + +echo "> Unzipping input files" +unpigz -c "\$par_genome_fasta" > "\$tmpdir/genome.fa" + +echo "> Building star index" +cd "\$tmpdir" +cellranger mkref \\ + --fasta "\$tmpdir/genome.fa" \\ + --genes "\$par_transcriptome_gtf" \\ + --genome output \\ + \${par_reference_version:+--ref-version \$par_reference_version} \\ + \${meta_cpus:+--nthreads \$meta_cpus} \\ + \${meta_memory_gb:+--memgb \$((\$meta_memory_gb-2))} # always keep 2 gb for the OS itseld + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "\$par_output" -C "\$tmpdir/output" . +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA=$(ViashDockerStripAutomount "$VIASH_PAR_GENOME_FASTA") + fi + if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_TRANSCRIPTOME_GTF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/reference/build_cellranger_reference/nextflow_labels.config b/target/executable/reference/build_cellranger_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/reference/build_cellranger_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/reference/build_star_reference/.config.vsh.yaml b/target/executable/reference/build_star_reference/.config.vsh.yaml new file mode 100644 index 00000000..db012789 --- /dev/null +++ b/target/executable/reference/build_star_reference/.config.vsh.yaml @@ -0,0 +1,257 @@ +name: "build_star_reference" +namespace: "reference" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input/Output" + arguments: + - type: "file" + name: "--genome_fasta" + alternatives: + - "--genomeFastaFiles" + description: "The fasta files to be included in the reference. Corresponds to\ + \ the --genomeFastaFiles argument in the STAR command." + info: null + example: + - "chr1.fasta" + - "chr2.fasta" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + alternatives: + - "--sjdbGTFfile" + description: "Specifies the path to the file with annotated transcripts in the\ + \ standard GTF\nformat. STAR will extract splice junctions from this file and\ + \ use them to greatly improve\naccuracy of the mapping. Corresponds to the --sjdbGTFfile\ + \ argument in the STAR command.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--genomeDir" + description: "Path to output directory. Corresponds to the --genomeDir argument\ + \ in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Genome indexing arguments" + arguments: + - type: "integer" + name: "--genomeSAindexNbases" + description: "Length (bases) of the SA pre-indexing string. Typically between\ + \ 10 and 15.\nLonger strings will use much more memory, but allow faster searches.\ + \ For small\ngenomes, the parameter {genomeSAindexNbases must be scaled down\ + \ to\nmin(14, log2(GenomeLength)/2 - 1).\n" + info: null + default: + - 14 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create a reference for STAR from a set of fasta files." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "docker" + env: + - "STAR_VERSION 2.7.10b" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_star_reference/config.vsh.yml" + runner: "executable" + engine: "docker|native" + output: "target/executable/reference/build_star_reference" + executable: "target/executable/reference/build_star_reference/build_star_reference" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/reference/build_star_reference/build_star_reference b/target/executable/reference/build_star_reference/build_star_reference new file mode 100755 index 00000000..21feb4ea --- /dev/null +++ b/target/executable/reference/build_star_reference/build_star_reference @@ -0,0 +1,1390 @@ +#!/usr/bin/env bash + +# build_star_reference main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="build_star_reference" +VIASH_META_FUNCTIONALITY_NAME="build_star_reference" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +ENV STAR_VERSION 2.7.10b +ENV PACKAGES gcc g++ make wget zlib1g-dev unzip +RUN apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component reference build_star_reference" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "build_star_reference main" + echo "" + echo "Create a reference for STAR from a set of fasta files." + echo "" + echo "Input/Output:" + echo " --genomeFastaFiles, --genome_fasta" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: chr1.fasta;chr2.fasta" + echo " The fasta files to be included in the reference. Corresponds to the" + echo " --genomeFastaFiles argument in the STAR command." + echo "" + echo " --sjdbGTFfile, --transcriptome_gtf" + echo " type: file, file must exist" + echo " Specifies the path to the file with annotated transcripts in the" + echo " standard GTF" + echo " format. STAR will extract splice junctions from this file and use them" + echo " to greatly improve" + echo " accuracy of the mapping. Corresponds to the --sjdbGTFfile argument in" + echo " the STAR command." + echo "" + echo " --genomeDir, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: /path/to/foo" + echo " Path to output directory. Corresponds to the --genomeDir argument in the" + echo " STAR command." + echo "" + echo "Genome indexing arguments:" + echo " --genomeSAindexNbases" + echo " type: integer" + echo " default: 14" + echo " Length (bases) of the SA pre-indexing string. Typically between 10 and" + echo " 15." + echo " Longer strings will use much more memory, but allow faster searches. For" + echo " small" + echo " genomes, the parameter {genomeSAindexNbases must be scaled down to" + echo " min(14, log2(GenomeLength)/2 - 1)." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "build_star_reference main" + exit + ;; + --genome_fasta) + if [ -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA="$2" + else + VIASH_PAR_GENOME_FASTA="$VIASH_PAR_GENOME_FASTA;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genome_fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genome_fasta=*) + if [ -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENOME_FASTA="$VIASH_PAR_GENOME_FASTA;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeFastaFiles) + if [ -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA="$2" + else + VIASH_PAR_GENOME_FASTA="$VIASH_PAR_GENOME_FASTA;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeFastaFiles. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --transcriptome_gtf) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--transcriptome_gtf\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --transcriptome_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --transcriptome_gtf=*) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--transcriptome_gtf=*\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --sjdbGTFfile) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--sjdbGTFfile\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --sjdbGTFfile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --genomeDir) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--genomeDir\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeSAindexNbases) + [ -n "$VIASH_PAR_GENOMESAINDEXNBASES" ] && ViashError Bad arguments for option \'--genomeSAindexNbases\': \'$VIASH_PAR_GENOMESAINDEXNBASES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMESAINDEXNBASES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeSAindexNbases. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeSAindexNbases=*) + [ -n "$VIASH_PAR_GENOMESAINDEXNBASES" ] && ViashError Bad arguments for option \'--genomeSAindexNbases=*\': \'$VIASH_PAR_GENOMESAINDEXNBASES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMESAINDEXNBASES=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/reference/build_star_reference:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GENOME_FASTA+x} ]; then + ViashError '--genome_fasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_GENOMESAINDEXNBASES+x} ]; then + VIASH_PAR_GENOMESAINDEXNBASES="14" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GENOME_FASTA; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && [ ! -e "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + ViashError "Input file '$VIASH_PAR_TRANSCRIPTOME_GTF' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_GENOMESAINDEXNBASES" ]]; then + if ! [[ "$VIASH_PAR_GENOMESAINDEXNBASES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--genomeSAindexNbases' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_TEST_GENOME_FASTA=() + IFS=';' + for var in $VIASH_PAR_GENOME_FASTA; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GENOME_FASTA+=( "$var" ) + done + VIASH_PAR_GENOME_FASTA=$(IFS=';' ; echo "${VIASH_TEST_GENOME_FASTA[*]}") +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TRANSCRIPTOME_GTF")" ) + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_TRANSCRIPTOME_GTF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-build_star_reference-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'genome_fasta': $( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "r'${VIASH_PAR_GENOME_FASTA//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'transcriptome_gtf': $( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then echo "r'${VIASH_PAR_TRANSCRIPTOME_GTF//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'genomeSAindexNbases': $( if [ ! -z ${VIASH_PAR_GENOMESAINDEXNBASES+x} ]; then echo "int(r'${VIASH_PAR_GENOMESAINDEXNBASES//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\x1f\\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the \`processPar()\` generator needs to be adapted +to_rename = { + "genome_fasta": "genomeFastaFiles", + "output": "genomeDir", + "transcriptome_gtf": "sjdbGTFfile", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the \`to_rename\` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["genomeDir"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeFastaFiles", "sjdbGTFfile"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "genomeGenerate" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + unset VIASH_TEST_GENOME_FASTA + IFS=';' + for var in $VIASH_PAR_GENOME_FASTA; do + unset IFS + if [ -z "$VIASH_TEST_GENOME_FASTA" ]; then + VIASH_TEST_GENOME_FASTA="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GENOME_FASTA="$VIASH_TEST_GENOME_FASTA;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GENOME_FASTA="$VIASH_TEST_GENOME_FASTA" + fi + if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_TRANSCRIPTOME_GTF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/reference/build_star_reference/nextflow_labels.config b/target/executable/reference/build_star_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/reference/build_star_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/reference/cellranger_mkgtf/.config.vsh.yaml b/target/executable/reference/cellranger_mkgtf/.config.vsh.yaml new file mode 100644 index 00000000..ee56a619 --- /dev/null +++ b/target/executable/reference/cellranger_mkgtf/.config.vsh.yaml @@ -0,0 +1,231 @@ +name: "cellranger_mkgtf" +namespace: "reference" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_gtf" + description: "Reference GTF annotation." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_gtf" + description: "Output GTF file." + info: null + example: + - "output.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--attribute" + description: "Key-value pair in attributes field to be kept in the GTF file of\ + \ the format attribute:attribute_value." + info: null + example: + - "gene_type:transcribed_unprocessed_pseudogene" + - "gene_type:miRNA" + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Make a GTF file - filter by a specific attribute." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y pigz procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/cellranger_mkgtf/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/reference/cellranger_mkgtf" + executable: "target/executable/reference/cellranger_mkgtf/cellranger_mkgtf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/reference/cellranger_mkgtf/cellranger_mkgtf b/target/executable/reference/cellranger_mkgtf/cellranger_mkgtf new file mode 100755 index 00000000..ffec1e81 --- /dev/null +++ b/target/executable/reference/cellranger_mkgtf/cellranger_mkgtf @@ -0,0 +1,1169 @@ +#!/usr/bin/env bash + +# cellranger_mkgtf main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cellranger_mkgtf" +VIASH_META_FUNCTIONALITY_NAME="cellranger_mkgtf" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ghcr.io/data-intuitive/cellranger:9.0 +ENTRYPOINT [] +RUN DEBIAN_FRONTEND=noninteractive apt update && \ +apt upgrade -y && apt install -y pigz procps && rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component reference cellranger_mkgtf" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cellranger_mkgtf main" + echo "" + echo "Make a GTF file - filter by a specific attribute." + echo "" + echo "Arguments:" + echo " --input_gtf" + echo " type: file, required parameter, file must exist" + echo " example: transcriptome_annotation.gtf.gz" + echo " Reference GTF annotation." + echo "" + echo " --output_gtf" + echo " type: file, required parameter, output, file must exist" + echo " example: output.gtf.gz" + echo " Output GTF file." + echo "" + echo " --attribute" + echo " type: string, required parameter, multiple values allowed" + echo " example: gene_type:transcribed_unprocessed_pseudogene;gene_type:miRNA" + echo " Key-value pair in attributes field to be kept in the GTF file of the" + echo " format attribute:attribute_value." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cellranger_mkgtf main" + exit + ;; + --input_gtf) + [ -n "$VIASH_PAR_INPUT_GTF" ] && ViashError Bad arguments for option \'--input_gtf\': \'$VIASH_PAR_INPUT_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_gtf=*) + [ -n "$VIASH_PAR_INPUT_GTF" ] && ViashError Bad arguments for option \'--input_gtf=*\': \'$VIASH_PAR_INPUT_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_gtf) + [ -n "$VIASH_PAR_OUTPUT_GTF" ] && ViashError Bad arguments for option \'--output_gtf\': \'$VIASH_PAR_OUTPUT_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_gtf=*) + [ -n "$VIASH_PAR_OUTPUT_GTF" ] && ViashError Bad arguments for option \'--output_gtf=*\': \'$VIASH_PAR_OUTPUT_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --attribute) + if [ -z "$VIASH_PAR_ATTRIBUTE" ]; then + VIASH_PAR_ATTRIBUTE="$2" + else + VIASH_PAR_ATTRIBUTE="$VIASH_PAR_ATTRIBUTE;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --attribute. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --attribute=*) + if [ -z "$VIASH_PAR_ATTRIBUTE" ]; then + VIASH_PAR_ATTRIBUTE=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ATTRIBUTE="$VIASH_PAR_ATTRIBUTE;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/reference/cellranger_mkgtf:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_GTF+x} ]; then + ViashError '--input_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_GTF+x} ]; then + ViashError '--output_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ATTRIBUTE+x} ]; then + ViashError '--attribute' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_GTF" ] && [ ! -e "$VIASH_PAR_INPUT_GTF" ]; then + ViashError "Input file '$VIASH_PAR_INPUT_GTF' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_GTF")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_GTF")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT_GTF")" ) + VIASH_PAR_INPUT_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT_GTF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_GTF")" ) + VIASH_PAR_OUTPUT_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_GTF") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_GTF" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cellranger_mkgtf-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT_GTF+x} ]; then echo "${VIASH_PAR_INPUT_GTF}" | sed "s#'#'\"'\"'#g;s#.*#par_input_gtf='&'#" ; else echo "# par_input_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_GTF+x} ]; then echo "${VIASH_PAR_OUTPUT_GTF}" | sed "s#'#'\"'\"'#g;s#.*#par_output_gtf='&'#" ; else echo "# par_output_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_ATTRIBUTE+x} ]; then echo "${VIASH_PAR_ATTRIBUTE}" | sed "s#'#'\"'\"'#g;s#.*#par_attribute='&'#" ; else echo "# par_attribute="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +echo $VIASH_TEMP +mkdir -p "$VIASH_TEMP" +tmpdir=\$(mktemp -d "$VIASH_TEMP/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_input_gtf=\`realpath \$par_input_gtf\` +par_output_gtf=\`realpath \$par_output_gtf\` + +echo "> Unzipping input files" +unpigz -c "\$par_input_gtf" > "\$tmpdir/input_gtf.gtf" + +echo "\${par_attribute}" + +echo "> Building gtf" +cd "\$tmpdir" +# Start the cellranger mkgtf command +IFS=';' read -r -a attributes <<< "\$par_attribute" +cmd="cellranger mkgtf \\"\$tmpdir/input_gtf.gtf\\" \\"\$tmpdir/output.gtf\\"" +# Append each key-value pair as a separate --attribute argument +for attribute in "\${attributes[@]}"; do + cmd+=" --attribute=\$attribute" +done +# Execute the command +eval \$cmd + +echo "> Creating archive" +pigz -k "\$tmpdir/output.gtf" +mv "\$tmpdir/output.gtf.gz" "\$par_output_gtf" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_GTF" ]; then + VIASH_PAR_INPUT_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT_GTF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ]; then + VIASH_PAR_OUTPUT_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_GTF") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ] && [ ! -e "$VIASH_PAR_OUTPUT_GTF" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_GTF' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/reference/cellranger_mkgtf/nextflow_labels.config b/target/executable/reference/cellranger_mkgtf/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/reference/cellranger_mkgtf/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/reference/make_reference/.config.vsh.yaml b/target/executable/reference/make_reference/.config.vsh.yaml new file mode 100644 index 00000000..92478e3f --- /dev/null +++ b/target/executable/reference/make_reference/.config.vsh.yaml @@ -0,0 +1,280 @@ +name: "make_reference" +namespace: "reference" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta. Example: " + info: null + example: + - "genome_fasta.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + description: "Reference transcriptome annotation." + info: null + example: + - "transcriptome.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--ercc" + description: "ERCC sequence and annotation file." + info: null + example: + - "ercc.zip" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--subset_regex" + description: "Will subset the reference chromosomes using the given regex." + info: null + example: + - "(ERCC-00002|chr1)" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_fasta" + description: "Output genome sequence fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_gtf" + description: "Output transcriptome annotation gtf." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Preprocess and build a transcriptome reference.\n\nExample input files\ + \ are:\n - `genome_fasta`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz\n\ + \ - `transcriptome_gtf`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz\n\ + \ - `ercc`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "pigz" + - "seqkit" + - "curl" + - "wget" + - "unzip" + - "file" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/make_reference/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/reference/make_reference" + executable: "target/executable/reference/make_reference/make_reference" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/reference/make_reference/make_reference b/target/executable/reference/make_reference/make_reference new file mode 100755 index 00000000..4a963746 --- /dev/null +++ b/target/executable/reference/make_reference/make_reference @@ -0,0 +1,1292 @@ +#!/usr/bin/env bash + +# make_reference main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Angela Oliveira Pisco (author) +# * Robrecht Cannoodt (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="make_reference" +VIASH_META_FUNCTIONALITY_NAME="make_reference" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM ubuntu:22.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y pigz seqkit curl wget unzip file && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Angela Oliveira Pisco, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component reference make_reference" +LABEL org.opencontainers.image.created="2025-08-22T07:23:12Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "make_reference main" + echo "" + echo "Preprocess and build a transcriptome reference." + echo "" + echo "Example input files are:" + echo " - \`genome_fasta\`:" + echo "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" + echo " - \`transcriptome_gtf\`:" + echo "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" + echo " - \`ercc\`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" + echo "" + echo "Arguments:" + echo " --genome_fasta" + echo " type: file, required parameter, file must exist" + echo " example: genome_fasta.fa.gz" + echo " Reference genome fasta. Example:" + echo "" + echo " --transcriptome_gtf" + echo " type: file, required parameter, file must exist" + echo " example: transcriptome.gtf.gz" + echo " Reference transcriptome annotation." + echo "" + echo " --ercc" + echo " type: file, file must exist" + echo " example: ercc.zip" + echo " ERCC sequence and annotation file." + echo "" + echo " --subset_regex" + echo " type: string" + echo " example: (ERCC-00002|chr1)" + echo " Will subset the reference chromosomes using the given regex." + echo "" + echo " --output_fasta" + echo " type: file, required parameter, output, file must exist" + echo " example: genome_sequence.fa.gz" + echo " Output genome sequence fasta." + echo "" + echo " --output_gtf" + echo " type: file, required parameter, output, file must exist" + echo " example: transcriptome_annotation.gtf.gz" + echo " Output transcriptome annotation gtf." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "make_reference main" + exit + ;; + --genome_fasta) + [ -n "$VIASH_PAR_GENOME_FASTA" ] && ViashError Bad arguments for option \'--genome_fasta\': \'$VIASH_PAR_GENOME_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME_FASTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genome_fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genome_fasta=*) + [ -n "$VIASH_PAR_GENOME_FASTA" ] && ViashError Bad arguments for option \'--genome_fasta=*\': \'$VIASH_PAR_GENOME_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOME_FASTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --transcriptome_gtf) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--transcriptome_gtf\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --transcriptome_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --transcriptome_gtf=*) + [ -n "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && ViashError Bad arguments for option \'--transcriptome_gtf=*\': \'$VIASH_PAR_TRANSCRIPTOME_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --ercc) + [ -n "$VIASH_PAR_ERCC" ] && ViashError Bad arguments for option \'--ercc\': \'$VIASH_PAR_ERCC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ERCC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ercc. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ercc=*) + [ -n "$VIASH_PAR_ERCC" ] && ViashError Bad arguments for option \'--ercc=*\': \'$VIASH_PAR_ERCC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ERCC=$(ViashRemoveFlags "$1") + shift 1 + ;; + --subset_regex) + [ -n "$VIASH_PAR_SUBSET_REGEX" ] && ViashError Bad arguments for option \'--subset_regex\': \'$VIASH_PAR_SUBSET_REGEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSET_REGEX="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --subset_regex. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --subset_regex=*) + [ -n "$VIASH_PAR_SUBSET_REGEX" ] && ViashError Bad arguments for option \'--subset_regex=*\': \'$VIASH_PAR_SUBSET_REGEX\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SUBSET_REGEX=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_fasta) + [ -n "$VIASH_PAR_OUTPUT_FASTA" ] && ViashError Bad arguments for option \'--output_fasta\': \'$VIASH_PAR_OUTPUT_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FASTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_fasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_fasta=*) + [ -n "$VIASH_PAR_OUTPUT_FASTA" ] && ViashError Bad arguments for option \'--output_fasta=*\': \'$VIASH_PAR_OUTPUT_FASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FASTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_gtf) + [ -n "$VIASH_PAR_OUTPUT_GTF" ] && ViashError Bad arguments for option \'--output_gtf\': \'$VIASH_PAR_OUTPUT_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_gtf=*) + [ -n "$VIASH_PAR_OUTPUT_GTF" ] && ViashError Bad arguments for option \'--output_gtf=*\': \'$VIASH_PAR_OUTPUT_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/reference/make_reference:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GENOME_FASTA+x} ]; then + ViashError '--genome_fasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then + ViashError '--transcriptome_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_FASTA+x} ]; then + ViashError '--output_fasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_GTF+x} ]; then + ViashError '--output_gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ] && [ ! -e "$VIASH_PAR_GENOME_FASTA" ]; then + ViashError "Input file '$VIASH_PAR_GENOME_FASTA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ] && [ ! -e "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + ViashError "Input file '$VIASH_PAR_TRANSCRIPTOME_GTF' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ERCC" ] && [ ! -e "$VIASH_PAR_ERCC" ]; then + ViashError "Input file '$VIASH_PAR_ERCC' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_FASTA" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_FASTA")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_FASTA")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_GTF")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_GTF")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENOME_FASTA")" ) + VIASH_PAR_GENOME_FASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_GENOME_FASTA") +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TRANSCRIPTOME_GTF")" ) + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_TRANSCRIPTOME_GTF") +fi +if [ ! -z "$VIASH_PAR_ERCC" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ERCC")" ) + VIASH_PAR_ERCC=$(ViashDockerAutodetectMount "$VIASH_PAR_ERCC") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_FASTA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_FASTA")" ) + VIASH_PAR_OUTPUT_FASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_FASTA") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_FASTA" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_GTF")" ) + VIASH_PAR_OUTPUT_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_GTF") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_GTF" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-make_reference-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "${VIASH_PAR_GENOME_FASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_genome_fasta='&'#" ; else echo "# par_genome_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then echo "${VIASH_PAR_TRANSCRIPTOME_GTF}" | sed "s#'#'\"'\"'#g;s#.*#par_transcriptome_gtf='&'#" ; else echo "# par_transcriptome_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_ERCC+x} ]; then echo "${VIASH_PAR_ERCC}" | sed "s#'#'\"'\"'#g;s#.*#par_ercc='&'#" ; else echo "# par_ercc="; fi ) +$( if [ ! -z ${VIASH_PAR_SUBSET_REGEX+x} ]; then echo "${VIASH_PAR_SUBSET_REGEX}" | sed "s#'#'\"'\"'#g;s#.*#par_subset_regex='&'#" ; else echo "# par_subset_regex="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_FASTA+x} ]; then echo "${VIASH_PAR_OUTPUT_FASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_output_fasta='&'#" ; else echo "# par_output_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_GTF+x} ]; then echo "${VIASH_PAR_OUTPUT_GTF}" | sed "s#'#'\"'\"'#g;s#.*#par_output_gtf='&'#" ; else echo "# par_output_gtf="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\$(mktemp -d "$VIASH_TEMP/\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\$tmpdir" +} +trap clean_up EXIT + +echo "> Getting path of fasta file" +par_genome_fasta=\$(realpath \$par_genome_fasta) +echo "> Getting path of annotation file" +par_transcriptome_gtf=\$(realpath \$par_transcriptome_gtf) + +echo "> Processing genome sequence" +genome_fasta="\$tmpdir/genome_sequence.fa" +# if genome is gzipped, extract. otherwise not +if file --mime-type "\$par_genome_fasta" | grep -q gzip\$; then + zcat "\$par_genome_fasta" > "\$genome_fasta" +else + cp "\$par_genome_fasta" "\$genome_fasta" +fi + +echo "> Processing transcriptome annotation" +transcriptome_gtf="\$tmpdir/transcriptome_annotation.gtf" +# if transcriptome is gzipped, extract. otherwise not +if file --mime-type "\$par_transcriptome_gtf" | grep -q gzip\$; then + zcat "\$par_transcriptome_gtf" > "\$transcriptome_gtf" +else + cp "\$par_transcriptome_gtf" "\$transcriptome_gtf" +fi + +if [[ ! -z \$par_ercc ]]; then + echo "> Processing ERCC sequences" + # wget "\$par_ercc" -O "\$tmpdir/ercc.zip" + # unzip "\$tmpdir/ercc.zip" -d "\$tmpdir" + unzip "\$par_ercc" -d "\$tmpdir" + cat "\$tmpdir/ERCC92.fa" >> "\$genome_fasta" + cat "\$tmpdir/ERCC92.gtf" >> "\$transcriptome_gtf" +fi + +# create output & filter reference if so desired +if [[ ! -z \$par_subset_regex ]]; then + echo "> Subsetting reference with regex '\$par_subset_regex'" + awk '{print \$1}' "\$genome_fasta" | seqkit grep -r -p "^\$par_subset_regex\\\$" > "\$tmpdir/genome_sequence_filtered.fa" + genome_fasta="\$tmpdir/genome_sequence_filtered.fa" + grep -E "^\$par_subset_regex[^A-Za-z0-9]" "\$transcriptome_gtf" > "\$tmpdir/transcriptome_annotation_filtered.gtf" + transcriptome_gtf="\$tmpdir/transcriptome_annotation_filtered.gtf" + + echo + echo "Matched tags:" + cat "\$genome_fasta" | grep '^>' | sed 's#^>##' | sed 's# .*##' | sort | uniq + echo +fi + +echo "> Gzipping outputs" +pigz -c "\$genome_fasta" > "\$par_output_fasta" +pigz -c "\$transcriptome_gtf" > "\$par_output_gtf" + +# to do: re enable +# echo "> Sanity check of outputs" +# readarray -t fasta_tags < <( cat "\$genome_fasta" | grep '^>' | sed 's#^>##' | sed 's# .*##' | sort | uniq ) +# readarray -t transcriptome_tags < <( cat "\$transcriptome_gtf" | cut -d\$'\\t' -f1 | sort | uniq | grep '^[^#]' ) +# [ "\${fasta_tags[*]}" == "\${transcriptome_tags[*]}" ] || { echo "Warning: fasta tags differ from transcriptome tags"; exit 1; } +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_GENOME_FASTA" ]; then + VIASH_PAR_GENOME_FASTA=$(ViashDockerStripAutomount "$VIASH_PAR_GENOME_FASTA") + fi + if [ ! -z "$VIASH_PAR_TRANSCRIPTOME_GTF" ]; then + VIASH_PAR_TRANSCRIPTOME_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_TRANSCRIPTOME_GTF") + fi + if [ ! -z "$VIASH_PAR_ERCC" ]; then + VIASH_PAR_ERCC=$(ViashDockerStripAutomount "$VIASH_PAR_ERCC") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_FASTA" ]; then + VIASH_PAR_OUTPUT_FASTA=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_FASTA") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ]; then + VIASH_PAR_OUTPUT_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_GTF") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_FASTA" ] && [ ! -e "$VIASH_PAR_OUTPUT_FASTA" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_FASTA' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_GTF" ] && [ ! -e "$VIASH_PAR_OUTPUT_GTF" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_GTF' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/reference/make_reference/nextflow_labels.config b/target/executable/reference/make_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/reference/make_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/report/mermaid/.config.vsh.yaml b/target/executable/report/mermaid/.config.vsh.yaml new file mode 100644 index 00000000..fb5112bb --- /dev/null +++ b/target/executable/report/mermaid/.config.vsh.yaml @@ -0,0 +1,252 @@ +name: "mermaid" +namespace: "report" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input directory" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Generated network as output." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_format" + description: "Output format for the generated image. By default will be inferred\ + \ from the extension \nof the file specified with --output.\n" + info: null + required: false + choices: + - "svg" + - "png" + - "pdf" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--width" + description: "Width of the page" + info: null + default: + - 800 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--height" + description: "Height of the page" + info: null + default: + - 600 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--background_color" + description: "Background color for pngs/svgs (not pdfs)" + info: null + example: + - "#F0F0F0" + default: + - "white" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "puppeteer-config.json" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Generates a network from mermaid code.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "node:20-bullseye" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "javascript" + npm: + - "@mermaid-js/mermaid-cli" + - type: "apt" + packages: + - "chromium" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/report/mermaid/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/report/mermaid" + executable: "target/executable/report/mermaid/mermaid" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/report/mermaid/mermaid b/target/executable/report/mermaid/mermaid new file mode 100755 index 00000000..f8844147 --- /dev/null +++ b/target/executable/report/mermaid/mermaid @@ -0,0 +1,1232 @@ +#!/usr/bin/env bash + +# mermaid main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="mermaid" +VIASH_META_FUNCTIONALITY_NAME="mermaid" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM node:20-bullseye +ENTRYPOINT [] +RUN npm install -g "@mermaid-js/mermaid-cli" + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y chromium && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Dries De Maeyer" +LABEL org.opencontainers.image.description="Companion container for running component report mermaid" +LABEL org.opencontainers.image.created="2025-08-22T07:23:15Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "mermaid main" + echo "" + echo "Generates a network from mermaid code." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Input directory" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Generated network as output." + echo "" + echo " --output_format" + echo " type: string" + echo " choices: [ svg, png, pdf ]" + echo " Output format for the generated image. By default will be inferred from" + echo " the extension" + echo " of the file specified with --output." + echo "" + echo " --width" + echo " type: integer" + echo " default: 800" + echo " Width of the page" + echo "" + echo " --height" + echo " type: integer" + echo " default: 600" + echo " Height of the page" + echo "" + echo " --background_color" + echo " type: string" + echo " default: white" + echo " example: #F0F0F0" + echo " Background color for pngs/svgs (not pdfs)" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "mermaid main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_format) + [ -n "$VIASH_PAR_OUTPUT_FORMAT" ] && ViashError Bad arguments for option \'--output_format\': \'$VIASH_PAR_OUTPUT_FORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FORMAT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_format. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_format=*) + [ -n "$VIASH_PAR_OUTPUT_FORMAT" ] && ViashError Bad arguments for option \'--output_format=*\': \'$VIASH_PAR_OUTPUT_FORMAT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_FORMAT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --width) + [ -n "$VIASH_PAR_WIDTH" ] && ViashError Bad arguments for option \'--width\': \'$VIASH_PAR_WIDTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WIDTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --width. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --width=*) + [ -n "$VIASH_PAR_WIDTH" ] && ViashError Bad arguments for option \'--width=*\': \'$VIASH_PAR_WIDTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WIDTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --height) + [ -n "$VIASH_PAR_HEIGHT" ] && ViashError Bad arguments for option \'--height\': \'$VIASH_PAR_HEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HEIGHT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --height. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --height=*) + [ -n "$VIASH_PAR_HEIGHT" ] && ViashError Bad arguments for option \'--height=*\': \'$VIASH_PAR_HEIGHT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HEIGHT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --background_color) + [ -n "$VIASH_PAR_BACKGROUND_COLOR" ] && ViashError Bad arguments for option \'--background_color\': \'$VIASH_PAR_BACKGROUND_COLOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BACKGROUND_COLOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --background_color. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --background_color=*) + [ -n "$VIASH_PAR_BACKGROUND_COLOR" ] && ViashError Bad arguments for option \'--background_color=*\': \'$VIASH_PAR_BACKGROUND_COLOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BACKGROUND_COLOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/report/mermaid:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_WIDTH+x} ]; then + VIASH_PAR_WIDTH="800" +fi +if [ -z ${VIASH_PAR_HEIGHT+x} ]; then + VIASH_PAR_HEIGHT="600" +fi +if [ -z ${VIASH_PAR_BACKGROUND_COLOR+x} ]; then + VIASH_PAR_BACKGROUND_COLOR="white" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_WIDTH" ]]; then + if ! [[ "$VIASH_PAR_WIDTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--width' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_HEIGHT" ]]; then + if ! [[ "$VIASH_PAR_HEIGHT" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--height' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_FORMAT" ]; then + VIASH_PAR_OUTPUT_FORMAT_CHOICES=("svg;png;pdf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_FORMAT_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_FORMAT;" ]]; then + ViashError '--output_format' specified value of \'$VIASH_PAR_OUTPUT_FORMAT\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-mermaid-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_FORMAT+x} ]; then echo "${VIASH_PAR_OUTPUT_FORMAT}" | sed "s#'#'\"'\"'#g;s#.*#par_output_format='&'#" ; else echo "# par_output_format="; fi ) +$( if [ ! -z ${VIASH_PAR_WIDTH+x} ]; then echo "${VIASH_PAR_WIDTH}" | sed "s#'#'\"'\"'#g;s#.*#par_width='&'#" ; else echo "# par_width="; fi ) +$( if [ ! -z ${VIASH_PAR_HEIGHT+x} ]; then echo "${VIASH_PAR_HEIGHT}" | sed "s#'#'\"'\"'#g;s#.*#par_height='&'#" ; else echo "# par_height="; fi ) +$( if [ ! -z ${VIASH_PAR_BACKGROUND_COLOR+x} ]; then echo "${VIASH_PAR_BACKGROUND_COLOR}" | sed "s#'#'\"'\"'#g;s#.*#par_background_color='&'#" ; else echo "# par_background_color="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +mmdc -p "\$meta_resources_dir/puppeteer-config.json" \\ + -i "\$par_input" \\ + -o "\$par_output" \\ + --width "\$par_width" \\ + --height "\$par_height" \\ + \${par_background_color:+--backgroundColor \$par_background_color} \\ + \${output_format:+--outputFormat \$par_output_format} +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/report/mermaid/nextflow_labels.config b/target/executable/report/mermaid/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/report/mermaid/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/report/mermaid/puppeteer-config.json b/target/executable/report/mermaid/puppeteer-config.json new file mode 100644 index 00000000..7b2851c2 --- /dev/null +++ b/target/executable/report/mermaid/puppeteer-config.json @@ -0,0 +1,6 @@ +{ + "executablePath": "/usr/bin/chromium", + "args": [ + "--no-sandbox" + ] +} \ No newline at end of file diff --git a/target/executable/scgpt/binning/.config.vsh.yaml b/target/executable/scgpt/binning/.config.vsh.yaml new file mode 100644 index 00000000..0d52918f --- /dev/null +++ b/target/executable/scgpt/binning/.config.vsh.yaml @@ -0,0 +1,326 @@ +name: "binning" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Mudata layer (key from .layers) to use as input data for binning.\ + \ If not specified, .X is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "The name of the adata.var column containing boolean mask for vocabulary-cross\ + \ checked and/or highly variable genes.\n" + info: null + default: + - "id_in_vocab" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_input_bins" + description: "The number of bins to discretize the data into. When no value is\ + \ provided, data won't be binned.\n" + info: null + default: + - 51 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output h5mu file containing the binned data. \n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_binned_counts" + description: "The name of the adata layer to write the binned data to.\n" + info: null + default: + - "binned_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "Seed for random number generation.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Conversion of (pre-processed) expression count data into relative values\ + \ (bins) to address scale differences across sequencing batches.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "Kim2020_Lung_subset_preprocessed.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/binning/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/scgpt/binning" + executable: "target/executable/scgpt/binning/binning" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/scgpt/binning/binning b/target/executable/scgpt/binning/binning new file mode 100755 index 00000000..98dc5b48 --- /dev/null +++ b/target/executable/scgpt/binning/binning @@ -0,0 +1,1400 @@ +#!/usr/bin/env bash + +# binning main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer, author) +# * Elizabeth Mlynarski (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="binning" +VIASH_META_FUNCTIONALITY_NAME="binning" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.11-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component scgpt binning" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "binning main" + echo "" + echo "Conversion of (pre-processed) expression count data into relative values (bins)" + echo "to address scale differences across sequencing batches." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Mudata layer (key from .layers) to use as input data for binning. If not" + echo " specified, .X is used." + echo "" + echo " --var_input" + echo " type: string" + echo " default: id_in_vocab" + echo " The name of the adata.var column containing boolean mask for" + echo " vocabulary-cross checked and/or highly variable genes." + echo "" + echo " --n_input_bins" + echo " type: integer" + echo " default: 51" + echo " min: 1" + echo " The number of bins to discretize the data into. When no value is" + echo " provided, data won't be binned." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " The output h5mu file containing the binned data." + echo "" + echo " --output_obsm_binned_counts" + echo " type: string" + echo " default: binned_counts" + echo " The name of the adata layer to write the binned data to." + echo "" + echo " --seed" + echo " type: integer" + echo " Seed for random number generation." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "binning main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_input_bins) + [ -n "$VIASH_PAR_N_INPUT_BINS" ] && ViashError Bad arguments for option \'--n_input_bins\': \'$VIASH_PAR_N_INPUT_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_INPUT_BINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_input_bins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_input_bins=*) + [ -n "$VIASH_PAR_N_INPUT_BINS" ] && ViashError Bad arguments for option \'--n_input_bins=*\': \'$VIASH_PAR_N_INPUT_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_INPUT_BINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obsm_binned_counts) + [ -n "$VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS" ] && ViashError Bad arguments for option \'--output_obsm_binned_counts\': \'$VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obsm_binned_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obsm_binned_counts=*) + [ -n "$VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS" ] && ViashError Bad arguments for option \'--output_obsm_binned_counts=*\': \'$VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seed) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seed=*) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed=*\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/scgpt/binning:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_VAR_INPUT+x} ]; then + VIASH_PAR_VAR_INPUT="id_in_vocab" +fi +if [ -z ${VIASH_PAR_N_INPUT_BINS+x} ]; then + VIASH_PAR_N_INPUT_BINS="51" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS+x} ]; then + VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS="binned_counts" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_N_INPUT_BINS" ]]; then + if ! [[ "$VIASH_PAR_N_INPUT_BINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_input_bins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_N_INPUT_BINS -lt 1 ]]; then + ViashError '--n_input_bins' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEED" ]]; then + if ! [[ "$VIASH_PAR_SEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-binning-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +import numpy as np +from scipy.sparse import csr_matrix +import warnings + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'n_input_bins': $( if [ ! -z ${VIASH_PAR_N_INPUT_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_INPUT_BINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obsm_binned_counts': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +if par["seed"]: + np.random.seed(par["seed"]) + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars + +logger = setup_logger() + +logger.info("Reading in data") +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +logger.info("Subsetting data based on highly variable gene and/or cross-checked genes") +adata = subset_vars(adata, par["var_input"]) + +logger.info("Converting the input layer into a CSR matrix") +if not par["input_layer"] or par["input_layer"] == "X": + layer_data = adata.X +else: + layer_data = adata.layers[par["input_layer"]] +layer_data = csr_matrix(layer_data) + +if layer_data.min() < 0: + raise ValueError( + f"Assuming non-negative data, but got min value {layer_data.min()}." + ) + +n_bins = par["n_input_bins"] # NOTE: the first bin is always a spectial for zero +logger.info(f"Binning data into {par['n_input_bins']} bins.") + + +def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: + assert x.ndim == 1 and bins.ndim == 1 + + left_digits = np.digitize(x, bins) + right_difits = np.digitize(x, bins, right=True) + + rands = np.random.rand(len(x)) # uniform random numbers + + digits = rands * (right_difits - left_digits) + left_digits + digits = np.ceil(digits) + smallest_dtype = np.min_scalar_type( + digits.max().astype(np.uint) + ) # Already checked for non-negative values + digits = digits.astype(smallest_dtype) + + return digits + + +with warnings.catch_warnings(): + # Make sure warnings are displayed once. + warnings.simplefilter("once") + # layer_data.indptr.size is the number of rows in the sparse matrix + binned_rows = [] + bin_edges = [] + logger.info( + "Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix" + ) + for row_number in range(layer_data.indptr.size - 1): + row_start_index, row_end_index = ( + layer_data.indptr[row_number], + layer_data.indptr[row_number + 1], + ) + # These are all non-zero counts in the row + non_zero_row = layer_data.data[row_start_index:row_end_index] + if len(non_zero_row) == 0: + logger.warning( + "The input data contains all zero rows. Please make sure " + "this is expected. You can use the \`filter_cell_by_counts\` " + "arg to filter out all zero rows." + ) + + # Add binned_rows and bin_edges as all 0 + # np.stack will upcast the dtype later + binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8)) + bin_edges.append(np.array([0] * n_bins)) + continue + + # Binning of non-zero values + bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1)) + non_zero_digits = _digitize(non_zero_row, bins) + assert non_zero_digits.min() >= 1 + assert non_zero_digits.max() <= n_bins - 1 + binned_rows.append(non_zero_digits) + + bin_edges.append(np.concatenate([[0], bins])) + +# Create new CSR matrix +logger.info("Creating a new CSR matrix of the binned count values") +binned_counts = csr_matrix( + ( + np.concatenate(binned_rows, casting="same_kind"), + layer_data.indices, + layer_data.indptr, + ), + shape=layer_data.shape, +) + +# Set binned values and bin edges layers to adata object +input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts +input_adata.obsm["bin_edges"] = np.stack(bin_edges) + +# Write mudata output +logger.info("Writing output data") +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/scgpt/binning/nextflow_labels.config b/target/executable/scgpt/binning/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/scgpt/binning/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/scgpt/binning/setup_logger.py b/target/executable/scgpt/binning/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/scgpt/binning/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/scgpt/binning/subset_vars.py b/target/executable/scgpt/binning/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/scgpt/binning/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/scgpt/cell_type_annotation/.config.vsh.yaml b/target/executable/scgpt/cell_type_annotation/.config.vsh.yaml new file mode 100644 index 00000000..96725207 --- /dev/null +++ b/target/executable/scgpt/cell_type_annotation/.config.vsh.yaml @@ -0,0 +1,458 @@ +name: "cell_type_annotation" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Model input" + arguments: + - type: "file" + name: "--model" + description: "The model file containing checkpoints and cell type label mapper.\n" + info: null + example: + - "best_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_config" + description: "The model configuration file. \n" + info: null + example: + - "args.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Model vocabulary file directory.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--finetuned_checkpoints_key" + description: "Key in the model file containing the pretrained checkpoints.\n" + info: null + default: + - "model_state_dict" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--label_mapper_key" + description: "Key in the model file containing the cell type class to label mapper\ + \ dictionary.\n" + info: null + default: + - "id_to_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Query input" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file containing of data that have been pre-processed\ + \ (normalized, binned, genes cross-checked and tokenized).\n" + info: null + example: + - "scgpt_preprocess_ouput.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_label" + description: "The name of the adata.obs column containing the batch labels. Required\ + \ if dsbn is set to true.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_tokens" + description: "The key of the .obsm array containing the gene token ids\n" + info: null + default: + - "gene_id_tokens" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_tokenized_values" + description: "The key of the .obsm array containing the count values of the tokenized\ + \ genes\n" + info: null + default: + - "values_tokenized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output mudata file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "The name of the adata.obs column to write predicted cell type labels\ + \ to.\n" + info: null + default: + - "scgpt_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "The name of the adata.obs column to write the probabilities of the\ + \ predicted cell type labels to.\n" + info: null + default: + - "scgpt_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "The padding token used in the model.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_input_bins" + description: "The number of input bins.\n" + info: null + default: + - 51 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size. \n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--dsbn" + description: "Whether to use domain-specific batch normalization.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "Seed for random number generation. If not specified, no seed is\ + \ used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Annotate gene expression data with cell type classes through the scGPT\ + \ model.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "Kim2020_Lung_subset_tokenized.h5mu" +- type: "file" + path: "args.json" +- type: "file" + path: "vocab.json" +- type: "file" + path: "best_model.pt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/cell_type_annotation/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/scgpt/cell_type_annotation" + executable: "target/executable/scgpt/cell_type_annotation/cell_type_annotation" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/scgpt/cell_type_annotation/cell_type_annotation b/target/executable/scgpt/cell_type_annotation/cell_type_annotation new file mode 100755 index 00000000..6bb7f0dc --- /dev/null +++ b/target/executable/scgpt/cell_type_annotation/cell_type_annotation @@ -0,0 +1,1785 @@ +#!/usr/bin/env bash + +# cell_type_annotation main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer, author) +# * Jakub Majercik (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cell_type_annotation" +VIASH_META_FUNCTIONALITY_NAME="cell_type_annotation" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:23.09-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scgpt==0.2.1" + +LABEL org.opencontainers.image.authors="Dorien Roosen, Jakub Majercik" +LABEL org.opencontainers.image.description="Companion container for running component scgpt cell_type_annotation" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cell_type_annotation main" + echo "" + echo "Annotate gene expression data with cell type classes through the scGPT model." + echo "" + echo "Model input:" + echo " --model" + echo " type: file, required parameter, file must exist" + echo " example: best_model.pt" + echo " The model file containing checkpoints and cell type label mapper." + echo "" + echo " --model_config" + echo " type: file, required parameter, file must exist" + echo " example: args.json" + echo " The model configuration file." + echo "" + echo " --model_vocab" + echo " type: file, required parameter, file must exist" + echo " example: vocab.json" + echo " Model vocabulary file directory." + echo "" + echo " --finetuned_checkpoints_key" + echo " type: string" + echo " default: model_state_dict" + echo " Key in the model file containing the pretrained checkpoints." + echo "" + echo " --label_mapper_key" + echo " type: string" + echo " default: id_to_class" + echo " Key in the model file containing the cell type class to label mapper" + echo " dictionary." + echo "" + echo "Query input:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: scgpt_preprocess_ouput.h5mu" + echo " The input h5mu file containing of data that have been pre-processed" + echo " (normalized, binned, genes cross-checked and tokenized)." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --obs_batch_label" + echo " type: string" + echo " The name of the adata.obs column containing the batch labels. Required" + echo " if dsbn is set to true." + echo "" + echo " --obsm_gene_tokens" + echo " type: string" + echo " default: gene_id_tokens" + echo " The key of the .obsm array containing the gene token ids" + echo "" + echo " --obsm_tokenized_values" + echo " type: string" + echo " default: values_tokenized" + echo " The key of the .obsm array containing the count values of the tokenized" + echo " genes" + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " The output mudata file." + echo "" + echo " --output_obs_predictions" + echo " type: string" + echo " default: scgpt_pred" + echo " The name of the adata.obs column to write predicted cell type labels to." + echo "" + echo " --output_obs_probability" + echo " type: string" + echo " default: scgpt_probability" + echo " The name of the adata.obs column to write the probabilities of the" + echo " predicted cell type labels to." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --pad_token" + echo " type: string" + echo " default: " + echo " The padding token used in the model." + echo "" + echo " --pad_value" + echo " type: integer" + echo " default: -2" + echo " The value of the padding." + echo "" + echo " --n_input_bins" + echo " type: integer" + echo " default: 51" + echo " The number of input bins." + echo "" + echo " --batch_size" + echo " type: integer" + echo " default: 64" + echo " The batch size." + echo "" + echo " --dsbn" + echo " type: boolean" + echo " default: true" + echo " Whether to use domain-specific batch normalization." + echo "" + echo " --seed" + echo " type: integer" + echo " Seed for random number generation. If not specified, no seed is used." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cell_type_annotation main" + exit + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model_config) + [ -n "$VIASH_PAR_MODEL_CONFIG" ] && ViashError Bad arguments for option \'--model_config\': \'$VIASH_PAR_MODEL_CONFIG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_CONFIG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_config. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_config=*) + [ -n "$VIASH_PAR_MODEL_CONFIG" ] && ViashError Bad arguments for option \'--model_config=*\': \'$VIASH_PAR_MODEL_CONFIG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_CONFIG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model_vocab) + [ -n "$VIASH_PAR_MODEL_VOCAB" ] && ViashError Bad arguments for option \'--model_vocab\': \'$VIASH_PAR_MODEL_VOCAB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_VOCAB="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_vocab. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_vocab=*) + [ -n "$VIASH_PAR_MODEL_VOCAB" ] && ViashError Bad arguments for option \'--model_vocab=*\': \'$VIASH_PAR_MODEL_VOCAB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_VOCAB=$(ViashRemoveFlags "$1") + shift 1 + ;; + --finetuned_checkpoints_key) + [ -n "$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY" ] && ViashError Bad arguments for option \'--finetuned_checkpoints_key\': \'$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINETUNED_CHECKPOINTS_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --finetuned_checkpoints_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --finetuned_checkpoints_key=*) + [ -n "$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY" ] && ViashError Bad arguments for option \'--finetuned_checkpoints_key=*\': \'$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINETUNED_CHECKPOINTS_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --label_mapper_key) + [ -n "$VIASH_PAR_LABEL_MAPPER_KEY" ] && ViashError Bad arguments for option \'--label_mapper_key\': \'$VIASH_PAR_LABEL_MAPPER_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LABEL_MAPPER_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --label_mapper_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --label_mapper_key=*) + [ -n "$VIASH_PAR_LABEL_MAPPER_KEY" ] && ViashError Bad arguments for option \'--label_mapper_key=*\': \'$VIASH_PAR_LABEL_MAPPER_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LABEL_MAPPER_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_batch_label) + [ -n "$VIASH_PAR_OBS_BATCH_LABEL" ] && ViashError Bad arguments for option \'--obs_batch_label\': \'$VIASH_PAR_OBS_BATCH_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch_label=*) + [ -n "$VIASH_PAR_OBS_BATCH_LABEL" ] && ViashError Bad arguments for option \'--obs_batch_label=*\': \'$VIASH_PAR_OBS_BATCH_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_gene_tokens) + [ -n "$VIASH_PAR_OBSM_GENE_TOKENS" ] && ViashError Bad arguments for option \'--obsm_gene_tokens\': \'$VIASH_PAR_OBSM_GENE_TOKENS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_TOKENS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_gene_tokens. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_gene_tokens=*) + [ -n "$VIASH_PAR_OBSM_GENE_TOKENS" ] && ViashError Bad arguments for option \'--obsm_gene_tokens=*\': \'$VIASH_PAR_OBSM_GENE_TOKENS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_TOKENS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_tokenized_values) + [ -n "$VIASH_PAR_OBSM_TOKENIZED_VALUES" ] && ViashError Bad arguments for option \'--obsm_tokenized_values\': \'$VIASH_PAR_OBSM_TOKENIZED_VALUES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_TOKENIZED_VALUES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_tokenized_values. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_tokenized_values=*) + [ -n "$VIASH_PAR_OBSM_TOKENIZED_VALUES" ] && ViashError Bad arguments for option \'--obsm_tokenized_values=*\': \'$VIASH_PAR_OBSM_TOKENIZED_VALUES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_TOKENIZED_VALUES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_predictions) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_predictions. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_predictions=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" ] && ViashError Bad arguments for option \'--output_obs_predictions=*\': \'$VIASH_PAR_OUTPUT_OBS_PREDICTIONS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PREDICTIONS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_obs_probability) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_obs_probability. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_obs_probability=*) + [ -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" ] && ViashError Bad arguments for option \'--output_obs_probability=*\': \'$VIASH_PAR_OUTPUT_OBS_PROBABILITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_OBS_PROBABILITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_token) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_token. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_token=*) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token=*\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_value) + [ -n "$VIASH_PAR_PAD_VALUE" ] && ViashError Bad arguments for option \'--pad_value\': \'$VIASH_PAR_PAD_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_VALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_value. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_value=*) + [ -n "$VIASH_PAR_PAD_VALUE" ] && ViashError Bad arguments for option \'--pad_value=*\': \'$VIASH_PAR_PAD_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_VALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_input_bins) + [ -n "$VIASH_PAR_N_INPUT_BINS" ] && ViashError Bad arguments for option \'--n_input_bins\': \'$VIASH_PAR_N_INPUT_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_INPUT_BINS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_input_bins. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_input_bins=*) + [ -n "$VIASH_PAR_N_INPUT_BINS" ] && ViashError Bad arguments for option \'--n_input_bins=*\': \'$VIASH_PAR_N_INPUT_BINS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_INPUT_BINS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --batch_size) + [ -n "$VIASH_PAR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--batch_size\': \'$VIASH_PAR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BATCH_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --batch_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --batch_size=*) + [ -n "$VIASH_PAR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--batch_size=*\': \'$VIASH_PAR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BATCH_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dsbn) + [ -n "$VIASH_PAR_DSBN" ] && ViashError Bad arguments for option \'--dsbn\': \'$VIASH_PAR_DSBN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DSBN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dsbn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dsbn=*) + [ -n "$VIASH_PAR_DSBN" ] && ViashError Bad arguments for option \'--dsbn=*\': \'$VIASH_PAR_DSBN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DSBN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --seed) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --seed. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --seed=*) + [ -n "$VIASH_PAR_SEED" ] && ViashError Bad arguments for option \'--seed=*\': \'$VIASH_PAR_SEED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SEED=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/scgpt/cell_type_annotation:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_MODEL+x} ]; then + ViashError '--model' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODEL_CONFIG+x} ]; then + ViashError '--model_config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then + ViashError '--model_vocab' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY+x} ]; then + VIASH_PAR_FINETUNED_CHECKPOINTS_KEY="model_state_dict" +fi +if [ -z ${VIASH_PAR_LABEL_MAPPER_KEY+x} ]; then + VIASH_PAR_LABEL_MAPPER_KEY="id_to_class" +fi +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then + VIASH_PAR_OBSM_GENE_TOKENS="gene_id_tokens" +fi +if [ -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then + VIASH_PAR_OBSM_TOKENIZED_VALUES="values_tokenized" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then + VIASH_PAR_OUTPUT_OBS_PREDICTIONS="scgpt_pred" +fi +if [ -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then + VIASH_PAR_OUTPUT_OBS_PROBABILITY="scgpt_probability" +fi +if [ -z ${VIASH_PAR_PAD_TOKEN+x} ]; then + VIASH_PAR_PAD_TOKEN="" +fi +if [ -z ${VIASH_PAR_PAD_VALUE+x} ]; then + VIASH_PAR_PAD_VALUE="-2" +fi +if [ -z ${VIASH_PAR_N_INPUT_BINS+x} ]; then + VIASH_PAR_N_INPUT_BINS="51" +fi +if [ -z ${VIASH_PAR_BATCH_SIZE+x} ]; then + VIASH_PAR_BATCH_SIZE="64" +fi +if [ -z ${VIASH_PAR_DSBN+x} ]; then + VIASH_PAR_DSBN="true" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_MODEL" ] && [ ! -e "$VIASH_PAR_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_MODEL' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_CONFIG" ] && [ ! -e "$VIASH_PAR_MODEL_CONFIG" ]; then + ViashError "Input file '$VIASH_PAR_MODEL_CONFIG' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ] && [ ! -e "$VIASH_PAR_MODEL_VOCAB" ]; then + ViashError "Input file '$VIASH_PAR_MODEL_VOCAB' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_PAD_VALUE" ]]; then + if ! [[ "$VIASH_PAR_PAD_VALUE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--pad_value' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_INPUT_BINS" ]]; then + if ! [[ "$VIASH_PAR_N_INPUT_BINS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_input_bins' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BATCH_SIZE" ]]; then + if ! [[ "$VIASH_PAR_BATCH_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--batch_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DSBN" ]]; then + if ! [[ "$VIASH_PAR_DSBN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--dsbn' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_SEED" ]]; then + if ! [[ "$VIASH_PAR_SEED" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--seed' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL")" ) + VIASH_PAR_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL") +fi +if [ ! -z "$VIASH_PAR_MODEL_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_CONFIG")" ) + VIASH_PAR_MODEL_CONFIG=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_CONFIG") +fi +if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_VOCAB")" ) + VIASH_PAR_MODEL_VOCAB=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_VOCAB") +fi +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cell_type_annotation-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import json +from multiprocessing import freeze_support +import os +import mudata as mu +from typing import Dict +import warnings +import torch +import numpy as np +from torch.nn import functional +from torch.utils.data import Dataset, DataLoader +from scgpt.model import TransformerModel +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.utils import set_seed +from tqdm import tqdm + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_config': $( if [ ! -z ${VIASH_PAR_MODEL_CONFIG+x} ]; then echo "r'${VIASH_PAR_MODEL_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_vocab': $( if [ ! -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then echo "r'${VIASH_PAR_MODEL_VOCAB//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'finetuned_checkpoints_key': $( if [ ! -z ${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY+x} ]; then echo "r'${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'label_mapper_key': $( if [ ! -z ${VIASH_PAR_LABEL_MAPPER_KEY+x} ]; then echo "r'${VIASH_PAR_LABEL_MAPPER_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_batch_label': $( if [ ! -z ${VIASH_PAR_OBS_BATCH_LABEL+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_gene_tokens': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_TOKENS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_tokenized_values': $( if [ ! -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then echo "r'${VIASH_PAR_OBSM_TOKENIZED_VALUES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_value': $( if [ ! -z ${VIASH_PAR_PAD_VALUE+x} ]; then echo "int(r'${VIASH_PAR_PAD_VALUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_input_bins': $( if [ ! -z ${VIASH_PAR_N_INPUT_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_INPUT_BINS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'batch_size': $( if [ ! -z ${VIASH_PAR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_BATCH_SIZE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'dsbn': $( if [ ! -z ${VIASH_PAR_DSBN+x} ]; then echo "r'${VIASH_PAR_DSBN//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +class SeqDataset(Dataset): + def __init__(self, data: Dict[str, torch.Tensor]): + self.data = data + + def __len__(self): + return self.data["gene_ids"].shape[0] + + def __getitem__(self, idx): + return {k: v[idx] for k, v in self.data.items()} + + +def main(): + # Setting seed + if par["seed"]: + set_seed(par["seed"]) + + # Setting device + logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Read in data + logger.info("Reading in data") + mdata = mu.read(par["input"]) + input_adata = mdata.mod[par["modality"]] + adata = input_adata.copy() + + # Fetch batch ids for domain-specific batch normalization + if par["dsbn"] and not par["obs_batch_label"]: + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (obs_batch_labels)." + ) + elif par["dsbn"] and par["obs_batch_label"]: + logger.info("Fetching batch id's for domain-specific batch normalization") + batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") + batch_id_labels = batch_id_cats.cat.codes.values + batch_ids = batch_id_labels.tolist() + batch_ids = np.array(batch_ids) + num_batch_types = len(set(batch_ids)) + elif not par["dsbn"]: + # forward pass requires a tensor as input + batch_ids = np.zeros(adata.shape[0]) + + # Vocabulary configuration + logger.info("Loading model vocabulary") + special_tokens = [par["pad_token"], "", ""] + logger.info(f"Loading model vocab from {par['model_vocab']}") + vocab_file = par["model_vocab"] + vocab = GeneVocab.from_file(vocab_file) + [vocab.append_token(s) for s in special_tokens if s not in vocab] + vocab.set_default_index(vocab[par["pad_token"]]) + ntokens = len(vocab) + + # Model configuration + logger.info("Loading model and configurations") + model_config_file = par["model_config"] + with open(model_config_file, "r") as f: + model_configs = json.load(f) + embsize = model_configs["embsize"] + nhead = model_configs["nheads"] + d_hid = model_configs["d_hid"] + nlayers = model_configs["nlayers"] + + # Ensure the provided model has the correct architecture + logger.info("Loading model") + model_file = par["model"] + model_dict = torch.load(model_file, map_location=device) + for k, v in { + "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"], + "--label_mapper_key": par["label_mapper_key"], + }.items(): + if v not in model_dict.keys(): + raise KeyError( + f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) + pretrained_dict = model_dict[par["finetuned_checkpoints_key"]] + + # Label mapper configuration + logger.info("Loading label mapper") + label_mapper = model_dict[par["label_mapper_key"]] + cell_type_mapper = {int(k): v for k, v in label_mapper.items()} + n_cls = len(cell_type_mapper) + + # Model instatiation + logger.info("Instantiating model") + model = TransformerModel( + ntokens, + d_model=embsize, # self.encoder (GenEncoder), self.value_encoder (ContinuousValueEncoder), self.transformerencoder(TransformerEncoderLayer) + nhead=nhead, # self.transformer_encoder(TransformerEncoderLayer) + d_hid=d_hid, # self.transformer_encoder(TransformerEncoderLayer) + nlayers=nlayers, # self.transformer_encoder(TransformerEncoderLayer), self.cls_decoder + nlayers_cls=3, # self.cls_decoder + n_cls=n_cls, # self.cls_decoder + vocab=vocab, + dropout=0.2, # self.transformer_encoder + pad_token=par["pad_token"], + pad_value=par["pad_value"], + do_mvc=False, + do_dab=False, + use_batch_labels=par["dsbn"], + num_batch_labels=num_batch_types if par["dsbn"] else None, + domain_spec_batchnorm=par["dsbn"], + input_emb_style="continuous", + n_input_bins=par["n_input_bins"], + cell_emb_style="cls", # required for cell-type annotation + use_fast_transformer=False, # TODO: parametrize when GPU is available + fast_transformer_backend="flash", # TODO: parametrize when GPU is available + pre_norm=False, # TODO: parametrize when GPU is available + ) + + # Load model params + logger.info(f"Loading model params from {model_file}") + try: + model.load_state_dict(pretrained_dict) + except RuntimeError: + logger.info("only load params that are in the model and match the size") + model_dict = model.state_dict() + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if k in model_dict and v.shape == model_dict[k].shape + } + for k, v in pretrained_dict.items(): + logger.info(f"Loading params {k} with shape {v.shape}") + model_dict.update(pretrained_dict) + model.load_state_dict(model_dict) + + model.to(device) + + # Load tokenized gene data + logger.info("Loading data for inference") + for k, v in { + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + }.items(): + if v not in adata.obsm.keys(): + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) + + input_gene_ids = adata.obsm[par["obsm_gene_tokens"]] + input_values = adata.obsm[par["obsm_tokenized_values"]] + + data_pt = { + "gene_ids": input_gene_ids, + "values": input_values, + "batch_labels": torch.from_numpy(batch_ids).long(), + } + + data_loader = DataLoader( + dataset=SeqDataset(data_pt), + batch_size=par["batch_size"], + num_workers=min(os.cpu_count(), par["batch_size"] // 2), + pin_memory=True, + ) + + # Inference + logger.info("Predicting cell type classes") + model.eval() + predictions = [] + probabilities = [] + with torch.no_grad(): + for batch_data in tqdm(data_loader): + input_gene_ids = batch_data["gene_ids"].to(device) + input_values = batch_data["values"].to(device) + batch_labels = batch_data["batch_labels"].to(device) + + src_key_padding_mask = input_gene_ids.eq(vocab[par["pad_token"]]) + with torch.cuda.amp.autocast(enabled=False): + output_dict = model( + input_gene_ids, + input_values, + src_key_padding_mask=src_key_padding_mask, + batch_labels=batch_labels if par["dsbn"] else None, + CLS=True, # Return celltype classification objective output + CCE=False, + MVC=False, + ECS=False, + ) + output_values = output_dict["cls_output"] + + preds = output_values.argmax(1).cpu().numpy() + predictions.append(preds) + + probs = functional.softmax(output_values, dim=1).max(1)[0] + probabilities.append(probs.cpu().numpy()) + + predictions = np.concatenate(predictions, axis=0) + probabilities = np.concatenate(probabilities, axis=0) + + # Assign cell type labels to predicted classes + logger.info("Assigning cell type predictions and probabilities") + adata.obs["scgpt_class_pred"] = predictions + adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map( + lambda x: cell_type_mapper[x] + ) + adata.obs[par["output_obs_probability"]] = probabilities + + # Write output + logger.info("Writing output data") + mdata.mod[par["modality"]] = adata + mdata.write(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + freeze_support() + warnings.filterwarnings("ignore") + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL") + fi + if [ ! -z "$VIASH_PAR_MODEL_CONFIG" ]; then + VIASH_PAR_MODEL_CONFIG=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_CONFIG") + fi + if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ]; then + VIASH_PAR_MODEL_VOCAB=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_VOCAB") + fi + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/scgpt/cell_type_annotation/nextflow_labels.config b/target/executable/scgpt/cell_type_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/scgpt/cell_type_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/scgpt/cell_type_annotation/setup_logger.py b/target/executable/scgpt/cell_type_annotation/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/scgpt/cell_type_annotation/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/scgpt/cross_check_genes/.config.vsh.yaml b/target/executable/scgpt/cross_check_genes/.config.vsh.yaml new file mode 100644 index 00000000..9d01d1bb --- /dev/null +++ b/target/executable/scgpt/cross_check_genes/.config.vsh.yaml @@ -0,0 +1,349 @@ +name: "cross_check_genes" +namespace: "scgpt" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file containing of pre-processed data.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "The modality key of the MuData object containing the RNA AnnData\ + \ object.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vocab_file" + description: "Model vocabulary file path.\n" + info: null + example: + - "resources_test/scgpt/vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata.var column containing gene names. By default\ + \ the .var index will be used.\n" + info: null + example: + - "gene_name" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. If provided, will\ + \ only cross-check HVG filtered genes with model vocabulary." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output cross-checked anndata file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_filter" + description: "In which .var slot to store a boolean array corresponding to which\ + \ observations should be filtered out based on HVG and model vocabulary." + info: null + default: + - "id_in_vocab" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "The padding token used in the model.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cross-check genes with pre-trained scGPT model.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "Kim2020_Lung_subset_preprocessed.h5mu" +- type: "file" + path: "vocab.json" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/cross_check_genes/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/scgpt/cross_check_genes" + executable: "target/executable/scgpt/cross_check_genes/cross_check_genes" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/scgpt/cross_check_genes/cross_check_genes b/target/executable/scgpt/cross_check_genes/cross_check_genes new file mode 100755 index 00000000..422fe07b --- /dev/null +++ b/target/executable/scgpt/cross_check_genes/cross_check_genes @@ -0,0 +1,1336 @@ +#!/usr/bin/env bash + +# cross_check_genes main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Jakub Majercik (author) +# * Dorien Roosen (maintainer, author) +# * Elizabeth Mlynarski (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="cross_check_genes" +VIASH_META_FUNCTIONALITY_NAME="cross_check_genes" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:23.09-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scgpt==0.2.1" + +LABEL org.opencontainers.image.authors="Jakub Majercik, Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component scgpt cross_check_genes" +LABEL org.opencontainers.image.created="2025-08-22T07:23:17Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "cross_check_genes main" + echo "" + echo "Cross-check genes with pre-trained scGPT model." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input h5mu file containing of pre-processed data." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " The modality key of the MuData object containing the RNA AnnData object." + echo "" + echo " --vocab_file" + echo " type: file, required parameter, file must exist" + echo " example: resources_test/scgpt/vocab.json" + echo " Model vocabulary file path." + echo "" + echo " --input_var_gene_names" + echo " type: string" + echo " example: gene_name" + echo " The name of the adata.var column containing gene names. By default the" + echo " .var index will be used." + echo "" + echo " --var_input" + echo " type: string" + echo " .var column containing highly variable genes. If provided, will only" + echo " cross-check HVG filtered genes with model vocabulary." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " The output cross-checked anndata file." + echo "" + echo " --output_var_filter" + echo " type: string" + echo " default: id_in_vocab" + echo " In which .var slot to store a boolean array corresponding to which" + echo " observations should be filtered out based on HVG and model vocabulary." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --pad_token" + echo " type: string" + echo " default: " + echo " The padding token used in the model." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "cross_check_genes main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --vocab_file) + [ -n "$VIASH_PAR_VOCAB_FILE" ] && ViashError Bad arguments for option \'--vocab_file\': \'$VIASH_PAR_VOCAB_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VOCAB_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --vocab_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --vocab_file=*) + [ -n "$VIASH_PAR_VOCAB_FILE" ] && ViashError Bad arguments for option \'--vocab_file=*\': \'$VIASH_PAR_VOCAB_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VOCAB_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_var_gene_names) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_var_gene_names=*) + [ -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--input_var_gene_names=*\': \'$VIASH_PAR_INPUT_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_var_filter) + [ -n "$VIASH_PAR_OUTPUT_VAR_FILTER" ] && ViashError Bad arguments for option \'--output_var_filter\': \'$VIASH_PAR_OUTPUT_VAR_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_FILTER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_var_filter. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_var_filter=*) + [ -n "$VIASH_PAR_OUTPUT_VAR_FILTER" ] && ViashError Bad arguments for option \'--output_var_filter=*\': \'$VIASH_PAR_OUTPUT_VAR_FILTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_VAR_FILTER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_token) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_token. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_token=*) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token=*\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/scgpt/cross_check_genes:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_VOCAB_FILE+x} ]; then + ViashError '--vocab_file' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OUTPUT_VAR_FILTER+x} ]; then + VIASH_PAR_OUTPUT_VAR_FILTER="id_in_vocab" +fi +if [ -z ${VIASH_PAR_PAD_TOKEN+x} ]; then + VIASH_PAR_PAD_TOKEN="" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_VOCAB_FILE" ] && [ ! -e "$VIASH_PAR_VOCAB_FILE" ]; then + ViashError "Input file '$VIASH_PAR_VOCAB_FILE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_VOCAB_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_VOCAB_FILE")" ) + VIASH_PAR_VOCAB_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_VOCAB_FILE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-cross_check_genes-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +from scgpt.tokenizer.gene_tokenizer import GeneVocab + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'vocab_file': $( if [ ! -z ${VIASH_PAR_VOCAB_FILE+x} ]; then echo "r'${VIASH_PAR_VOCAB_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_var_filter': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_FILTER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_FILTER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +# Read in data +logger.info(f"Reading {par['input']}") +mudata = mu.read_h5mu(par["input"]) +adata = mudata.mod[par["modality"]].copy() + +pad_token = par["pad_token"] +special_tokens = [pad_token, "", ""] + +# Fetching gene names +if not par["input_var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +elif par["input_var_gene_names"] not in adata.var.columns: + raise ValueError( + f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs." + ) +else: + genes = adata.var[par["input_var_gene_names"]].astype(str).tolist() + +# Cross-check genes with pre-trained model +logger.info(f"Loading model vocab from {par['vocab_file']}") +vocab_file = par["vocab_file"] +vocab = GeneVocab.from_file(vocab_file) +[vocab.append_token(s) for s in special_tokens if s not in vocab] + +if par["var_input"]: + logger.info("Filtering genes based on model vocab and HVG") + filter_with_hvg = adata.var[par["var_input"]].tolist() + gene_filter_mask = [ + 1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg) + ] + logger.info( + f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}" + ) +else: + logger.info("Filtering genes based on model vocab") + gene_filter_mask = [1 if gene in vocab else 0 for gene in genes] + logger.info( + f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}" + ) + +logger.info(f"Writing to {par['output']}") +adata.var[par["output_var_filter"]] = gene_filter_mask +adata.var[par["output_var_filter"]] = adata.var[par["output_var_filter"]].astype("bool") +mudata.mod[par["modality"]] = adata +mudata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_VOCAB_FILE" ]; then + VIASH_PAR_VOCAB_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_VOCAB_FILE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/scgpt/cross_check_genes/nextflow_labels.config b/target/executable/scgpt/cross_check_genes/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/scgpt/cross_check_genes/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/scgpt/cross_check_genes/setup_logger.py b/target/executable/scgpt/cross_check_genes/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/scgpt/cross_check_genes/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/scgpt/embedding/.config.vsh.yaml b/target/executable/scgpt/embedding/.config.vsh.yaml new file mode 100644 index 00000000..28c17d4f --- /dev/null +++ b/target/executable/scgpt/embedding/.config.vsh.yaml @@ -0,0 +1,428 @@ +name: "embedding" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file containing tokenized gene and count data. \n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model" + description: "Path to scGPT model file.\n" + info: null + example: + - "best_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Path to scGPT model vocabulary file.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_config" + description: "Path to scGPT model config file.\n" + info: null + example: + - "args.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_tokens" + description: "The key of the .obsm array containing the gene token ids\n" + info: null + example: + - "values.pt" + default: + - "gene_id_tokens" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_tokenized_values" + description: "The key of the .obsm array containing the count values of the tokenized\ + \ genes\n" + info: null + default: + - "values_tokenized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_padding_mask" + description: "The key of the .obsm array containing the padding mask.\n" + info: null + default: + - "padding_mask" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the .var column containing gene names. When no gene_name_layer\ + \ is provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_label" + description: "The name of the adata.obs column containing the batch labels. Must\ + \ be provided when 'dsbn' is set to True.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--finetuned_checkpoints_key" + description: "Key in the model file containing the pretrained checkpoints. Only\ + \ relevant for fine-tuned models.\n" + info: null + example: + - "model_state_dict" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Path to output anndata file containing pre-processed data as well\ + \ as scGPT embeddings.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_embeddings" + description: "The name of the adata.obsm array to which scGPT embeddings will\ + \ be written.\n" + info: null + default: + - "X_scGPT" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "The token to be used for padding.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding token.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--dsbn" + description: "Whether to apply domain-specific batch normalization for generating\ + \ embeddings. When set to True, 'obs_batch_labels' must be set as well.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size to be used for inference\n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Generation of cell embeddings for the integration of single cell transcriptomic\ + \ count data using scGPT.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "source" +- type: "file" + path: "finetuned_model" +- type: "file" + path: "Kim2020_Lung_subset_tokenized.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/embedding/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/scgpt/embedding" + executable: "target/executable/scgpt/embedding/embedding" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/scgpt/embedding/embedding b/target/executable/scgpt/embedding/embedding new file mode 100755 index 00000000..048f926f --- /dev/null +++ b/target/executable/scgpt/embedding/embedding @@ -0,0 +1,1667 @@ +#!/usr/bin/env bash + +# embedding main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer, author) +# * Elizabeth Mlynarski (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="embedding" +VIASH_META_FUNCTIONALITY_NAME="embedding" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:23.09-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scgpt==0.2.1" + +LABEL org.opencontainers.image.authors="Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component scgpt embedding" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "embedding main" + echo "" + echo "Generation of cell embeddings for the integration of single cell transcriptomic" + echo "count data using scGPT." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input h5mu file containing tokenized gene and count data." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --model" + echo " type: file, required parameter, file must exist" + echo " example: best_model.pt" + echo " Path to scGPT model file." + echo "" + echo " --model_vocab" + echo " type: file, required parameter, file must exist" + echo " example: vocab.json" + echo " Path to scGPT model vocabulary file." + echo "" + echo " --model_config" + echo " type: file, required parameter, file must exist" + echo " example: args.json" + echo " Path to scGPT model config file." + echo "" + echo " --obsm_gene_tokens" + echo " type: string" + echo " default: gene_id_tokens" + echo " example: values.pt" + echo " The key of the .obsm array containing the gene token ids" + echo "" + echo " --obsm_tokenized_values" + echo " type: string" + echo " default: values_tokenized" + echo " The key of the .obsm array containing the count values of the tokenized" + echo " genes" + echo "" + echo " --obsm_padding_mask" + echo " type: string" + echo " default: padding_mask" + echo " The key of the .obsm array containing the padding mask." + echo "" + echo " --var_gene_names" + echo " type: string" + echo " The name of the .var column containing gene names. When no" + echo " gene_name_layer is provided, the .var index will be used." + echo "" + echo " --obs_batch_label" + echo " type: string" + echo " The name of the adata.obs column containing the batch labels. Must be" + echo " provided when 'dsbn' is set to True." + echo "" + echo " --finetuned_checkpoints_key" + echo " type: string" + echo " example: model_state_dict" + echo " Key in the model file containing the pretrained checkpoints. Only" + echo " relevant for fine-tuned models." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Path to output anndata file containing pre-processed data as well as" + echo " scGPT embeddings." + echo "" + echo " --obsm_embeddings" + echo " type: string" + echo " default: X_scGPT" + echo " The name of the adata.obsm array to which scGPT embeddings will be" + echo " written." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --pad_token" + echo " type: string" + echo " default: " + echo " The token to be used for padding." + echo "" + echo " --pad_value" + echo " type: integer" + echo " default: -2" + echo " The value of the padding token." + echo "" + echo " --dsbn" + echo " type: boolean" + echo " default: true" + echo " Whether to apply domain-specific batch normalization for generating" + echo " embeddings. When set to True, 'obs_batch_labels' must be set as well." + echo "" + echo " --batch_size" + echo " type: integer" + echo " default: 64" + echo " The batch size to be used for inference" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "embedding main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model=*) + [ -n "$VIASH_PAR_MODEL" ] && ViashError Bad arguments for option \'--model=*\': \'$VIASH_PAR_MODEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model_vocab) + [ -n "$VIASH_PAR_MODEL_VOCAB" ] && ViashError Bad arguments for option \'--model_vocab\': \'$VIASH_PAR_MODEL_VOCAB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_VOCAB="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_vocab. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_vocab=*) + [ -n "$VIASH_PAR_MODEL_VOCAB" ] && ViashError Bad arguments for option \'--model_vocab=*\': \'$VIASH_PAR_MODEL_VOCAB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_VOCAB=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model_config) + [ -n "$VIASH_PAR_MODEL_CONFIG" ] && ViashError Bad arguments for option \'--model_config\': \'$VIASH_PAR_MODEL_CONFIG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_CONFIG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_config. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_config=*) + [ -n "$VIASH_PAR_MODEL_CONFIG" ] && ViashError Bad arguments for option \'--model_config=*\': \'$VIASH_PAR_MODEL_CONFIG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_CONFIG=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_gene_tokens) + [ -n "$VIASH_PAR_OBSM_GENE_TOKENS" ] && ViashError Bad arguments for option \'--obsm_gene_tokens\': \'$VIASH_PAR_OBSM_GENE_TOKENS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_TOKENS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_gene_tokens. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_gene_tokens=*) + [ -n "$VIASH_PAR_OBSM_GENE_TOKENS" ] && ViashError Bad arguments for option \'--obsm_gene_tokens=*\': \'$VIASH_PAR_OBSM_GENE_TOKENS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_TOKENS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_tokenized_values) + [ -n "$VIASH_PAR_OBSM_TOKENIZED_VALUES" ] && ViashError Bad arguments for option \'--obsm_tokenized_values\': \'$VIASH_PAR_OBSM_TOKENIZED_VALUES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_TOKENIZED_VALUES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_tokenized_values. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_tokenized_values=*) + [ -n "$VIASH_PAR_OBSM_TOKENIZED_VALUES" ] && ViashError Bad arguments for option \'--obsm_tokenized_values=*\': \'$VIASH_PAR_OBSM_TOKENIZED_VALUES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_TOKENIZED_VALUES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_padding_mask) + [ -n "$VIASH_PAR_OBSM_PADDING_MASK" ] && ViashError Bad arguments for option \'--obsm_padding_mask\': \'$VIASH_PAR_OBSM_PADDING_MASK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_PADDING_MASK="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_padding_mask. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_padding_mask=*) + [ -n "$VIASH_PAR_OBSM_PADDING_MASK" ] && ViashError Bad arguments for option \'--obsm_padding_mask=*\': \'$VIASH_PAR_OBSM_PADDING_MASK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_PADDING_MASK=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_gene_names) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_gene_names=*) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names=*\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_batch_label) + [ -n "$VIASH_PAR_OBS_BATCH_LABEL" ] && ViashError Bad arguments for option \'--obs_batch_label\': \'$VIASH_PAR_OBS_BATCH_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH_LABEL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_batch_label. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_batch_label=*) + [ -n "$VIASH_PAR_OBS_BATCH_LABEL" ] && ViashError Bad arguments for option \'--obs_batch_label=*\': \'$VIASH_PAR_OBS_BATCH_LABEL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBS_BATCH_LABEL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --finetuned_checkpoints_key) + [ -n "$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY" ] && ViashError Bad arguments for option \'--finetuned_checkpoints_key\': \'$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINETUNED_CHECKPOINTS_KEY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --finetuned_checkpoints_key. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --finetuned_checkpoints_key=*) + [ -n "$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY" ] && ViashError Bad arguments for option \'--finetuned_checkpoints_key=*\': \'$VIASH_PAR_FINETUNED_CHECKPOINTS_KEY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FINETUNED_CHECKPOINTS_KEY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_embeddings) + [ -n "$VIASH_PAR_OBSM_EMBEDDINGS" ] && ViashError Bad arguments for option \'--obsm_embeddings\': \'$VIASH_PAR_OBSM_EMBEDDINGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_EMBEDDINGS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_embeddings. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_embeddings=*) + [ -n "$VIASH_PAR_OBSM_EMBEDDINGS" ] && ViashError Bad arguments for option \'--obsm_embeddings=*\': \'$VIASH_PAR_OBSM_EMBEDDINGS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_EMBEDDINGS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_token) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_token. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_token=*) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token=*\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_value) + [ -n "$VIASH_PAR_PAD_VALUE" ] && ViashError Bad arguments for option \'--pad_value\': \'$VIASH_PAR_PAD_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_VALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_value. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_value=*) + [ -n "$VIASH_PAR_PAD_VALUE" ] && ViashError Bad arguments for option \'--pad_value=*\': \'$VIASH_PAR_PAD_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_VALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --dsbn) + [ -n "$VIASH_PAR_DSBN" ] && ViashError Bad arguments for option \'--dsbn\': \'$VIASH_PAR_DSBN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DSBN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --dsbn. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --dsbn=*) + [ -n "$VIASH_PAR_DSBN" ] && ViashError Bad arguments for option \'--dsbn=*\': \'$VIASH_PAR_DSBN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_DSBN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --batch_size) + [ -n "$VIASH_PAR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--batch_size\': \'$VIASH_PAR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BATCH_SIZE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --batch_size. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --batch_size=*) + [ -n "$VIASH_PAR_BATCH_SIZE" ] && ViashError Bad arguments for option \'--batch_size=*\': \'$VIASH_PAR_BATCH_SIZE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BATCH_SIZE=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/scgpt/embedding:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODEL+x} ]; then + ViashError '--model' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then + ViashError '--model_vocab' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODEL_CONFIG+x} ]; then + ViashError '--model_config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then + VIASH_PAR_OBSM_GENE_TOKENS="gene_id_tokens" +fi +if [ -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then + VIASH_PAR_OBSM_TOKENIZED_VALUES="values_tokenized" +fi +if [ -z ${VIASH_PAR_OBSM_PADDING_MASK+x} ]; then + VIASH_PAR_OBSM_PADDING_MASK="padding_mask" +fi +if [ -z ${VIASH_PAR_OBSM_EMBEDDINGS+x} ]; then + VIASH_PAR_OBSM_EMBEDDINGS="X_scGPT" +fi +if [ -z ${VIASH_PAR_PAD_TOKEN+x} ]; then + VIASH_PAR_PAD_TOKEN="" +fi +if [ -z ${VIASH_PAR_PAD_VALUE+x} ]; then + VIASH_PAR_PAD_VALUE="-2" +fi +if [ -z ${VIASH_PAR_DSBN+x} ]; then + VIASH_PAR_DSBN="true" +fi +if [ -z ${VIASH_PAR_BATCH_SIZE+x} ]; then + VIASH_PAR_BATCH_SIZE="64" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL" ] && [ ! -e "$VIASH_PAR_MODEL" ]; then + ViashError "Input file '$VIASH_PAR_MODEL' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ] && [ ! -e "$VIASH_PAR_MODEL_VOCAB" ]; then + ViashError "Input file '$VIASH_PAR_MODEL_VOCAB' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_CONFIG" ] && [ ! -e "$VIASH_PAR_MODEL_CONFIG" ]; then + ViashError "Input file '$VIASH_PAR_MODEL_CONFIG' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_PAD_VALUE" ]]; then + if ! [[ "$VIASH_PAR_PAD_VALUE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--pad_value' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_DSBN" ]]; then + if ! [[ "$VIASH_PAR_DSBN" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--dsbn' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_BATCH_SIZE" ]]; then + if ! [[ "$VIASH_PAR_BATCH_SIZE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--batch_size' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL")" ) + VIASH_PAR_MODEL=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL") +fi +if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_VOCAB")" ) + VIASH_PAR_MODEL_VOCAB=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_VOCAB") +fi +if [ ! -z "$VIASH_PAR_MODEL_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_CONFIG")" ) + VIASH_PAR_MODEL_CONFIG=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_CONFIG") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-embedding-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import numpy as np +import mudata as mu +import json +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.model import TransformerModel +from scgpt.utils.util import load_pretrained +import torch + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_vocab': $( if [ ! -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then echo "r'${VIASH_PAR_MODEL_VOCAB//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_config': $( if [ ! -z ${VIASH_PAR_MODEL_CONFIG+x} ]; then echo "r'${VIASH_PAR_MODEL_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_gene_tokens': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_TOKENS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_tokenized_values': $( if [ ! -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then echo "r'${VIASH_PAR_OBSM_TOKENIZED_VALUES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_padding_mask': $( if [ ! -z ${VIASH_PAR_OBSM_PADDING_MASK+x} ]; then echo "r'${VIASH_PAR_OBSM_PADDING_MASK//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_batch_label': $( if [ ! -z ${VIASH_PAR_OBS_BATCH_LABEL+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH_LABEL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'finetuned_checkpoints_key': $( if [ ! -z ${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY+x} ]; then echo "r'${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_embeddings': $( if [ ! -z ${VIASH_PAR_OBSM_EMBEDDINGS+x} ]; then echo "r'${VIASH_PAR_OBSM_EMBEDDINGS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_value': $( if [ ! -z ${VIASH_PAR_PAD_VALUE+x} ]; then echo "int(r'${VIASH_PAR_PAD_VALUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'dsbn': $( if [ ! -z ${VIASH_PAR_DSBN+x} ]; then echo "r'${VIASH_PAR_DSBN//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'batch_size': $( if [ ! -z ${VIASH_PAR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_BATCH_SIZE//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +logger.info("Reading in data") + +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +for k, v in { + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + "--obsm_padding_mask": par["obsm_padding_mask"], +}.items(): + if v not in adata.obsm.keys(): + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) + +all_gene_ids = adata.obsm[par["obsm_gene_tokens"]] +all_values = adata.obsm[par["obsm_tokenized_values"]] +padding_mask = adata.obsm[par["obsm_padding_mask"]] + +# Fetch batch ids for domain-specific batch normalization +if par["dsbn"] and not par["obs_batch_label"]: + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels)." + ) +elif par["dsbn"] and par["obs_batch_label"]: + logger.info("Fetching batch id's for domain-specific batch normalization") + batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") + batch_id_labels = batch_id_cats.cat.codes.values + batch_ids = batch_id_labels.tolist() + batch_ids = np.array(batch_ids) + num_batch_types = len(set(batch_ids)) +elif not par["dsbn"] and par["obs_batch_label"]: + logger.info( + "Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed." + ) + +# Set padding specs +logger.info("Setting padding specs") +pad_token = par["pad_token"] +pad_value = par["pad_value"] +special_tokens = [pad_token, "", ""] + +# Fetching gene names +logger.info("Fetching gene names") +if not par["var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +else: + genes = adata.var[par["var_gene_names"]].astype(str).tolist() + +# Model files +logger.info("Loading model, vocab and configs") +model_config_file = par["model_config"] +model_file = par["model"] +vocab_file = par["model_vocab"] + +# Load vocab +vocab = GeneVocab.from_file(vocab_file) +for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + +vocab.set_default_index(vocab[""]) +ntokens = len(vocab) +gene_ids = np.array(vocab(genes), dtype=int) + +# Load model configs +with open(model_config_file, "r") as f: + model_configs = json.load(f) +embsize = model_configs["embsize"] +nhead = model_configs["nheads"] +d_hid = model_configs["d_hid"] +nlayers = model_configs["nlayers"] + +# Instantiate model +logger.info("Initializing transformer model") +model = TransformerModel( + ntokens, + d_model=embsize, + nhead=nhead, + d_hid=d_hid, + nlayers=nlayers, + vocab=vocab, + dropout=0.5, # scGPT default, only relevant for fine-tuning applications + pad_token=pad_token, + pad_value=pad_value, + nlayers_cls=3, # only applicable for decoder-based operations + n_cls=1, # only applicable for decoder-based operations + do_mvc=False, # only applicable for decoder-based operations + ecs_threshold=0.8, # only applicable for decoder-based operations + do_dab=False, # only applicable for decoder-based operations + use_batch_labels=False, # only applicable for decoder-based operations + num_batch_labels=num_batch_types if par["dsbn"] else None, + domain_spec_batchnorm=par["dsbn"], + input_emb_style="continuous", # scGPT default + explicit_zero_prob=False, # TODO: Parametrize when GPU-based machine types are supported + use_fast_transformer=False, # TODO: Parametrize when GPU-based machine types are supported + # fast_transformer_backend="flash", #TODO: Parametrize when GPU-based machine types are supported + pre_norm=False, # TODO: Parametrize when GPU-based machine types are supported +) + + +logger.info("Loading model") +model_file = par["model"] +model_dict = torch.load(model_file, map_location=device) + +# Ensure the provided model has the correct architecture +finetuned_checkpoints_key = par.get("finetuned_checkpoints_key") +if finetuned_checkpoints_key: + try: + model_dict = model_dict[finetuned_checkpoints_key] + except KeyError as e: + raise ValueError( + f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) from e + +# Load model +load_pretrained(model, model_dict, verbose=False) + +# Embed tokenized data +logger.info("Converting tokenized input data to embeddings") +model.to(device) +model.eval() + +cell_embeddings = model.encode_batch( + torch.from_numpy(all_gene_ids), + torch.from_numpy(all_values).float(), + src_key_padding_mask=torch.from_numpy(padding_mask), + batch_size=par["batch_size"], + batch_labels=torch.from_numpy(batch_ids).long() if par["dsbn"] else None, + output_to_cpu=True, + time_step=0, + return_np=True, +) + +cell_embeddings = cell_embeddings / np.linalg.norm( + cell_embeddings, axis=1, keepdims=True +) + +# Write output +logger.info("Writing output data") +adata.obsm[par["obsm_embeddings"]] = cell_embeddings +mdata.mod[par["modality"]] = adata +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL" ]; then + VIASH_PAR_MODEL=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL") + fi + if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ]; then + VIASH_PAR_MODEL_VOCAB=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_VOCAB") + fi + if [ ! -z "$VIASH_PAR_MODEL_CONFIG" ]; then + VIASH_PAR_MODEL_CONFIG=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_CONFIG") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/scgpt/embedding/nextflow_labels.config b/target/executable/scgpt/embedding/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/scgpt/embedding/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/scgpt/embedding/setup_logger.py b/target/executable/scgpt/embedding/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/scgpt/embedding/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/scgpt/pad_tokenize/.config.vsh.yaml b/target/executable/scgpt/pad_tokenize/.config.vsh.yaml new file mode 100644 index 00000000..8b61bd7e --- /dev/null +++ b/target/executable/scgpt/pad_tokenize/.config.vsh.yaml @@ -0,0 +1,387 @@ +name: "pad_tokenize" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file of pre-processed data.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Path to model vocabulary file.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the .var column containing gene names. When no gene_name_layer\ + \ is provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "The name of the adata.var column containing boolean mask for vocabulary-cross\ + \ checked and/or highly variable genes.\n" + info: null + default: + - "id_in_vocab" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsm_binned_counts" + description: "The name of the .obsm field containing the binned counts to be padded\ + \ and tokenized.\n" + info: null + default: + - "binned_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output h5mu file containing obsm arrays for gene tokens, tokenized\ + \ data and padding mask.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_tokens" + description: "The key of the .obsm array containing the gene token ids\n" + info: null + example: + - "values.pt" + default: + - "gene_id_tokens" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_tokenized_values" + description: "The key of the .obsm array containing the count values of the tokenized\ + \ genes\n" + info: null + default: + - "values_tokenized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_padding_mask" + description: "The key of the .obsm array containing the padding mask.\n" + info: null + default: + - "padding_mask" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "Token used for padding.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding token.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_seq_len" + description: "The maximum sequence length of the tokenized data. Defaults to the\ + \ number of features if not provided.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Tokenize and pad a batch of data for scGPT integration zero-shot inference\ + \ or fine-tuning.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "scgpt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + - "ipython~=8.5.0" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/pad_tokenize/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/scgpt/pad_tokenize" + executable: "target/executable/scgpt/pad_tokenize/pad_tokenize" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/scgpt/pad_tokenize/nextflow_labels.config b/target/executable/scgpt/pad_tokenize/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/scgpt/pad_tokenize/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/scgpt/pad_tokenize/pad_tokenize b/target/executable/scgpt/pad_tokenize/pad_tokenize new file mode 100755 index 00000000..1edcc820 --- /dev/null +++ b/target/executable/scgpt/pad_tokenize/pad_tokenize @@ -0,0 +1,1484 @@ +#!/usr/bin/env bash + +# pad_tokenize main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer, author) +# * Elizabeth Mlynarski (author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="pad_tokenize" +VIASH_META_FUNCTIONALITY_NAME="pad_tokenize" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM nvcr.io/nvidia/pytorch:23.09-py3 +ENTRYPOINT [] +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "scgpt==0.2.1" "ipython~=8.5.0" + +LABEL org.opencontainers.image.authors="Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component scgpt pad_tokenize" +LABEL org.opencontainers.image.created="2025-08-22T07:23:16Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "pad_tokenize main" + echo "" + echo "Tokenize and pad a batch of data for scGPT integration zero-shot inference or" + echo "fine-tuning." + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " The input h5mu file of pre-processed data." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --model_vocab" + echo " type: file, required parameter, file must exist" + echo " example: vocab.json" + echo " Path to model vocabulary file." + echo "" + echo " --var_gene_names" + echo " type: string" + echo " The name of the .var column containing gene names. When no" + echo " gene_name_layer is provided, the .var index will be used." + echo "" + echo " --var_input" + echo " type: string" + echo " default: id_in_vocab" + echo " The name of the adata.var column containing boolean mask for" + echo " vocabulary-cross checked and/or highly variable genes." + echo "" + echo " --input_obsm_binned_counts" + echo " type: string" + echo " default: binned_counts" + echo " The name of the .obsm field containing the binned counts to be padded" + echo " and tokenized." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " The output h5mu file containing obsm arrays for gene tokens, tokenized" + echo " data and padding mask." + echo "" + echo " --obsm_gene_tokens" + echo " type: string" + echo " default: gene_id_tokens" + echo " example: values.pt" + echo " The key of the .obsm array containing the gene token ids" + echo "" + echo " --obsm_tokenized_values" + echo " type: string" + echo " default: values_tokenized" + echo " The key of the .obsm array containing the count values of the tokenized" + echo " genes" + echo "" + echo " --obsm_padding_mask" + echo " type: string" + echo " default: padding_mask" + echo " The key of the .obsm array containing the padding mask." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Arguments:" + echo " --pad_token" + echo " type: string" + echo " default: " + echo " Token used for padding." + echo "" + echo " --pad_value" + echo " type: integer" + echo " default: -2" + echo " The value of the padding token." + echo "" + echo " --max_seq_len" + echo " type: integer" + echo " The maximum sequence length of the tokenized data. Defaults to the" + echo " number of features if not provided." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "pad_tokenize main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --model_vocab) + [ -n "$VIASH_PAR_MODEL_VOCAB" ] && ViashError Bad arguments for option \'--model_vocab\': \'$VIASH_PAR_MODEL_VOCAB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_VOCAB="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --model_vocab. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --model_vocab=*) + [ -n "$VIASH_PAR_MODEL_VOCAB" ] && ViashError Bad arguments for option \'--model_vocab=*\': \'$VIASH_PAR_MODEL_VOCAB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODEL_VOCAB=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_gene_names) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_gene_names. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_gene_names=*) + [ -n "$VIASH_PAR_VAR_GENE_NAMES" ] && ViashError Bad arguments for option \'--var_gene_names=*\': \'$VIASH_PAR_VAR_GENE_NAMES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_GENE_NAMES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --var_input) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --var_input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --var_input=*) + [ -n "$VIASH_PAR_VAR_INPUT" ] && ViashError Bad arguments for option \'--var_input=*\': \'$VIASH_PAR_VAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_VAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_obsm_binned_counts) + [ -n "$VIASH_PAR_INPUT_OBSM_BINNED_COUNTS" ] && ViashError Bad arguments for option \'--input_obsm_binned_counts\': \'$VIASH_PAR_INPUT_OBSM_BINNED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_BINNED_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_obsm_binned_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_obsm_binned_counts=*) + [ -n "$VIASH_PAR_INPUT_OBSM_BINNED_COUNTS" ] && ViashError Bad arguments for option \'--input_obsm_binned_counts=*\': \'$VIASH_PAR_INPUT_OBSM_BINNED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_OBSM_BINNED_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_gene_tokens) + [ -n "$VIASH_PAR_OBSM_GENE_TOKENS" ] && ViashError Bad arguments for option \'--obsm_gene_tokens\': \'$VIASH_PAR_OBSM_GENE_TOKENS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_TOKENS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_gene_tokens. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_gene_tokens=*) + [ -n "$VIASH_PAR_OBSM_GENE_TOKENS" ] && ViashError Bad arguments for option \'--obsm_gene_tokens=*\': \'$VIASH_PAR_OBSM_GENE_TOKENS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_GENE_TOKENS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_tokenized_values) + [ -n "$VIASH_PAR_OBSM_TOKENIZED_VALUES" ] && ViashError Bad arguments for option \'--obsm_tokenized_values\': \'$VIASH_PAR_OBSM_TOKENIZED_VALUES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_TOKENIZED_VALUES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_tokenized_values. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_tokenized_values=*) + [ -n "$VIASH_PAR_OBSM_TOKENIZED_VALUES" ] && ViashError Bad arguments for option \'--obsm_tokenized_values=*\': \'$VIASH_PAR_OBSM_TOKENIZED_VALUES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_TOKENIZED_VALUES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obsm_padding_mask) + [ -n "$VIASH_PAR_OBSM_PADDING_MASK" ] && ViashError Bad arguments for option \'--obsm_padding_mask\': \'$VIASH_PAR_OBSM_PADDING_MASK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_PADDING_MASK="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obsm_padding_mask. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obsm_padding_mask=*) + [ -n "$VIASH_PAR_OBSM_PADDING_MASK" ] && ViashError Bad arguments for option \'--obsm_padding_mask=*\': \'$VIASH_PAR_OBSM_PADDING_MASK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OBSM_PADDING_MASK=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_token) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_token. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_token=*) + [ -n "$VIASH_PAR_PAD_TOKEN" ] && ViashError Bad arguments for option \'--pad_token=*\': \'$VIASH_PAR_PAD_TOKEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_TOKEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --pad_value) + [ -n "$VIASH_PAR_PAD_VALUE" ] && ViashError Bad arguments for option \'--pad_value\': \'$VIASH_PAR_PAD_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_VALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pad_value. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pad_value=*) + [ -n "$VIASH_PAR_PAD_VALUE" ] && ViashError Bad arguments for option \'--pad_value=*\': \'$VIASH_PAR_PAD_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PAD_VALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_seq_len) + [ -n "$VIASH_PAR_MAX_SEQ_LEN" ] && ViashError Bad arguments for option \'--max_seq_len\': \'$VIASH_PAR_MAX_SEQ_LEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_SEQ_LEN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_seq_len. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_seq_len=*) + [ -n "$VIASH_PAR_MAX_SEQ_LEN" ] && ViashError Bad arguments for option \'--max_seq_len=*\': \'$VIASH_PAR_MAX_SEQ_LEN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_SEQ_LEN=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/scgpt/pad_tokenize:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then + ViashError '--model_vocab' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_VAR_INPUT+x} ]; then + VIASH_PAR_VAR_INPUT="id_in_vocab" +fi +if [ -z ${VIASH_PAR_INPUT_OBSM_BINNED_COUNTS+x} ]; then + VIASH_PAR_INPUT_OBSM_BINNED_COUNTS="binned_counts" +fi +if [ -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then + VIASH_PAR_OBSM_GENE_TOKENS="gene_id_tokens" +fi +if [ -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then + VIASH_PAR_OBSM_TOKENIZED_VALUES="values_tokenized" +fi +if [ -z ${VIASH_PAR_OBSM_PADDING_MASK+x} ]; then + VIASH_PAR_OBSM_PADDING_MASK="padding_mask" +fi +if [ -z ${VIASH_PAR_PAD_TOKEN+x} ]; then + VIASH_PAR_PAD_TOKEN="" +fi +if [ -z ${VIASH_PAR_PAD_VALUE+x} ]; then + VIASH_PAR_PAD_VALUE="-2" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ] && [ ! -e "$VIASH_PAR_MODEL_VOCAB" ]; then + ViashError "Input file '$VIASH_PAR_MODEL_VOCAB' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_PAD_VALUE" ]]; then + if ! [[ "$VIASH_PAR_PAD_VALUE" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--pad_value' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MAX_SEQ_LEN" ]]; then + if ! [[ "$VIASH_PAR_MAX_SEQ_LEN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--max_seq_len' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_MODEL_VOCAB")" ) + VIASH_PAR_MODEL_VOCAB=$(ViashDockerAutodetectMount "$VIASH_PAR_MODEL_VOCAB") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-pad_tokenize-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata as mu +import numpy as np +from scipy.sparse import issparse +from scgpt.tokenizer import tokenize_and_pad_batch +from scgpt.tokenizer.gene_tokenizer import GeneVocab + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'model_vocab': $( if [ ! -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then echo "r'${VIASH_PAR_MODEL_VOCAB//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_obsm_binned_counts': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_BINNED_COUNTS+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_BINNED_COUNTS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_gene_tokens': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_TOKENS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_tokenized_values': $( if [ ! -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then echo "r'${VIASH_PAR_OBSM_TOKENIZED_VALUES//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obsm_padding_mask': $( if [ ! -z ${VIASH_PAR_OBSM_PADDING_MASK+x} ]; then echo "r'${VIASH_PAR_OBSM_PADDING_MASK//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'pad_value': $( if [ ! -z ${VIASH_PAR_PAD_VALUE+x} ]; then echo "int(r'${VIASH_PAR_PAD_VALUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'max_seq_len': $( if [ ! -z ${VIASH_PAR_MAX_SEQ_LEN+x} ]; then echo "int(r'${VIASH_PAR_MAX_SEQ_LEN//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars + +logger = setup_logger() + +logger.info("Reading in data") + +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +adata = subset_vars(adata, par["var_input"]) + +# Set padding specs +pad_token = par["pad_token"] +special_tokens = [pad_token, "", ""] +pad_value = -2 + +logger.info("Fetching counts and gene names") +# Fetch counts +all_counts = ( + adata.obsm[par["input_obsm_binned_counts"]].toarray() + if issparse(adata.obsm[par["input_obsm_binned_counts"]]) + else adata.obsm[par["input_obsm_binned_counts"]] +) + +# Fetching gene names +if not par["var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +else: + genes = adata.var[par["var_gene_names"]].astype(str).tolist() + +# Fetch gene names and look up tokens in vocab +logger.info("Reading in vocab and fetching gene tokens") +vocab_file = par["model_vocab"] +vocab = GeneVocab.from_file(vocab_file) +for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + +vocab.set_default_index(vocab[""]) +ntokens = len(vocab) +gene_ids = np.array(vocab(genes), dtype=int) + +# Fetch max seq len +if not par["max_seq_len"]: + max_seq_len = adata.var.shape[0] + 1 +else: + max_seq_len = par["max_seq_len"] + +# Tokenize and pad data +logger.info( + f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}." +) +tokenized_data = tokenize_and_pad_batch( + all_counts, + gene_ids, + max_len=max_seq_len, + vocab=vocab, + pad_token=pad_token, + pad_value=pad_value, + append_cls=True, # append token at the beginning, + include_zero_gene=False, + return_pt=True, + mod_type=None, + vocab_mod=None, +) + +all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] +padding_mask = all_gene_ids.eq(vocab[pad_token]) + +logger.info("Writing output data") +input_adata.obsm[par["obsm_gene_tokens"]] = all_gene_ids.numpy() +input_adata.obsm[par["obsm_tokenized_values"]] = all_values.numpy() +input_adata.obsm[par["obsm_padding_mask"]] = padding_mask.numpy() + +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_MODEL_VOCAB" ]; then + VIASH_PAR_MODEL_VOCAB=$(ViashDockerStripAutomount "$VIASH_PAR_MODEL_VOCAB") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/scgpt/pad_tokenize/setup_logger.py b/target/executable/scgpt/pad_tokenize/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/scgpt/pad_tokenize/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/scgpt/pad_tokenize/subset_vars.py b/target/executable/scgpt/pad_tokenize/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/executable/scgpt/pad_tokenize/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/executable/transform/bpcells_regress_out/.config.vsh.yaml b/target/executable/transform/bpcells_regress_out/.config.vsh.yaml new file mode 100644 index 00000000..6483ca3d --- /dev/null +++ b/target/executable/transform/bpcells_regress_out/.config.vsh.yaml @@ -0,0 +1,327 @@ +name: "bpcells_regress_out" +namespace: "transform" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "The modality to run this component on." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_keys" + description: "The .obs keys to regress on." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object to regress on.\nIf not provided, the\ + \ X attribute of the adata object will be used.\n" + info: null + example: + - "X_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "The layer of the adata object containing the regressed count data.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "X_regressed" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Regress out the effects of confounding variables using a linear least\ + \ squares regression model with BPCells.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "RETICULATE_PYTHON=/usr/bin/python" + - type: "apt" + packages: + - "libhdf5-dev" + - "python3" + - "python3-pip" + - "python3-dev" + - "python-is-python3" + interactive: false + - type: "r" + cran: + - "anndata" + - "reticulate" + github: + - "bnprks/BPCells/r" + bioc_force_install: false + warnings_as_errors: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/bpcells_regress_out/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/bpcells_regress_out" + executable: "target/executable/transform/bpcells_regress_out/bpcells_regress_out" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/bpcells_regress_out/bpcells_regress_out b/target/executable/transform/bpcells_regress_out/bpcells_regress_out new file mode 100755 index 00000000..38d6a2ee --- /dev/null +++ b/target/executable/transform/bpcells_regress_out/bpcells_regress_out @@ -0,0 +1,1306 @@ +#!/usr/bin/env bash + +# bpcells_regress_out main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dorien Roosen (maintainer, author) +# * Robrecht Cannoodt (contributor, author) +# * Weiwei Schultz (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="bpcells_regress_out" +VIASH_META_FUNCTIONALITY_NAME="bpcells_regress_out" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:22.04 +ENTRYPOINT [] +ENV RETICULATE_PYTHON=/usr/bin/python +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev python3 python3-pip python3-dev python-is-python3 && \ + rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("anndata", "reticulate"), repos = "https://cran.rstudio.com")' && \ + Rscript -e 'options(warn = 2); remotes::install_github(c("bnprks/BPCells/r"), repos = "https://cran.rstudio.com")' + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dorien Roosen, Robrecht Cannoodt, Weiwei Schultz" +LABEL org.opencontainers.image.description="Companion container for running component transform bpcells_regress_out" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "bpcells_regress_out main" + echo "" + echo "Regress out the effects of confounding variables using a linear least squares" + echo "regression model with BPCells." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " The modality to run this component on." + echo "" + echo " --obs_keys" + echo " type: string, multiple values allowed" + echo " The .obs keys to regress on." + echo "" + echo " --input_layer" + echo " type: string" + echo " example: X_normalized" + echo " The layer of the adata object to regress on." + echo " If not provided, the X attribute of the adata object will be used." + echo "" + echo " --output_layer" + echo " type: string" + echo " example: X_regressed" + echo " The layer of the adata object containing the regressed count data." + echo " If not provided, the X attribute of the adata object will be used." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "bpcells_regress_out main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_keys) + if [ -z "$VIASH_PAR_OBS_KEYS" ]; then + VIASH_PAR_OBS_KEYS="$2" + else + VIASH_PAR_OBS_KEYS="$VIASH_PAR_OBS_KEYS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_keys. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_keys=*) + if [ -z "$VIASH_PAR_OBS_KEYS" ]; then + VIASH_PAR_OBS_KEYS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_KEYS="$VIASH_PAR_OBS_KEYS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/bpcells_regress_out:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-bpcells_regress_out-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +cat("Loading libraries\\n") +library(glue) +library(BPCells) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +mudata <- reticulate::import("mudata") + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "modality" = $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_MODALITY" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "obs_keys" = $( if [ ! -z ${VIASH_PAR_OBS_KEYS+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_OBS_KEYS" | sed "s#['\\]#\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "input_layer" = $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_LAYER" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_layer" = $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_LAYER" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_compression" = $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_COMPRESSION" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +# Read the h5mu file and make var names unique +mdata <- mudata\$read_h5mu(par\$input) + +# Regress out +if (!is.null(par\$obs_keys) && length(par\$obs_keys) > 0) { + cat("Regress out variables ", par\$obs_keys, " on modality ", + par\$modality, "\\n", + sep = "" + ) + + # Fetch modality AnnData and convert to an iterable matrix + adata <- mdata\$mod[[par\$modality]] + + # Fetch the input layer + mat <- + if (is.null(par\$input_layer)) { + cat("Using .X as input layer\\n") + adata\$X + } else { + cat("Using .layers ", par\$input_layer, " as input layer\\n", sep = "") + adata\$layers[[par\$input_layer]] + } + + imat <- as(as(mat, "CsparseMatrix"), "IterableMatrix") + dimnames(imat) <- NULL + + # obs_keys is not NULL and not empty + latent_data <- as.data.frame(adata\$obs[, par\$obs_keys]) + + # Regress out using BPCells + regressed_data <- regress_out(imat, latent_data, prediction_axis = "col") + + # Convert iterable matrix back to C sparse matrix + rmat <- as(regressed_data, "dgCMatrix") + + # Assign regressed out data back to AnnData object + if (is.null(par\$output_layer)) { + cat("Using .X as output layer\\n") + adata\$X <- rmat + } else { + cat("Using .layers ", par\$output_layer, " as output layer\\n", sep = "") + adata\$layers[[par\$output_layer]] <- rmat + } +} else { + cat("No obs_keys provided, skipping regression\\n") +} + +# Write to output h5mu file +mdata\$write(par\$output, compression = par\$output_compression) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/bpcells_regress_out/nextflow_labels.config b/target/executable/transform/bpcells_regress_out/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/bpcells_regress_out/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/clr/.config.vsh.yaml b/target/executable/transform/clr/.config.vsh.yaml new file mode 100644 index 00000000..86582ca6 --- /dev/null +++ b/target/executable/transform/clr/.config.vsh.yaml @@ -0,0 +1,286 @@ +name: "clr" +namespace: "transform" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. By default, .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use. By default, use X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--axis" + description: "Axis across which CLR is performed. If set to 0, CLR is performed\ + \ across observations (cells).\nIf set to 1, CLR is performed across features\ + \ (genes).\n" + info: null + default: + - 0 + required: false + choices: + - 0 + - 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017).\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "muon~=0.1.5" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/clr/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/clr" + executable: "target/executable/transform/clr/clr" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/clr/clr b/target/executable/transform/clr/clr new file mode 100755 index 00000000..67558f8d --- /dev/null +++ b/target/executable/transform/clr/clr @@ -0,0 +1,1294 @@ +#!/usr/bin/env bash + +# clr main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="clr" +VIASH_META_FUNCTIONALITY_NAME="clr" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "muon~=0.1.5" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component transform clr" +LABEL org.opencontainers.image.created="2025-08-22T07:23:05Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "clr main" + echo "" + echo "Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017)." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: prot" + echo " Which modality from the input MuData file to process." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. By default, .X is used." + echo "" + echo " --output_layer" + echo " type: string" + echo " Output layer to use. By default, use X." + echo "" + echo " --axis" + echo " type: integer" + echo " default: 0" + echo " choices: [ 0, 1 ]" + echo " Axis across which CLR is performed. If set to 0, CLR is performed across" + echo " observations (cells)." + echo " If set to 1, CLR is performed across features (genes)." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "clr main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --axis) + [ -n "$VIASH_PAR_AXIS" ] && ViashError Bad arguments for option \'--axis\': \'$VIASH_PAR_AXIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AXIS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --axis. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --axis=*) + [ -n "$VIASH_PAR_AXIS" ] && ViashError Bad arguments for option \'--axis=*\': \'$VIASH_PAR_AXIS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_AXIS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/clr:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="prot" +fi +if [ -z ${VIASH_PAR_AXIS+x} ]; then + VIASH_PAR_AXIS="0" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_AXIS" ]]; then + if ! [[ "$VIASH_PAR_AXIS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--axis' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_AXIS" ]; then + VIASH_PAR_AXIS_CHOICES=("0;1") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_AXIS_CHOICES[*]};" =~ ";$VIASH_PAR_AXIS;" ]]; then + ViashError '--axis' specified value of \'$VIASH_PAR_AXIS\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-clr-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from muon import prot as pt +from mudata import read_h5ad +from anndata import AnnData +from functools import partial +from operator import setitem + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'axis': $( if [ ! -z ${VIASH_PAR_AXIS+x} ]; then echo "int(r'${VIASH_PAR_AXIS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def main(): + input_data = read_h5ad(par["input"], mod=par["modality"]) + if par["input_layer"]: + input_data = AnnData(X=input_data.layers[par["input_layer"]]) + # CLR always normalizes the .X layer, so we have to create an AnnData file with + # the input layer at .X + normalized_counts = pt.pp.clr(input_data, axis=par["axis"], inplace=False) + if not normalized_counts: + raise RuntimeError("CLR failed to return the requested output layer") + + output_layer_setter = ( + partial(setattr, input_data, "X") + if not par["output_layer"] + else partial(setitem, input_data.layers, par["output_layer"]) + ) + output_layer_setter(normalized_counts.X) + write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + input_data, + par["output_compression"], + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/clr/compress_h5mu.py b/target/executable/transform/clr/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/clr/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/clr/nextflow_labels.config b/target/executable/transform/clr/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/clr/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/delete_layer/.config.vsh.yaml b/target/executable/transform/delete_layer/.config.vsh.yaml new file mode 100644 index 00000000..50aaec28 --- /dev/null +++ b/target/executable/transform/delete_layer/.config.vsh.yaml @@ -0,0 +1,274 @@ +name: "delete_layer" +namespace: "transform" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to remove" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--missing_ok" + description: "Do not raise an error if the layer does not exist for all modalities." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Delete an anndata layer from one or more modalities.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/delete_layer/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/delete_layer" + executable: "target/executable/transform/delete_layer/delete_layer" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/delete_layer/compress_h5mu.py b/target/executable/transform/delete_layer/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/delete_layer/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/delete_layer/delete_layer b/target/executable/transform/delete_layer/delete_layer new file mode 100755 index 00000000..c15bbac2 --- /dev/null +++ b/target/executable/transform/delete_layer/delete_layer @@ -0,0 +1,1266 @@ +#!/usr/bin/env bash + +# delete_layer main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="delete_layer" +VIASH_META_FUNCTIONALITY_NAME="delete_layer" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component transform delete_layer" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "delete_layer main" + echo "" + echo "Delete an anndata layer from one or more modalities." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --layer" + echo " type: string, required parameter, multiple values allowed" + echo " Input layer to remove" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --missing_ok" + echo " type: boolean_true" + echo " Do not raise an error if the layer does not exist for all modalities." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "delete_layer main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer) + if [ -z "$VIASH_PAR_LAYER" ]; then + VIASH_PAR_LAYER="$2" + else + VIASH_PAR_LAYER="$VIASH_PAR_LAYER;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer=*) + if [ -z "$VIASH_PAR_LAYER" ]; then + VIASH_PAR_LAYER=$(ViashRemoveFlags "$1") + else + VIASH_PAR_LAYER="$VIASH_PAR_LAYER;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --missing_ok) + [ -n "$VIASH_PAR_MISSING_OK" ] && ViashError Bad arguments for option \'--missing_ok\': \'$VIASH_PAR_MISSING_OK\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MISSING_OK=true + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/delete_layer:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_LAYER+x} ]; then + ViashError '--layer' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_MISSING_OK+x} ]; then + VIASH_PAR_MISSING_OK="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MISSING_OK" ]]; then + if ! [[ "$VIASH_PAR_MISSING_OK" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--missing_ok' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-delete_layer-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from mudata import read_h5ad +from pathlib import Path + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'missing_ok': $( if [ ! -z ${VIASH_PAR_MISSING_OK+x} ]; then echo "r'${VIASH_PAR_MISSING_OK//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) + + logger.info("Reading input file %s, modality %s.", input_file, mod_name) + mod = read_h5ad(input_file, mod=mod_name) + for layer in par["layer"]: + if layer not in mod.layers: + if par["missing_ok"]: + continue + raise ValueError(f"Layer '{layer}' is not present in modality {mod_name}.") + logger.info("Deleting layer %s from modality %s.", layer, mod_name) + del mod.layers[layer] + + logger.info("Writing output to %s.", par["output"]) + + write_h5ad_to_h5mu_with_compression( + output_file, input_file, mod_name, mod, par["output_compression"] + ) + + logger.info("Finished.") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/delete_layer/nextflow_labels.config b/target/executable/transform/delete_layer/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/delete_layer/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/delete_layer/setup_logger.py b/target/executable/transform/delete_layer/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/delete_layer/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/log1p/.config.vsh.yaml b/target/executable/transform/log1p/.config.vsh.yaml new file mode 100644 index 00000000..74fdceac --- /dev/null +++ b/target/executable/transform/log1p/.config.vsh.yaml @@ -0,0 +1,299 @@ +name: "log1p" +namespace: "transform" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is normalized" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use. By default, use X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--base" + description: "Base of the logarithm. Natural logarithm is used by default.\n" + info: null + example: + - 2.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Logarithmize the data matrix. Computes X = log(X + 1), where log denotes\ + \ the natural logarithm unless a different base is given.\n" +test_resources: +- type: "python_script" + path: "run_test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/log1p/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/log1p" + executable: "target/executable/transform/log1p/log1p" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/log1p/compress_h5mu.py b/target/executable/transform/log1p/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/log1p/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/log1p/log1p b/target/executable/transform/log1p/log1p new file mode 100755 index 00000000..4f7273fe --- /dev/null +++ b/target/executable/transform/log1p/log1p @@ -0,0 +1,1278 @@ +#!/usr/bin/env bash + +# log1p main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) +# * Robrecht Cannoodt (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="log1p" +VIASH_META_FUNCTIONALITY_NAME="log1p" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component transform log1p" +LABEL org.opencontainers.image.created="2025-08-22T07:23:05Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "log1p main" + echo "" + echo "Logarithmize the data matrix. Computes X = log(X + 1), where log denotes the" + echo "natural logarithm unless a different base is given." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. If None, X is normalized" + echo "" + echo " --output_layer" + echo " type: string" + echo " Output layer to use. By default, use X." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --base" + echo " type: double" + echo " example: 2.0" + echo " Base of the logarithm. Natural logarithm is used by default." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "log1p main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --base) + [ -n "$VIASH_PAR_BASE" ] && ViashError Bad arguments for option \'--base\': \'$VIASH_PAR_BASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BASE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --base. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --base=*) + [ -n "$VIASH_PAR_BASE" ] && ViashError Bad arguments for option \'--base=*\': \'$VIASH_PAR_BASE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BASE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/log1p:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_BASE" ]]; then + if ! [[ "$VIASH_PAR_BASE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--base' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-log1p-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import anndata as ad +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'base': $( if [ ! -z ${VIASH_PAR_BASE+x} ]; then echo "float(r'${VIASH_PAR_BASE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from input mudata", par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) +assert data.var_names.is_unique, "Expected var_names to be unique." + +logger.info("Performing log transformation") +# Make our own copy with not a lot of data +# this avoid excessive memory usage and accidental overwrites +input_layer = data.layers[par["input_layer"]] if par["input_layer"] else data.X +data_for_scanpy = ad.AnnData(X=input_layer.copy()) +sc.pp.log1p( + data_for_scanpy, + base=par["base"], + layer=None, # use X + copy=False, +) # allow overwrites in the copy that was made + +# Scanpy will overwrite the input layer. +# So fetch input layer from the copy and use it to populate the output slot +if par["output_layer"]: + data.layers[par["output_layer"]] = data_for_scanpy.X +else: + data.X = data_for_scanpy.X +data.uns["log1p"] = data_for_scanpy.uns["log1p"].copy() + +logger.info("Writing to file %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/log1p/nextflow_labels.config b/target/executable/transform/log1p/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/log1p/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/log1p/setup_logger.py b/target/executable/transform/log1p/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/log1p/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/move_layer/.config.vsh.yaml b/target/executable/transform/move_layer/.config.vsh.yaml new file mode 100644 index 00000000..ae9ab31a --- /dev/null +++ b/target/executable/transform/move_layer/.config.vsh.yaml @@ -0,0 +1,264 @@ +name: "move_layer" +namespace: "transform" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to move to a new output location. If specified, will\ + \ be used to select a key from .layers,\notherwise .X is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Destination location for the layer. If not provided, .X will be\ + \ used,\nOtherwise, will be the key for the .layers attribute in the output\ + \ MuData file.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Move a data matrix stored at the .layers or .X attributes in a MuData\ + \ object to another layer." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/move_layer/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/move_layer" + executable: "target/executable/transform/move_layer/move_layer" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/move_layer/compress_h5mu.py b/target/executable/transform/move_layer/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/move_layer/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/move_layer/move_layer b/target/executable/transform/move_layer/move_layer new file mode 100755 index 00000000..a6d6f028 --- /dev/null +++ b/target/executable/transform/move_layer/move_layer @@ -0,0 +1,1256 @@ +#!/usr/bin/env bash + +# move_layer main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="move_layer" +VIASH_META_FUNCTIONALITY_NAME="move_layer" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.description="Companion container for running component transform move_layer" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "move_layer main" + echo "" + echo "Move a data matrix stored at the .layers or .X attributes in a MuData object to" + echo "another layer." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to move to a new output location. If specified, will be used" + echo " to select a key from .layers," + echo " otherwise .X is used." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_layer" + echo " type: string" + echo " Destination location for the layer. If not provided, .X will be used," + echo " Otherwise, will be the key for the .layers attribute in the output" + echo " MuData file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "move_layer main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/move_layer:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-move_layer-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from mudata import read_h5ad +from functools import partial +from operator import setitem + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s mudata from file %s", par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + + +logger.info( + "Using input layer '%s'", "X" if not par["input_layer"] else par["input_layer"] +) +if par["input_layer"]: + data_to_write = mod_data.layers[par["input_layer"]].copy() + del mod_data.layers[par["input_layer"]] +else: + data_to_write = mod_data.X + mod_data.X = None + +output_layer_setter = ( + partial(setattr, mod_data, "X") + if not par["output_layer"] + else partial(setitem, mod_data.layers, par["output_layer"]) +) +output_layer_setter(data_to_write) + +logger.info( + "Writing output to file %s with compression %s", + par["output"], + par["output_compression"], +) + +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/move_layer/nextflow_labels.config b/target/executable/transform/move_layer/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/move_layer/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/move_layer/setup_logger.py b/target/executable/transform/move_layer/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/move_layer/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/normalize_total/.config.vsh.yaml b/target/executable/transform/normalize_total/.config.vsh.yaml new file mode 100644 index 00000000..8db4a9a0 --- /dev/null +++ b/target/executable/transform/normalize_total/.config.vsh.yaml @@ -0,0 +1,317 @@ +name: "normalize_total" +namespace: "transform" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. By default, X is normalized" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use. By default, use X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--target_sum" + description: "If None, after normalization, each observation (cell) has a total\ + \ count equal to the median of total counts for observations (cells) before\ + \ normalization." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--exclude_highly_expressed" + description: "Exclude (very) highly expressed genes for the computation of the\ + \ normalization factor (size factor) for each cell. A gene is considered highly\ + \ expressed, if it has more than max_fraction of the total counts in at least\ + \ one cell. The not-excluded genes will sum up to target_sum." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Normalize counts per cell.\n\nNormalize each cell by total counts over\ + \ all genes, so that every cell has the same total count after normalization. If\ + \ choosing target_sum=1e6, this is CPM normalization.\n\nIf exclude_highly_expressed=True,\ + \ very highly expressed genes are excluded from the computation of the normalization\ + \ factor (size factor) for each cell. This is meaningful as these can strongly influence\ + \ the resulting normalized values for all other genes [Weinreb17].\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/normalize_total/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/normalize_total" + executable: "target/executable/transform/normalize_total/normalize_total" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/normalize_total/compress_h5mu.py b/target/executable/transform/normalize_total/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/normalize_total/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/normalize_total/nextflow_labels.config b/target/executable/transform/normalize_total/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/normalize_total/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/normalize_total/normalize_total b/target/executable/transform/normalize_total/normalize_total new file mode 100755 index 00000000..b588c189 --- /dev/null +++ b/target/executable/transform/normalize_total/normalize_total @@ -0,0 +1,1312 @@ +#!/usr/bin/env bash + +# normalize_total main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries De Maeyer (maintainer) +# * Robrecht Cannoodt (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="normalize_total" +VIASH_META_FUNCTIONALITY_NAME="normalize_total" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries De Maeyer, Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component transform normalize_total" +LABEL org.opencontainers.image.created="2025-08-22T07:23:04Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "normalize_total main" + echo "" + echo "Normalize counts per cell." + echo "" + echo "Normalize each cell by total counts over all genes, so that every cell has the" + echo "same total count after normalization. If choosing target_sum=1e6, this is CPM" + echo "normalization." + echo "" + echo "If exclude_highly_expressed=True, very highly expressed genes are excluded from" + echo "the computation of the normalization factor (size factor) for each cell. This is" + echo "meaningful as these can strongly influence the resulting normalized values for" + echo "all other genes [Weinreb17]." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. By default, X is normalized" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_layer" + echo " type: string" + echo " Output layer to use. By default, use X." + echo "" + echo " --target_sum" + echo " type: integer" + echo " If None, after normalization, each observation (cell) has a total count" + echo " equal to the median of total counts for observations (cells) before" + echo " normalization." + echo "" + echo " --exclude_highly_expressed" + echo " type: boolean_true" + echo " Exclude (very) highly expressed genes for the computation of the" + echo " normalization factor (size factor) for each cell. A gene is considered" + echo " highly expressed, if it has more than max_fraction of the total counts" + echo " in at least one cell. The not-excluded genes will sum up to target_sum." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "normalize_total main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --target_sum) + [ -n "$VIASH_PAR_TARGET_SUM" ] && ViashError Bad arguments for option \'--target_sum\': \'$VIASH_PAR_TARGET_SUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGET_SUM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --target_sum. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --target_sum=*) + [ -n "$VIASH_PAR_TARGET_SUM" ] && ViashError Bad arguments for option \'--target_sum=*\': \'$VIASH_PAR_TARGET_SUM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TARGET_SUM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --exclude_highly_expressed) + [ -n "$VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED" ] && ViashError Bad arguments for option \'--exclude_highly_expressed\': \'$VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED=true + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/normalize_total:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED+x} ]; then + VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_TARGET_SUM" ]]; then + if ! [[ "$VIASH_PAR_TARGET_SUM" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--target_sum' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED" ]]; then + if ! [[ "$VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--exclude_highly_expressed' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-normalize_total-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import scanpy as sc +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'target_sum': $( if [ ! -z ${VIASH_PAR_TARGET_SUM+x} ]; then echo "int(r'${VIASH_PAR_TARGET_SUM//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'exclude_highly_expressed': $( if [ ! -z ${VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from %s", par["modality"], par["input"]) +dat = mu.read_h5ad(par["input"], mod=par["modality"]) +assert dat.var_names.is_unique, "The var_names of the input modality must be be unique." + +logger.info(par) + +logger.info("Performing total normalization.") +if par["input_layer"] and par["input_layer"] not in dat.layers.keys(): + raise ValueError(f"Input layer {par['input_layer']} not found in {par['modality']}") +output_data = sc.pp.normalize_total( + dat, + layer=par["input_layer"], + target_sum=par["target_sum"], + copy=True if par["output_layer"] else False, +) + +if output_data: + result = ( + output_data.X + if not par["input_layer"] + else output_data.layers[par["input_layer"]] + ) + dat.layers[par["output_layer"]] = result + +logger.info( + "Writing to file to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], dat, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/normalize_total/setup_logger.py b/target/executable/transform/normalize_total/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/normalize_total/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/regress_out/.config.vsh.yaml b/target/executable/transform/regress_out/.config.vsh.yaml new file mode 100644 index 00000000..8ce3b179 --- /dev/null +++ b/target/executable/transform/regress_out/.config.vsh.yaml @@ -0,0 +1,295 @@ +name: "regress_out" +namespace: "transform" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality (one or more) to run this component on." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_keys" + description: "Which .obs keys to regress on." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object to regress on.\nIf not provided, the\ + \ X attribute of the adata object will be used.\n" + info: null + example: + - "X_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "The layer of the adata object containing the regressed count data.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "X_regressed" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Regress out (mostly) unwanted sources of variation.\nUses simple linear\ + \ regression. This is inspired by Seurat's regressOut function in R [Satija15].\ + \ \nNote that this function tends to overcorrect in certain circumstances as described\ + \ in issue theislab/scanpy#526.\nSee https://github.com/theislab/scanpy/issues/526.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/regress_out/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/regress_out" + executable: "target/executable/transform/regress_out/regress_out" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/regress_out/nextflow_labels.config b/target/executable/transform/regress_out/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/regress_out/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/regress_out/regress_out b/target/executable/transform/regress_out/regress_out new file mode 100755 index 00000000..b7c8362d --- /dev/null +++ b/target/executable/transform/regress_out/regress_out @@ -0,0 +1,1276 @@ +#!/usr/bin/env bash + +# regress_out main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer, contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="regress_out" +VIASH_META_FUNCTIONALITY_NAME="regress_out" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component transform regress_out" +LABEL org.opencontainers.image.created="2025-08-22T07:23:05Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "regress_out main" + echo "" + echo "Regress out (mostly) unwanted sources of variation." + echo "Uses simple linear regression. This is inspired by Seurat's regressOut function" + echo "in R [Satija15]." + echo "Note that this function tends to overcorrect in certain circumstances as" + echo "described in issue theislab/scanpy#526." + echo "See https://github.com/theislab/scanpy/issues/526." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " Which modality (one or more) to run this component on." + echo "" + echo " --obs_keys" + echo " type: string, multiple values allowed" + echo " Which .obs keys to regress on." + echo "" + echo " --input_layer" + echo " type: string" + echo " example: X_normalized" + echo " The layer of the adata object to regress on." + echo " If not provided, the X attribute of the adata object will be used." + echo "" + echo " --output_layer" + echo " type: string" + echo " example: X_regressed" + echo " The layer of the adata object containing the regressed count data." + echo " If not provided, the X attribute of the adata object will be used." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "regress_out main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --obs_keys) + if [ -z "$VIASH_PAR_OBS_KEYS" ]; then + VIASH_PAR_OBS_KEYS="$2" + else + VIASH_PAR_OBS_KEYS="$VIASH_PAR_OBS_KEYS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --obs_keys. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --obs_keys=*) + if [ -z "$VIASH_PAR_OBS_KEYS" ]; then + VIASH_PAR_OBS_KEYS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_OBS_KEYS="$VIASH_PAR_OBS_KEYS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/regress_out:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-regress_out-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import scanpy as sc +import mudata as mu +import anndata as ad +import multiprocessing +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'obs_keys': $( if [ ! -z ${VIASH_PAR_OBS_KEYS+x} ]; then echo "r'${VIASH_PAR_OBS_KEYS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) +mdata.var_names_make_unique() + +if par["obs_keys"] is not None and len(par["obs_keys"]) > 0: + mod = par["modality"] + data = mdata.mod[mod] + + # Copy required data from input data to new AnnData object to allow providing input and output layers + input_layer = data.X if not par["input_layer"] else data.layers[par["input_layer"]] + obs = data.obs.loc[:, par["obs_keys"]] + sc_data = ad.AnnData(X=input_layer.copy(), obs=obs) + + logger.info("Regress out variables on modality %s", mod) + sc.pp.regress_out( + sc_data, keys=par["obs_keys"], n_jobs=multiprocessing.cpu_count() - 1 + ) + + # Copy regressed data back to original input data + if par["output_layer"]: + data.layers[par["output_layer"]] = sc_data.X + else: + data.X = sc_data.X + +logger.info("Writing to file") +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/regress_out/setup_logger.py b/target/executable/transform/regress_out/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/regress_out/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/scale/.config.vsh.yaml b/target/executable/transform/scale/.config.vsh.yaml new file mode 100644 index 00000000..8e6ed452 --- /dev/null +++ b/target/executable/transform/scale/.config.vsh.yaml @@ -0,0 +1,293 @@ +name: "scale" +namespace: "transform" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of modalities to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer with data to scale. Uses .X by default" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer where scaled data will be stored. If not specified,\ + \ .X will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_value" + description: "Clip (truncate) to this value after scaling. Does not clip by default." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--zero_center" + description: "If set, omit zero-centering variables, which allows to handle sparse\ + \ input efficiently." + info: null + direction: "input" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Scale data to unit variance and zero mean.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/scale/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/scale" + executable: "target/executable/transform/scale/scale" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/scale/compress_h5mu.py b/target/executable/transform/scale/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/scale/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/scale/nextflow_labels.config b/target/executable/transform/scale/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/scale/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/scale/scale b/target/executable/transform/scale/scale new file mode 100755 index 00000000..a86f6043 --- /dev/null +++ b/target/executable/transform/scale/scale @@ -0,0 +1,1304 @@ +#!/usr/bin/env bash + +# scale main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scale" +VIASH_META_FUNCTIONALITY_NAME="scale" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component transform scale" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scale main" + echo "" + echo "Scale data to unit variance and zero mean." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file." + echo "" + echo " --modality" + echo " type: string" + echo " default: rna" + echo " List of modalities to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer with data to scale. Uses .X by default" + echo "" + echo " --output_layer" + echo " type: string" + echo " Output layer where scaled data will be stored. If not specified, .X will" + echo " be used." + echo "" + echo " --max_value" + echo " type: double" + echo " Clip (truncate) to this value after scaling. Does not clip by default." + echo "" + echo " --zero_center" + echo " type: boolean_false" + echo " If set, omit zero-centering variables, which allows to handle sparse" + echo " input efficiently." + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " default: output.h5mu" + echo " Output h5mu file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scale main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --max_value) + [ -n "$VIASH_PAR_MAX_VALUE" ] && ViashError Bad arguments for option \'--max_value\': \'$VIASH_PAR_MAX_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_VALUE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --max_value. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --max_value=*) + [ -n "$VIASH_PAR_MAX_VALUE" ] && ViashError Bad arguments for option \'--max_value=*\': \'$VIASH_PAR_MAX_VALUE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MAX_VALUE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --zero_center) + [ -n "$VIASH_PAR_ZERO_CENTER" ] && ViashError Bad arguments for option \'--zero_center\': \'$VIASH_PAR_ZERO_CENTER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ZERO_CENTER=false + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/scale:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="rna" +fi +if [ -z ${VIASH_PAR_ZERO_CENTER+x} ]; then + VIASH_PAR_ZERO_CENTER="true" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MAX_VALUE" ]]; then + if ! [[ "$VIASH_PAR_MAX_VALUE" =~ ^[-+]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?$ ]]; then + ViashError '--max_value' has to be a double. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_ZERO_CENTER" ]]; then + if ! [[ "$VIASH_PAR_ZERO_CENTER" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--zero_center' has to be a boolean_false. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scale-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +from mudata import read_h5ad +import scanpy +from functools import partial +from operator import setitem + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'max_value': $( if [ ! -z ${VIASH_PAR_MAX_VALUE+x} ]; then echo "float(r'${VIASH_PAR_MAX_VALUE//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'zero_center': $( if [ ! -z ${VIASH_PAR_ZERO_CENTER+x} ]; then echo "r'${VIASH_PAR_ZERO_CENTER//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info( + "Reading modality %s from .h5mu file: %s", par["modality"], par["input"] + ) + data = read_h5ad(par["input"], mod=par["modality"]) + logger.info("Scaling modality %s.", par["modality"]) + scanpy_output = scanpy.pp.scale( + data, + layer=par["input_layer"], + zero_center=par["zero_center"], + max_value=par["max_value"], + copy=True, + ) + output_layer_setter = ( + partial(setattr, data, "X") + if not par["output_layer"] + else partial(setitem, data.layers, par["output_layer"]) + ) + output_layer_setter( + scanpy_output.X + if not par["input_layer"] + else scanpy_output.layers[par["input_layer"]] + ) + logger.info( + "Writing to %s with compression %s", par["output"], par["output_compression"] + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] + ) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/transform/scale/setup_logger.py b/target/executable/transform/scale/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/scale/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/tfidf/.config.vsh.yaml b/target/executable/transform/tfidf/.config.vsh.yaml new file mode 100644 index 00000000..d08ec109 --- /dev/null +++ b/target/executable/transform/tfidf/.config.vsh.yaml @@ -0,0 +1,324 @@ +name: "tfidf" +namespace: "transform" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "maintainer" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "atac" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. By default, X is normalized" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use." + info: null + default: + - "tfidf" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scale_factor" + description: "Scale factor to multiply the TF-IDF matrix by." + info: null + default: + - 10000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_idf" + description: "Whether to log-transform IDF term." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_tf" + description: "Whether to log-transform TF term." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_tfidf" + description: "Whether to log-transform TF*IDF term (False by default). Can only\ + \ be used when log_tf and log_idf are False." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Perform TF-IDF normalization of the data (typically, ATAC).\n\nTF-IDF\ + \ stands for \"term frequency - inverse document frequency\". It is a technique\ + \ from natural language processing analysis.\nIn the context of ATAC data, \"terms\"\ + \ are the features (genes) and \"documents\" are the observations (cells). \nTF-IDF\ + \ normalization is applied to single-cell ATAC-seq data to highlight the importance\ + \ of specific genomic regions (typically peaks)\nacross different cells while down-weighting\ + \ regions that are commonly accessible across many cells. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "counts" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim-bullseye" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + - "pkg-config" + - "gcc" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "muon~=0.1.5" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/tfidf/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/transform/tfidf" + executable: "target/executable/transform/tfidf/tfidf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/transform/tfidf/compress_h5mu.py b/target/executable/transform/tfidf/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/transform/tfidf/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/transform/tfidf/nextflow_labels.config b/target/executable/transform/tfidf/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/transform/tfidf/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/transform/tfidf/setup_logger.py b/target/executable/transform/tfidf/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/transform/tfidf/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/transform/tfidf/tfidf b/target/executable/transform/tfidf/tfidf new file mode 100755 index 00000000..f703382f --- /dev/null +++ b/target/executable/transform/tfidf/tfidf @@ -0,0 +1,1372 @@ +#!/usr/bin/env bash + +# tfidf main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Vladimir Shitov (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="tfidf" +VIASH_META_FUNCTIONALITY_NAME="tfidf" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim-bullseye +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y libhdf5-dev procps pkg-config gcc && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "muon~=0.1.5" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Vladimir Shitov" +LABEL org.opencontainers.image.description="Companion container for running component transform tfidf" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "tfidf main" + echo "" + echo "Perform TF-IDF normalization of the data (typically, ATAC)." + echo "" + echo "TF-IDF stands for \"term frequency - inverse document frequency\". It is a" + echo "technique from natural language processing analysis." + echo "In the context of ATAC data, \"terms\" are the features (genes) and \"documents\"" + echo "are the observations (cells)." + echo "TF-IDF normalization is applied to single-cell ATAC-seq data to highlight the" + echo "importance of specific genomic regions (typically peaks)" + echo "across different cells while down-weighting regions that are commonly accessible" + echo "across many cells." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " example: input.h5mu" + echo " Input h5mu file" + echo "" + echo " --modality" + echo " type: string" + echo " default: atac" + echo " Which modality from the input MuData file to process." + echo "" + echo " --input_layer" + echo " type: string" + echo " Input layer to use. By default, X is normalized" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Output h5mu file." + echo "" + echo " --output_layer" + echo " type: string" + echo " default: tfidf" + echo " Output layer to use." + echo "" + echo " --scale_factor" + echo " type: integer" + echo " default: 10000" + echo " min: 1" + echo " Scale factor to multiply the TF-IDF matrix by." + echo "" + echo " --log_idf" + echo " type: boolean" + echo " default: true" + echo " Whether to log-transform IDF term." + echo "" + echo " --log_tf" + echo " type: boolean" + echo " default: true" + echo " Whether to log-transform TF term." + echo "" + echo " --log_tfidf" + echo " type: boolean" + echo " default: false" + echo " Whether to log-transform TF*IDF term (False by default). Can only be" + echo " used when log_tf and log_idf are False." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "tfidf main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --input_layer) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_layer=*) + [ -n "$VIASH_PAR_INPUT_LAYER" ] && ViashError Bad arguments for option \'--input_layer=*\': \'$VIASH_PAR_INPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_layer=*) + [ -n "$VIASH_PAR_OUTPUT_LAYER" ] && ViashError Bad arguments for option \'--output_layer=*\': \'$VIASH_PAR_OUTPUT_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --scale_factor) + [ -n "$VIASH_PAR_SCALE_FACTOR" ] && ViashError Bad arguments for option \'--scale_factor\': \'$VIASH_PAR_SCALE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCALE_FACTOR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --scale_factor. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --scale_factor=*) + [ -n "$VIASH_PAR_SCALE_FACTOR" ] && ViashError Bad arguments for option \'--scale_factor=*\': \'$VIASH_PAR_SCALE_FACTOR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_SCALE_FACTOR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --log_idf) + [ -n "$VIASH_PAR_LOG_IDF" ] && ViashError Bad arguments for option \'--log_idf\': \'$VIASH_PAR_LOG_IDF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_IDF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --log_idf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --log_idf=*) + [ -n "$VIASH_PAR_LOG_IDF" ] && ViashError Bad arguments for option \'--log_idf=*\': \'$VIASH_PAR_LOG_IDF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_IDF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --log_tf) + [ -n "$VIASH_PAR_LOG_TF" ] && ViashError Bad arguments for option \'--log_tf\': \'$VIASH_PAR_LOG_TF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_TF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --log_tf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --log_tf=*) + [ -n "$VIASH_PAR_LOG_TF" ] && ViashError Bad arguments for option \'--log_tf=*\': \'$VIASH_PAR_LOG_TF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_TF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --log_tfidf) + [ -n "$VIASH_PAR_LOG_TFIDF" ] && ViashError Bad arguments for option \'--log_tfidf\': \'$VIASH_PAR_LOG_TFIDF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_TFIDF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --log_tfidf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --log_tfidf=*) + [ -n "$VIASH_PAR_LOG_TFIDF" ] && ViashError Bad arguments for option \'--log_tfidf=*\': \'$VIASH_PAR_LOG_TFIDF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_TFIDF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/transform/tfidf:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + VIASH_PAR_MODALITY="atac" +fi +if [ -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then + VIASH_PAR_OUTPUT_LAYER="tfidf" +fi +if [ -z ${VIASH_PAR_SCALE_FACTOR+x} ]; then + VIASH_PAR_SCALE_FACTOR="10000" +fi +if [ -z ${VIASH_PAR_LOG_IDF+x} ]; then + VIASH_PAR_LOG_IDF="true" +fi +if [ -z ${VIASH_PAR_LOG_TF+x} ]; then + VIASH_PAR_LOG_TF="true" +fi +if [ -z ${VIASH_PAR_LOG_TFIDF+x} ]; then + VIASH_PAR_LOG_TFIDF="false" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_SCALE_FACTOR" ]]; then + if ! [[ "$VIASH_PAR_SCALE_FACTOR" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--scale_factor' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_SCALE_FACTOR -lt 1 ]]; then + ViashError '--scale_factor' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LOG_IDF" ]]; then + if ! [[ "$VIASH_PAR_LOG_IDF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--log_idf' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LOG_TF" ]]; then + if ! [[ "$VIASH_PAR_LOG_TF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--log_tf' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LOG_TFIDF" ]]; then + if ! [[ "$VIASH_PAR_LOG_TFIDF" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--log_tfidf' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-tfidf-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata +import muon + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'scale_factor': $( if [ ! -z ${VIASH_PAR_SCALE_FACTOR+x} ]; then echo "int(r'${VIASH_PAR_SCALE_FACTOR//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'log_idf': $( if [ ! -z ${VIASH_PAR_LOG_IDF+x} ]; then echo "r'${VIASH_PAR_LOG_IDF//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'log_tf': $( if [ ! -z ${VIASH_PAR_LOG_TF+x} ]; then echo "r'${VIASH_PAR_LOG_TF//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'log_tfidf': $( if [ ! -z ${VIASH_PAR_LOG_TFIDF+x} ]; then echo "r'${VIASH_PAR_LOG_TFIDF//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input modality %s from %s", par["modality"], par["input"]) +adata = mudata.read_h5ad(par["input"], mod=par["modality"]) + +logger.info(par) + +logger.info("Performing TF-IDF normalization") +input_data = adata.copy() + +muon.atac.pp.tfidf( + input_data, + log_tf=par["log_tf"], + log_idf=par["log_idf"], + log_tfidf=par["log_tfidf"], + scale_factor=par["scale_factor"], + inplace=True, + copy=False, + from_layer=par["input_layer"], + to_layer=par["output_layer"], +) + +adata.layers[par["output_layer"]] = input_data.layers[par["output_layer"]] + +logger.info("Writing to file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/velocity/scvelo/.config.vsh.yaml b/target/executable/velocity/scvelo/.config.vsh.yaml new file mode 100644 index 00000000..515ac8d1 --- /dev/null +++ b/target/executable/velocity/scvelo/.config.vsh.yaml @@ -0,0 +1,403 @@ +name: "scvelo" +namespace: "velocity" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input MuData file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--counts_layer" + description: "Name of the counts layer, if not specified, X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Input modality" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_spliced" + description: "Name of the layer to store the spliced abundances in. Will be used\ + \ as key in the .layer attribute of the\noutput MuData object.\n" + info: null + default: + - "spliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_unspliced" + description: "Name of the layer to store the unspliced abundances in.\nWill be\ + \ used as key in the .layer attribute of the output MuData object.\n" + info: null + default: + - "unspliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_ambiguous" + description: "Name of the layer to store the abundances in for which no fate was\ + \ determined.\nWill be used as key in the .layer attribute of the output MuData\ + \ object.\n" + info: null + default: + - "ambiguous" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output directory. If it does not exist, will be created." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_h5mu" + description: "Output mudata file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Filtering and normalization" + description: "Arguments for filtering, normalization an log transform (see scvelo.pp.filter_and_normalize\ + \ function)" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts required for a gene to pass filtering (spliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts_u" + description: "Minimum number of counts required for a gene to pass filtering (unspliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells" + description: "Minimum number of cells expressed required to pass filtering (spliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_u" + description: "Minimum number of cells expressed required to pass filtering (unspliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_shared_counts" + description: "Minimum number of counts (both unspliced and spliced) required for\ + \ a gene." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_shared_cells" + description: "Minimum number of cells required to be expressed (both unspliced\ + \ and spliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_top_genes" + description: "Number of genes to keep." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_transform" + description: "Do not log transform counts." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Fitting parameters" + description: "Arguments for fitting the data" + arguments: + - type: "integer" + name: "--n_principal_components" + description: "Number of principal components to use for calculating moments." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors" + description: "Number of neighbors to use. First/second-order moments are computed\ + \ for each\ncell across its nearest neighbors, where the neighbor graph is obtained\ + \ from\neuclidean distances in PCA space.\n" + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "velocyto.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scvelo~=0.3.3" + - "scipy~=1.14.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/velocity/scvelo/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/velocity/scvelo" + executable: "target/executable/velocity/scvelo/scvelo" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/velocity/scvelo/compress_h5mu.py b/target/executable/velocity/scvelo/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/executable/velocity/scvelo/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/executable/velocity/scvelo/nextflow_labels.config b/target/executable/velocity/scvelo/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/velocity/scvelo/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/velocity/scvelo/scvelo b/target/executable/velocity/scvelo/scvelo new file mode 100755 index 00000000..0a476682 --- /dev/null +++ b/target/executable/velocity/scvelo/scvelo @@ -0,0 +1,1678 @@ +#!/usr/bin/env bash + +# scvelo main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="scvelo" +VIASH_META_FUNCTIONALITY_NAME="scvelo" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "anndata~=0.11.1" "mudata~=0.3.1" "scanpy~=1.10.4" "scvelo~=0.3.3" "scipy~=1.14.1" && \ + python -c 'exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component velocity scvelo" +LABEL org.opencontainers.image.created="2025-08-22T07:23:06Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "scvelo main" + echo "" + echo "Inputs:" + echo " --input" + echo " type: file, required parameter, file must exist" + echo " Input MuData file" + echo "" + echo " --counts_layer" + echo " type: string" + echo " Name of the counts layer, if not specified, X is used." + echo "" + echo " --modality" + echo " type: string, required parameter" + echo " Input modality" + echo "" + echo " --layer_spliced" + echo " type: string" + echo " default: spliced" + echo " Name of the layer to store the spliced abundances in. Will be used as" + echo " key in the .layer attribute of the" + echo " output MuData object." + echo "" + echo " --layer_unspliced" + echo " type: string" + echo " default: unspliced" + echo " Name of the layer to store the unspliced abundances in." + echo " Will be used as key in the .layer attribute of the output MuData object." + echo "" + echo " --layer_ambiguous" + echo " type: string" + echo " default: ambiguous" + echo " Name of the layer to store the abundances in for which no fate was" + echo " determined." + echo " Will be used as key in the .layer attribute of the output MuData object." + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " Output directory. If it does not exist, will be created." + echo "" + echo " --output_h5mu" + echo " type: file, required parameter, output, file must exist" + echo " Output mudata file." + echo "" + echo " --output_compression" + echo " type: string" + echo " example: gzip" + echo " choices: [ gzip, lzf ]" + echo " Compression format to use for the output AnnData and/or Mudata objects." + echo " By default no compression is applied." + echo "" + echo "Filtering and normalization:" + echo " Arguments for filtering, normalization an log transform (see" + echo " scvelo.pp.filter_and_normalize function)" + echo "" + echo " --min_counts" + echo " type: integer" + echo " Minimum number of counts required for a gene to pass filtering" + echo " (spliced)." + echo "" + echo " --min_counts_u" + echo " type: integer" + echo " Minimum number of counts required for a gene to pass filtering" + echo " (unspliced)." + echo "" + echo " --min_cells" + echo " type: integer" + echo " Minimum number of cells expressed required to pass filtering (spliced)." + echo "" + echo " --min_cells_u" + echo " type: integer" + echo " Minimum number of cells expressed required to pass filtering" + echo " (unspliced)." + echo "" + echo " --min_shared_counts" + echo " type: integer" + echo " Minimum number of counts (both unspliced and spliced) required for a" + echo " gene." + echo "" + echo " --min_shared_cells" + echo " type: integer" + echo " Minimum number of cells required to be expressed (both unspliced and" + echo " spliced)." + echo "" + echo " --n_top_genes" + echo " type: integer" + echo " Number of genes to keep." + echo "" + echo " --log_transform" + echo " type: boolean" + echo " default: true" + echo " Do not log transform counts." + echo "" + echo "Fitting parameters:" + echo " Arguments for fitting the data" + echo "" + echo " --n_principal_components" + echo " type: integer" + echo " Number of principal components to use for calculating moments." + echo "" + echo " --n_neighbors" + echo " type: integer" + echo " default: 30" + echo " Number of neighbors to use. First/second-order moments are computed for" + echo " each" + echo " cell across its nearest neighbors, where the neighbor graph is obtained" + echo " from" + echo " euclidean distances in PCA space." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "scvelo main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --counts_layer) + [ -n "$VIASH_PAR_COUNTS_LAYER" ] && ViashError Bad arguments for option \'--counts_layer\': \'$VIASH_PAR_COUNTS_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COUNTS_LAYER="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --counts_layer. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --counts_layer=*) + [ -n "$VIASH_PAR_COUNTS_LAYER" ] && ViashError Bad arguments for option \'--counts_layer=*\': \'$VIASH_PAR_COUNTS_LAYER\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_COUNTS_LAYER=$(ViashRemoveFlags "$1") + shift 1 + ;; + --modality) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --modality. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --modality=*) + [ -n "$VIASH_PAR_MODALITY" ] && ViashError Bad arguments for option \'--modality=*\': \'$VIASH_PAR_MODALITY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MODALITY=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer_spliced) + [ -n "$VIASH_PAR_LAYER_SPLICED" ] && ViashError Bad arguments for option \'--layer_spliced\': \'$VIASH_PAR_LAYER_SPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_SPLICED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_spliced. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_spliced=*) + [ -n "$VIASH_PAR_LAYER_SPLICED" ] && ViashError Bad arguments for option \'--layer_spliced=*\': \'$VIASH_PAR_LAYER_SPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_SPLICED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer_unspliced) + [ -n "$VIASH_PAR_LAYER_UNSPLICED" ] && ViashError Bad arguments for option \'--layer_unspliced\': \'$VIASH_PAR_LAYER_UNSPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_UNSPLICED="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_unspliced. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_unspliced=*) + [ -n "$VIASH_PAR_LAYER_UNSPLICED" ] && ViashError Bad arguments for option \'--layer_unspliced=*\': \'$VIASH_PAR_LAYER_UNSPLICED\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_UNSPLICED=$(ViashRemoveFlags "$1") + shift 1 + ;; + --layer_ambiguous) + [ -n "$VIASH_PAR_LAYER_AMBIGUOUS" ] && ViashError Bad arguments for option \'--layer_ambiguous\': \'$VIASH_PAR_LAYER_AMBIGUOUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_AMBIGUOUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --layer_ambiguous. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --layer_ambiguous=*) + [ -n "$VIASH_PAR_LAYER_AMBIGUOUS" ] && ViashError Bad arguments for option \'--layer_ambiguous=*\': \'$VIASH_PAR_LAYER_AMBIGUOUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LAYER_AMBIGUOUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_h5mu) + [ -n "$VIASH_PAR_OUTPUT_H5MU" ] && ViashError Bad arguments for option \'--output_h5mu\': \'$VIASH_PAR_OUTPUT_H5MU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_H5MU="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_h5mu. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_h5mu=*) + [ -n "$VIASH_PAR_OUTPUT_H5MU" ] && ViashError Bad arguments for option \'--output_h5mu=*\': \'$VIASH_PAR_OUTPUT_H5MU\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_H5MU=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output_compression) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_compression. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_compression=*) + [ -n "$VIASH_PAR_OUTPUT_COMPRESSION" ] && ViashError Bad arguments for option \'--output_compression=*\': \'$VIASH_PAR_OUTPUT_COMPRESSION\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_COMPRESSION=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_counts) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_counts=*) + [ -n "$VIASH_PAR_MIN_COUNTS" ] && ViashError Bad arguments for option \'--min_counts=*\': \'$VIASH_PAR_MIN_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_counts_u) + [ -n "$VIASH_PAR_MIN_COUNTS_U" ] && ViashError Bad arguments for option \'--min_counts_u\': \'$VIASH_PAR_MIN_COUNTS_U\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS_U="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_counts_u. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_counts_u=*) + [ -n "$VIASH_PAR_MIN_COUNTS_U" ] && ViashError Bad arguments for option \'--min_counts_u=*\': \'$VIASH_PAR_MIN_COUNTS_U\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_COUNTS_U=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_cells) + [ -n "$VIASH_PAR_MIN_CELLS" ] && ViashError Bad arguments for option \'--min_cells\': \'$VIASH_PAR_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_cells=*) + [ -n "$VIASH_PAR_MIN_CELLS" ] && ViashError Bad arguments for option \'--min_cells=*\': \'$VIASH_PAR_MIN_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_cells_u) + [ -n "$VIASH_PAR_MIN_CELLS_U" ] && ViashError Bad arguments for option \'--min_cells_u\': \'$VIASH_PAR_MIN_CELLS_U\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS_U="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_cells_u. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_cells_u=*) + [ -n "$VIASH_PAR_MIN_CELLS_U" ] && ViashError Bad arguments for option \'--min_cells_u=*\': \'$VIASH_PAR_MIN_CELLS_U\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_CELLS_U=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_shared_counts) + [ -n "$VIASH_PAR_MIN_SHARED_COUNTS" ] && ViashError Bad arguments for option \'--min_shared_counts\': \'$VIASH_PAR_MIN_SHARED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SHARED_COUNTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_shared_counts. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_shared_counts=*) + [ -n "$VIASH_PAR_MIN_SHARED_COUNTS" ] && ViashError Bad arguments for option \'--min_shared_counts=*\': \'$VIASH_PAR_MIN_SHARED_COUNTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SHARED_COUNTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --min_shared_cells) + [ -n "$VIASH_PAR_MIN_SHARED_CELLS" ] && ViashError Bad arguments for option \'--min_shared_cells\': \'$VIASH_PAR_MIN_SHARED_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SHARED_CELLS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --min_shared_cells. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --min_shared_cells=*) + [ -n "$VIASH_PAR_MIN_SHARED_CELLS" ] && ViashError Bad arguments for option \'--min_shared_cells=*\': \'$VIASH_PAR_MIN_SHARED_CELLS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_MIN_SHARED_CELLS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_top_genes) + [ -n "$VIASH_PAR_N_TOP_GENES" ] && ViashError Bad arguments for option \'--n_top_genes\': \'$VIASH_PAR_N_TOP_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TOP_GENES="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_top_genes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_top_genes=*) + [ -n "$VIASH_PAR_N_TOP_GENES" ] && ViashError Bad arguments for option \'--n_top_genes=*\': \'$VIASH_PAR_N_TOP_GENES\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_TOP_GENES=$(ViashRemoveFlags "$1") + shift 1 + ;; + --log_transform) + [ -n "$VIASH_PAR_LOG_TRANSFORM" ] && ViashError Bad arguments for option \'--log_transform\': \'$VIASH_PAR_LOG_TRANSFORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_TRANSFORM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --log_transform. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --log_transform=*) + [ -n "$VIASH_PAR_LOG_TRANSFORM" ] && ViashError Bad arguments for option \'--log_transform=*\': \'$VIASH_PAR_LOG_TRANSFORM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOG_TRANSFORM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_principal_components) + [ -n "$VIASH_PAR_N_PRINCIPAL_COMPONENTS" ] && ViashError Bad arguments for option \'--n_principal_components\': \'$VIASH_PAR_N_PRINCIPAL_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PRINCIPAL_COMPONENTS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_principal_components. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_principal_components=*) + [ -n "$VIASH_PAR_N_PRINCIPAL_COMPONENTS" ] && ViashError Bad arguments for option \'--n_principal_components=*\': \'$VIASH_PAR_N_PRINCIPAL_COMPONENTS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_PRINCIPAL_COMPONENTS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --n_neighbors) + [ -n "$VIASH_PAR_N_NEIGHBORS" ] && ViashError Bad arguments for option \'--n_neighbors\': \'$VIASH_PAR_N_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --n_neighbors. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --n_neighbors=*) + [ -n "$VIASH_PAR_N_NEIGHBORS" ] && ViashError Bad arguments for option \'--n_neighbors=*\': \'$VIASH_PAR_N_NEIGHBORS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_N_NEIGHBORS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/velocity/scvelo:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MODALITY+x} ]; then + ViashError '--modality' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_H5MU+x} ]; then + ViashError '--output_h5mu' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_LAYER_SPLICED+x} ]; then + VIASH_PAR_LAYER_SPLICED="spliced" +fi +if [ -z ${VIASH_PAR_LAYER_UNSPLICED+x} ]; then + VIASH_PAR_LAYER_UNSPLICED="unspliced" +fi +if [ -z ${VIASH_PAR_LAYER_AMBIGUOUS+x} ]; then + VIASH_PAR_LAYER_AMBIGUOUS="ambiguous" +fi +if [ -z ${VIASH_PAR_LOG_TRANSFORM+x} ]; then + VIASH_PAR_LOG_TRANSFORM="true" +fi +if [ -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then + VIASH_PAR_N_NEIGHBORS="30" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_MIN_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_MIN_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_COUNTS_U" ]]; then + if ! [[ "$VIASH_PAR_MIN_COUNTS_U" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_counts_u' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CELLS" ]]; then + if ! [[ "$VIASH_PAR_MIN_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_CELLS_U" ]]; then + if ! [[ "$VIASH_PAR_MIN_CELLS_U" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_cells_u' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SHARED_COUNTS" ]]; then + if ! [[ "$VIASH_PAR_MIN_SHARED_COUNTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_shared_counts' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_MIN_SHARED_CELLS" ]]; then + if ! [[ "$VIASH_PAR_MIN_SHARED_CELLS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--min_shared_cells' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_TOP_GENES" ]]; then + if ! [[ "$VIASH_PAR_N_TOP_GENES" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_top_genes' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_LOG_TRANSFORM" ]]; then + if ! [[ "$VIASH_PAR_LOG_TRANSFORM" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--log_transform' has to be a boolean. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_PRINCIPAL_COMPONENTS" ]]; then + if ! [[ "$VIASH_PAR_N_PRINCIPAL_COMPONENTS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_principal_components' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_N_NEIGHBORS" ]]; then + if ! [[ "$VIASH_PAR_N_NEIGHBORS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--n_neighbors' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_OUTPUT_COMPRESSION" ]; then + VIASH_PAR_OUTPUT_COMPRESSION_CHOICES=("gzip;lzf") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_OUTPUT_COMPRESSION_CHOICES[*]};" =~ ";$VIASH_PAR_OUTPUT_COMPRESSION;" ]]; then + ViashError '--output_compression' specified value of \'$VIASH_PAR_OUTPUT_COMPRESSION\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_OUTPUT_H5MU" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_H5MU")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_H5MU")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_OUTPUT_H5MU" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_H5MU")" ) + VIASH_PAR_OUTPUT_H5MU=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_H5MU") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_H5MU" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-scvelo-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import sys +import mudata +import anndata +import tempfile +import shutil +from contextlib import redirect_stdout +from pathlib import Path +import matplotlib as mpl +import scvelo + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'counts_layer': $( if [ ! -z ${VIASH_PAR_COUNTS_LAYER+x} ]; then echo "r'${VIASH_PAR_COUNTS_LAYER//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_spliced': $( if [ ! -z ${VIASH_PAR_LAYER_SPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_SPLICED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_unspliced': $( if [ ! -z ${VIASH_PAR_LAYER_UNSPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_UNSPLICED//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'layer_ambiguous': $( if [ ! -z ${VIASH_PAR_LAYER_AMBIGUOUS+x} ]; then echo "r'${VIASH_PAR_LAYER_AMBIGUOUS//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_h5mu': $( if [ ! -z ${VIASH_PAR_OUTPUT_H5MU+x} ]; then echo "r'${VIASH_PAR_OUTPUT_H5MU//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_counts_u': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS_U+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS_U//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_cells': $( if [ ! -z ${VIASH_PAR_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_cells_u': $( if [ ! -z ${VIASH_PAR_MIN_CELLS_U+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS_U//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_shared_counts': $( if [ ! -z ${VIASH_PAR_MIN_SHARED_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_SHARED_COUNTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'min_shared_cells': $( if [ ! -z ${VIASH_PAR_MIN_SHARED_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_SHARED_CELLS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_top_genes': $( if [ ! -z ${VIASH_PAR_N_TOP_GENES+x} ]; then echo "int(r'${VIASH_PAR_N_TOP_GENES//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'log_transform': $( if [ ! -z ${VIASH_PAR_LOG_TRANSFORM+x} ]; then echo "r'${VIASH_PAR_LOG_TRANSFORM//\'/\'\"\'\"r\'}'.lower() == 'true'"; else echo None; fi ), + 'n_principal_components': $( if [ ! -z ${VIASH_PAR_N_PRINCIPAL_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_N_PRINCIPAL_COMPONENTS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'n_neighbors': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import compress_h5mu + +logger = setup_logger() + +mpl.rcParams["savefig.dpi"] = 150 + + +# Script must be wrapped into a main function because scvelo spawn subprocesses +# and this fails when the functions are not wrapped. +def main(): + # Create output directory + output_dir = Path(par["output"]) + output_dir.mkdir(parents=True, exist_ok=True) + scvelo.settings.figdir = str(output_dir) + + # Load the input data + adata_in = mudata.read_h5ad(par["input"], mod=par["modality"]) + + # Create a copy of the data as input + # as many scvelo functions do not take input layer arguments + layers_mapping = { + "spliced": par["layer_spliced"], + "unspliced": par["layer_unspliced"], + "ambiguous": par["layer_ambiguous"], + } + layer_data = { + default: (adata_in.layers.get(arg_val) if arg_val else adata_in.layers[default]) + for default, arg_val in layers_mapping.items() + } + adata = anndata.AnnData( + X=adata_in.X + if not par["counts_layer"] + else adata_in.layers[par["counts_layer"]], + layers=layer_data, + ) + + # Calculate the sample name + sample_name = par["output"].removesuffix(".h5mu") + sample_name = Path(sample_name).name + + # Read the input data + + # Save spliced vs unspliced proportions to file + with (output_dir / "proportions.txt").open("w") as target: + with redirect_stdout(target): + scvelo.utils.show_proportions(adata) + + # Plot piecharts of spliced vs unspliced proportions + scvelo.pl.proportions(adata, save=True, show=False) + + # Perform preprocessing + scvelo.pp.filter_and_normalize( + adata, + min_counts=par["min_counts"], + min_counts_u=par["min_counts_u"], + min_cells=par["min_cells"], + min_cells_u=par["min_cells_u"], + min_shared_counts=par["min_shared_counts"], + min_shared_cells=par["min_shared_cells"], + n_top_genes=par["n_top_genes"], + log=par["log_transform"], + ) + + # Fitting + scvelo.pp.moments( + adata, n_pcs=par["n_principal_components"], n_neighbors=par["n_neighbors"] + ) + + # Second step in velocyto calculations + # Velocity calculation and visualization + # From the scvelo manual: + # The solution to the full dynamical model is obtained by setting mode='dynamical', + # which requires to run scv.tl.recover_dynamics(adata) beforehand + scvelo.tl.recover_dynamics(adata) + scvelo.tl.velocity(adata, mode="dynamical") + scvelo.tl.velocity_graph(adata) + scvelo.pl.velocity_graph( + adata, save=str(output_dir / "scvelo_graph.pdf"), show=False + ) + + # Plotting + # TODO: add more here. + scvelo.pl.velocity_embedding_stream( + adata, save=str(output_dir / "scvelo_embedding.pdf"), show=False + ) + + # Copy over slots to output + for slot in ("obs", "var"): + setattr( + adata_in, + slot, + getattr(adata_in, slot) + .assign(**getattr(adata, slot).to_dict()) + .convert_dtypes(), + ) + items_per_slot = { + "uns": ( + "recover_dynamics", + "velocity_params", + "velocity_graph", + "velocity_graph_neg", + ), + "varm": ("loss",), + "obsm": ("velocity_pca",), + "layers": ( + "Ms", + "Mu", + "fit_t", + "fit_tau", + "fit_tau_", + "velocity", + "velocity_u", + ), + } + for dict_slot, dict_items in items_per_slot.items(): + setattr( + adata_in, + dict_slot, + dict( + getattr(adata_in, dict_slot), + **{key_: getattr(adata, dict_slot)[key_] for key_ in dict_items}, + ), + ) + with tempfile.NamedTemporaryFile( + suffix=".h5mu", delete_on_close=False + ) as temp_h5mu: + shutil.copyfile(par["input"], temp_h5mu.name) + # Create output + mudata.write_h5ad(temp_h5mu.name, mod=par["modality"], data=adata_in) + compression = par["output_compression"] + + if compression: + compress_h5mu(temp_h5mu.name, par["output_h5mu"], compression=compression) + else: + shutil.move(temp_h5mu.name, par["output_h5mu"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_OUTPUT_H5MU" ]; then + VIASH_PAR_OUTPUT_H5MU=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_H5MU") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_OUTPUT_H5MU" ] && [ ! -e "$VIASH_PAR_OUTPUT_H5MU" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_H5MU' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/velocity/scvelo/setup_logger.py b/target/executable/velocity/scvelo/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/executable/velocity/scvelo/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/executable/velocity/velocyto/.config.vsh.yaml b/target/executable/velocity/velocyto/.config.vsh.yaml new file mode 100644 index 00000000..dfb6cd62 --- /dev/null +++ b/target/executable/velocity/velocyto/.config.vsh.yaml @@ -0,0 +1,290 @@ +name: "velocyto" +namespace: "velocity" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to BAM file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome" + alternatives: + - "-t" + description: "Path to GTF file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode" + alternatives: + - "-b" + description: "Valid barcodes file, to filter the bam. If --bcfile is not specified\ + \ all the cell barcodes will be included.\nCell barcodes should be specified\ + \ in the bcfile as the 'CB' tag for each read\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--without_umi" + description: "foo" + info: null + direction: "input" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Velocyto loom file" + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--logic" + alternatives: + - "-l" + description: "The logic to use for the filtering." + info: null + default: + - "Default" + required: false + choices: + - "Default" + - "Permissive10X" + - "Intermediate10X" + - "ValidatedIntrons10X" + - "Stricter10X" + - "ObservedSpanning10X" + - "Discordant10X" + - "SmartSeq2" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Runs the velocity analysis on a BAM file, outputting a loom file." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +- type: "file" + path: "rna_velocity" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim-bookworm" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + - "file" + interactive: false + - type: "python" + user: false + pip: + - "numpy<2" + - "Cython" + upgrade: true + - type: "python" + user: false + pip: + - "velocyto" + upgrade: true + - type: "apt" + packages: + - "samtools" + interactive: false + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/velocity/velocyto/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/velocity/velocyto" + executable: "target/executable/velocity/velocyto/velocyto" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/executable/velocity/velocyto/nextflow_labels.config b/target/executable/velocity/velocyto/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/executable/velocity/velocyto/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/executable/velocity/velocyto/velocyto b/target/executable/velocity/velocyto/velocyto new file mode 100755 index 00000000..a4592f33 --- /dev/null +++ b/target/executable/velocity/velocyto/velocyto @@ -0,0 +1,1300 @@ +#!/usr/bin/env bash + +# velocyto main +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Robrecht Cannoodt (maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="velocyto" +VIASH_META_FUNCTIONALITY_NAME="velocyto" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.10-slim-bookworm +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps build-essential file && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "numpy<2" "Cython" + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "velocyto" + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y samtools && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.authors="Robrecht Cannoodt" +LABEL org.opencontainers.image.description="Companion container for running component velocity velocyto" +LABEL org.opencontainers.image.created="2025-08-22T07:23:07Z" +LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline" +LABEL org.opencontainers.image.revision="173327cc5670aa8bd5cf473827de80b602c90092" +LABEL org.opencontainers.image.version="main" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "velocyto main" + echo "" + echo "Runs the velocity analysis on a BAM file, outputting a loom file." + echo "" + echo "Arguments:" + echo " -i, --input" + echo " type: file, required parameter, file must exist" + echo " Path to BAM file" + echo "" + echo " -t, --transcriptome" + echo " type: file, required parameter, file must exist" + echo " Path to GTF file" + echo "" + echo " -b, --barcode" + echo " type: file, file must exist" + echo " Valid barcodes file, to filter the bam. If --bcfile is not specified all" + echo " the cell barcodes will be included." + echo " Cell barcodes should be specified in the bcfile as the 'CB' tag for each" + echo " read" + echo "" + echo " --without_umi" + echo " type: boolean_true" + echo " foo" + echo "" + echo " -o, --output" + echo " type: file, required parameter, output, file must exist" + echo " Velocyto loom file" + echo "" + echo " -l, --logic" + echo " type: string" + echo " default: Default" + echo " choices: [ Default, Permissive10X, Intermediate10X, ValidatedIntrons10X," + echo "Stricter10X, ObservedSpanning10X, Discordant10X, SmartSeq2 ]" + echo " The logic to use for the filtering." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "velocyto main" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -i) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'-i\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -i. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --transcriptome) + [ -n "$VIASH_PAR_TRANSCRIPTOME" ] && ViashError Bad arguments for option \'--transcriptome\': \'$VIASH_PAR_TRANSCRIPTOME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --transcriptome. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --transcriptome=*) + [ -n "$VIASH_PAR_TRANSCRIPTOME" ] && ViashError Bad arguments for option \'--transcriptome=*\': \'$VIASH_PAR_TRANSCRIPTOME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME=$(ViashRemoveFlags "$1") + shift 1 + ;; + -t) + [ -n "$VIASH_PAR_TRANSCRIPTOME" ] && ViashError Bad arguments for option \'-t\': \'$VIASH_PAR_TRANSCRIPTOME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_TRANSCRIPTOME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -t. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcode) + [ -n "$VIASH_PAR_BARCODE" ] && ViashError Bad arguments for option \'--barcode\': \'$VIASH_PAR_BARCODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcode=*) + [ -n "$VIASH_PAR_BARCODE" ] && ViashError Bad arguments for option \'--barcode=*\': \'$VIASH_PAR_BARCODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + -b) + [ -n "$VIASH_PAR_BARCODE" ] && ViashError Bad arguments for option \'-b\': \'$VIASH_PAR_BARCODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -b. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --without_umi) + [ -n "$VIASH_PAR_WITHOUT_UMI" ] && ViashError Bad arguments for option \'--without_umi\': \'$VIASH_PAR_WITHOUT_UMI\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WITHOUT_UMI=true + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + -o) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'-o\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -o. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --logic) + [ -n "$VIASH_PAR_LOGIC" ] && ViashError Bad arguments for option \'--logic\': \'$VIASH_PAR_LOGIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOGIC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --logic. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --logic=*) + [ -n "$VIASH_PAR_LOGIC" ] && ViashError Bad arguments for option \'--logic=*\': \'$VIASH_PAR_LOGIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOGIC=$(ViashRemoveFlags "$1") + shift 1 + ;; + -l) + [ -n "$VIASH_PAR_LOGIC" ] && ViashError Bad arguments for option \'-l\': \'$VIASH_PAR_LOGIC\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LOGIC="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to -l. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/openpipeline/velocity/velocyto:main' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_TRANSCRIPTOME+x} ]; then + ViashError '--transcriptome' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_WITHOUT_UMI+x} ]; then + VIASH_PAR_WITHOUT_UMI="false" +fi +if [ -z ${VIASH_PAR_LOGIC+x} ]; then + VIASH_PAR_LOGIC="Default" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME" ] && [ ! -e "$VIASH_PAR_TRANSCRIPTOME" ]; then + ViashError "Input file '$VIASH_PAR_TRANSCRIPTOME' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BARCODE" ] && [ ! -e "$VIASH_PAR_BARCODE" ]; then + ViashError "Input file '$VIASH_PAR_BARCODE' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_WITHOUT_UMI" ]]; then + if ! [[ "$VIASH_PAR_WITHOUT_UMI" =~ ^(true|True|TRUE|false|False|FALSE|yes|Yes|YES|no|No|NO)$ ]]; then + ViashError '--without_umi' has to be a boolean_true. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# check whether value is belongs to a set of choices +if [ ! -z "$VIASH_PAR_LOGIC" ]; then + VIASH_PAR_LOGIC_CHOICES=("Default;Permissive10X;Intermediate10X;ValidatedIntrons10X;Stricter10X;ObservedSpanning10X;Discordant10X;SmartSeq2") + IFS=';' + set -f + if ! [[ ";${VIASH_PAR_LOGIC_CHOICES[*]};" =~ ";$VIASH_PAR_LOGIC;" ]]; then + ViashError '--logic' specified value of \'$VIASH_PAR_LOGIC\' is not in the list of allowed values. Use "--help" to get more information on the parameters. + exit 1 + fi + set +f + unset IFS +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_TRANSCRIPTOME" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_TRANSCRIPTOME")" ) + VIASH_PAR_TRANSCRIPTOME=$(ViashDockerAutodetectMount "$VIASH_PAR_TRANSCRIPTOME") +fi +if [ ! -z "$VIASH_PAR_BARCODE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BARCODE")" ) + VIASH_PAR_BARCODE=$(ViashDockerAutodetectMount "$VIASH_PAR_BARCODE") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-velocyto-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME+x} ]; then echo "${VIASH_PAR_TRANSCRIPTOME}" | sed "s#'#'\"'\"'#g;s#.*#par_transcriptome='&'#" ; else echo "# par_transcriptome="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODE+x} ]; then echo "${VIASH_PAR_BARCODE}" | sed "s#'#'\"'\"'#g;s#.*#par_barcode='&'#" ; else echo "# par_barcode="; fi ) +$( if [ ! -z ${VIASH_PAR_WITHOUT_UMI+x} ]; then echo "${VIASH_PAR_WITHOUT_UMI}" | sed "s#'#'\"'\"'#g;s#.*#par_without_umi='&'#" ; else echo "# par_without_umi="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_LOGIC+x} ]; then echo "${VIASH_PAR_LOGIC}" | sed "s#'#'\"'\"'#g;s#.*#par_logic='&'#" ; else echo "# par_logic="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=( ) + +if [ ! -z "\$par_barcode" ]; then + extra_params+=( "--bcfile=\$par_barcode" ) +fi +if [ "\$par_without_umi" == "true" ]; then + extra_params+=( "--without-umi" ) +fi +if [ ! -z "\$meta_cpus" ]; then + extra_params+=( "--samtools-threads" "\$meta_cpus" ) +fi +if [ ! -z "\$meta_memory_mb" ]; then + extra_params+=( "--samtools-memory" "\$meta_memory_mb" ) +fi + +output_dir=\`dirname "\$par_output"\` +sample_id=\`basename "\$par_output" .loom\` + +if (file \`readlink -f "\$par_transcriptome"\` | grep -q compressed ) ; then + # create temporary directory + tmpdir=\$(mktemp -d "\$meta_temp_dir/\$meta_name-XXXXXXXX") + function clean_up { + rm -rf "\$tmpdir" + } + trap clean_up EXIT + + zcat "\$par_transcriptome" > "\$tmpdir/genes.gtf" + par_transcriptome="\$tmpdir/genes.gtf" +fi + +velocyto run \\ + "\$par_input" \\ + "\$par_transcriptome" \\ + "\${extra_params[@]}" \\ + --outputfolder "\$output_dir" \\ + --sampleid "\$sample_id" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_TRANSCRIPTOME" ]; then + VIASH_PAR_TRANSCRIPTOME=$(ViashDockerStripAutomount "$VIASH_PAR_TRANSCRIPTOME") + fi + if [ ! -z "$VIASH_PAR_BARCODE" ]; then + VIASH_PAR_BARCODE=$(ViashDockerStripAutomount "$VIASH_PAR_BARCODE") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/nextflow/annotate/celltypist/.config.vsh.yaml b/target/nextflow/annotate/celltypist/.config.vsh.yaml new file mode 100644 index 00000000..67d659b0 --- /dev/null +++ b/target/nextflow/annotate/celltypist/.config.vsh.yaml @@ -0,0 +1,471 @@ +name: "celltypist" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data containing log normalized counts to\ + \ be used for cell type annotation if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The name of the adata obs column in the reference data containing\ + \ cell type annotations." + info: null + default: + - "cell_ontology_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments." + arguments: + - type: "file" + name: "--model" + description: "Pretrained model in pkl format. If not provided, the model will\ + \ be trained on the reference data and --reference should be provided." + info: null + example: + - "pretrained_model.pkl" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--feature_selection" + description: "Whether to perform feature selection." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--majority_voting" + description: "Whether to refine the predicted labels by running the majority voting\ + \ classifier after over-clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--C" + description: "Inverse of regularization strength in logistic regression." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "Maximum number of iterations before reaching the minimum of the\ + \ cost function." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--use_SGD" + description: "Whether to use the stochastic gradient descent algorithm." + info: null + direction: "input" + - type: "double" + name: "--min_prop" + description: "\"For the dominant cell type within a subcluster, the minimum proportion\ + \ of cells required to \nsupport naming of the subcluster by this cell type.\ + \ Ignored if majority_voting is set to False. \nSubcluster that fails to pass\ + \ this proportion threshold will be assigned 'Heterogeneous'.\"\n" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "celltypist_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "celltypist_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\ + \ of logistic regression classifiers optimised by the stochastic gradient descent\ + \ algorithm." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + upgrade: true + - type: "python" + user: false + packages: + - "celltypist==1.6.3" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/celltypist/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/celltypist" + executable: "target/nextflow/annotate/celltypist/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/celltypist/cross_check_genes.py b/target/nextflow/annotate/celltypist/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/nextflow/annotate/celltypist/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/nextflow/annotate/celltypist/main.nf b/target/nextflow/annotate/celltypist/main.nf new file mode 100644 index 00000000..eeefb4e8 --- /dev/null +++ b/target/nextflow/annotate/celltypist/main.nf @@ -0,0 +1,4298 @@ +// celltypist main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "celltypist", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "The input (query) data to be labeled. Should be a .h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "The name of the adata obs column in the reference data containing cell type annotations.", + "default" : [ + "cell_ontology_class" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Model arguments", + "description" : "Model arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--model", + "description" : "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided.", + "example" : [ + "pretrained_model.pkl" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--feature_selection", + "description" : "Whether to perform feature selection.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--majority_voting", + "description" : "Whether to refine the predicted labels by running the majority voting classifier after over-clustering.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--C", + "description" : "Inverse of regularization strength in logistic regression.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_iter", + "description" : "Maximum number of iterations before reaching the minimum of the cost function.", + "default" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--use_SGD", + "description" : "Whether to use the stochastic gradient descent algorithm.", + "direction" : "input" + }, + { + "type" : "double", + "name" : "--min_prop", + "description" : "\\"For the dominant cell type within a subcluster, the minimum proportion of cells required to \nsupport naming of the subcluster by this cell type. Ignored if majority_voting is set to False. \nSubcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'.\\"\n", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted information.\n", + "default" : [ + "celltypist_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\n", + "default" : [ + "celltypist_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/cross_check_genes.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scanpy~=1.10.4" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "celltypist==1.6.3" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/celltypist/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/celltypist", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import celltypist +import mudata as mu +import anndata as ad +import pandas as pd +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'feature_selection': $( if [ ! -z ${VIASH_PAR_FEATURE_SELECTION+x} ]; then echo "r'${VIASH_PAR_FEATURE_SELECTION//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'majority_voting': $( if [ ! -z ${VIASH_PAR_MAJORITY_VOTING+x} ]; then echo "r'${VIASH_PAR_MAJORITY_VOTING//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'C': $( if [ ! -z ${VIASH_PAR_C+x} ]; then echo "float(r'${VIASH_PAR_C//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'use_SGD': $( if [ ! -z ${VIASH_PAR_USE_SGD+x} ]; then echo "r'${VIASH_PAR_USE_SGD//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'min_prop': $( if [ ! -z ${VIASH_PAR_MIN_PROP+x} ]; then echo "float(r'${VIASH_PAR_MIN_PROP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index +from subset_vars import subset_vars + +logger = setup_logger() + + +def check_celltypist_format(indata): + if np.abs(np.expm1(indata[0]).sum() - 10000) > 1: + return False + return True + + +def main(par): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + + # Provide correct format of query data for celltypist annotation + ## Sanitize gene names and set as index + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + ## Fetch lognormalized counts + lognorm_counts = ( + input_modality.layers[par["input_layer"]].copy() + if par["input_layer"] + else input_modality.X.copy() + ) + ## Create AnnData object + input_modality = ad.AnnData( + X=lognorm_counts, var=pd.DataFrame(index=input_modality.var.index) + ) + + if par["model"]: + logger.info("Loading CellTypist model") + model = celltypist.models.Model.load(par["model"]) + cross_check_genes( + input_modality.var.index, + model.features, + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + elif par["reference"]: + reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]] + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + # CellTypist requires query gene names to be in index + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # Ensure enough overlap between genes in query and reference + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + + labels = reference_modality.obs[par["reference_obs_target"]] + + logger.info("Training CellTypist model on reference") + model = celltypist.train( + reference_matrix, + labels=labels, + genes=reference_modality.var.index, + C=par["C"], + max_iter=par["max_iter"], + use_SGD=par["use_SGD"], + feature_selection=par["feature_selection"], + check_expression=True, + ) + + logger.info("Predicting CellTypist annotations") + predictions = celltypist.annotate( + input_modality, model, majority_voting=par["majority_voting"] + ) + + input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels[ + "predicted_labels" + ].values + input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max( + axis=1 + ).values + + # copy observations back to input data (with full set of features) + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/celltypist", + "tag" : "main" + }, + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/celltypist/nextflow.config b/target/nextflow/annotate/celltypist/nextflow.config new file mode 100644 index 00000000..52b0c603 --- /dev/null +++ b/target/nextflow/annotate/celltypist/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/celltypist' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.' + author = 'Jakub Majercik, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/celltypist/nextflow_labels.config b/target/nextflow/annotate/celltypist/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/celltypist/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/celltypist/nextflow_schema.json b/target/nextflow/annotate/celltypist/nextflow_schema.json new file mode 100644 index 00000000..39845870 --- /dev/null +++ b/target/nextflow/annotate/celltypist/nextflow_schema.json @@ -0,0 +1,191 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "celltypist", + "description": "Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Input dataset (query) arguments", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input (query) data to be labeled", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "string", + "description": "In which `.obs` slots to store the predicted information.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"celltypist_pred\"`. ", + "default": "celltypist_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "In which `.obs` slots to store the probability of the predictions.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"celltypist_probability\"`. ", + "default": "celltypist_probability" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "description": "The reference data to train the CellTypist classifiers on", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer in the reference data to be used for cell type annotation if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_target": { + "type": "string", + "description": "The name of the adata obs column in the reference data containing cell type annotations.", + "help_text": "Type: `string`, multiple: `False`, default: `\"cell_ontology_class\"`. ", + "default": "cell_ontology_class" + }, + "reference_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "model arguments": { + "title": "Model arguments", + "type": "object", + "description": "Model arguments.", + "properties": { + "model": { + "type": "string", + "format": "path", + "description": "Pretrained model in pkl format", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"pretrained_model.pkl\"`. " + }, + "feature_selection": { + "type": "boolean", + "description": "Whether to perform feature selection.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "majority_voting": { + "type": "boolean", + "description": "Whether to refine the predicted labels by running the majority voting classifier after over-clustering.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "C": { + "type": "number", + "description": "Inverse of regularization strength in logistic regression.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "max_iter": { + "type": "integer", + "description": "Maximum number of iterations before reaching the minimum of the cost function.", + "help_text": "Type: `integer`, multiple: `False`, default: `1000`. ", + "default": 1000 + }, + "use_SGD": { + "type": "boolean", + "description": "Whether to use the stochastic gradient descent algorithm.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "min_prop": { + "type": "number", + "description": "\"For the dominant cell type within a subcluster, the minimum proportion of cells required to \nsupport naming of the subcluster by this cell type", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/model arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/celltypist/set_var_index.py b/target/nextflow/annotate/celltypist/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/annotate/celltypist/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/annotate/celltypist/setup_logger.py b/target/nextflow/annotate/celltypist/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/annotate/celltypist/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/annotate/celltypist/subset_vars.py b/target/nextflow/annotate/celltypist/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/annotate/celltypist/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/annotate/onclass/.config.vsh.yaml b/target/nextflow/annotate/onclass/.config.vsh.yaml new file mode 100644 index 00000000..b8057103 --- /dev/null +++ b/target/nextflow/annotate/onclass/.config.vsh.yaml @@ -0,0 +1,442 @@ +name: "onclass" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Ontology" + description: "Ontology input files" + arguments: + - type: "file" + name: "--cl_nlp_emb_file" + description: "The .nlp.emb file with the cell type embeddings." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cl_ontology_file" + description: "The .ontology file with the cell type ontology." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cl_obo_file" + description: "The .obo file with the cell type ontology." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The name of the adata obs column in the reference data containing\ + \ cell type annotations." + info: null + example: + - "cell_ontology_class" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unknown_celltype" + description: "Label for unknown cell types.\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "onclass_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "onclass_prob" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments" + arguments: + - type: "string" + name: "--model" + description: "\"Pretrained model path without a file extension. If not provided,\ + \ the model will be trained \non the reference data and --reference should be\ + \ provided. The path namespace should contain:\n - a .npz or .pkl file\n -\ + \ a .data file\n - a .meta file\n - a .index file\ne.g. /path/to/model/pretrained_model_target1\ + \ as saved by OnClass.\"\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "Maximum number of iterations for training the model." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "OnClass is a python package for single-cell cell type annotation. It\ + \ uses the Cell Ontology to capture the cell type similarity. \nThese similarities\ + \ enable OnClass to annotate cell types that are never seen in the training data.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "OnClass~=1.3" + - "tensorflow" + - "obonet" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/onclass/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/onclass" + executable: "target/nextflow/annotate/onclass/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/onclass/cross_check_genes.py b/target/nextflow/annotate/onclass/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/nextflow/annotate/onclass/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/nextflow/annotate/onclass/main.nf b/target/nextflow/annotate/onclass/main.nf new file mode 100644 index 00000000..772343f1 --- /dev/null +++ b/target/nextflow/annotate/onclass/main.nf @@ -0,0 +1,4321 @@ +// onclass main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "onclass", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "The input (query) data to be labeled. Should be a .h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer in the input data to be used for cell type annotation if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Ontology", + "description" : "Ontology input files", + "arguments" : [ + { + "type" : "file", + "name" : "--cl_nlp_emb_file", + "description" : "The .nlp.emb file with the cell type embeddings.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cl_ontology_file", + "description" : "The .ontology file with the cell type ontology.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cl_obo_file", + "description" : "The .obo file with the cell type ontology.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer in the reference data to be used for cell type annotation if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "The name of the adata obs column in the reference data containing cell type annotations.", + "example" : [ + "cell_ontology_class" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--unknown_celltype", + "description" : "Label for unknown cell types.\n", + "default" : [ + "Unknown" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted information.\n", + "default" : [ + "onclass_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\n", + "default" : [ + "onclass_prob" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Model arguments", + "description" : "Model arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--model", + "description" : "\\"Pretrained model path without a file extension. If not provided, the model will be trained \non the reference data and --reference should be provided. The path namespace should contain:\n - a .npz or .pkl file\n - a .data file\n - a .meta file\n - a .index file\ne.g. /path/to/model/pretrained_model_target1 as saved by OnClass.\\"\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_iter", + "description" : "Maximum number of iterations for training the model.", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/cross_check_genes.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity. \nThese similarities enable OnClass to annotate cell types that are never seen in the training data.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "OnClass~=1.3", + "tensorflow", + "obonet" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/onclass/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/onclass", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +import numpy as np +from OnClass.OnClassModel import OnClassModel +import obonet +from typing import Dict, List, Tuple + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'cl_nlp_emb_file': $( if [ ! -z ${VIASH_PAR_CL_NLP_EMB_FILE+x} ]; then echo "r'${VIASH_PAR_CL_NLP_EMB_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cl_ontology_file': $( if [ ! -z ${VIASH_PAR_CL_ONTOLOGY_FILE+x} ]; then echo "r'${VIASH_PAR_CL_ONTOLOGY_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cl_obo_file': $( if [ ! -z ${VIASH_PAR_CL_OBO_FILE+x} ]; then echo "r'${VIASH_PAR_CL_OBO_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'unknown_celltype': $( if [ ! -z ${VIASH_PAR_UNKNOWN_CELLTYPE+x} ]; then echo "r'${VIASH_PAR_UNKNOWN_CELLTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from set_var_index import set_var_index +from subset_vars import subset_vars + +logger = setup_logger() + + +def map_celltype_to_ontology_id( + cl_obo_file: str, +) -> Tuple[Dict[str, str], Dict[str, str]]: + """ + Map cell type names to ontology IDs and vice versa. + + Parameters + ---------- + cl_obo_file : str + Path to the cell ontology file. + + Returns + ------- + Tuple[Dict[str, str], Dict[str, str]] + A tuple of two dictionaries. The first dictionary maps cell ontology IDs to cell type names. + The second dictionary maps cell type names to cell ontology IDs. + """ + graph = obonet.read_obo(cl_obo_file) + cl_id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)} + cl_id_to_name = {k: v for k, v in cl_id_to_name.items() if v is not None} + name_to_cl_id = {v: k for k, v in cl_id_to_name.items()} + return cl_id_to_name, name_to_cl_id + + +def cell_type_prediction( + model: OnClassModel, + input_matrix: np.array, + input_features: List[str], + id_to_name: dict, +) -> Tuple[List[str], List[float]]: + """ + Predict cell types for input data and save results to Anndata obj. + + Parameters + ---------- + model : OnClassModel + The OnClass model. + input_matrix : np.array + The input data matrix. + input_modality : ad.AnnData + The input data Anndata object. + id_to_name : dict + Dictionary mapping cell ontology IDs to cell type names. + + Returns + ------- + predictions: List[str] + The predicted cell types. + probabilities: List[float] + Probabilities of the predicted cell types. + """ + corr_test_feature = model.ProcessTestFeature( + test_feature=input_matrix, + test_genes=input_features, + log_transform=False, + ) + onclass_pred = model.Predict( + corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0 + ) + pred_label = [model.i2co[ind] for ind in onclass_pred[2]] + pred_cell_type_label = [id_to_name[id] for id in pred_label] + prob_cell_type_label = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1) + + return pred_cell_type_label, prob_cell_type_label + + +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + # Onclass needs dense matrix format + input_matrix = input_matrix.toarray() + + id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"]) + + if par["model"]: + logger.info("Predicting cell types using pre-trained model") + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) + + model.BuildModel(use_pretrain=par["model"], ngene=None) + cross_check_genes( + model.genes, input_modality.var.index, par["input_reference_gene_overlap"] + ) + + elif par["reference"]: + logger.info("Reading reference data") + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + # Onclass needs dense matrix format + reference_matrix = reference_matrix.toarray() + + logger.info("Training a model from reference...") + + labels = reference_modality.obs[par["reference_obs_target"]].tolist() + labels_cl = [ + name_to_id[label] if label in name_to_id else par["unknown_celltype"] + for label in labels + ] + + _ = model.EmbedCellTypes(labels_cl) + corr_train_feature, _, corr_train_genes, _ = model.ProcessTrainFeature( + train_feature=reference_matrix, + train_label=labels_cl, + train_genes=reference_modality.var.index, + test_feature=input_matrix, + test_genes=input_modality.var.index, + log_transform=False, + ) + model.BuildModel(ngene=len(corr_train_genes)) + model.Train(corr_train_feature, labels_cl, max_iter=par["max_iter"]) + + logger.info("Predicting cell types") + predictions, probabilities = cell_type_prediction( + model, input_matrix, input_modality.var.index, id_to_name + ) + + logger.info("Writing output data") + input_adata.obs[par["output_obs_predictions"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/onclass", + "tag" : "main" + }, + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/onclass/nextflow.config b/target/nextflow/annotate/onclass/nextflow.config new file mode 100644 index 00000000..ba2cc530 --- /dev/null +++ b/target/nextflow/annotate/onclass/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/onclass' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity. \nThese similarities enable OnClass to annotate cell types that are never seen in the training data.\n' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/onclass/nextflow_labels.config b/target/nextflow/annotate/onclass/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/onclass/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/onclass/nextflow_schema.json b/target/nextflow/annotate/onclass/nextflow_schema.json new file mode 100644 index 00000000..9055ffe9 --- /dev/null +++ b/target/nextflow/annotate/onclass/nextflow_schema.json @@ -0,0 +1,196 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "onclass", + "description": "OnClass is a python package for single-cell cell type annotation. It uses the Cell Ontology to capture the cell type similarity. \nThese similarities enable OnClass to annotate cell types that are never seen in the training data.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Input dataset (query) arguments", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input (query) data to be labeled", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer in the input data to be used for cell type annotation if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "string", + "description": "In which `.obs` slots to store the predicted information.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"onclass_pred\"`. ", + "default": "onclass_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "In which `.obs` slots to store the probability of the predictions.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"onclass_prob\"`. ", + "default": "onclass_prob" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "ontology": { + "title": "Ontology", + "type": "object", + "description": "Ontology input files", + "properties": { + "cl_nlp_emb_file": { + "type": "string", + "format": "path", + "exists": true, + "description": "The .nlp.emb file with the cell type embeddings.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "cl_ontology_file": { + "type": "string", + "format": "path", + "exists": true, + "description": "The .ontology file with the cell type ontology.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "cl_obo_file": { + "type": "string", + "format": "path", + "exists": true, + "description": "The .obo file with the cell type ontology.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "description": "The reference data to train the CellTypist classifiers on", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer in the reference data to be used for cell type annotation if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_target": { + "type": "string", + "description": "The name of the adata obs column in the reference data containing cell type annotations.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"cell_ontology_class\"`. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "unknown_celltype": { + "type": "string", + "description": "Label for unknown cell types.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"Unknown\"`. ", + "default": "Unknown" + } + } + }, + "model arguments": { + "title": "Model arguments", + "type": "object", + "description": "Model arguments", + "properties": { + "model": { + "type": "string", + "description": "\"Pretrained model path without a file extension", + "help_text": "Type: `string`, multiple: `False`. " + }, + "max_iter": { + "type": "integer", + "description": "Maximum number of iterations for training the model.", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/ontology" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/model arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/onclass/set_var_index.py b/target/nextflow/annotate/onclass/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/annotate/onclass/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/annotate/onclass/setup_logger.py b/target/nextflow/annotate/onclass/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/annotate/onclass/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/annotate/onclass/subset_vars.py b/target/nextflow/annotate/onclass/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/annotate/onclass/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/annotate/popv/.config.vsh.yaml b/target/nextflow/annotate/popv/.config.vsh.yaml new file mode 100644 index 00000000..26b042df --- /dev/null +++ b/target/nextflow/annotate/popv/.config.vsh.yaml @@ -0,0 +1,408 @@ +name: "popv" +namespace: "annotate" +version: "main" +authors: +- name: "Matthias Beyens" + roles: + - "author" + info: + role: "Contributor" + links: + github: "MatthiasBeyens" + orcid: "0000-0003-3304-0706" + email: "matthias.beyens@gmail.com" + linkedin: "mbeyens" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + description: "Arguments related to the input (aka query) dataset." + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Which layer to use. If no value is provided, the counts are assumed\ + \ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch" + description: "Key in obs field of input adata for batch information. If no value\ + \ is provided, batch label is assumed to be unknown." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_subset" + description: "Subset the input object with this column." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_label" + description: "Key in obs field of input adata for label information. This is only\ + \ used for training scANVI. Unlabelled cells should be set to `\"unknown_celltype_label\"\ + `." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unknown_celltype_label" + description: "If `input_obs_label` is specified, cells with this value will be\ + \ treated as unknown and will be predicted by the model." + info: null + default: + - "unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "User-provided reference tissue. The data that will be used as reference\ + \ to call cell types." + info: null + example: + - "TS_Bladder_filtered.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "Which layer to use. If no value is provided, the counts are assumed\ + \ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_label" + description: "Key in obs field of reference AnnData with cell-type information." + info: null + default: + - "cell_ontology_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch" + description: "Key in obs field of input adata for batch information." + info: null + default: + - "donor_assay" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + description: "Other arguments." + arguments: + - type: "string" + name: "--methods" + description: "Methods to call cell types. By default, runs to knn_on_scvi and\ + \ scanvi." + info: null + example: + - "knn_on_scvi" + - "scanvi" + required: true + choices: + - "celltypist" + - "knn_on_bbknn" + - "knn_on_scanorama" + - "knn_on_scvi" + - "onclass" + - "rf" + - "scanvi" + - "svm" + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs popular major vote cell typing on single cell sequence data\ + \ using multiple algorithms. Note that this is a one-shot version of PopV." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "CFLAGS=\"-mno-avx512f -mno-avx2\"" + - "CPPFLAGS=\"-mno-avx512f -mno-avx2\"" + - type: "apt" + packages: + - "procps" + - "git" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "popv~=0.4.2" + - "numpy<2" + - "setuptools" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "docker" + run: + - "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git\n" + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/popv/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/popv" + executable: "target/nextflow/annotate/popv/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/popv/main.nf b/target/nextflow/annotate/popv/main.nf new file mode 100644 index 00000000..c38ffb81 --- /dev/null +++ b/target/nextflow/annotate/popv/main.nf @@ -0,0 +1,4299 @@ +// popv main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Matthias Beyens (author) +// * Robrecht Cannoodt (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "popv", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Matthias Beyens", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "MatthiasBeyens", + "orcid" : "0000-0003-3304-0706", + "email" : "matthias.beyens@gmail.com", + "linkedin" : "mbeyens" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Arguments related to the input (aka query) dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch", + "description" : "Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_subset", + "description" : "Subset the input object with this column.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_label", + "description" : "Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `\\"unknown_celltype_label\\"`.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--unknown_celltype_label", + "description" : "If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.", + "default" : [ + "unknown" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "User-provided reference tissue. The data that will be used as reference to call cell types.", + "example" : [ + "TS_Bladder_filtered.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_label", + "description" : "Key in obs field of reference AnnData with cell-type information.", + "default" : [ + "cell_ontology_class" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_batch", + "description" : "Key in obs field of input adata for batch information.", + "default" : [ + "donor_assay" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "description" : "Other arguments.", + "arguments" : [ + { + "type" : "string", + "name" : "--methods", + "description" : "Methods to call cell types. By default, runs to knn_on_scvi and scanvi.", + "example" : [ + "knn_on_scvi", + "scanvi" + ], + "required" : true, + "choices" : [ + "celltypist", + "knn_on_bbknn", + "knn_on_scanorama", + "knn_on_scvi", + "onclass", + "rf", + "scanvi", + "svm" + ], + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "env" : [ + "CFLAGS=\\"-mno-avx512f -mno-avx2\\"", + "CPPFLAGS=\\"-mno-avx512f -mno-avx2\\"" + ] + }, + { + "type" : "apt", + "packages" : [ + "procps", + "git", + "build-essential" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "popv~=0.4.2", + "numpy<2", + "setuptools" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "docker", + "run" : [ + "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git\n" + ] + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/popv/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/popv", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import re +import tempfile +import typing +import numpy as np +import mudata as mu +import anndata as ad +import popv + +# todo: is this still needed? +from torch.cuda import is_available as cuda_is_available + +try: + from torch.backends.mps import is_available as mps_is_available +except ModuleNotFoundError: + # Older pytorch versions + # MacOS GPUs + def mps_is_available(): + return False + + +# where to find the obo files +cl_obo_folder = "/opt/PopV/resources/ontology/" + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_subset': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_SUBSET+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_SUBSET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'unknown_celltype_label': $( if [ ! -z ${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL+x} ]; then echo "r'${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_label': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_batch': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'methods': $( if [ ! -z ${VIASH_PAR_METHODS+x} ]; then echo "r'${VIASH_PAR_METHODS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +use_gpu = cuda_is_available() +logger.info("GPU enabled? %s", use_gpu) + + +# Helper functions +def get_X( + adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str] +): + """Fetch the counts data from X or a layer. Subset columns by var_index if so desired.""" + if var_index: + adata = adata[:, var_index] + if layer: + return adata.layers[layer] + else: + return adata.X + + +def get_obs(adata: ad.AnnData, obs_par_names): + """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch.""" + obs_columns = [par[x] for x in obs_par_names if par[x]] + return adata.obs[obs_columns] + + +def get_var(adata: ad.AnnData, var_index: list[str]): + """Fetch the var dataframe. Subset rows by var_index if so desired.""" + return adata.var.loc[var_index] + + +def main(par, meta): + assert len(par["methods"]) >= 1, ( + "Please, specify at least one method for cell typing." + ) + logger.info("Cell typing methods: {}".format(par["methods"])) + + ### PREPROCESSING REFERENCE ### + logger.info("### PREPROCESSING REFERENCE ###") + + # take a look at reference data + logger.info("Reading reference data '%s'", par["reference"]) + reference = ad.read_h5ad(par["reference"]) + + logger.info("Setting reference var index to Ensembl IDs") + reference.var["gene_symbol"] = list(reference.var.index) + reference.var.index = [ + re.sub("\\\\\\\\.[0-9]+\\$", "", s) for s in reference.var["ensemblid"] + ] + + logger.info("Detect number of samples per label") + min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size()) + n_samples_per_label = np.max((min_celltype_size, 100)) + + ### PREPROCESSING INPUT ### + logger.info("### PREPROCESSING INPUT ###") + logger.info("Reading '%s'", par["input"]) + input = mu.read_h5mu(par["input"]) + input_modality = input.mod[par["modality"]] + + # subset with var column + if par["input_var_subset"]: + logger.info("Subset input with .var['%s']", par["input_var_subset"]) + assert par["input_var_subset"] in input_modality.var, ( + f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var" + ) + input_modality = input_modality[:, input_modality.var[par["input_var_subset"]]] + + ### ALIGN REFERENCE AND INPUT ### + logger.info("### ALIGN REFERENCE AND INPUT ###") + + logger.info("Detecting common vars based on ensembl ids") + common_ens_ids = list( + set(reference.var.index).intersection(set(input_modality.var.index)) + ) + + logger.info(" reference n_vars: %i", reference.n_vars) + logger.info(" input n_vars: %i", input_modality.n_vars) + logger.info(" intersect n_vars: %i", len(common_ens_ids)) + assert len(common_ens_ids) >= 100, "The intersection of genes is too small." + + # subset input objects to make sure popv is using the data we expect + input_modality = ad.AnnData( + X=get_X(input_modality, par["input_layer"], common_ens_ids), + obs=get_obs(input_modality, ["input_obs_label", "input_obs_batch"]), + var=get_var(input_modality, common_ens_ids), + ) + reference = ad.AnnData( + X=get_X(reference, par["reference_layer"], common_ens_ids), + obs=get_obs(reference, ["reference_obs_label", "reference_obs_batch"]), + var=get_var(reference, common_ens_ids), + ) + + # remove layers that + + ### ALIGN REFERENCE AND INPUT ### + logger.info("### ALIGN REFERENCE AND INPUT ###") + + with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir: + logger.info("Run PopV processing") + pq = popv.preprocessing.Process_Query( + # input + query_adata=input_modality, + query_labels_key=par["input_obs_label"], + query_batch_key=par["input_obs_batch"], + query_layers_key=None, # this is taken care of by subset + # reference + ref_adata=reference, + ref_labels_key=par["reference_obs_label"], + ref_batch_key=par["reference_obs_batch"], + # options + unknown_celltype_label=par["unknown_celltype_label"], + n_samples_per_label=n_samples_per_label, + # pretrained model + # Might need to be parameterized at some point + prediction_mode="retrain", + pretrained_scvi_path=None, + # outputs + # Might need to be parameterized at some point + save_path_trained_models=temp_dir, + # hardcoded values + cl_obo_folder=cl_obo_folder, + accelerator="cuda" if use_gpu else "cpu", + ) + method_kwargs = {} + if "scanorama" in par["methods"]: + method_kwargs["scanorama"] = {"approx": False} + logger.info("Annotate data") + popv.annotation.annotate_data( + adata=pq.adata, methods=par["methods"], methods_kwargs=method_kwargs + ) + + popv_input = pq.adata[input_modality.obs_names] + + # select columns starting with "popv_" + popv_obs_cols = popv_input.obs.columns[ + popv_input.obs.columns.str.startswith("popv_") + ] + + # create new data frame with selected columns + df_popv = popv_input.obs[popv_obs_cols] + + # remove prefix from column names + df_popv.columns = df_popv.columns.str.replace("popv_", "") + + # store output in mudata .obsm + input.mod[par["modality"]].obsm["popv_output"] = df_popv + + # copy important output in mudata .obs + for col in ["popv_prediction"]: + if col in popv_input.obs.columns: + input.mod[par["modality"]].obs[col] = popv_input.obs[col] + + # code to explore how the output differs from the original + # for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]: + # old_keys = set(getattr(pq_adata_orig, attr).keys()) + # new_keys = set(getattr(pq.adata, attr).keys()) + # diff_keys = list(new_keys.difference(old_keys)) + # diff_keys.sort() + # print(f"{attr}:", flush=True) + # for key in diff_keys: + # print(f" {key}", flush=True) + + # write output + logger.info("Writing %s", par["output"]) + input.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/popv", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/popv/nextflow.config b/target/nextflow/annotate/popv/nextflow.config new file mode 100644 index 00000000..0780f728 --- /dev/null +++ b/target/nextflow/annotate/popv/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/popv' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV.' + author = 'Matthias Beyens, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/popv/nextflow_labels.config b/target/nextflow/annotate/popv/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/popv/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/popv/nextflow_schema.json b/target/nextflow/annotate/popv/nextflow_schema.json new file mode 100644 index 00000000..b52aee96 --- /dev/null +++ b/target/nextflow/annotate/popv/nextflow_schema.json @@ -0,0 +1,152 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "popv", + "description": "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Arguments related to the input (aka query) dataset.", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Which layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_batch": { + "type": "string", + "description": "Key in obs field of input adata for batch information", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_subset": { + "type": "string", + "description": "Subset the input object with this column.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_label": { + "type": "string", + "description": "Key in obs field of input adata for label information", + "help_text": "Type: `string`, multiple: `False`. " + }, + "unknown_celltype_label": { + "type": "string", + "description": "If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.", + "help_text": "Type: `string`, multiple: `False`, default: `\"unknown\"`. ", + "default": "unknown" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "Other arguments.", + "properties": { + "methods": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Methods to call cell types", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"knn_on_scvi\";\"scanvi\"]`, choices: ``celltypist`, `knn_on_bbknn`, `knn_on_scanorama`, `knn_on_scvi`, `onclass`, `rf`, `scanvi`, `svm``. " + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "User-provided reference tissue", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"TS_Bladder_filtered.h5ad\"`. " + }, + "reference_layer": { + "type": "string", + "description": "Which layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_label": { + "type": "string", + "description": "Key in obs field of reference AnnData with cell-type information.", + "help_text": "Type: `string`, multiple: `False`, default: `\"cell_ontology_class\"`. ", + "default": "cell_ontology_class" + }, + "reference_obs_batch": { + "type": "string", + "description": "Key in obs field of input adata for batch information.", + "help_text": "Type: `string`, multiple: `False`, default: `\"donor_assay\"`. ", + "default": "donor_assay" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/popv/setup_logger.py b/target/nextflow/annotate/popv/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/annotate/popv/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/annotate/random_forest_annotation/.config.vsh.yaml b/target/nextflow/annotate/random_forest_annotation/.config.vsh.yaml new file mode 100644 index 00000000..acfa3eae --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/.config.vsh.yaml @@ -0,0 +1,457 @@ +name: "random_forest_annotation" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "Key in obs field of reference modality with cell-type information." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "random_forest_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "random_forest_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments." + arguments: + - type: "file" + name: "--model" + description: "Pretrained model in pkl format. If not provided, the model will\ + \ be trained on the reference data and --reference should be provided." + info: null + example: + - "pretrained_model.pkl" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_estimators" + description: "Number of trees in the random forest." + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_depth" + description: "Maximum depth of the trees in the random forest. \nIf not provided,\ + \ the nodes are expanded until all leaves only contain a single sample.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--criterion" + description: "The function to measure the quality of a split." + info: null + default: + - "gini" + required: false + choices: + - "gini" + - "entropy" + - "log_loss" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--class_weight" + description: "Weights associated with classes.\nThe `balanced` mode uses the values\ + \ of y to automatically adjust weights inversely proportional to class frequencies\ + \ in the input data.\nThe `balanced_subsample` mode is the same as `balanced`\ + \ except that weights are computed based on the bootstrap sample for every tree\ + \ grown.\nThe `uniform` mode gives all classes a weight of one.\n" + info: null + default: + - "balanced_subsample" + required: false + choices: + - "balanced" + - "balanced_subsample" + - "uniform" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--max_features" + description: "The number of features to consider when looking for the best split.\ + \ The value can either be a positive integer or one of `sqrt`, `log2` or `all`.\n\ + If integer: consider max_features features at each split.\nIf `sqrt`: max_features\ + \ is the squareroot of all input features.\nIf `log2`: max_features is the log2\ + \ of all input features.\nIf `all`: max features equals all input features.\n" + info: null + default: + - "200" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\ + \ of random forest." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "scikit-learn==1.4.2" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/random_forest_annotation/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/random_forest_annotation" + executable: "target/nextflow/annotate/random_forest_annotation/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/random_forest_annotation/cross_check_genes.py b/target/nextflow/annotate/random_forest_annotation/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/nextflow/annotate/random_forest_annotation/main.nf b/target/nextflow/annotate/random_forest_annotation/main.nf new file mode 100644 index 00000000..5cddd38b --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/main.nf @@ -0,0 +1,4281 @@ +// random_forest_annotation main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "random_forest_annotation", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input (query) data to be labeled. Should be a .h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer in the input data to be used for cell type annotation if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "Key in obs field of reference modality with cell-type information.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted information.\n", + "default" : [ + "random_forest_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\n", + "default" : [ + "random_forest_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Model arguments", + "description" : "Model arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--model", + "description" : "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided.", + "example" : [ + "pretrained_model.pkl" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_estimators", + "description" : "Number of trees in the random forest.", + "default" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_depth", + "description" : "Maximum depth of the trees in the random forest. \nIf not provided, the nodes are expanded until all leaves only contain a single sample.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--criterion", + "description" : "The function to measure the quality of a split.", + "default" : [ + "gini" + ], + "required" : false, + "choices" : [ + "gini", + "entropy", + "log_loss" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--class_weight", + "description" : "Weights associated with classes.\nThe `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.\nThe `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.\nThe `uniform` mode gives all classes a weight of one.\n", + "default" : [ + "balanced_subsample" + ], + "required" : false, + "choices" : [ + "balanced", + "balanced_subsample", + "uniform" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--max_features", + "description" : "The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`.\nIf integer: consider max_features features at each split.\nIf `sqrt`: max_features is the squareroot of all input features.\nIf `log2`: max_features is the log2 of all input features.\nIf `all`: max features equals all input features.\n", + "default" : [ + "200" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/cross_check_genes.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scikit-learn==1.4.2" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/random_forest_annotation/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/random_forest_annotation", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +import numpy as np +from sklearn.ensemble import RandomForestClassifier +import pickle + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_estimators': $( if [ ! -z ${VIASH_PAR_N_ESTIMATORS+x} ]; then echo "int(r'${VIASH_PAR_N_ESTIMATORS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_depth': $( if [ ! -z ${VIASH_PAR_MAX_DEPTH+x} ]; then echo "int(r'${VIASH_PAR_MAX_DEPTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'criterion': $( if [ ! -z ${VIASH_PAR_CRITERION+x} ]; then echo "r'${VIASH_PAR_CRITERION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'class_weight': $( if [ ! -z ${VIASH_PAR_CLASS_WEIGHT+x} ]; then echo "r'${VIASH_PAR_CLASS_WEIGHT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'max_features': $( if [ ! -z ${VIASH_PAR_MAX_FEATURES+x} ]; then echo "r'${VIASH_PAR_MAX_FEATURES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from subset_vars import subset_vars +from set_var_index import set_var_index + +logger = setup_logger() + + +def main(): + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + + # Handle max_features parameter + max_features_conversion = { + "all": None, + "sqrt": "sqrt", + "log2": "log2", + } + try: + max_features = max_features_conversion.get( + par["max_features"], int(par["max_features"]) + ) + except ValueError: + raise ValueError( + f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'" + ) + + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + + if par["model"]: + logger.info("Loading a pre-trained model") + model = pickle.load(open(par["model"], "rb")) + if hasattr(model, "_feature_names_in"): + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) + if not len(common_genes) == len(model._feature_names_in): + raise ValueError("Input dataset does not contain all model features.") + input_modality = input_modality[:, common_genes] + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + else: + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) + + elif par["reference"]: + logger.info("Reading reference data") + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Query and input require the exact same features + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + reference_modality = reference_modality[:, common_genes] + input_modality = input_modality[:, common_genes] + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + logger.info("Training a model...") + labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() + model = RandomForestClassifier( + n_estimators=par["n_estimators"], + criterion=par["criterion"], + max_depth=par["max_depth"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + max_features=max_features, + ) + model.fit(reference_matrix, labels) + model._feature_names_in = reference_modality.var.index + + logger.info("Running predictions...") + predictions = model.predict(input_matrix) + probabilities = np.max(model.predict_proba(input_matrix), axis=1) + + input_adata.obs[par["output_obs_predictions"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + + logger.info("Writing output data") + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/random_forest_annotation", + "tag" : "main" + }, + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/random_forest_annotation/nextflow.config b/target/nextflow/annotate/random_forest_annotation/nextflow.config new file mode 100644 index 00000000..d98e24c7 --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/random_forest_annotation' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/random_forest_annotation/nextflow_labels.config b/target/nextflow/annotate/random_forest_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/random_forest_annotation/nextflow_schema.json b/target/nextflow/annotate/random_forest_annotation/nextflow_schema.json new file mode 100644 index 00000000..f3d6c41d --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/nextflow_schema.json @@ -0,0 +1,193 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "random_forest_annotation", + "description": "Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Input dataset (query) arguments", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input (query) data to be labeled", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer in the input data to be used for cell type annotation if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "string", + "description": "In which `.obs` slots to store the predicted information.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"random_forest_pred\"`. ", + "default": "random_forest_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "In which `.obs` slots to store the probability of the predictions.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"random_forest_probability\"`. ", + "default": "random_forest_probability" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "description": "The reference data to train the CellTypist classifiers on", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer in the reference data to be used for cell type annotation if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_target": { + "type": "string", + "description": "Key in obs field of reference modality with cell-type information.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "model arguments": { + "title": "Model arguments", + "type": "object", + "description": "Model arguments.", + "properties": { + "model": { + "type": "string", + "format": "path", + "description": "Pretrained model in pkl format", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"pretrained_model.pkl\"`. " + }, + "n_estimators": { + "type": "integer", + "description": "Number of trees in the random forest.", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "max_depth": { + "type": "integer", + "description": "Maximum depth of the trees in the random forest", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "criterion": { + "type": "string", + "description": "The function to measure the quality of a split.", + "help_text": "Type: `string`, multiple: `False`, default: `\"gini\"`, choices: ``gini`, `entropy`, `log_loss``. ", + "enum": [ + "gini", + "entropy", + "log_loss" + ], + "default": "gini" + }, + "class_weight": { + "type": "string", + "description": "Weights associated with classes.\nThe `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.\nThe `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.\nThe `uniform` mode gives all classes a weight of one.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"balanced_subsample\"`, choices: ``balanced`, `balanced_subsample`, `uniform``. ", + "enum": [ + "balanced", + "balanced_subsample", + "uniform" + ], + "default": "balanced_subsample" + }, + "max_features": { + "type": "string", + "description": "The number of features to consider when looking for the best split", + "help_text": "Type: `string`, multiple: `False`, default: `\"200\"`. ", + "default": "200" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/model arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/random_forest_annotation/set_var_index.py b/target/nextflow/annotate/random_forest_annotation/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/annotate/random_forest_annotation/setup_logger.py b/target/nextflow/annotate/random_forest_annotation/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/annotate/random_forest_annotation/subset_vars.py b/target/nextflow/annotate/random_forest_annotation/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/annotate/random_forest_annotation/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/annotate/scanvi/.config.vsh.yaml b/target/nextflow/annotate/scanvi/.config.vsh.yaml new file mode 100644 index 00000000..53a7adac --- /dev/null +++ b/target/nextflow/annotate/scanvi/.config.vsh.yaml @@ -0,0 +1,480 @@ +name: "scanvi" +namespace: "annotate" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file. Note that this needs to be the exact same dataset\ + \ as the --scvi_model was trained on." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes that were used to train\ + \ the scVi model. By default, do not subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: ".var column containing gene names. By default, use the index." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_labels" + description: ".obs field containing the labels" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unlabeled_category" + description: "Value in the --obs_labels field that indicates unlabeled observations\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "scVI Model" + arguments: + - type: "file" + name: "--scvi_model" + description: "Pretrained SCVI reference model to initialize the SCANVI model with." + info: null + example: + - "scvi_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_model" + description: "Folder where the state of the trained model will be saved to." + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_scanvi_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_predictions" + description: "In which .obs slot to store the predicted labels." + info: null + default: + - "scanvi_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_probabilities" + description: "In which. obs slot to store the probabilities of the predicted labels." + info: null + default: + - "scanvi_proba" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "scANVI training arguments" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "scANVI () is a semi-supervised model for single-cell transcriptomics\ + \ data. scANVI is an scVI extension that can leverage the cell type knowledge for\ + \ a subset of the cells present in the data sets to infer the states of the rest\ + \ of the cells.\nThis component will instantiate a scANVI model from a pre-trained\ + \ scVI model, integrate the data and perform label prediction.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "scvi_model" +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + - "gpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "jax[cuda]" + - "scvi-tools~=1.3.1" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/scanvi/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/scanvi" + executable: "target/nextflow/annotate/scanvi/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/scanvi/compress_h5mu.py b/target/nextflow/annotate/scanvi/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/annotate/scanvi/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/annotate/scanvi/main.nf b/target/nextflow/annotate/scanvi/main.nf new file mode 100644 index 00000000..ed2f1c0f --- /dev/null +++ b/target/nextflow/annotate/scanvi/main.nf @@ -0,0 +1,4284 @@ +// scanvi main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer) +// * Jakub Majercik (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scanvi", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file. Note that this needs to be the exact same dataset as the --scvi_model was trained on.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. If None, X is used", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : ".var column containing highly variable genes that were used to train the scVi model. By default, do not subset genes.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_gene_names", + "description" : ".var column containing gene names. By default, use the index.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_labels", + "description" : ".obs field containing the labels", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--unlabeled_category", + "description" : "Value in the --obs_labels field that indicates unlabeled observations\n", + "default" : [ + "Unknown" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "scVI Model", + "arguments" : [ + { + "type" : "file", + "name" : "--scvi_model", + "description" : "Pretrained SCVI reference model to initialize the SCANVI model with.", + "example" : [ + "scvi_model.pt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_model", + "description" : "Folder where the state of the trained model will be saved to.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_scanvi_integrated" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_output_predictions", + "description" : "In which .obs slot to store the predicted labels.", + "default" : [ + "scanvi_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_output_probabilities", + "description" : "In which. obs slot to store the probabilities of the predicted labels.", + "default" : [ + "scanvi_proba" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "scANVI training arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--early_stopping", + "description" : "Whether to perform early stopping with respect to the validation set.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--early_stopping_monitor", + "description" : "Metric logged during validation set epoch.", + "default" : [ + "elbo_validation" + ], + "required" : false, + "choices" : [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--early_stopping_patience", + "description" : "Number of validation epochs with no improvement after which training will be stopped.", + "default" : [ + 45 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--early_stopping_min_delta", + "description" : "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--reduce_lr_on_plateau", + "description" : "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_factor", + "description" : "Factor to reduce learning rate.", + "default" : [ + 0.6 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_patience", + "description" : "Number of epochs with no improvement after which learning rate will be reduced.", + "default" : [ + 30.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.\nThis component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/scvi_model/" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midcpu", + "midmem", + "gpu", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:25.05-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "jax[cuda]", + "scvi-tools~=1.3.1" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/scanvi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/scanvi", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata +import scvi +import numpy as np + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_labels': $( if [ ! -z ${VIASH_PAR_OBS_LABELS+x} ]; then echo "r'${VIASH_PAR_OBS_LABELS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'unlabeled_category': $( if [ ! -z ${VIASH_PAR_UNLABELED_CATEGORY+x} ]; then echo "r'${VIASH_PAR_UNLABELED_CATEGORY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'scvi_model': $( if [ ! -z ${VIASH_PAR_SCVI_MODEL+x} ]; then echo "r'${VIASH_PAR_SCVI_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_model': $( if [ ! -z ${VIASH_PAR_OUTPUT_MODEL+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_output_predictions': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_output_probabilities': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PROBABILITIES+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PROBABILITIES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'early_stopping': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping_monitor': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING_MONITOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'early_stopping_patience': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then echo "int(r'${VIASH_PAR_EARLY_STOPPING_PATIENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'early_stopping_min_delta': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then echo "float(r'${VIASH_PAR_EARLY_STOPPING_MIN_DELTA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reduce_lr_on_plateau': $( if [ ! -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then echo "r'${VIASH_PAR_REDUCE_LR_ON_PLATEAU//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'lr_factor': $( if [ ! -z ${VIASH_PAR_LR_FACTOR+x} ]; then echo "float(r'${VIASH_PAR_LR_FACTOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'lr_patience': $( if [ ! -z ${VIASH_PAR_LR_PATIENCE+x} ]; then echo "float(r'${VIASH_PAR_LR_PATIENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info("Reading input data...") + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + if par["var_input"]: + # Subset to HVG + adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() + else: + adata_subset = adata.copy() + + # Sanitize gene names and set as index of the AnnData object + adata_subset = set_var_index(adata_subset, par["var_gene_names"]) + + logger.info(f"Loading pre-trained scVI model from {par['scvi_model']}") + scvi_model = scvi.model.SCVI.load( + par["scvi_model"], + adata_subset, + accelerator="auto", + device="auto", + ) + + logger.info("Instantiating scANVI model from scVI model...") + vae_uns = scvi.model.SCANVI.from_scvi_model( + scvi_model, + unlabeled_category=par["unlabeled_category"], + labels_key=par["obs_labels"], + adata=adata_subset, + ) + + plan_kwargs = { + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], + } + + logger.info("Training scANVI model...") + # Train the model + vae_uns.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + plan_kwargs=plan_kwargs, + check_val_every_n_epoch=1, + accelerator="auto", + ) + # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set + + logger.info("Performing scANVI integration...") + # Get the latent output + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() + + logger.info("Performing scANVI prediction...") + adata.obs[par["obs_output_predictions"]] = vae_uns.predict() + adata.obs[par["obs_output_probabilities"]] = np.max( + vae_uns.predict(soft=True), axis=1 + ) + + logger.info("Writing output data...") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + if par["output_model"]: + vae_uns.save(par["output_model"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/scanvi", + "tag" : "main" + }, + "label" : [ + "midcpu", + "midmem", + "gpu", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/scanvi/nextflow.config b/target/nextflow/annotate/scanvi/nextflow.config new file mode 100644 index 00000000..33b32b33 --- /dev/null +++ b/target/nextflow/annotate/scanvi/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/scanvi' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.\nThis component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.\n' + author = 'Dorien Roosen, Jakub Majercik, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/scanvi/nextflow_labels.config b/target/nextflow/annotate/scanvi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/scanvi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/scanvi/nextflow_schema.json b/target/nextflow/annotate/scanvi/nextflow_schema.json new file mode 100644 index 00000000..3f3266a3 --- /dev/null +++ b/target/nextflow/annotate/scanvi/nextflow_schema.json @@ -0,0 +1,203 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scanvi", + "description": "scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.\nThis component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": ".var column containing highly variable genes that were used to train the scVi model", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_gene_names": { + "type": "string", + "description": ".var column containing gene names", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_labels": { + "type": "string", + "description": ".obs field containing the labels", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "unlabeled_category": { + "type": "string", + "description": "Value in the --obs_labels field that indicates unlabeled observations\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"Unknown\"`. ", + "default": "Unknown" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_model": { + "type": "string", + "format": "path", + "description": "Folder where the state of the trained model will be saved to.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_model\"`, direction: `output`. ", + "default": "$id.$key.output_model" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scanvi_integrated\"`. ", + "default": "X_scanvi_integrated" + }, + "obs_output_predictions": { + "type": "string", + "description": "In which .obs slot to store the predicted labels.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_pred\"`. ", + "default": "scanvi_pred" + }, + "obs_output_probabilities": { + "type": "string", + "description": "In which", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_proba\"`. ", + "default": "scanvi_proba" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "scvi model": { + "title": "scVI Model", + "type": "object", + "description": "No description", + "properties": { + "scvi_model": { + "type": "string", + "format": "path", + "exists": true, + "description": "Pretrained SCVI reference model to initialize the SCANVI model with.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"scvi_model.pt\"`. " + } + } + }, + "scanvi training arguments": { + "title": "scANVI training arguments", + "type": "object", + "description": "No description", + "properties": { + "early_stopping": { + "type": "boolean", + "description": "Whether to perform early stopping with respect to the validation set.", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "early_stopping_monitor": { + "type": "string", + "description": "Metric logged during validation set epoch.", + "help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ", + "enum": [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "default": "elbo_validation" + }, + "early_stopping_patience": { + "type": "integer", + "description": "Number of validation epochs with no improvement after which training will be stopped.", + "help_text": "Type: `integer`, multiple: `False`, default: `45`. ", + "default": 45 + }, + "early_stopping_min_delta": { + "type": "number", + "description": "Minimum change in the monitored quantity to qualify as an improvement, i.e", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "reduce_lr_on_plateau": { + "type": "boolean", + "description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "lr_factor": { + "type": "number", + "description": "Factor to reduce learning rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.6`. ", + "default": 0.6 + }, + "lr_patience": { + "type": "number", + "description": "Number of epochs with no improvement after which learning rate will be reduced.", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/scvi model" + }, + { + "$ref": "#/$defs/scanvi training arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/scanvi/set_var_index.py b/target/nextflow/annotate/scanvi/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/annotate/scanvi/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/annotate/scanvi/setup_logger.py b/target/nextflow/annotate/scanvi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/annotate/scanvi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/annotate/scanvi/subset_vars.py b/target/nextflow/annotate/scanvi/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/annotate/scanvi/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/annotate/singler/.config.vsh.yaml b/target/nextflow/annotate/singler/.config.vsh.yaml new file mode 100644 index 00000000..5875c7d0 --- /dev/null +++ b/target/nextflow/annotate/singler/.config.vsh.yaml @@ -0,0 +1,519 @@ +name: "singler" +namespace: "annotate" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data containing log normalized counts to\ + \ be used for cell type annotation if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata .var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_clusters" + description: "The name of the adata .obs column containing cluster identities\ + \ of the observations. \nIf provided, annoation is performed on the aggregated\ + \ cluster profiles, \notherwise it defaults to annotation per observation.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data containing lognormalized couns to\ + \ be used for cell type annotation if .X is not to be used. Data are expected\ + \ to be processed in the same way as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The name of the adata obs column in the reference data containing\ + \ cell type annotations." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing a boolean mask corresponding to genes to\ + \ be used for marker selection. By default, do not subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + description: "Arguments related to the training of and classification with the SingleR\ + \ model" + arguments: + - type: "integer" + name: "--de_n_genes" + description: "The number of differentially expressed genes across labels to be\ + \ calculated from the reference.\nDefaults to 500 * (2/3) ^ log2(N) where N\ + \ is the number of unique labels when if `--de_method` is set to `classic`,\n\ + otherwise, defaults to 10.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--de_method" + description: "Method to detect differentially expressed genes between pairs of\ + \ labels." + info: null + default: + - "classic" + required: false + choices: + - "classic" + - "t" + - "wilcox" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--quantile" + description: "The quantile of the correlation distribution to use to compute the\ + \ score per label." + info: null + default: + - 0.8 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--fine_tune" + description: "Whether finetuning should be performed to improve the resolution.\ + \ \nIf set to True, an additional finetuning step is performed after initial\ + \ classification, \nnew marker genes are calculated based on all cells with\ + \ a score higher then the maximum score minus `--fine_tuning_thershold`,\nand\ + \ the calculation of the scores is repeated.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--fine_tuning_threshold" + description: "The maximum difference from the maximum correlation to use in fine-tuning\n" + info: null + default: + - 0.05 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--prune" + description: "Whether label pruning should be performed. If set to True, an additional\ + \ output .obs field `--output_obs_pruned_predictions` will be added to the `--output`,\ + \ containing labels where 'low-quality' labels are replaced with NA's. Labels\ + \ are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`)\ + \ fall more than 3 median absolute deviations below the median for that label\ + \ type." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slot to store the predicted labels. If `--fine_tune\ + \ False`, this is based only on the maximum entry in `--output_obsm_scores`.\n" + info: null + default: + - "singler_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predicted\ + \ labels.\n" + info: null + default: + - "singler_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_delta_next" + description: "In which `.obs` slot to store the delta between the best and next-best\ + \ score. If `--fine_tune True`, this is reported for scores after fine-tuning.\n" + info: null + default: + - "singler_delta_next" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_pruned_predictions" + description: "In which `.obs` slot to store the pruned labels, where low-quality\ + \ labels are replaced with NA's. Only added if `--prune True`.\n" + info: null + default: + - "singler_pruned_labels" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_scores" + description: "In which `.obsm` slot to store the matrix of prediction correlations\ + \ at the specified quantile for each label (column) in each cell (row).\n" + info: null + default: + - "singler_scores" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "SingleR performs reference-based cell type annotation for single-cell\ + \ RNA-seq data \nby computing Spearman correlations between test cells and reference\ + \ samples with known labels, \nusing marker genes to assign the most similar cell\ + \ type label to each new cell.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "RETICULATE_PYTHON=/usr/bin/python" + - type: "apt" + packages: + - "libhdf5-dev" + - "python3" + - "python3-pip" + - "python3-dev" + - "python-is-python3" + interactive: false + - type: "r" + cran: + - "anndata" + - "reticulate" + - "SingleR" + bioc: + - "scrapper" + - "bit64" + bioc_force_install: false + warnings_as_errors: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/singler/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/singler" + executable: "target/nextflow/annotate/singler/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/singler/main.nf b/target/nextflow/annotate/singler/main.nf new file mode 100644 index 00000000..28ce9c4b --- /dev/null +++ b/target/nextflow/annotate/singler/main.nf @@ -0,0 +1,4379 @@ +// singler main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "singler", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "The input (query) data to be labeled. Should be a .h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the adata .var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_clusters", + "description" : "The name of the adata .obs column containing cluster identities of the observations. \nIf provided, annoation is performed on the aggregated cluster profiles, \notherwise it defaults to annotation per observation.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer in the reference data containing lognormalized couns to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "The name of the adata obs column in the reference data containing cell type annotations.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_input", + "description" : ".var column containing a boolean mask corresponding to genes to be used for marker selection. By default, do not subset genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "description" : "Arguments related to the training of and classification with the SingleR model", + "arguments" : [ + { + "type" : "integer", + "name" : "--de_n_genes", + "description" : "The number of differentially expressed genes across labels to be calculated from the reference.\nDefaults to 500 * (2/3) ^ log2(N) where N is the number of unique labels when if `--de_method` is set to `classic`,\notherwise, defaults to 10.\n", + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--de_method", + "description" : "Method to detect differentially expressed genes between pairs of labels.", + "default" : [ + "classic" + ], + "required" : false, + "choices" : [ + "classic", + "t", + "wilcox" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--quantile", + "description" : "The quantile of the correlation distribution to use to compute the score per label.", + "default" : [ + 0.8 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--fine_tune", + "description" : "Whether finetuning should be performed to improve the resolution. \nIf set to True, an additional finetuning step is performed after initial classification, \nnew marker genes are calculated based on all cells with a score higher then the maximum score minus `--fine_tuning_thershold`,\nand the calculation of the scores is repeated.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--fine_tuning_threshold", + "description" : "The maximum difference from the maximum correlation to use in fine-tuning\n", + "default" : [ + 0.05 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--prune", + "description" : "Whether label pruning should be performed. If set to True, an additional output .obs field `--output_obs_pruned_predictions` will be added to the `--output`, containing labels where 'low-quality' labels are replaced with NA's. Labels are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`) fall more than 3 median absolute deviations below the median for that label type.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slot to store the predicted labels. If `--fine_tune False`, this is based only on the maximum entry in `--output_obsm_scores`.\n", + "default" : [ + "singler_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predicted labels.\n", + "default" : [ + "singler_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_delta_next", + "description" : "In which `.obs` slot to store the delta between the best and next-best score. If `--fine_tune True`, this is reported for scores after fine-tuning.\n", + "default" : [ + "singler_delta_next" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_pruned_predictions", + "description" : "In which `.obs` slot to store the pruned labels, where low-quality labels are replaced with NA's. Only added if `--prune True`.\n", + "default" : [ + "singler_pruned_labels" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obsm_scores", + "description" : "In which `.obsm` slot to store the matrix of prediction correlations at the specified quantile for each label (column) in each cell (row).\n", + "default" : [ + "singler_scores" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "SingleR performs reference-based cell type annotation for single-cell RNA-seq data \nby computing Spearman correlations between test cells and reference samples with known labels, \nusing marker genes to assign the most similar cell type label to each new cell.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "env" : [ + "RETICULATE_PYTHON=/usr/bin/python" + ] + }, + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "python3", + "python3-pip", + "python3-dev", + "python-is-python3" + ], + "interactive" : false + }, + { + "type" : "r", + "cran" : [ + "anndata", + "reticulate", + "SingleR" + ], + "bioc" : [ + "scrapper", + "bit64" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/singler/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/singler", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +library(SingleR) +library(Matrix) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +mudata <- reticulate::import("mudata") + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "modality" = $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_MODALITY" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_layer" = $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_LAYER" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_var_gene_names" = $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_VAR_GENE_NAMES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_obs_clusters" = $( if [ ! -z ${VIASH_PAR_INPUT_OBS_CLUSTERS+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_OBS_CLUSTERS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "input_reference_gene_overlap" = $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "reference" = $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_layer" = $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_LAYER" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_obs_target" = $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_OBS_TARGET" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_var_gene_names" = $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_VAR_GENE_NAMES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "reference_var_input" = $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_REFERENCE_VAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "de_n_genes" = $( if [ ! -z ${VIASH_PAR_DE_N_GENES+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_DE_N_GENES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "de_method" = $( if [ ! -z ${VIASH_PAR_DE_METHOD+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_DE_METHOD" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "quantile" = $( if [ ! -z ${VIASH_PAR_QUANTILE+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_QUANTILE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "fine_tune" = $( if [ ! -z ${VIASH_PAR_FINE_TUNE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_FINE_TUNE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'))"; else echo NULL; fi ), + "fine_tuning_threshold" = $( if [ ! -z ${VIASH_PAR_FINE_TUNING_THRESHOLD+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_FINE_TUNING_THRESHOLD" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "prune" = $( if [ ! -z ${VIASH_PAR_PRUNE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_PRUNE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'))"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_predictions" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_PREDICTIONS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_probability" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_PROBABILITY" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_delta_next" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_DELTA_NEXT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_DELTA_NEXT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obs_pruned_predictions" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBS_PRUNED_PREDICTIONS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_obsm_scores" = $( if [ ! -z ${VIASH_PAR_OUTPUT_OBSM_SCORES+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_OBSM_SCORES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_compression" = $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_COMPRESSION" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + +get_layer <- function(adata, layer, var_gene_names) { + # find data + data <- + if (is.null(layer)) { + adata\\$X + } else { + adata\\$layers[[layer]] + } + + # check if data is available + if (is.null(data)) { + if (is.null(layer)) { + stop("No layer specified and no .X slot available in the AnnData object.") + } else { + stop( + "Requested layer '", + layer, + "' is not available in the AnnData object. Available layers: ", + paste(names(adata\\$layers), collapse = ", ") + ) + } + } + + # Set matrix dimnames + input_gene_names <- sanitize_gene_names(adata, var_gene_names) + dimnames(data) <- list(adata\\$obs_names, input_gene_names) + + # return output + data +} + +sanitize_gene_names <- function(adata, gene_symbol = NULL) { + if (is.null(gene_symbol)) { + gene_names <- adata\\$var_names + } else { + gene_names <- adata\\$var[[gene_symbol]] + } + # Remove version numbers (dot followed by digits at end of string) + sanitized <- gsub("\\\\\\\\.[0-9]+\\$", "", gene_names) + sanitized +} + +subset_vars <- function(adata, subset_col) { + # Check if column exists + if (!subset_col %in% colnames(adata\\$var)) { + stop( + "Requested to use .var column '", + subset_col, + "' as a selection of genes, but the column is not available." + ) + } + + # Get the column + subset_values <- adata\\$var[[subset_col]] + + # Check for NA values + if (any(is.na(subset_values))) { + stop( + "The .var column \\`", + subset_col, + "\\` contains NA values. Can not subset data." + ) + } + + # Check if it's logical/boolean + if (!is.logical(subset_values)) { + stop( + "Expected dtype of .var column '", + subset_col, + "' to be \\`logical\\`, but found ", + class(subset_values)[1], + ". Can not subset data." + ) + } + + # Subset and return copy + adata_subset <- adata[, subset_values]\\$copy() + adata_subset +} + +# Read input data +cat("Reading input file\\\\n") +input_mdata <- mudata\\$read_h5mu(par\\$input) +input_adata <- input_mdata\\$mod[[par\\$modality]] +input_matrix <- Matrix::t( + get_layer(input_adata, par\\$input_layer, par\\$input_var_gene_names) +) + +# Read reference +cat("Reading reference file\\\\n") +ref_adata <- mudata\\$read_h5ad(par\\$reference, mod = "rna") +if (!is.null(par\\$reference_var_input)) { + ref_adata <- subset_vars(ref_adata, par\\$reference_var_input) +} +ref_matrix <- Matrix::t( + get_layer(ref_adata, par\\$reference_layer, par\\$reference_var_gene_names) +) + +# Check overlap genes +cat("Checking overlap of genes between input and reference file\\\\n") +common_ens_ids <- intersect(rownames(ref_matrix), rownames(input_matrix)) +if (length(common_ens_ids) < par\\$input_reference_gene_overlap) { + stop( + "The intersection of genes between the query and reference is too small.\\\\n", + paste0("Expected overlap: ", par\\$input_reference_gene_overlap), + paste0("Detected overlap: ", length(common_ens_ids)) + ) +} + +# Calculate CPU cores +n_workers <- meta\\$cpus %||% max(1, parallel::detectCores() - 1) + +# Assign target labels +if (!par\\$reference_obs_target %in% colnames(ref_adata\\$obs)) { + stop( + "Requested to use .obs column '", + par\\$reference_obs_target, + "' as target labels, but the column is not available." + ) +} else { + target_labels <- ref_adata\\$obs[[par\\$reference_obs_target]] +} + +cat("Performing SingleR cell type prediction\\\\n") +predictions <- SingleR( + test = input_matrix, + ref = ref_matrix, + labels = target_labels, + clusters = par\\$input_obs_clusters, + genes = "de", + de.method = par\\$de_method, + de.n = par\\$de_n_genes, + quantile = par\\$quantile, + fine.tune = par\\$fine_tune, + tune.thresh = par\\$fine_tuning_threshold, + prune = par\\$prune, + num.threads = n_workers +) + +cat("Writing output data\\\\n") +# Writing output slots +input_adata\\$obs[[par\\$output_obs_predictions]] <- + predictions\\$labels +input_adata\\$obs[[par\\$output_obs_probability]] <- + apply(predictions\\$scores, 1, max) +input_adata\\$obs[[par\\$output_obs_delta_next]] <- + predictions\\$delta.next +input_adata\\$obs[[par\\$output_obs_pruned_predictions]] <- + predictions\\$pruned.labels +input_adata\\$obsm[[par\\$output_obsm_scores]] <- + predictions\\$scores +# Writing output H5MU +input_mdata\\$write(par\\$output, compression = par\\$output_compression) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/singler", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/singler/nextflow.config b/target/nextflow/annotate/singler/nextflow.config new file mode 100644 index 00000000..e34cea30 --- /dev/null +++ b/target/nextflow/annotate/singler/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/singler' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'SingleR performs reference-based cell type annotation for single-cell RNA-seq data \nby computing Spearman correlations between test cells and reference samples with known labels, \nusing marker genes to assign the most similar cell type label to each new cell.\n' + author = 'Dorien Roosen, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/singler/nextflow_labels.config b/target/nextflow/annotate/singler/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/singler/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/singler/nextflow_schema.json b/target/nextflow/annotate/singler/nextflow_schema.json new file mode 100644 index 00000000..0be08c8f --- /dev/null +++ b/target/nextflow/annotate/singler/nextflow_schema.json @@ -0,0 +1,212 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "singler", + "description": "SingleR performs reference-based cell type annotation for single-cell RNA-seq data \nby computing Spearman correlations between test cells and reference samples with known labels, \nusing marker genes to assign the most similar cell type label to each new cell.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Input dataset (query) arguments", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input (query) data to be labeled", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the adata .var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_clusters": { + "type": "string", + "description": "The name of the adata .obs column containing cluster identities of the observations", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "string", + "description": "In which `.obs` slot to store the predicted labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"singler_pred\"`. ", + "default": "singler_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "In which `.obs` slots to store the probability of the predicted labels.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"singler_probability\"`. ", + "default": "singler_probability" + }, + "output_obs_delta_next": { + "type": "string", + "description": "In which `.obs` slot to store the delta between the best and next-best score", + "help_text": "Type: `string`, multiple: `False`, default: `\"singler_delta_next\"`. ", + "default": "singler_delta_next" + }, + "output_obs_pruned_predictions": { + "type": "string", + "description": "In which `.obs` slot to store the pruned labels, where low-quality labels are replaced with NA's", + "help_text": "Type: `string`, multiple: `False`, default: `\"singler_pruned_labels\"`. ", + "default": "singler_pruned_labels" + }, + "output_obsm_scores": { + "type": "string", + "description": "In which `.obsm` slot to store the matrix of prediction correlations at the specified quantile for each label (column) in each cell (row).\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"singler_scores\"`. ", + "default": "singler_scores" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "Arguments related to the training of and classification with the SingleR model", + "properties": { + "de_n_genes": { + "type": "integer", + "description": "The number of differentially expressed genes across labels to be calculated from the reference.\nDefaults to 500 * (2/3) ^ log2(N) where N is the number of unique labels when if `--de_method` is set to `classic`,\notherwise, defaults to 10.\n", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "de_method": { + "type": "string", + "description": "Method to detect differentially expressed genes between pairs of labels.", + "help_text": "Type: `string`, multiple: `False`, default: `\"classic\"`, choices: ``classic`, `t`, `wilcox``. ", + "enum": [ + "classic", + "t", + "wilcox" + ], + "default": "classic" + }, + "quantile": { + "type": "number", + "description": "The quantile of the correlation distribution to use to compute the score per label.", + "help_text": "Type: `double`, multiple: `False`, default: `0.8`. ", + "default": 0.8 + }, + "fine_tune": { + "type": "boolean", + "description": "Whether finetuning should be performed to improve the resolution", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "fine_tuning_threshold": { + "type": "number", + "description": "The maximum difference from the maximum correlation to use in fine-tuning\n", + "help_text": "Type: `double`, multiple: `False`, default: `0.05`. ", + "default": 0.05 + }, + "prune": { + "type": "boolean", + "description": "Whether label pruning should be performed", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "The reference data to train the CellTypist classifiers on", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer in the reference data containing lognormalized couns to be used for cell type annotation if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_target": { + "type": "string", + "description": "The name of the adata obs column in the reference data containing cell type annotations.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_input": { + "type": "string", + "description": ".var column containing a boolean mask corresponding to genes to be used for marker selection", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/svm_annotation/.config.vsh.yaml b/target/nextflow/annotate/svm_annotation/.config.vsh.yaml new file mode 100644 index 00000000..10407cb1 --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/.config.vsh.yaml @@ -0,0 +1,440 @@ +name: "svm_annotation" +namespace: "annotate" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input data to be used for cell type annotation\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata var column in the input data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data to be used for cell type annotation\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "Key in .obs attribute of reference modality with cell-type information.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the adata var column in the reference data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_prediction" + description: "In which `.obs` slots to store the predicted information.\n" + info: null + default: + - "svm_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n" + info: null + default: + - "svm_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model arguments" + description: "Model arguments." + arguments: + - type: "file" + name: "--model" + description: "Pretrained model in pkl format. If not provided, the model will\ + \ be trained on the reference data and --reference should be provided." + info: null + example: + - "pretrained_model.pkl" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--feature_selection" + description: "Whether to perform feature selection." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "Maximum number of iterations for the SVM." + info: null + default: + - 5000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--c_reg" + description: "Regularization parameter for the SVM." + info: null + default: + - 1.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--class_weight" + description: "\"Class weights for the SVM. The `uniform` mode gives all classes\ + \ a weight of one. \nThe `balanced` mode (default) uses the values of y to\ + \ automatically adjust weights inversely \nproportional to class frequencies\ + \ in the input data as n_samples / (n_classes * np.bincount(y))\"\n" + info: null + default: + - "balanced" + required: false + choices: + - "balanced" + - "uniform" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\ + \ of SVMs." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "scikit-learn==1.5.2" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/annotate/svm_annotation/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/annotate/svm_annotation" + executable: "target/nextflow/annotate/svm_annotation/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/annotate/svm_annotation/cross_check_genes.py b/target/nextflow/annotate/svm_annotation/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/nextflow/annotate/svm_annotation/main.nf b/target/nextflow/annotate/svm_annotation/main.nf new file mode 100644 index 00000000..4c5689b8 --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/main.nf @@ -0,0 +1,4251 @@ +// svm_annotation main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "svm_annotation", + "namespace" : "annotate", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input (query) data to be labeled. Should be a .h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer in the input data to be used for cell type annotation if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "Key in .obs attribute of reference modality with cell-type information.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_prediction", + "description" : "In which `.obs` slots to store the predicted information.\n", + "default" : [ + "svm_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\n", + "default" : [ + "svm_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Model arguments", + "description" : "Model arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--model", + "description" : "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided.", + "example" : [ + "pretrained_model.pkl" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--feature_selection", + "description" : "Whether to perform feature selection.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_iter", + "description" : "Maximum number of iterations for the SVM.", + "default" : [ + 5000 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--c_reg", + "description" : "Regularization parameter for the SVM.", + "default" : [ + 1.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--class_weight", + "description" : "\\"Class weights for the SVM. The `uniform` mode gives all classes a weight of one. \nThe `balanced` mode (default) uses the values of y to automatically adjust weights inversely \nproportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))\\"\n", + "default" : [ + "balanced" + ], + "required" : false, + "choices" : [ + "balanced", + "uniform" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/cross_check_genes.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scikit-learn==1.5.2" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/annotate/svm_annotation/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/annotate/svm_annotation", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +import numpy as np +from sklearn.calibration import CalibratedClassifierCV +from sklearn import svm +import pickle + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_target': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGET+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_input': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_prediction': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'feature_selection': $( if [ ! -z ${VIASH_PAR_FEATURE_SELECTION+x} ]; then echo "r'${VIASH_PAR_FEATURE_SELECTION//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'c_reg': $( if [ ! -z ${VIASH_PAR_C_REG+x} ]; then echo "float(r'${VIASH_PAR_C_REG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'class_weight': $( if [ ! -z ${VIASH_PAR_CLASS_WEIGHT+x} ]; then echo "r'${VIASH_PAR_CLASS_WEIGHT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from cross_check_genes import cross_check_genes +from subset_vars import subset_vars +from set_var_index import set_var_index + +logger = setup_logger() + + +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) + logger.info("Reading input data") + input_mudata = mu.read_h5mu(par["input"]) + input_adata = input_mudata.mod[par["modality"]] + input_modality = input_adata.copy() + input_modality = set_var_index(input_modality, par["input_var_gene_names"]) + + if par["model"]: + logger.info("Loading a pre-trained model") + model = pickle.load(open(par["model"], "rb")) + if hasattr(model, "_feature_names_in"): + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) + if not len(common_genes) == len(model._feature_names_in): + raise ValueError("Input dataset does not contain all model features.") + input_modality = input_modality[:, common_genes] + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + else: + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) + + elif par["reference"]: + logger.info("Reading reference data") + + reference_mudata = mu.read_h5mu(par["reference"]) + reference_modality = reference_mudata.mod[par["modality"]].copy() + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) + + # subset to HVG if required + if par["reference_var_input"]: + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) + + # Query and input require the exact same features + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) + reference_modality = reference_modality[:, common_genes] + input_modality = input_modality[:, common_genes] + + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + + logger.info("Training a model...") + labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() + model = CalibratedClassifierCV( + svm.LinearSVC( + C=par["c_reg"], + max_iter=par["max_iter"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + dual="auto", + ) + ) + model.fit(reference_matrix, labels) + model._feature_names_in = reference_modality.var.index + + logger.info("Running predictions...") + predictions = model.predict(input_matrix) + probabilities = np.max(model.predict_proba(input_matrix), axis=1) + + logger.info("Writing output data") + input_adata.obs[par["output_obs_prediction"]] = predictions + input_adata.obs[par["output_obs_probability"]] = probabilities + input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/annotate/svm_annotation", + "tag" : "main" + }, + "label" : [ + "highcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/annotate/svm_annotation/nextflow.config b/target/nextflow/annotate/svm_annotation/nextflow.config new file mode 100644 index 00000000..fd8896e1 --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'annotate/svm_annotation' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/annotate/svm_annotation/nextflow_labels.config b/target/nextflow/annotate/svm_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/annotate/svm_annotation/nextflow_schema.json b/target/nextflow/annotate/svm_annotation/nextflow_schema.json new file mode 100644 index 00000000..99d795b9 --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/nextflow_schema.json @@ -0,0 +1,182 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "svm_annotation", + "description": "Automated cell type annotation tool for scRNA-seq datasets on the basis of SVMs.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Input dataset (query) arguments", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input (query) data to be labeled", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer in the input data to be used for cell type annotation if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_prediction": { + "type": "string", + "description": "In which `.obs` slots to store the predicted information.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"svm_pred\"`. ", + "default": "svm_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "In which `.obs` slots to store the probability of the predictions.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"svm_probability\"`. ", + "default": "svm_probability" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "description": "The reference data to train the CellTypist classifiers on", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer in the reference data to be used for cell type annotation if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_target": { + "type": "string", + "description": "Key in .obs attribute of reference modality with cell-type information.\n", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "model arguments": { + "title": "Model arguments", + "type": "object", + "description": "Model arguments.", + "properties": { + "model": { + "type": "string", + "format": "path", + "description": "Pretrained model in pkl format", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"pretrained_model.pkl\"`. " + }, + "feature_selection": { + "type": "boolean", + "description": "Whether to perform feature selection.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "max_iter": { + "type": "integer", + "description": "Maximum number of iterations for the SVM.", + "help_text": "Type: `integer`, multiple: `False`, default: `5000`. ", + "default": 5000 + }, + "c_reg": { + "type": "number", + "description": "Regularization parameter for the SVM.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "class_weight": { + "type": "string", + "description": "\"Class weights for the SVM", + "help_text": "Type: `string`, multiple: `False`, default: `\"balanced\"`, choices: ``balanced`, `uniform``. ", + "enum": [ + "balanced", + "uniform" + ], + "default": "balanced" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/model arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/annotate/svm_annotation/set_var_index.py b/target/nextflow/annotate/svm_annotation/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/annotate/svm_annotation/setup_logger.py b/target/nextflow/annotate/svm_annotation/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/annotate/svm_annotation/subset_vars.py b/target/nextflow/annotate/svm_annotation/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/annotate/svm_annotation/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/cluster/leiden/.config.vsh.yaml b/target/nextflow/cluster/leiden/.config.vsh.yaml new file mode 100644 index 00000000..cc38529c --- /dev/null +++ b/target/nextflow/cluster/leiden/.config.vsh.yaml @@ -0,0 +1,305 @@ +name: "leiden" +namespace: "cluster" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot the neighbor connectivities can be found." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_name" + description: "Name of the .obsm key under which to add the cluster labels.\nThe\ + \ name of the columns in the matrix will correspond to the resolutions.\n" + info: null + default: + - "leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--resolution" + description: "A parameter value controlling the coarseness of the clustering.\ + \ Higher values lead to more clusters.\nMultiple values will result in clustering\ + \ being performed multiple times.\n" + info: null + default: + - 1.0 + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cluster cells using the [Leiden algorithm] [Traag18] implemented in\ + \ the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain\ + \ algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15]\ + \ [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn`\ + \ first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in\ + \ large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven\ + \ Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with\ + \ Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing\ + \ well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale\ + \ single-cell gene expression data analysis, Genome Biology. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.13-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "leidenalg~=0.10.0" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/cluster/leiden/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/cluster/leiden" + executable: "target/nextflow/cluster/leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/cluster/leiden/compress_h5mu.py b/target/nextflow/cluster/leiden/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/cluster/leiden/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/cluster/leiden/main.nf b/target/nextflow/cluster/leiden/main.nf new file mode 100644 index 00000000..f554876d --- /dev/null +++ b/target/nextflow/cluster/leiden/main.nf @@ -0,0 +1,4270 @@ +// leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "leiden", + "namespace" : "cluster", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_connectivities", + "description" : "In which .obsp slot the neighbor connectivities can be found.", + "default" : [ + "connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_name", + "description" : "Name of the .obsm key under which to add the cluster labels.\nThe name of the columns in the matrix will correspond to the resolutions.\n", + "default" : [ + "leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--resolution", + "description" : "A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters.\nMultiple values will result in clustering being performed multiple times.\n", + "default" : [ + 1.0 + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.13-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "leidenalg~=0.10.0" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/cluster/leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/cluster/leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import signal +import os +import time +import logging +import logging.handlers +import warnings +import mudata as mu +import pandas as pd +import scanpy as sc +import numpy as np +import numpy.typing as npt +import anndata as ad +from multiprocessing import managers, shared_memory, get_context +from concurrent.futures import ProcessPoolExecutor, process, as_completed +from scipy.sparse import csr_matrix +from pathlib import Path +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsp_connectivities': $( if [ ! -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then echo "r'${VIASH_PAR_OBSP_CONNECTIVITIES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_name': $( if [ ! -z ${VIASH_PAR_OBSM_NAME+x} ]; then echo "r'${VIASH_PAR_OBSM_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resolution': $( if [ ! -z ${VIASH_PAR_RESOLUTION+x} ]; then echo "list(map(float, r'${VIASH_PAR_RESOLUTION//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) + +from compress_h5mu import compress_h5mu + +_shared_logger_name = "leiden" + + +# Function to check available space in /dev/shm +def get_available_shared_memory(): + shm_path = "/dev/shm" + shm_stats = os.statvfs(shm_path) + return shm_stats.f_bsize * shm_stats.f_bavail + + +class SharedNumpyMatrix: + def __init__( + self, + shared_memory: shared_memory.SharedMemory, + dtype: npt.DTypeLike, + shape: tuple[int, int], + ) -> None: + self._memory = shared_memory + self._dtype = dtype + self._shape = shape + + @classmethod + def from_numpy( + cls, memory_manager: managers.SharedMemoryManager, array: npt.ArrayLike + ): + available_shared_memory = get_available_shared_memory() + n_bytes_required = array.nbytes + if available_shared_memory < n_bytes_required: + raise ValueError( + "Not enough shared memory (/dev/shm) is available to load the data. " + f"Required amount: {n_bytes_required}, available: {available_shared_memory}." + ) + shm = memory_manager.SharedMemory(size=array.nbytes) + array_in_shared_memory = np.ndarray( + array.shape, dtype=array.dtype, buffer=shm.buf + ) + # Copy the data into shared memory + array_in_shared_memory[:] = array[:] + return cls(shm, array.dtype, array.shape) + + def to_numpy(self): + return np.ndarray(self._shape, dtype=self._dtype, buffer=self._memory.buf) + + def close(self): + self._memory.close() + + +class SharedCsrMatrix: + def __init__( + self, + data: SharedNumpyMatrix, + indices: SharedNumpyMatrix, + indptr: SharedNumpyMatrix, + shape: npt.DTypeLike, + ): + self._data = data + self._indices = indices + self._indptr = indptr + self._shape = shape + + @classmethod + def from_csr_matrix( + cls, memory_manager: managers.SharedMemoryManager, csr_matrix_obj: csr_matrix + ): + return cls( + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.data), + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indices), + SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indptr), + csr_matrix_obj.shape, + ) + + def to_csr_matrix(self): + return csr_matrix( + (self._data.to_numpy(), self._indices.to_numpy(), self._indptr.to_numpy()), + shape=self._shape, + copy=False, + ) + + def close(self): + self._data.close() + self._indices.close() + self._indptr.close() + + +def create_empty_anndata_with_connectivities(connectivities, obs_names): + empty_anndata = ad.AnnData( + np.zeros((connectivities.shape[0], 1)), obs=pd.DataFrame(index=list(obs_names)) + ) + empty_anndata.obsp["connectivities"] = connectivities + return empty_anndata + + +def run_single_resolution(shared_csr_matrix, obs_names, resolution): + logger = logging.getLogger(_shared_logger_name) + logger.info( + "Process with PID '%s' for resolution '%s' started", os.getpid(), resolution + ) + try: + connectivities = shared_csr_matrix.to_csr_matrix() + adata = create_empty_anndata_with_connectivities(connectivities, obs_names) + with warnings.catch_warnings(): + # In the future, the default backend for leiden will be igraph instead of leidenalg. + warnings.simplefilter(action="ignore", category=FutureWarning) + adata_out = sc.tl.leiden( + adata, + resolution=resolution, + key_added=str(resolution), + obsp="connectivities", + copy=True, + ) + logger.info(f"Returning result for resolution {resolution}") + return adata_out.obs[str(resolution)] + finally: + obs_names.shm.close() + shared_csr_matrix.close() + + +def init_worker(parent_process_id, exit_event, log_queue, log_level): + import os + import threading + import time + + pid = os.getpid() + + logger = logging.getLogger(_shared_logger_name) + logger.setLevel(log_level) + + handler = logging.handlers.QueueHandler(log_queue) + logger.addHandler(handler) + + logger.info("Initializing process %s", pid) + + def exit_if_orphaned(): + logger.info( + "Starting orphanned process checker for process %s, parent process %s.", + pid, + parent_process_id, + ) + while True: + # Check if parent process is gone + try: + # If sig is 0, then no signal is sent, but error checking is still performed; + # this can be used to check for the existence of a process ID + os.kill(parent_process_id, 0) + except ProcessLookupError: + logger.info("Parent process is gone, shutting down %s", pid) + # Kill self + os.kill(pid, signal.SIGTERM) + time.sleep(0.2) + # Parent process requested exit + try: + exit_event_set = exit_event.wait(timeout=1) + except BrokenPipeError: + logger.info( + "Checking for shutdown resulted in BrokenPipeError, " + "parent process is most likely gone. Shutting down %s", + pid, + ) + os.kill(pid, signal.SIGTERM) + else: + if exit_event_set: + logger.info("Exit event set, shutting down %s", pid) + os.kill(pid, signal.SIGTERM) + time.sleep(1) + + threading.Thread(target=exit_if_orphaned, daemon=True).start() + logger.info( + "Initialization of process %s is complete, process is now waiting for work.", + pid, + ) + + +def main(): + with managers.SyncManager() as syncm: + log_level = logging.INFO + log_format = "%(name)s:%(levelname)s:%(asctime)s: %(message)s" + formatter = logging.Formatter(log_format) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + log_queue = syncm.Queue() + log_listener = logging.handlers.QueueListener(log_queue, console_handler) + log_listener.start() + + logger = logging.getLogger(_shared_logger_name) + logger.setLevel(log_level) + handler = logging.handlers.QueueHandler(log_queue) + logger.addHandler(handler) + + logger.info("Reading %s.", par["input"]) + adata = mu.read_h5ad(par["input"], mod=par["modality"], backed="r") + logger.info("Processing modality '%s'.", par["modality"]) + try: + connectivities = adata.obsp[par["obsp_connectivities"]] + except KeyError: + raise ValueError( + f'Could not find .obsp key "{par["obsp_connectivities"]}" ' + "in modality {par['modality']}" + ) + + # An event that, when triggered, will kill the child processes that are still running + exit_early_event = syncm.Event() + with managers.SharedMemoryManager() as smm: + # anndata converts the index to strings, so no worries that it cannot be stored in ShareableList + # because it has an unsupported dtype. It should always be string... + index_contents = adata.obs.index.to_list() + assert all([isinstance(item, str) for item in index_contents]) + obs_names = smm.ShareableList(index_contents) + + shared_csr_matrix = SharedCsrMatrix.from_csr_matrix(smm, connectivities) + results = {} + n_workers = ( + meta["cpus"] - 2 if (meta["cpus"] and (meta["cpus"] - 2) > 0) else 1 + ) + logger.info(f"Requesting {n_workers} workers") + executor = ProcessPoolExecutor( + max_workers=n_workers, + max_tasks_per_child=1, + mp_context=get_context("spawn"), + initializer=init_worker, + initargs=((os.getpid(), exit_early_event, log_queue, log_level)), + ) + pending_futures = { + executor.submit( + run_single_resolution, shared_csr_matrix, obs_names, resolution + ): resolution + for resolution in par["resolution"] + } + try: + logger.info("All futures sheduled") + for done_future in as_completed(pending_futures): + resolution = pending_futures[done_future] + data = done_future.result() + logger.info(f"Processed resolution '{resolution}'") + results[str(resolution)] = data + except process.BrokenProcessPool: + # This assumes that one of the child processses was killed by the kernel + # because the oom killer was activated. This the is the most likely scenario, + # other causes could be: + # * Subprocess terminates without raising a proper exception. + # * The code of the process handling the communication is broke (i.e. a python bug) + # * The return data could not be pickled. + logger.error("BrokenProcessPool is raised") + executor.shutdown(wait=False, cancel_futures=True) + time.sleep(3) + exit_early_event.set() + time.sleep(3) + sys.exit(137) + finally: + logger.info("Closing shared resources in main process") + shared_csr_matrix.close() + obs_names.shm.close() + logger.info("Shared resources closed") + logger.info( + "Shutting down logging queue and re-enabling logging from main thread." + ) + log_listener.enqueue_sentinel() + log_listener.stop() + stdout_handler = logging.StreamHandler() + stdout_handler.setFormatter(formatter) + logger.addHandler(stdout_handler) + logger.info("Re-enabled logging in main thread.") + logger.info("Waiting for shutdown of processes") + executor.shutdown() + logger.info("Executor shut down.") + adata.obsm[par["obsm_name"]] = pd.DataFrame(results) + + output_file = Path(par["output"]) + logger.info("Writing output to %s.", par["output"]) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if par["output_compression"] + else output_file + ) + shutil.copyfile(par["input"], output_file_uncompressed) + mu.write_h5ad( + filename=output_file_uncompressed, mod=par["modality"], data=adata + ) + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, + output_file, + compression=par["output_compression"], + ) + output_file_uncompressed.unlink() + logger.info("Finished.") + log_listener.enqueue_sentinel() + time.sleep(3) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/cluster/leiden", + "tag" : "main" + }, + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/cluster/leiden/nextflow.config b/target/nextflow/cluster/leiden/nextflow.config new file mode 100644 index 00000000..75cada35 --- /dev/null +++ b/target/nextflow/cluster/leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'cluster/leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n' + author = 'Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/cluster/leiden/nextflow_labels.config b/target/nextflow/cluster/leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/cluster/leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/cluster/leiden/nextflow_schema.json b/target/nextflow/cluster/leiden/nextflow_schema.json new file mode 100644 index 00000000..cb2d0249 --- /dev/null +++ b/target/nextflow/cluster/leiden/nextflow_schema.json @@ -0,0 +1,87 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "leiden", + "description": "Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obsp_connectivities": { + "type": "string", + "description": "In which .obsp slot the neighbor connectivities can be found.", + "help_text": "Type: `string`, multiple: `False`, default: `\"connectivities\"`. ", + "default": "connectivities" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_name": { + "type": "string", + "description": "Name of the .obsm key under which to add the cluster labels.\nThe name of the columns in the matrix will correspond to the resolutions.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"leiden\"`. ", + "default": "leiden" + }, + "resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "A parameter value controlling the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, required, default: `[1.0]`. ", + "default": [ + 1.0 + ] + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/cluster/leiden/setup_logger.py b/target/nextflow/cluster/leiden/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/cluster/leiden/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/compression/compress_h5mu/.config.vsh.yaml b/target/nextflow/compression/compress_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..bc4f3612 --- /dev/null +++ b/target/nextflow/compression/compress_h5mu/.config.vsh.yaml @@ -0,0 +1,245 @@ +name: "compress_h5mu" +namespace: "compression" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the input .h5mu." + info: null + example: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "location of output file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Compress a MuData file. \n" +test_resources: +- type: "python_script" + path: "run_test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/compression/compress_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/compression/compress_h5mu" + executable: "target/nextflow/compression/compress_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/compression/compress_h5mu/compress_h5mu.py b/target/nextflow/compression/compress_h5mu/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/compression/compress_h5mu/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/compression/compress_h5mu/main.nf b/target/nextflow/compression/compress_h5mu/main.nf new file mode 100644 index 00000000..b6faefe9 --- /dev/null +++ b/target/nextflow/compression/compress_h5mu/main.nf @@ -0,0 +1,3898 @@ +// compress_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "compress_h5mu", + "namespace" : "compression", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the input .h5mu.", + "example" : [ + "sample_path" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "location of output file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Compress a MuData file. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "run_test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/compression/compress_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/compression/compress_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import compress_h5mu + +if __name__ == "__main__": + compress_h5mu(par["input"], par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/compression/compress_h5mu", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/compression/compress_h5mu/nextflow.config b/target/nextflow/compression/compress_h5mu/nextflow.config new file mode 100644 index 00000000..8e879b4f --- /dev/null +++ b/target/nextflow/compression/compress_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'compression/compress_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Compress a MuData file. \n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/compression/compress_h5mu/nextflow_labels.config b/target/nextflow/compression/compress_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/compression/compress_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/compression/compress_h5mu/nextflow_schema.json b/target/nextflow/compression/compress_h5mu/nextflow_schema.json new file mode 100644 index 00000000..e9466bd5 --- /dev/null +++ b/target/nextflow/compression/compress_h5mu/nextflow_schema.json @@ -0,0 +1,58 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "compress_h5mu", + "description": "Compress a MuData file. \n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the input .h5mu.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"sample_path\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "location of output file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/compression/tar_extract/.config.vsh.yaml b/target/nextflow/compression/tar_extract/.config.vsh.yaml new file mode 100644 index 00000000..d793bac1 --- /dev/null +++ b/target/nextflow/compression/tar_extract/.config.vsh.yaml @@ -0,0 +1,219 @@ +name: "tar_extract" +namespace: "compression" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input file" + info: null + example: + - "input.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Folder to restore file(s) to." + info: null + example: + - "output_folder" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--strip_components" + alternatives: + - "-s" + description: "Strip this amount of leading components from file names on extraction.\ + \ For example, to extract only 'myfile.txt' from an archive containing the structure\ + \ `this/goes/deep/myfile.txt', use 3 to strip 'this/goes/deep/'." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--exclude" + alternatives: + - "-e" + description: "Prevents any file or member whose name matches the shell wildcard\ + \ (pattern) from being extracted." + info: null + example: + - "docs/figures" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Extract files from a tar archive" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "LICENSE" +info: null +status: "deprecated" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/compression/tar_extract/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/compression/tar_extract" + executable: "target/nextflow/compression/tar_extract/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/compression/tar_extract/main.nf b/target/nextflow/compression/tar_extract/main.nf new file mode 100644 index 00000000..9e541725 --- /dev/null +++ b/target/nextflow/compression/tar_extract/main.nf @@ -0,0 +1,3849 @@ +// tar_extract main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "tar_extract", + "namespace" : "compression", + "version" : "main", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input file", + "example" : [ + "input.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Folder to restore file(s) to.", + "example" : [ + "output_folder" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--strip_components", + "alternatives" : [ + "-s" + ], + "description" : "Strip this amount of leading components from file names on extraction. For example, to extract only 'myfile.txt' from an archive containing the structure `this/goes/deep/myfile.txt', use 3 to strip 'this/goes/deep/'.", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--exclude", + "alternatives" : [ + "-e" + ], + "description" : "Prevents any file or member whose name matches the shell wildcard (pattern) from being extracted.", + "example" : [ + "docs/figures" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Extract files from a tar archive", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../LICENSE" + } + ], + "status" : "deprecated", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:latest", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/compression/tar_extract/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/compression/tar_extract", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/usr/bin/env bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_STRIP_COMPONENTS+x} ]; then echo "${VIASH_PAR_STRIP_COMPONENTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_strip_components='&'#" ; else echo "# par_strip_components="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE+x} ]; then echo "${VIASH_PAR_EXCLUDE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_exclude='&'#" ; else echo "# par_exclude="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=() +mkdir -p \\$par_output # Create output directory if it doesn't exist already + +if [ "\\$par_strip_components" != "" ]; then + extra_params+=("--strip-components=\\$par_strip_components") +fi + +if [ "\\$par_exclude" != "" ]; then + extra_params+=("--exclude=\\$par_exclude") +fi + +echo "Extracting \\$par_input to \\$par_output..." +echo "" +tar "\\${extra_params[@]}" -xvf "\\$par_input" -C "\\$par_output" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/compression/tar_extract", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/compression/tar_extract/nextflow.config b/target/nextflow/compression/tar_extract/nextflow.config new file mode 100644 index 00000000..41948b89 --- /dev/null +++ b/target/nextflow/compression/tar_extract/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'compression/tar_extract' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Extract files from a tar archive' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/compression/tar_extract/nextflow_labels.config b/target/nextflow/compression/tar_extract/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/compression/tar_extract/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/compression/tar_extract/nextflow_schema.json b/target/nextflow/compression/tar_extract/nextflow_schema.json new file mode 100644 index 00000000..78539f31 --- /dev/null +++ b/target/nextflow/compression/tar_extract/nextflow_schema.json @@ -0,0 +1,59 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "tar_extract", + "description": "Extract files from a tar archive", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.tar.gz\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Folder to restore file(s) to.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"output_folder\"`. ", + "default": "$id.$key.output" + }, + "strip_components": { + "type": "integer", + "description": "Strip this amount of leading components from file names on extraction", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "exclude": { + "type": "string", + "description": "Prevents any file or member whose name matches the shell wildcard (pattern) from being extracted.", + "help_text": "Type: `string`, multiple: `False`, example: `\"docs/figures\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_10xh5_to_h5mu/.config.vsh.yaml b/target/nextflow/convert/from_10xh5_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..10ade1ae --- /dev/null +++ b/target/nextflow/convert/from_10xh5_to_h5mu/.config.vsh.yaml @@ -0,0 +1,352 @@ +name: "from_10xh5_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "A 10x h5 file as generated by Cell Ranger." + info: null + example: + - "raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_metrics_summary" + description: "A metrics summary csv file as generated by Cell Ranger." + info: null + example: + - "metrics_cellranger.h5" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: + slots: + mod: + - name: "rna" + required: true + description: "Gene expression counts." + slots: + var: + - name: "gene_symbol" + type: "string" + description: "Identification of the gene." + required: true + - name: "feature_types" + type: "string" + description: "The full name of the modality." + required: true + - name: "genome" + type: "string" + description: "Reference that was used to generate the data." + required: true + - name: "prot" + required: false + description: "Protein abundancy" + slots: + var: + - name: "gene_symbol" + type: "string" + description: "Identification of the gene." + required: true + - name: "feature_types" + type: "string" + description: "The full name of the modality." + required: true + - name: "genome" + type: "string" + description: "Reference that was used to generate the data." + required: true + - name: "vdj" + required: false + description: "VDJ transcript counts" + slots: + var: + - name: "gene_symbol" + type: "string" + description: "Identification of the gene." + required: true + - name: "feature_types" + type: "string" + description: "The full name of the modality." + required: true + - name: "genome" + type: "string" + description: "Reference that was used to generate the data." + required: true + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_metrics" + description: "Name of the .uns slot under which to QC metrics (if any)." + info: null + default: + - "metrics_cellranger" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--min_genes" + description: "Minimum number of counts required for a cell to pass filtering." + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts" + description: "Minimum number of genes expressed required for a cell to pass filtering." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a 10x h5 into an h5mu file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_10xh5_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_10xh5_to_h5mu" + executable: "target/nextflow/convert/from_10xh5_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_10xh5_to_h5mu/main.nf b/target/nextflow/convert/from_10xh5_to_h5mu/main.nf new file mode 100644 index 00000000..c7218017 --- /dev/null +++ b/target/nextflow/convert/from_10xh5_to_h5mu/main.nf @@ -0,0 +1,4114 @@ +// from_10xh5_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_10xh5_to_h5mu", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "A 10x h5 file as generated by Cell Ranger.", + "example" : [ + "raw_feature_bc_matrix.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_metrics_summary", + "description" : "A metrics summary csv file as generated by Cell Ranger.", + "example" : [ + "metrics_cellranger.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "info" : { + "slots" : { + "mod" : [ + { + "name" : "rna", + "required" : true, + "description" : "Gene expression counts.", + "slots" : { + "var" : [ + { + "name" : "gene_symbol", + "type" : "string", + "description" : "Identification of the gene.", + "required" : true + }, + { + "name" : "feature_types", + "type" : "string", + "description" : "The full name of the modality.", + "required" : true + }, + { + "name" : "genome", + "type" : "string", + "description" : "Reference that was used to generate the data.", + "required" : true + } + ] + } + }, + { + "name" : "prot", + "required" : false, + "description" : "Protein abundancy", + "slots" : { + "var" : [ + { + "name" : "gene_symbol", + "type" : "string", + "description" : "Identification of the gene.", + "required" : true + }, + { + "name" : "feature_types", + "type" : "string", + "description" : "The full name of the modality.", + "required" : true + }, + { + "name" : "genome", + "type" : "string", + "description" : "Reference that was used to generate the data.", + "required" : true + } + ] + } + }, + { + "name" : "vdj", + "required" : false, + "description" : "VDJ transcript counts", + "slots" : { + "var" : [ + { + "name" : "gene_symbol", + "type" : "string", + "description" : "Identification of the gene.", + "required" : true + }, + { + "name" : "feature_types", + "type" : "string", + "description" : "The full name of the modality.", + "required" : true + }, + { + "name" : "genome", + "type" : "string", + "description" : "Reference that was used to generate the data.", + "required" : true + } + ] + } + } + ] + } + }, + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_metrics", + "description" : "Name of the .uns slot under which to QC metrics (if any).", + "default" : [ + "metrics_cellranger" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_genes", + "description" : "Minimum number of counts required for a cell to pass filtering.", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of genes expressed required for a cell to pass filtering.", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts a 10x h5 into an h5mu file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_10xh5_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_10xh5_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata +import scanpy as sc +import sys +import pandas as pd + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_metrics_summary': $( if [ ! -z ${VIASH_PAR_INPUT_METRICS_SUMMARY+x} ]; then echo "r'${VIASH_PAR_INPUT_METRICS_SUMMARY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_metrics': $( if [ ! -z ${VIASH_PAR_UNS_METRICS+x} ]; then echo "r'${VIASH_PAR_UNS_METRICS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_genes': $( if [ ! -z ${VIASH_PAR_MIN_GENES+x} ]; then echo "int(r'${VIASH_PAR_MIN_GENES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading %s.", par["input"]) +adata = sc.read_10x_h5(par["input"], gex_only=False) + +# set the gene ids as var_names +logger.info("Renaming var columns") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + +# parse metrics summary file and store in .uns +if par["input_metrics_summary"] and par["uns_metrics"]: + logger.info("Reading metrics summary file '%s'", par["input_metrics_summary"]) + + def read_percentage(val): + try: + return float(val.strip("%")) / 100 + except AttributeError: + return val + + metrics_summary = pd.read_csv( + par["input_metrics_summary"], decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) + + logger.info("Storing metrics summary in .uns['%s']", par["uns_metrics"]) + adata.uns[par["uns_metrics"]] = metrics_summary +else: + is_none = ( + "input_metrics_summary" if not par["input_metrics_summary"] else "uns_metrics" + ) + logger.info("Not storing metrics summary because par['%s'] is None", is_none) + +# might perform basic filtering to get rid of some data +# applicable when starting from the raw counts +if par["min_genes"]: + logger.info("Filtering with min_genes=%d", par["min_genes"]) + sc.pp.filter_cells(adata, min_genes=par["min_genes"]) + +if par["min_counts"]: + logger.info("Filtering with min_counts=%d", par["min_counts"]) + sc.pp.filter_cells(adata, min_counts=par["min_counts"]) + +# generate output +logger.info("Convert to mudata") +mdata = mudata.MuData(adata) + +# override root .obs and .uns +mdata.obs = adata.obs +mdata.uns = adata.uns + +# write output +logger.info("Writing %s", par["output"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_10xh5_to_h5mu", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_10xh5_to_h5mu/nextflow.config b/target/nextflow/convert/from_10xh5_to_h5mu/nextflow.config new file mode 100644 index 00000000..6cbb5cd5 --- /dev/null +++ b/target/nextflow/convert/from_10xh5_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_10xh5_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts a 10x h5 into an h5mu file.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_10xh5_to_h5mu/nextflow_labels.config b/target/nextflow/convert/from_10xh5_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_10xh5_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_10xh5_to_h5mu/nextflow_schema.json b/target/nextflow/convert/from_10xh5_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..088a002b --- /dev/null +++ b/target/nextflow/convert/from_10xh5_to_h5mu/nextflow_schema.json @@ -0,0 +1,100 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_10xh5_to_h5mu", + "description": "Converts a 10x h5 into an h5mu file.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "A 10x h5 file as generated by Cell Ranger.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"raw_feature_bc_matrix.h5\"`. " + }, + "input_metrics_summary": { + "type": "string", + "format": "path", + "description": "A metrics summary csv file as generated by Cell Ranger.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"metrics_cellranger.h5\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "uns_metrics": { + "type": "string", + "description": "Name of the .uns slot under which to QC metrics (if any).", + "help_text": "Type: `string`, multiple: `False`, default: `\"metrics_cellranger\"`. ", + "default": "metrics_cellranger" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "min_genes": { + "type": "integer", + "description": "Minimum number of counts required for a cell to pass filtering.", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "min_counts": { + "type": "integer", + "description": "Minimum number of genes expressed required for a cell to pass filtering.", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_10xh5_to_h5mu/setup_logger.py b/target/nextflow/convert/from_10xh5_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/convert/from_10xh5_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/convert/from_10xmtx_to_h5mu/.config.vsh.yaml b/target/nextflow/convert/from_10xmtx_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..7712f72a --- /dev/null +++ b/target/nextflow/convert/from_10xmtx_to_h5mu/.config.vsh.yaml @@ -0,0 +1,253 @@ +name: "from_10xmtx_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input mtx folder" + info: null + example: + - "input_dir_containing_gz_files" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a 10x mtx into an h5mu file.\n" +test_resources: +- type: "python_script" + path: "run_test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_10xmtx_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_10xmtx_to_h5mu" + executable: "target/nextflow/convert/from_10xmtx_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_10xmtx_to_h5mu/main.nf b/target/nextflow/convert/from_10xmtx_to_h5mu/main.nf new file mode 100644 index 00000000..111325cf --- /dev/null +++ b/target/nextflow/convert/from_10xmtx_to_h5mu/main.nf @@ -0,0 +1,3928 @@ +// from_10xmtx_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_10xmtx_to_h5mu", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input mtx folder", + "example" : [ + "input_dir_containing_gz_files" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts a 10x mtx into an h5mu file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "run_test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_10xmtx_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_10xmtx_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import scanpy as sc +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading %s.", par["input"]) +adata = sc.read_10x_mtx(par["input"], gex_only=False) + +logger.info("Renaming keys.") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + +# generate output +logger.info("Convert to mudata") +mdata = mu.MuData(adata) + +# override root .obs +mdata.obs = adata.obs + +# write output +logger.info("Writing %s", par["output"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_10xmtx_to_h5mu", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow.config b/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow.config new file mode 100644 index 00000000..fee03198 --- /dev/null +++ b/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_10xmtx_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts a 10x mtx into an h5mu file.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow_labels.config b/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow_schema.json b/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..74f97d72 --- /dev/null +++ b/target/nextflow/convert/from_10xmtx_to_h5mu/nextflow_schema.json @@ -0,0 +1,58 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_10xmtx_to_h5mu", + "description": "Converts a 10x mtx into an h5mu file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input mtx folder", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input_dir_containing_gz_files\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_10xmtx_to_h5mu/setup_logger.py b/target/nextflow/convert/from_10xmtx_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/convert/from_10xmtx_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/.config.vsh.yaml b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/.config.vsh.yaml new file mode 100644 index 00000000..02ce116f --- /dev/null +++ b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/.config.vsh.yaml @@ -0,0 +1,228 @@ +name: "from_bd_to_10x_molecular_barcode_tags" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input SAM or BAM file." + info: null + example: + - "input.bam" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output alignment file." + info: null + example: + - "output.sam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--bam" + description: "Output a BAM file." + info: null + direction: "input" + - type: "integer" + name: "--threads" + alternatives: + - "-t" + description: "Number of threads" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the molecular barcode sequence SAM tag from BD format (MA) to\ + \ 10X format (UB).\n" +test_resources: +- type: "bash_script" + path: "run_test.sh" + is_executable: true +- type: "file" + path: "output_raw" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "samtools" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags" + executable: "target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/main.nf b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/main.nf new file mode 100644 index 00000000..abb7869e --- /dev/null +++ b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/main.nf @@ -0,0 +1,3927 @@ +// from_bd_to_10x_molecular_barcode_tags main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_bd_to_10x_molecular_barcode_tags", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input SAM or BAM file.", + "example" : [ + "input.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output alignment file.", + "example" : [ + "output.sam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--bam", + "description" : "Output a BAM file.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--threads", + "alternatives" : [ + "-t" + ], + "description" : "Number of threads", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert the molecular barcode sequence SAM tag from BD format (MA) to 10X format (UB).\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "run_test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/bdrhap_5kjrt/processed/output_raw" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:latest", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "samtools" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_THREADS+x} ]; then echo "${VIASH_PAR_THREADS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_threads='&'#" ; else echo "# par_threads="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Sam tags added by BD Rhapsody Pipeline +# From: https://www.bd.com/documents/guides/user-guides/GMX_BD-Rhapsody-genomics-informatics_UG_EN.pdf +# +# ========================================================================================= +# | | Definition | +# ========================================================================================= +# | CB | A number between 1 and 96 3 (884,736) representing a unique cell label sequence | +# | | (CB = 0 when no cell label sequence is detected) | +# ----------------------------------------------------------------------------------------- +# | MR | Raw molecular identifier sequence | +# ----------------------------------------------------------------------------------------- +# | MA | RSEC-adjusted molecular identifier sequence. If not a true cell, the raw UMI is | +# | | repeated in this tag. | +# ----------------------------------------------------------------------------------------- +# | PT | T if a poly(T) tail was found in the expected position on R1, or F if poly(T) | +# | | was not found | +# ----------------------------------------------------------------------------------------- +# | CN | Indicates if a sequence is derived from a putative cell, as determined by the | +# | | cell label filtering algorithm (T: putative cell; x: invalid cell label or noise | +# | | cell) Note: You can distinguish between an invalid cell label and a noise cell | +# | | with the CB tag (invalid cell labels are 0). | +# ----------------------------------------------------------------------------------------- +# | ST | The value is 1-12, indicating the Sample Tag of the called putative cell, or M | +# | | for multiplet, or x for undetermined. | +# ========================================================================================= + + +# SAM tags added by 10X +# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam +# ========================================================================================= +# | | Definition | +# ========================================================================================= +# | CB | Chromium cellular barcode sequence that is error-corrected and confirmed against | +# | | a list of known-good barcode sequences. For multiplex Fixed RNA Profiling, the | +# | | cellular barcode is a combination of the 10x GEM Barcode and Probe Barcode | +# | | sequences. | +# ----------------------------------------------------------------------------------------- +# | CR | Chromium cellular barcode sequence as reported by the sequencer. For multiplex | +# | | Fixed RNA Profiling, the cellular barcode is a combination of the 10x GEM | +# | | Barcode and Probe Barcode sequences. | +# ----------------------------------------------------------------------------------------- +# | CY | Chromium cellular barcode read quality. For multiplex Fixed RNA Profiling, the | +# | | cellular barcode is a combination of the 10x GEM Barcode and Probe Barcode | +# | | sequences. Phred scores as reported by sequencer. | +# ----------------------------------------------------------------------------------------- +# | UB | Chromium molecular barcode sequence that is error-corrected among other | +# | | molecular barcodes with the same cellular barcode and gene alignment. | +# ----------------------------------------------------------------------------------------- +# | UR | Chromium molecular barcode sequence as reported by the sequencer. | +# ----------------------------------------------------------------------------------------- +# | UY | Chromium molecular barcode read quality. Phred scores as reported by sequencer. | +# ----------------------------------------------------------------------------------------- +# | TR | Trimmed sequence. For the Single Cell 3' v1 chemistry, this is trailing sequence | +# | | following the UMI on Read 2. For the Single Cell 3' v2 chemistry, this is | +# | | trailing sequence following the cell and molecular barcodes on Read 1. | +# ========================================================================================= + +extra_params=() + +if [ "\\$par_bam" == "true" ]; then + extra_params+=("--bam") +fi + +cat \\\\ + <(samtools view -SH "\\$par_input") \\\\ + <(samtools view "\\$par_input" | grep "MA:Z:*" | sed "s/MA:Z:/UB:Z:/" ) | \\\\ +samtools view -Sh "\\${extra_params[@]}" -@"\\$par_threads" - > "\\$par_output" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_bd_to_10x_molecular_barcode_tags", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow.config b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow.config new file mode 100644 index 00000000..b05d177a --- /dev/null +++ b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_bd_to_10x_molecular_barcode_tags' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert the molecular barcode sequence SAM tag from BD format (MA) to 10X format (UB).\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_labels.config b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_schema.json b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_schema.json new file mode 100644 index 00000000..8ae48b97 --- /dev/null +++ b/target/nextflow/convert/from_bd_to_10x_molecular_barcode_tags/nextflow_schema.json @@ -0,0 +1,60 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_bd_to_10x_molecular_barcode_tags", + "description": "Convert the molecular barcode sequence SAM tag from BD format (MA) to 10X format (UB).\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input SAM or BAM file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.bam\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output alignment file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.sam\"`, direction: `output`, example: `\"output.sam\"`. ", + "default": "$id.$key.output.sam" + }, + "bam": { + "type": "boolean", + "description": "Output a BAM file.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "threads": { + "type": "integer", + "description": "Number of threads", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_bdrhap_to_h5mu/.config.vsh.yaml b/target/nextflow/convert/from_bdrhap_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..33166b1b --- /dev/null +++ b/target/nextflow/convert/from_bdrhap_to_h5mu/.config.vsh.yaml @@ -0,0 +1,266 @@ +name: "from_bdrhap_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "A sample ID." + info: null + example: + - "my_id" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "The output h5mu of a BD Rhapsody workflow." + info: null + example: + - "sample.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "sample.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_bdrhap_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_bdrhap_to_h5mu" + executable: "target/nextflow/convert/from_bdrhap_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_bdrhap_to_h5mu/main.nf b/target/nextflow/convert/from_bdrhap_to_h5mu/main.nf new file mode 100644 index 00000000..01eb8144 --- /dev/null +++ b/target/nextflow/convert/from_bdrhap_to_h5mu/main.nf @@ -0,0 +1,3965 @@ +// from_bdrhap_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) +// * Robrecht Cannoodt (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_bdrhap_to_h5mu", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "A sample ID.", + "example" : [ + "my_id" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "The output h5mu of a BD Rhapsody workflow.", + "example" : [ + "sample.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/bdrhap_5kjrt/processed/output_raw/sample.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_bdrhap_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_bdrhap_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'id': $( if [ ! -z ${VIASH_PAR_ID+x} ]; then echo "r'${VIASH_PAR_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print(">> Reading input file", flush=True) +mdata = mu.read_h5mu(par["input"]) + +# Check if modalities are present +modalities = list(mdata.mod.keys()) +assert len(modalities) > 0, "No modalities found in input data" + + +def process_modality_inline(adata, modality): + adata.obs["library_id"] = " & ".join(adata.uns["Pipeline_Inputs"]["Libraries"]) + adata.obs["cell_id"] = adata.obs.index + adata.obs["run_id"] = par["id"] + + adata.obs.rename( + columns={"Sample_Tag": "sample_tag", "Sample_Name": "sample_id"}, inplace=True + ) + + adata.var["gene_ids"] = adata.var.index + adata.var["gene_name"] = adata.var.index + + if modality == "rna": + adata.var["feature_type"] = "Gene Expression" + adata.var["reference_file"] = adata.uns["Pipeline_Inputs"]["Reference_Archive"] + + elif modality == "prot": + adata.var["feature_type"] = "Antibody Capture" + adata.var["reference_file"] = " & ".join( + adata.uns["Pipeline_Inputs"]["AbSeq_Reference"] + ) + + # TODO: add other modalities + + +for key, value in mdata.mod.items(): + print(">> Processing modality:", key, flush=True) + process_modality_inline(value, key) + +print(">> Writing output file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_bdrhap_to_h5mu", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow.config b/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow.config new file mode 100644 index 00000000..cfc5c948 --- /dev/null +++ b/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_bdrhap_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file.\n' + author = 'Dorien Roosen, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow_labels.config b/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow_schema.json b/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..0bd14a05 --- /dev/null +++ b/target/nextflow/convert/from_bdrhap_to_h5mu/nextflow_schema.json @@ -0,0 +1,83 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_bdrhap_to_h5mu", + "description": "Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "A sample ID.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"my_id\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The output h5mu of a BD Rhapsody workflow.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"sample.h5mu\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_cellranger_multi_to_h5mu/.config.vsh.yaml b/target/nextflow/convert/from_cellranger_multi_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..88344ddf --- /dev/null +++ b/target/nextflow/convert/from_cellranger_multi_to_h5mu/.config.vsh.yaml @@ -0,0 +1,284 @@ +name: "from_cellranger_multi_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input folder. Must contain the output from a cellranger multi run." + info: null + example: + - "input_dir_containing_modalities" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Locations for the output files. Must contain a wildcard (*) character,\n\ + which will be replaced with the sample name.\n" + info: null + example: + - "*.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sample_csv" + description: "CSV file describing the sample name per output file" + info: null + example: + - "samples.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_metrics" + description: "Name of the .uns slot under which to QC metrics (if any)." + info: null + default: + - "metrics_cellranger" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts the output from cellranger multi to a single .h5mu file.\n\ + By default, will map the following library type names to modality names:\n - Gene\ + \ Expression: rna\n - Peaks: atac\n - Antibody Capture: prot\n - VDJ: vdj\n \ + \ - VDJ-T: vdj_t\n - VDJ-B: vdj_b\n - CRISPR Guide Capture: crispr\n - Multiplexing\ + \ Capture: hashing\n \nOther library types have their whitepace removed and dashes\ + \ replaced by\nunderscores to generate the modality name.\n\nCurrently does not\ + \ allow parsing the output from cell barcode demultiplexing.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "10x_5k_anticmv" +- type: "file" + path: "10x_5k_lung_crispr" +- type: "file" + path: "10x_5k_beam" +- type: "file" + path: "10x_5k_fixed" +- type: "file" + path: "10x_4plex_dtc" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scirpy~=0.12.0" + - "pandas~=2.2.3" + - "pytest" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_cellranger_multi_to_h5mu" + executable: "target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf b/target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf new file mode 100644 index 00000000..992642ef --- /dev/null +++ b/target/nextflow/convert/from_cellranger_multi_to_h5mu/main.nf @@ -0,0 +1,4378 @@ +// from_cellranger_multi_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_cellranger_multi_to_h5mu", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input folder. Must contain the output from a cellranger multi run.", + "example" : [ + "input_dir_containing_modalities" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Locations for the output files. Must contain a wildcard (*) character,\nwhich will be replaced with the sample name.\n", + "example" : [ + "*.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_csv", + "description" : "CSV file describing the sample name per output file", + "example" : [ + "samples.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_metrics", + "description" : "Name of the .uns slot under which to QC metrics (if any).", + "default" : [ + "metrics_cellranger" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts the output from cellranger multi to a single .h5mu file.\nBy default, will map the following library type names to modality names:\n - Gene Expression: rna\n - Peaks: atac\n - Antibody Capture: prot\n - VDJ: vdj\n - VDJ-T: vdj_t\n - VDJ-B: vdj_b\n - CRISPR Guide Capture: crispr\n - Multiplexing Capture: hashing\n \nOther library types have their whitepace removed and dashes replaced by\nunderscores to generate the modality name.\n\nCurrently does not allow parsing the output from cell barcode demultiplexing.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_anticmv" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_lung_crispr" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_beam" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_fixed" + }, + { + "type" : "file", + "path" : "/resources_test/10x_4plex_dtc" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "scirpy~=0.12.0", + "pandas~=2.2.3", + "pytest" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_cellranger_multi_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from pathlib import Path +import sys +import scanpy +import pandas as pd +import mudata +import numpy as np +from scirpy.io import read_10x_vdj +from collections import defaultdict +from functools import partial +import json +import csv +import tempfile + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sample_csv': $( if [ ! -z ${VIASH_PAR_SAMPLE_CSV+x} ]; then echo "r'${VIASH_PAR_SAMPLE_CSV//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_metrics': $( if [ ! -z ${VIASH_PAR_UNS_METRICS+x} ]; then echo "r'${VIASH_PAR_UNS_METRICS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +POSSIBLE_LIBRARY_TYPES = ( + "vdj_t", + "vdj_b", + "vdj_t_gd", + "count", + "antigen_analysis", + "multiplexing_analysis", +) + +FEATURE_TYPES_NAMES = { + "Gene Expression": "rna", + "Peaks": "atac", + "Antibody Capture": "prot", + "VDJ": "vdj", + "VDJ-T": "vdj_t", + "VDJ-B": "vdj_b", + "CRISPR Guide Capture": "gdo", + "Multiplexing Capture": "hto", + "Antigen Capture": "antigen", + "Custom": "custom", +} + + +def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: + """ + Cast the dataframe to dtypes that can be written by mudata. + """ + # dtype inferral workfs better with np.nan + result = result.replace({pd.NA: np.nan}) + + # MuData supports nullable booleans and ints + # ie. \\`IntegerArray\\` and \\`BooleanArray\\` + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # Convert leftover 'object' columns to string + # However, na values are supported, so convert all values except NA's to string + object_cols = result.select_dtypes(include="object").columns.values + for obj_col in object_cols: + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) + return result + + +def gather_input_data(dir: Path): + # / + # +-- multi + # | +-- count (raw output) + # | | +-- feature_reference.csv + # | | +-- raw_feature_bc_matrix.h5 + # | +-- vdj_t + # | | +-- all_contig_annotations.json + # | +-- vdj_b + # | | +-- all_contig_annotations.json + # | +-- vdj_t_gd + # | | +-- all_contig_annotations.json + # | +-- multiplexing_analysis + # | +-- cells_per_tag.json + # +-- per_sample_outs (filtered outputs) + # +-- example_1 + # +-- antigen_analysis + # | +-- per_barcode.csv + # | +-- antigen_specificity_scores.csv + # +-- count + # | +-- antibody_analysis + # | +-- crispr_analysis + # | +-- perturbation_efficiencies_by_feature.csv + # | +-- perturbation_efficiencies_by_target.csv + # +-- vdj_t (unused) + # +-- vdj_b (unused) + # +-- vdj_t_gd (unused) + # +-- metrics_summary.csv + + if not dir.is_dir(): + raise ValueError("Specified input is not a directory.") + folder_contents = list(dir.iterdir()) + config = dir / "config.csv" + if config not in folder_contents: + logger.warning( + "Config.csv not found in input directory, this folder might not be a valid cellranger multi output." + ) + + required_subfolders = [ + dir / subfolder_name for subfolder_name in ("multi", "per_sample_outs") + ] + found_input = {key_: {} for key_ in POSSIBLE_LIBRARY_TYPES} + for required_subfolder in required_subfolders: + if required_subfolder not in folder_contents: + raise ValueError( + f"Input folder must contain the subfolder {required_subfolder} please make " + "sure that the specified input folder is a valid cellranger multi output." + ) + + multi_dir = dir / "multi" + for library_type in multi_dir.iterdir(): + if not library_type.is_dir(): + logger.warning( + "%s is not a directory. Contents of the multi folder " + "must be directories to be recognized as valid input data", + library_type, + ) + continue + if library_type.name not in POSSIBLE_LIBRARY_TYPES: + raise ValueError( + f"Contents of the 'multi' folder must be found one of the following: {','.join(POSSIBLE_LIBRARY_TYPES)}." + ) + + found_input[library_type.name] = library_type + + per_sample_outs_dir = dir / "per_sample_outs" + samples_dirs = [ + samplepath + for samplepath in per_sample_outs_dir.iterdir() + if samplepath.is_dir() + ] + for samples_dir in samples_dirs: + for file_part in ( + "metrics_summary.csv", + "count/feature_reference.csv", + "count/crispr_analysis/perturbation_efficiencies_by_feature.csv", + "count/crispr_analysis/perturbation_efficiencies_by_target.csv", + "antigen_analysis", + ): + found_file = samples_dir / file_part + if found_file.exists(): + file_name = found_file.name.removesuffix(".csv") + found_input.setdefault(file_name, {})[samples_dir.name] = found_file + + return found_input + + +def proces_perturbation( + key_name: str, mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): + for sample_name, mudata_obj in mudatas.items(): + efficiency_file = efficiency_files[sample_name] + assert "gdo" in mudata_obj.mod + eff_df = pd.read_csv( + efficiency_file, + index_col="Perturbation", + sep=",", + decimal=".", + quotechar='"', + ) + mudata_obj.mod["gdo"].uns[key_name] = eff_df + return mudatas + + +def process_feature_reference( + mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): + for sample, mudata_obj in mudatas.items(): + efficiency_file = efficiency_files[sample] + df = pd.read_csv( + efficiency_file, index_col="id", sep=",", decimal=".", quotechar='"' + ) + assert "feature_type" in df.columns, ( + "Columns 'feature_type' should be present in features_reference file." + ) + feature_types = df["feature_type"] + missing_features = set(feature_types) - set(FEATURE_TYPES_NAMES) + if missing_features: + raise ValueError( + "Not all feature types present in the features_reference file are supported by this component.\\\\n" + f"Missing support for features: {','.join(missing_features)}." + ) + for feature_type in feature_types: + modality = FEATURE_TYPES_NAMES[feature_type] + subset_df = df.loc[df["feature_type"] == feature_type] + mudata_obj.mod[modality].uns["feature_reference"] = subset_df + return mudatas + + +def process_counts(counts_folder: Path, multiplexing_info, metrics_files): + counts_matrix_file = counts_folder / "raw_feature_bc_matrix.h5" + logger.info("Reading %s.", counts_matrix_file) + adata = scanpy.read_10x_h5(counts_matrix_file, gex_only=False) + + # set the gene ids as var_names + logger.info("Renaming var columns") + adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") + + # generate output + logger.info("Convert to mudata") + + def modality_name_factory(library_type): + return ("".join(library_type.replace("-", "_").split())).lower() + + feature_types = defaultdict(modality_name_factory, FEATURE_TYPES_NAMES) + mudata_all_samples = mudata.MuData(adata, feature_types_names=feature_types) + if multiplexing_info: + # Get the mapping between the barcode and the sample ID from one of the metrics files + metrics_file = pd.read_csv( + list(metrics_files.values())[0], decimal=".", quotechar='"', thousands="," + ) + sample_ids = metrics_file[metrics_file["Metric Name"] == "Sample ID"] + barcode_sample_mapping = ( + sample_ids.loc[:, ["Group Name", "Metric Value"]] + .set_index("Group Name") + .squeeze() + .to_dict() + ) + # When probe barcodes are combined with multiplexing barcodes; the probe barcode (BC) + # is paired with other sample with other sample information using the syntax "{BC}+{antibody}" + # (or even "{BC}+{antibody}+{crispr guide}"). The cells_per_tag + # file only encodes the BCs; so we need to strip the other information. + barcode_sample_mapping = { + key_.partition("+")[0]: value_ + for key_, value_ in barcode_sample_mapping.items() + } + return split_samples( + mudata_all_samples, multiplexing_info, barcode_sample_mapping + ) + return {"run": mudata_all_samples} + + +def split_samples(mudata_obj, multiplexing_analysis_folder, barcode_sample_mapping): + result = {} + cells_per_tag_file = multiplexing_analysis_folder / "cells_per_tag.json" + with cells_per_tag_file.open("r") as open_json: + sample_cell_mapping = json.load(open_json) + + for barcode, indices in sample_cell_mapping.items(): + if indices: + sample_mudata = mudata_obj[indices] + sample = barcode_sample_mapping[barcode] + result[sample] = sample_mudata.copy() + return result + + +def process_metrics_summary( + mudatas: dict[str, mudata.MuData], metrics_files: dict[str, Path] +): + def read_percentage(val): + try: + if str(val).endswith("%"): + return float(val.strip("%")) / 100 + else: + return val + except (AttributeError, ValueError): + return val + + for sample, mudata_obj in mudatas.items(): + metrics_file = metrics_files[sample] + metrics_summary = pd.read_csv( + metrics_file, decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) + + mudata_obj.uns[par["uns_metrics"]] = metrics_summary + for colname, coldata in metrics_summary.items(): + try: + new_column = coldata.astype(str, copy=True).astype( + {colname: "category"} + ) + metrics_summary[colname] = new_column + except (ValueError, TypeError): + logger.warning(f"Could not store column {colname} from metrics.") + pass + return mudatas + + +def process_antigen_analysis( + mudatas: dict[str, mudata.MuData], antigen_analysis_folder_paths: dict[str, Path] +): + for sample_id, mudata_obj in mudatas.items(): + antigen_analysis_folder_path = antigen_analysis_folder_paths[sample_id] + assert "antigen" in mudata_obj.mod + per_barcodes_file = antigen_analysis_folder_path / "per_barcode.csv" + assert per_barcodes_file.is_file(), ( + "Expected a per_barcode.csv file to be present." + ) + per_barcodes_df = pd.read_csv( + per_barcodes_file, index_col="barcode", sep=",", decimal=".", quotechar='"' + ) + is_gex_cell = per_barcodes_df["is_gex_cell"] + assert len(set(is_gex_cell.unique().tolist()) - set([False, True])) == 0, ( + "Expected 'is_gex_cell' column to be boolean. Please report this as a bug." + ) + barcodes_in_gex = per_barcodes_df[is_gex_cell] + # All of the barcodes listed in the per_barcode.csv with is_gex_cell set to 'True' + # must be in the 'rna' (an thus also 'antigen') modality + assert barcodes_in_gex.index.difference(mudata_obj["rna"].obs_names).empty + orig_obs_names = mudata_obj["antigen"].obs_names.copy() + mudata_obj["antigen"].obs = cast_to_writeable_dtype( + pd.concat( + [mudata_obj["antigen"].obs, barcodes_in_gex], + axis="columns", + join="outer", + verify_integrity=True, + sort=False, + ) + ) + assert orig_obs_names.equals(mudata_obj["antigen"].obs_names) + del orig_obs_names + + # The antigen_specificity_scores.csv file is only present when cellranger + # multi was run with a [antigen-specificity] section in config + specificity_file = ( + antigen_analysis_folder_path / "antigen_specificity_scores.csv" + ) + if specificity_file.is_file(): + antigen_scores_df = pd.read_csv( + specificity_file, + index_col=["barcode", "antigen"], + sep=",", + decimal=".", + quotechar='"', + ) + score = antigen_scores_df.unstack() + assert score.index.difference(mudata_obj["rna"].obs_names).empty + antigens = score.columns.unique(level="antigen") + for antigen in antigens: + score_antigen = score.loc[:, (slice(None), antigen)].droplevel( + "antigen", axis=1 + ) + score_antigen = score_antigen.reindex(mudata_obj["rna"].obs_names) + mudata_obj["antigen"].obsm[f"antigen_specificity_scores_{antigen}"] = ( + cast_to_writeable_dtype(score_antigen) + ) + return mudatas + + +def process_vdj(mudatas: dict[str, mudata.MuData], vdj_folder_path: str): + # https://scverse.org/scirpy/latest/generated/scirpy.io.read_10x_vdj.html#scirpy-io-read-10x-vdj + # According to docs, using the json is preferred as this file includes intron info. + all_config_json_file = vdj_folder_path / "all_contig_annotations.json" + vdj_type = vdj_folder_path.name + with all_config_json_file.open("r") as open_json: + json_obj = json.load(open_json) + for _, mudata_obj in mudatas.items(): + json_for_sample = [ + entry for entry in json_obj if entry["barcode"] in mudata_obj.obs_names + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as tfile: + json.dump(json_for_sample, tfile, indent=4) + tfile.flush() + vdj_anndata = read_10x_vdj(tfile.name) + mudata_obj.mod[vdj_type] = vdj_anndata + return mudatas + + +def get_modalities(input_data): + dispatcher = { + "multiplexing_analysis": split_samples, + "vdj_t": process_vdj, + "vdj_b": process_vdj, + "vdj_t_gd": process_vdj, + "metrics_summary": process_metrics_summary, + "feature_reference": process_feature_reference, + "perturbation_efficiencies_by_feature": partial( + proces_perturbation, "perturbation_efficiencies_by_feature" + ), + "perturbation_efficiencies_by_target": partial( + proces_perturbation, "perturbation_efficiencies_by_target" + ), + "antigen_analysis": process_antigen_analysis, + } + mudata_per_sample = process_counts( + input_data["count"], + input_data["multiplexing_analysis"], + input_data["metrics_summary"], + ) + for modality_name, modality_data_path in input_data.items(): + if ( + modality_name in ("count", "multiplexing_analysis") + or not modality_data_path + ): + continue + try: + parser_function = dispatcher[modality_name] + except KeyError as e: + raise ValueError( + "This component does not support the " + f"parsing of the '{modality_name}' yet." + ) from e + mudata_per_sample = parser_function(mudata_per_sample, modality_data_path) + return mudata_per_sample + + +def main(): + cellranger_multi_dir = Path(par["input"]) + # TODO: remove when issue https://github.com/viash-io/viash/issues/706 is resolved. + if isinstance(par["output"], (list, set, tuple)): + assert len(par["output"]) == 1, ( + "A single output file template should have been provided." + ) + par["output"] = par["output"][0] + assert par["output"].count("*") == 1, ( + f"Expected exactly one wildcard character (*) in output " + f"files template ({par['output']}). Found {par['output'].count('*')}" + ) + input_data = gather_input_data(cellranger_multi_dir) + result = get_modalities(input_data) + output_files = { + par["output"].replace("*", sample_name) for sample_name in result.keys() + } + assert len(output_files) == len(result.keys()), ( + "Replacing the wildcard in the output files " + "template did not produce unique file paths." + ) + logger.info( + "Writing output for samples: '%s' to '%s'", + "".join(result.keys()), + par["output"], + ) + with Path(par["sample_csv"]).open("w", newline="") as open_csv: + csvwriter = csv.DictWriter(open_csv, fieldnames=["sample_name", "file"]) + csvwriter.writeheader() + for sample_name, mudata_obj in result.items(): + output_file = Path(par["output"].replace("*", sample_name)) + mudata_obj.write_h5mu(output_file, compression=par["output_compression"]) + csvwriter.writerow({"sample_name": sample_name, "file": output_file.name}) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_cellranger_multi_to_h5mu", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow.config b/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow.config new file mode 100644 index 00000000..a1735478 --- /dev/null +++ b/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_cellranger_multi_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts the output from cellranger multi to a single .h5mu file.\nBy default, will map the following library type names to modality names:\n - Gene Expression: rna\n - Peaks: atac\n - Antibody Capture: prot\n - VDJ: vdj\n - VDJ-T: vdj_t\n - VDJ-B: vdj_b\n - CRISPR Guide Capture: crispr\n - Multiplexing Capture: hashing\n \nOther library types have their whitepace removed and dashes replaced by\nunderscores to generate the modality name.\n\nCurrently does not allow parsing the output from cell barcode demultiplexing.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow_labels.config b/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow_schema.json b/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..dcdb4216 --- /dev/null +++ b/target/nextflow/convert/from_cellranger_multi_to_h5mu/nextflow_schema.json @@ -0,0 +1,74 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_cellranger_multi_to_h5mu", + "description": "Converts the output from cellranger multi to a single .h5mu file.\nBy default, will map the following library type names to modality names:\n - Gene Expression: rna\n - Peaks: atac\n - Antibody Capture: prot\n - VDJ: vdj\n - VDJ-T: vdj_t\n - VDJ-B: vdj_b\n - CRISPR Guide Capture: crispr\n - Multiplexing Capture: hashing\n \nOther library types have their whitepace removed and dashes replaced by\nunderscores to generate the modality name.\n\nCurrently does not allow parsing the output from cell barcode demultiplexing.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input folder", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input_dir_containing_modalities\"`. " + }, + "output": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Locations for the output files", + "help_text": "Type: `file`, multiple: `True`, default: `\"$id.$key.output_*.h5mu\"`, direction: `output`, example: `[\"*.h5mu\"]`. ", + "default": "$id.$key.output_*.h5mu" + }, + "sample_csv": { + "type": "string", + "format": "path", + "description": "CSV file describing the sample name per output file", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.sample_csv.csv\"`, direction: `output`, example: `\"samples.csv\"`. ", + "default": "$id.$key.sample_csv.csv" + }, + "uns_metrics": { + "type": "string", + "description": "Name of the .uns slot under which to QC metrics (if any).", + "help_text": "Type: `string`, multiple: `False`, default: `\"metrics_cellranger\"`. ", + "default": "metrics_cellranger" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_cellranger_multi_to_h5mu/setup_logger.py b/target/nextflow/convert/from_cellranger_multi_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/convert/from_cellranger_multi_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/convert/from_h5ad_to_h5mu/.config.vsh.yaml b/target/nextflow/convert/from_h5ad_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..72dd772f --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_h5mu/.config.vsh.yaml @@ -0,0 +1,261 @@ +name: "from_h5ad_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5ad files" + info: null + default: + - "input.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of names to use for the modalities. Will be used as the keys\ + \ in the .mod attribute in the output MuData object\nThe number of items provided\ + \ for this argument equal the number of input files (--input) and their order\ + \ should match.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output MuData file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a single layer h5ad file into a single MuData object\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5ad_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_h5ad_to_h5mu" + executable: "target/nextflow/convert/from_h5ad_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_h5ad_to_h5mu/main.nf b/target/nextflow/convert/from_h5ad_to_h5mu/main.nf new file mode 100644 index 00000000..12815112 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_h5mu/main.nf @@ -0,0 +1,3939 @@ +// from_h5ad_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_h5ad_to_h5mu", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5ad files", + "default" : [ + "input.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "List of names to use for the modalities. Will be used as the keys in the .mod attribute in the output MuData object\nThe number of items provided for this argument equal the number of input files (--input) and their order should match.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output MuData file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts a single layer h5ad file into a single MuData object\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_h5ad_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_h5ad_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import anndata +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +assert len(par["input"]) == len(par["modality"]), ( + "Number of input files should be the same length as the number of modalities" +) + +logger.info("Reading input files") +data = { + key: anndata.read_h5ad(path) for key, path in zip(par["modality"], par["input"]) +} + +logger.info("Converting to mudata") +mudata = mu.MuData(data) + +try: + mudata.var_names_make_unique() +except (TypeError, ValueError): + pass + +logger.info("Writing to %s.", par["output"]) +mudata.write_h5mu(par["output"], compression=par["output_compression"]) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_h5ad_to_h5mu", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_h5ad_to_h5mu/nextflow.config b/target/nextflow/convert/from_h5ad_to_h5mu/nextflow.config new file mode 100644 index 00000000..0fd67a4b --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_h5ad_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts a single layer h5ad file into a single MuData object\n' + author = 'Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_h5ad_to_h5mu/nextflow_labels.config b/target/nextflow/convert/from_h5ad_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_h5ad_to_h5mu/nextflow_schema.json b/target/nextflow/convert/from_h5ad_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..552ad116 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_h5mu/nextflow_schema.json @@ -0,0 +1,75 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_h5ad_to_h5mu", + "description": "Converts a single layer h5ad file into a single MuData object\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Input h5ad files", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"input.h5ad\"]`, direction: `input`. ", + "default": [ + "input.h5ad" + ] + }, + "modality": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of names to use for the modalities", + "help_text": "Type: `string`, multiple: `True`, default: `[\"rna\"]`. ", + "default": [ + "rna" + ] + }, + "output": { + "type": "string", + "format": "path", + "description": "Output MuData file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_h5ad_to_h5mu/setup_logger.py b/target/nextflow/convert/from_h5ad_to_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/convert/from_h5ad_to_seurat/.config.vsh.yaml b/target/nextflow/convert/from_h5ad_to_seurat/.config.vsh.yaml new file mode 100644 index 00000000..21daa7db --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_seurat/.config.vsh.yaml @@ -0,0 +1,242 @@ +name: "from_h5ad_to_seurat" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5ad file" + info: null + example: + - "input.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--assay" + description: "Name of the assay to be created." + info: null + default: + - "RNA" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output Seurat file" + info: null + example: + - "output.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts an h5ad file into a Seurat file.\n" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "libgeos-dev" + interactive: false + - type: "r" + cran: + - "hdf5r" + - "Seurat" + - "SeuratObject" + github: + - "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + cran: + - "testthat" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5ad_to_seurat/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_h5ad_to_seurat" + executable: "target/nextflow/convert/from_h5ad_to_seurat/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_h5ad_to_seurat/main.nf b/target/nextflow/convert/from_h5ad_to_seurat/main.nf new file mode 100644 index 00000000..6ce04975 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_seurat/main.nf @@ -0,0 +1,3906 @@ +// from_h5ad_to_seurat main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_h5ad_to_seurat", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5ad file", + "example" : [ + "input.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--assay", + "description" : "Name of the assay to be created.", + "default" : [ + "RNA" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output Seurat file", + "example" : [ + "output.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts an h5ad file into a Seurat file.\n", + "test_resources" : [ + { + "type" : "r_script", + "path" : "test.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:24.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "libgeos-dev" + ], + "interactive" : false + }, + { + "type" : "r", + "cran" : [ + "hdf5r", + "Seurat", + "SeuratObject" + ], + "github" : [ + "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ], + "test_setup" : [ + { + "type" : "r", + "cran" : [ + "testthat" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_h5ad_to_seurat/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_h5ad_to_seurat", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +library(anndataR) + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "assay" = $( if [ ! -z ${VIASH_PAR_ASSAY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ASSAY" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + + +seurat_obj <- read_h5ad( + par\\$input, + mode = "r", + as = "Seurat", + assay_name = par\\$assay +) + +saveRDS(seurat_obj, file = par\\$output) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_h5ad_to_seurat", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_h5ad_to_seurat/nextflow.config b/target/nextflow/convert/from_h5ad_to_seurat/nextflow.config new file mode 100644 index 00000000..6d634a06 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_seurat/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_h5ad_to_seurat' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts an h5ad file into a Seurat file.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_h5ad_to_seurat/nextflow_labels.config b/target/nextflow/convert/from_h5ad_to_seurat/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_seurat/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_h5ad_to_seurat/nextflow_schema.json b/target/nextflow/convert/from_h5ad_to_seurat/nextflow_schema.json new file mode 100644 index 00000000..26ce9a73 --- /dev/null +++ b/target/nextflow/convert/from_h5ad_to_seurat/nextflow_schema.json @@ -0,0 +1,55 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_h5ad_to_seurat", + "description": "Converts an h5ad file into a Seurat file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5ad file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5ad\"`. " + }, + "assay": { + "type": "string", + "description": "Name of the assay to be created.", + "help_text": "Type: `string`, multiple: `False`, default: `\"RNA\"`. ", + "default": "RNA" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output Seurat file", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.rds\"`, direction: `output`, example: `\"output.rds\"`. ", + "default": "$id.$key.output.rds" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/.config.vsh.yaml b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/.config.vsh.yaml new file mode 100644 index 00000000..cbd5951c --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/.config.vsh.yaml @@ -0,0 +1,251 @@ +name: "from_h5mu_or_h5ad_to_seurat" +namespace: "convert" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5ad or h5mu file" + info: null + example: + - "input.h5ad" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Modality to be converted if the input file is an h5mu file." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--assay" + description: "Name of the assay to be created." + info: null + default: + - "RNA" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output Seurat file" + info: null + example: + - "output.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts an h5ad file or a single modality of an h5mu file into a Seurat\ + \ file.\n" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "libgeos-dev" + - "hdf5-tools" + interactive: false + - type: "r" + cran: + - "anndata" + - "hdf5r" + - "Seurat" + - "SeuratObject" + github: + - "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + cran: + - "testthat" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_h5mu_or_h5ad_to_seurat" + executable: "target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/main.nf b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/main.nf new file mode 100644 index 00000000..1ea4920d --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/main.nf @@ -0,0 +1,3976 @@ +// from_h5mu_or_h5ad_to_seurat main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_h5mu_or_h5ad_to_seurat", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5ad or h5mu file", + "example" : [ + "input.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Modality to be converted if the input file is an h5mu file.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--assay", + "description" : "Name of the assay to be created.", + "default" : [ + "RNA" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output Seurat file", + "example" : [ + "output.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts an h5ad file or a single modality of an h5mu file into a Seurat file.\n", + "test_resources" : [ + { + "type" : "r_script", + "path" : "test.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "libgeos-dev", + "hdf5-tools" + ], + "interactive" : false + }, + { + "type" : "r", + "cran" : [ + "anndata", + "hdf5r", + "Seurat", + "SeuratObject" + ], + "github" : [ + "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ], + "test_setup" : [ + { + "type" : "r", + "cran" : [ + "testthat" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +library(anndataR) +library(hdf5r) + + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "modality" = $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_MODALITY" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "assay" = $( if [ ! -z ${VIASH_PAR_ASSAY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ASSAY" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + +is_mudata_file <- function(file_path) { + con <- file(file_path, "rb") + tryCatch({ + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + header <- readBin(con, "raw", n = 6) + identical(header, charToRaw("MuData")) + }, finally = { + close(con) + }) +} + +h5mu_to_h5ad <- function(h5mu_path, modality_name) { + tmp_path <- tempfile(fileext = ".h5ad") + mod_location <- paste("mod", modality_name, sep = "/") + h5src <- hdf5r::H5File\\$new(h5mu_path, "r") + h5dest <- hdf5r::H5File\\$new(tmp_path, "w") + # Copy over the child objects and the child attributes from root + # Root cannot be copied directly because it always exists and + # copying does not allow overwriting. + children <- hdf5r::list.objects(h5src, + path = mod_location, + full.names = FALSE, recursive = FALSE + ) + for (child in children) { + h5dest\\$obj_copy_from( + h5src, paste(mod_location, child, sep = "/"), + paste0("/", child) + ) + } + # Also copy the root attributes + root_attrs <- hdf5r::h5attr_names(x = h5src) + for (attr in root_attrs) { + h5a <- h5src\\$attr_open(attr_name = attr) + robj <- h5a\\$read() + h5dest\\$create_attr_by_name( + attr_name = attr, + obj_name = ".", + robj = robj, + space = h5a\\$get_space(), + dtype = h5a\\$get_type() + ) + } + h5src\\$close() + h5dest\\$close() + + tmp_path +} + +# If the input is a MuData file, use a single modality as input instead +if (is_mudata_file(par\\$input)) { + if (is.null(par\\$modality) || par\\$modality == "") { + stop("'modality' argument must be set if the input is a MuData file.") + } + h5ad_path <- h5mu_to_h5ad(par\\$input, par\\$modality) +} else { + h5ad_path <- par\\$input +} + +seurat_obj <- read_h5ad( + h5ad_path, + mode = "r", + as = "Seurat", + assay_name = par\\$assay +) + +saveRDS(seurat_obj, file = par\\$output) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_h5mu_or_h5ad_to_seurat", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow.config b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow.config new file mode 100644 index 00000000..18aa4813 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_h5mu_or_h5ad_to_seurat' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts an h5ad file or a single modality of an h5mu file into a Seurat file.\n' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow_labels.config b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow_schema.json b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow_schema.json new file mode 100644 index 00000000..8427accf --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_seurat/nextflow_schema.json @@ -0,0 +1,61 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_h5mu_or_h5ad_to_seurat", + "description": "Converts an h5ad file or a single modality of an h5mu file into a Seurat file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5ad or h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5ad\"`. " + }, + "modality": { + "type": "string", + "description": "Modality to be converted if the input file is an h5mu file.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "assay": { + "type": "string", + "description": "Name of the assay to be created.", + "help_text": "Type: `string`, multiple: `False`, default: `\"RNA\"`. ", + "default": "RNA" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output Seurat file", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.rds\"`, direction: `output`, example: `\"output.rds\"`. ", + "default": "$id.$key.output.rds" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/.config.vsh.yaml b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/.config.vsh.yaml new file mode 100644 index 00000000..41631f80 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/.config.vsh.yaml @@ -0,0 +1,428 @@ +name: "from_h5mu_or_h5ad_to_tiledb" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Input AnnData or MuData file. When an AnnData file is provided,\ + \ it is automatically assumed to \ncontain transcriptome counts.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "RNA modality" + arguments: + - type: "string" + name: "--rna_modality" + description: "The name used for the RNA modality. Used when input file is a MuData\ + \ object.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_raw_layer_input" + description: "Location of the layer containing the raw transcriptome counts. Layers\ + \ are looked for in .layers,\nexcept when using the value 'X'; in which case\ + \ .X is used.\n" + info: null + example: + - "X" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_normalized_layer_input" + description: "Location of the layer containing the normalized counts. Layers are\ + \ looked for in .layers,\nexcept when using the value 'X'; in which case .X\ + \ is used.\n" + info: null + example: + - "log_normalized" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_var_gene_names_input" + description: "Column in .var that provides the gene names. If not specified, the\ + \ index from the input is used.\n" + info: null + example: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Protein modality" + arguments: + - type: "string" + name: "--prot_modality" + description: "The name used for the protein modality. Used when input file is\ + \ a MuData object.\nWhen not specified, the protein modality will not be processed.\n" + info: null + example: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_raw_layer_input" + description: "Location of the layer containing the raw protein counts. Layers\ + \ are looked for in .layers,\nexcept when using the value 'X'; in which case\ + \ .X is used.\n" + info: null + example: + - "X" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_normalized_layer_input" + description: "Location of the layer containing the normalized counts. Layers are\ + \ looked for in .layers,\nexcept when using the value 'X'; in which case .X\ + \ is used.\n" + info: null + example: + - "clr" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output slots" + arguments: + - type: "string" + name: "--rna_modality_output" + description: "TileDB Measurement name where the RNA modality will be stored.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_modality_output" + description: "Name of the TileDB Measurement where the protein modality will be\ + \ stored.\n" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_index_name_output" + description: "Name of the index that is used to describe the cells (observations).\n" + info: null + default: + - "cell_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_var_index_name_output" + description: "Output name of the index that is used to describe the genes.\n" + info: null + default: + - "rna_index" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_raw_layer_output" + description: "Output location for the raw transcriptomics counts.\n" + info: null + default: + - "X" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_normalized_layer_output" + description: "Output location for the normalized RNA counts.\n" + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_var_gene_names_output" + description: "Name of the .var column that specifies the gene games.\n" + info: null + default: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_var_index_name_output" + description: "Output name of the index that is used to describe the proteins.\ + \ \n" + info: null + default: + - "prot_index" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_raw_layer_output" + description: "Output location for the raw protein counts.\n" + info: null + default: + - "X" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_normalized_layer_output" + description: "Output location for the normalized protein counts.\n" + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--tiledb_dir" + description: "Directory where the TileDB output will be written to.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert a MuData or AnnData object to tiledb. Currently, transcriptome\ + \ and protein modalities are supported.\n\nNOTE: The functionality provided by this\ + \ component is experimental and may be subject to change. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "tiledbsoma" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb" + executable: "target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/main.nf b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/main.nf new file mode 100644 index 00000000..ecdce058 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/main.nf @@ -0,0 +1,4791 @@ +// from_h5mu_or_h5ad_to_tiledb main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_h5mu_or_h5ad_to_tiledb", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input AnnData or MuData file. When an AnnData file is provided, it is automatically assumed to \ncontain transcriptome counts.\n", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "RNA modality", + "arguments" : [ + { + "type" : "string", + "name" : "--rna_modality", + "description" : "The name used for the RNA modality. Used when input file is a MuData object.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_raw_layer_input", + "description" : "Location of the layer containing the raw transcriptome counts. Layers are looked for in .layers,\nexcept when using the value 'X'; in which case .X is used.\n", + "example" : [ + "X" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_normalized_layer_input", + "description" : "Location of the layer containing the normalized counts. Layers are looked for in .layers,\nexcept when using the value 'X'; in which case .X is used.\n", + "example" : [ + "log_normalized" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_var_gene_names_input", + "description" : "Column in .var that provides the gene names. If not specified, the index from the input is used.\n", + "example" : [ + "gene_symbol" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Protein modality", + "arguments" : [ + { + "type" : "string", + "name" : "--prot_modality", + "description" : "The name used for the protein modality. Used when input file is a MuData object.\nWhen not specified, the protein modality will not be processed.\n", + "example" : [ + "prot" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_raw_layer_input", + "description" : "Location of the layer containing the raw protein counts. Layers are looked for in .layers,\nexcept when using the value 'X'; in which case .X is used.\n", + "example" : [ + "X" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_normalized_layer_input", + "description" : "Location of the layer containing the normalized counts. Layers are looked for in .layers,\nexcept when using the value 'X'; in which case .X is used.\n", + "example" : [ + "clr" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output slots", + "arguments" : [ + { + "type" : "string", + "name" : "--rna_modality_output", + "description" : "TileDB Measurement name where the RNA modality will be stored.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_modality_output", + "description" : "Name of the TileDB Measurement where the protein modality will be stored.\n", + "default" : [ + "prot" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_index_name_output", + "description" : "Name of the index that is used to describe the cells (observations).\n", + "default" : [ + "cell_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_var_index_name_output", + "description" : "Output name of the index that is used to describe the genes.\n", + "default" : [ + "rna_index" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_raw_layer_output", + "description" : "Output location for the raw transcriptomics counts.\n", + "default" : [ + "X" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_normalized_layer_output", + "description" : "Output location for the normalized RNA counts.\n", + "default" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_var_gene_names_output", + "description" : "Name of the .var column that specifies the gene games.\n", + "default" : [ + "gene_symbol" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_var_index_name_output", + "description" : "Output name of the index that is used to describe the proteins. \n", + "default" : [ + "prot_index" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_raw_layer_output", + "description" : "Output location for the raw protein counts.\n", + "default" : [ + "X" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_normalized_layer_output", + "description" : "Output location for the normalized protein counts.\n", + "default" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--tiledb_dir", + "description" : "Directory where the TileDB output will be written to.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert a MuData or AnnData object to tiledb. Currently, transcriptome and protein modalities are supported.\n\nNOTE: The functionality provided by this component is experimental and may be subject to change. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "tiledbsoma" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import anndata +import numpy as np +import tiledbsoma + +# This import seems redundant, but it is required! +import tiledbsoma.io +import pandas as pd +from functools import singledispatch +from collections.abc import Mapping +import h5py +from shutil import copy2, rmtree +from collections.abc import Callable +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +import pyarrow as pa +from collections import defaultdict +import json + +anndata.settings.allow_write_nullable_strings = True + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_modality': $( if [ ! -z ${VIASH_PAR_RNA_MODALITY+x} ]; then echo "r'${VIASH_PAR_RNA_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_raw_layer_input': $( if [ ! -z ${VIASH_PAR_RNA_RAW_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_RNA_RAW_LAYER_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_normalized_layer_input': $( if [ ! -z ${VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_RNA_NORMALIZED_LAYER_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_var_gene_names_input': $( if [ ! -z ${VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT+x} ]; then echo "r'${VIASH_PAR_RNA_VAR_GENE_NAMES_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_modality': $( if [ ! -z ${VIASH_PAR_PROT_MODALITY+x} ]; then echo "r'${VIASH_PAR_PROT_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_raw_layer_input': $( if [ ! -z ${VIASH_PAR_PROT_RAW_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_PROT_RAW_LAYER_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_normalized_layer_input': $( if [ ! -z ${VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT+x} ]; then echo "r'${VIASH_PAR_PROT_NORMALIZED_LAYER_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_modality_output': $( if [ ! -z ${VIASH_PAR_RNA_MODALITY_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_MODALITY_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_modality_output': $( if [ ! -z ${VIASH_PAR_PROT_MODALITY_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_MODALITY_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_index_name_output': $( if [ ! -z ${VIASH_PAR_OBS_INDEX_NAME_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBS_INDEX_NAME_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_var_index_name_output': $( if [ ! -z ${VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_VAR_INDEX_NAME_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_raw_layer_output': $( if [ ! -z ${VIASH_PAR_RNA_RAW_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_RAW_LAYER_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_normalized_layer_output': $( if [ ! -z ${VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_NORMALIZED_LAYER_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rna_var_gene_names_output': $( if [ ! -z ${VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT+x} ]; then echo "r'${VIASH_PAR_RNA_VAR_GENE_NAMES_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_var_index_name_output': $( if [ ! -z ${VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_VAR_INDEX_NAME_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_raw_layer_output': $( if [ ! -z ${VIASH_PAR_PROT_RAW_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_RAW_LAYER_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'prot_normalized_layer_output': $( if [ ! -z ${VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_PROT_NORMALIZED_LAYER_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'tiledb_dir': $( if [ ! -z ${VIASH_PAR_TILEDB_DIR+x} ]; then echo "r'${VIASH_PAR_TILEDB_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +LAYER_ARGUMENTS = ( + "rna_raw_layer_input", + "rna_raw_layer_output", + "rna_normalized_layer_input", + "rna_normalized_layer_output", + "prot_raw_layer_input", + "prot_raw_layer_output", + "prot_normalized_layer_input", + "prot_normalized_layer_output", +) + + +multidim_column_indices = {} + + +def _convert_to_array(mod_name, _, group): + """ + tiledb SOMA does not support pandas dataframes in varm and obsm + Convert those to structured arrays and store column index as metadata + """ + entries = group.keys() + for entry in entries: + encoding = group[entry].attrs["encoding-type"] + if encoding in ("array", "csr_matrix", "csc_matrix"): + return + if encoding in { + "dataframe", + "nullable-integer", + "nullable-boolean", + "nullable-string-array", + }: + df = anndata.io.read_elem(group[entry]) + data = df.to_numpy() + anndata.io.write_elem(group, entry, data) + index_to_write = df.columns + # No need to store write the index to tileDB in the case its just the integer index + if not isinstance(index_to_write, pd.RangeIndex): + multidim_column_indices.setdefault(mod_name, {}).setdefault( + group.name.strip("/"), {} + ).update({entry: index_to_write.to_list()}) + return + logger.info( + "Deleting %s because it is of encoding %s, which is not supported.", + f"{group.name}/{entry}", + encoding, + ) + del group[entry] + + +def _read_obs(anndata_file): + with h5py.File(anndata_file, "r") as open_rna: + return _to_pandas_nullable(anndata.io.read_elem(open_rna["obs"])) + + +def _write_obs(anndata_file, obs_df): + with h5py.File(anndata_file, "r+") as open_h5: + anndata.io.write_elem(open_h5, "obs", obs_df) + + +def _log_arguments(function_obj, arg_dict): + """ + Format a dictionairy of arguments into a string that is put into the script logs. + """ + args_str = [f"\\\\t{param}: {param_val}\\\\n" for param, param_val in arg_dict.items()] + logger.info( + "Calling %s with arguments:\\\\n%s", + function_obj.__name__, + "".join(args_str).rstrip(), + ) + + +def _to_pandas_nullable(table_or_df: pa.Table | pd.DataFrame) -> pd.DataFrame: + """ + Convert pyarrow Table or pandas dataframe to pandas dataframe that uses nullable data types. + + This function takes the following into account: + 1. AnnData supports writing BooleanDtype, IntegerDtype and StringDtype (which uses pd.NA) + 2. AnnData does *not* support FloatingDtype + 3. AnnData does support numpy-stype floating and integer dtypes (which use np.nan). + 4. TileDB SOMA's \\`update_obs\\` does not allow casting floats to integers (potential information loss). + 5. When pandas's \\`convert_dtypes\\` convert_integer argument is set to True, + floats will be converted to integer if they can be 'faithfully' casted. + + Giving the previous constraints, it is *not* possible to only convert integers using \\`convert_types\\` because + it might cause some float columns to be converted to integers (which is incompatible with tiledbsoma). + Therefore, this function will leave integers and floats as-is. + + Based on the conversion provided by this function, a suitable na representation (pd.NA or np.NA) for a column can + be chosen using \\`pd.api.types.is_extension_array_dtype(column_dtype)\\` + """ + try: + df = table_or_df.to_pandas( + types_mapper=defaultdict(None, {pa.large_string(): pd.StringDtype()}).get + ) + except AttributeError: + df = table_or_df.convert_dtypes( + infer_objects=False, + convert_string=True, + convert_integer=False, + convert_boolean=False, + convert_floating=False, + ) + return df.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_integer=False, + convert_boolean=True, + convert_floating=False, + ) + + +def _get_temp_h5ad(): + return Path(NamedTemporaryFile(suffix=".h5ad", dir=meta["temp_dir"]).name) + + +def _h5mu_to_h5ad(h5mu_path, modality_name): + """ + Create an anndata file from a modality in a mudata file by + copying over the relevant h5 objects. + """ + result = _get_temp_h5ad() + with h5py.File(h5mu_path, "r") as open_anndata: + with h5py.File(result, "w") as file_to_write: + for h5_key in open_anndata["mod"][modality_name].keys(): + open_anndata.copy( + source=f"/mod/{modality_name}/{h5_key}", dest=file_to_write + ) + return result + + +def _handle_layer_location(layer_name): + """ + Handle the special case where 'X' is specified as layer + this layer is not present in \\`/layers/X\\` but in \\`/X\\`. + """ + prefix = "layers/" if layer_name != "X" else "" + return f"{prefix}{layer_name}" + + +def _check_input_args(par, input, is_mudata): + """ + Check the input arguments: + * Arguments for protein modalities are not allowed when the input is not a MuData file. + * Arguments for input layers may not point to the same layer + * Output locations for layers may not be the same + * Input modalities must not be the same + * Input modalities must exist + """ + if not is_mudata: + for par_name in ( + "prot_modality", + "prot_raw_layer_input", + "prot_normalized_layer_input", + ): + if par[par_name]: + raise ValueError( + f"'{par_name}' can not be used when the input is not a MuData file." + ) + + if par["prot_modality"]: + for prot_arg in ("prot_raw_layer_input", "prot_raw_layer_output"): + if not par[prot_arg]: + ValueError( + f"When providing 'prot_modality', '{prot_arg}' must also be set." + ) + + NOT_SAME_VALUE = ( + ("rna_raw_layer_input", "rna_normalized_layer_input"), + ("prot_raw_layer_input", "prot_normalized_layer_input"), + ("rna_raw_layer_output", "rna_normalized_layer_output"), + ("prot_raw_layer_output", "prot_normalized_layer_output"), + ("rna_modality", "prot_modality"), + ) + for layer1_arg, layer2_arg in NOT_SAME_VALUE: + arg1_val, arg2_val = par[layer1_arg], par[layer2_arg] + if (arg1_val == arg2_val) and arg1_val is not None: + raise ValueError( + f"The value for argument '{layer1_arg}' ({arg1_val}) must not be the same as for argument '{layer2_arg}' ({arg2_val})" + ) + + with h5py.File(input, "r") as open_h5: + if "mod" in open_h5: + available_modalities = tuple(open_h5["mod"].keys()) + for mod in (par["rna_modality"], par["prot_modality"]): + if mod is not None and f"/mod/{mod}" not in open_h5: + raise ValueError( + f"Modality '{mod}' was not found in object, available modalities: {available_modalities}" + ) + + +def _copy_layer(source: str, dest: str, input_h5ad: Path, mod_obj: h5py.Group): + """ + Copy a h5py object from one location (source) to a new location (dest). + The source location must be located in the input AnnData file, the destination + will be written to a h5py Group object. + """ + logger.info("Copying layer '%s' from '%s' to '%s'", source, input_h5ad, dest) + if source == f"{mod_obj.name}/{dest}": + logger.info("Layer is already at the correct location. Nothing to do.") + return + if not input_h5ad.is_file(): + raise FileNotFoundError(f"Could not link to {input_h5ad}: file does not exist.") + with h5py.File(input_h5ad) as open_input: + if source not in open_input: + raise ValueError(f"Layer {source} was not found in {input_h5ad}.") + if dest in mod_obj: + logger.info(f"{dest} is already present. Removing before creating link.") + del mod_obj[dest] + # ExternalLink could have been used instead of 'copy', but it does not seem to work with tileDB + with h5py.File(str(input_h5ad), "r") as open_input: + open_input.copy( + source=open_input[source], dest=mod_obj, name=dest, expand_external=True + ) + logger.info("Copying complete.") + + +def _copy_column(src, dest, _, df_h5_obj): + """ + Duplicate a column in a dataframe and give a new name. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Copying column '%s' to '%s' for '%s'.", src, dest, df_h5_obj.name) + columns = df_h5_obj.attrs["column-order"] + if src == dest: + logger.info("Source and destination for the column are the same, not copying.") + return None + if dest in columns: + raise ValueError(f"Column {dest} already exists in {df_h5_obj.name}.") + df_h5_obj.attrs["column-order"] = np.append(columns, dest) + if src is None: + # Use the index as source. + src = df_h5_obj.attrs["_index"] + # Both the index and columns are written as keys to df_h5_obj + # An index name may be allowed to also be column name _only_ when + # the column and the index have the same contents (this is an anndata requirement). + # Here, this is always the case. + if src in df_h5_obj.attrs["column-order"]: + return None + df_h5_obj.copy(src, dest, expand_refs=True, expand_soft=True) + + +def _set_index_name(name, _, df_h5_obj): + """ + Set the index for a dataframe to an existing column. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Setting index name of '%s' to '%s'.", df_h5_obj.name, name) + original_index_name = df_h5_obj.attrs["_index"] + # An index and a column may share a key in df_h5_obj + # In this case we must not remove the original key from the object + operator = ( + df_h5_obj.move + if original_index_name not in df_h5_obj.attrs["column-order"] + else df_h5_obj.copy + ) + operator(original_index_name, name) + df_h5_obj.attrs["_index"] = name + logger.info("Done setting index name") + + +def _delete_all_keys(_, group_obj, exception=None): + """ + Delete all keys from a h5py.Group object; which optional exceptions. + """ + if exception is None: + exception = () + logger.info( + "Deleting all keys from '%s'%s.", + group_obj.name, + f" except {', '.join(exception)}" if exception else "", + ) + keys_to_delete = set(group_obj.keys()) - set(exception) + if not keys_to_delete: + logger.info("No keys to delete.") + return + for key in keys_to_delete: + del group_obj[key] + logger.info("Done deleting keys %s.", ", ".join(keys_to_delete)) + + +def _set_index_to_string_dtype(_, df_obj): + """ + Change the index dataype for a dataframe to string when it is encoded as categorical. + The dataframe must be provided as a h5py Group object that has been encoded by AnnData. + """ + logger.info( + "Checking if index of object '%s' is encoded as a categorical.", df_obj.name + ) + index_col_name = df_obj.attrs["_index"] + index_col = df_obj[index_col_name] + logger.info("Column '%s' is being used as the index.", index_col_name) + index_col_dtype = index_col.attrs["encoding-type"] + if index_col_dtype == "categorical": + logger.info( + "Column '%s' is encoded as categorical, converting to string.", + index_col_name, + ) + is_ordered = bool(index_col.attrs.get("ordered", False)) + codes, categories = index_col["codes"], index_col["categories"] + column = pd.Categorical.from_codes(codes, categories, is_ordered) + column_str = column.astype(str).astype(object) + del df_obj[index_col_name] + df_obj.create_dataset(index_col_name, data=column_str) + index_col.attrs["encoding-type"] = "string-array" + logger.info("Conversion to string dtype complete.") + return + logger.info( + "Column '%s' not encoded as categorical. Leaving as is.", index_col_name + ) + + +def _get_rna_conversion_specification(par): + rna_spec = [ + partial(_copy_layer, par["rna_raw_layer_input"], par["rna_raw_layer_output"]), + partial( + _copy_layer, + par["rna_normalized_layer_input"], + par["rna_normalized_layer_output"], + ), + { + "varm": partial(_convert_to_array, "rna"), + "obsm": partial(_convert_to_array, "rna"), + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + "var": [ + partial( + _copy_column, + par["rna_var_gene_names_input"], + par["rna_var_gene_names_output"], + ), + partial(_set_index_name, par["rna_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "obs": partial(_set_index_name, par["obs_index_name_output"]), + "layers": partial( + _delete_all_keys, + exception=( + par["rna_raw_layer_output"].removeprefix("layers/"), + par["rna_normalized_layer_output"].removeprefix("layers/"), + ), + ), + }, + ] + return rna_spec + + +def _get_prot_conversion_specification(par): + prot_spec = [ + partial(_copy_layer, par["prot_raw_layer_input"], par["prot_raw_layer_output"]), + partial( + _copy_layer, + par["prot_normalized_layer_input"], + par["prot_normalized_layer_output"], + ), + { + "varm": partial(_convert_to_array, "prot"), + "obsm": partial(_convert_to_array, "prot"), + "var": [ + partial(_set_index_name, par["prot_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + # "varm": _delete_all_keys, + "layers": partial( + _delete_all_keys, + exception=( + par["prot_raw_layer_output"].removeprefix("layers/"), + par["prot_normalized_layer_output"].removeprefix("layers/"), + ), + ), + "obs": partial(_set_index_name, par["obs_index_name_output"]), + }, + ] + + return prot_spec + + +def _copy_anndata_and_convert_inplace(anndata_path, conversion_specification): + converted_anndata = _get_temp_h5ad() + copy2(anndata_path, converted_anndata) + with h5py.File(converted_anndata, "r+") as open_prot_h5: + apply_conversion(conversion_specification, anndata_path, open_prot_h5) + return converted_anndata + + +@singledispatch +def apply_conversion(conversion_specification, input_h5ad, open_h5): + raise NotImplementedError( + f"Error: function 'apply_conversion' is not implemented for type '{type(conversion_specification)}'" + ) + + +@apply_conversion.register +def _(conversion_specification: list, input_h5ad, open_h5): + for item in conversion_specification: + apply_conversion(item, input_h5ad, open_h5) + + +@apply_conversion.register +def _(conversion_specifications: Mapping, input_h5ad, open_h5): + for h5_key, conversion_spec in conversion_specifications.items(): + if hasattr(open_h5, h5_key): + h5_obj = getattr(open_h5, h5_key) + else: + h5_obj = open_h5[h5_key] + apply_conversion(conversion_spec, input_h5ad, h5_obj) + + +@apply_conversion.register +def _(conversion_specifications: Callable, input_h5ad, open_h5): + conversion_specifications(input_h5ad, open_h5) + + +def _write_varm_obsm_indices(output_dir, mod_name, multidim_column_indices): + # TileDB SOMA does not support column indices for .varm and obsm matrices + # As a workaround, we will store these in the metadata slot + for multidim_key in ("varm", "obsm"): + collection_name = f"{str(output_dir)}/ms/{mod_name}/{multidim_key}" + if tiledbsoma.Collection.exists(collection_name): + with tiledbsoma.Collection.open( + collection_name, "w" + ) as multidim_collection: + for item in multidim_collection: + index_to_write = ( + multidim_column_indices.get(mod_name, {}) + .get(multidim_key, {}) + .get(item, None) + ) + if index_to_write: + multidim_collection[item].metadata["column_index"] = json.dumps( + index_to_write + ) + + +def _remove_directory(dir_path): + """ + Check if a directory exists and remove it does. + """ + if dir_path.exists(): + try: + # Delete the directory and all its contents + for item in dir_path.iterdir(): + if item.is_file() or item.is_symlink(): + item.unlink() # Removes files or symbolic links + elif item.is_dir(): + rmtree(item) # Removes directories recursively + logger.info("Directory '%s' has been deleted successfully.", dir_path) + except FileNotFoundError: + logger.info("Directory '%s' not found.", dir_path) + except PermissionError as e: + raise ValueError( + f"Permission denied: Unable to delete '{dir_path}'." + ) from e + + +def _align_obs_columns(experiment_df, anndata_df): + def _create_na_columns(df_to_write, columns, dtype_df): + """ + Create new columns in a dataframe with NA's as content while + making sure that the datatype of the newly created columns match + with those from another dataframe. + """ + for col in columns: + dtype = dtype_df[col].dtype + na = pd.NA if pd.api.types.is_extension_array_dtype(dtype) else np.nan + df_to_write.loc[:, col] = pd.Series( + na, index=df_to_write.index, dtype=dtype + ) + return df_to_write + + logger.info("Making sure obs columns are aligned.") + contents_columns = set(experiment_df.columns) + logger.info("Already ingested .obs columns: %s", ", ".join(contents_columns)) + logger.info("Retreiving obs columns to be added.") + + obs_to_add_columns = set(anndata_df.columns) + + logger.info("Checking dtype of common columns") + common_columns = obs_to_add_columns.intersection(contents_columns) + common_dtypes = {} + for column_name in common_columns: + # Combine the two columns, but only keep the calculated dtype. + # We'll update the data types in the two original frames and let tiledb do the joining. + new_series = experiment_df[column_name].combine_first(anndata_df[column_name]) + result_dtype = new_series.dtype + # Nullable floats are not supported by anndata... + if pd.api.types.is_float_dtype( + result_dtype + ) and pd.api.types.is_extension_array_dtype(result_dtype): + result_dtype = result_dtype.numpy_dtype + common_dtypes[column_name] = result_dtype + + logger.info( + "Updating the dtypes to comply with the following schema: %s", common_dtypes + ) + experiment_df, anndata_df = ( + experiment_df.astype(common_dtypes), + anndata_df.astype(common_dtypes), + ) + + logger.info("Will add the following columns: %s", ", ".join(obs_to_add_columns)) + missing_columns_in_old = obs_to_add_columns - contents_columns + logger.info( + "Adding the following columns to the schema: %s.", + ", ".join(missing_columns_in_old), + ) + experiment_df = _create_na_columns( + experiment_df, missing_columns_in_old, anndata_df + ) + + logger.info("Adjusting .obs succeeded!") + missing_columns_in_new = ( + contents_columns + - obs_to_add_columns + - {"soma_joinid", par["obs_index_name_output"]} + ) + logger.info( + "Adding the following columns to modality to be ingested: %s", + ", ".join(missing_columns_in_new), + ) + anndata_df = _create_na_columns(anndata_df, missing_columns_in_new, experiment_df) + return experiment_df, anndata_df + + +def main(par): + logger.info(f"Component {meta['name']} started.") + par["input"], par["tiledb_dir"] = Path(par["input"]), Path(par["tiledb_dir"]) + + # Handle case where input is a mudat file + with open(par["input"], "rb") as open_input_file: + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + is_mudata = open_input_file.read(6) == b"MuData" + + _check_input_args(par, par["input"], is_mudata) + + # Reference layers other than "X" refer as "layer/" + for arg_name in LAYER_ARGUMENTS: + par[arg_name] = _handle_layer_location(par[arg_name]) + + rna_input = par["input"] + # Create a temporary file that holds the converted RNA h5ad + if is_mudata: + if not par["rna_modality"]: + raise ValueError( + "'rna_modality' argument must be set if the input is a MuData file." + ) + # If the input is a MuData file, take the RNA modality and use that as input instead + rna_input = _h5mu_to_h5ad(par["input"], par["rna_modality"]) + + # Put a copy of the RNA input in the output file and convert it in-place + rna_converted = _copy_anndata_and_convert_inplace( + rna_input, _get_rna_conversion_specification(par) + ) + + if par["prot_modality"]: + prot_input = _h5mu_to_h5ad(par["input"], par["prot_modality"]) + prot_converted = _copy_anndata_and_convert_inplace( + prot_input, _get_prot_conversion_specification(par) + ) + + logger.info( + "Making sure that obs columns are shared between the two modalities." + ) + contents, obs_to_add = _read_obs(rna_converted), _read_obs(prot_converted) + contents, obs_to_add = _align_obs_columns(contents, obs_to_add) + _write_obs(prot_converted, obs_to_add) + _write_obs(rna_converted, contents) + + tiledbsoma.logging.info() + _remove_directory(par["tiledb_dir"]) + + logger.info("Ingesting RNA modality") + rna_from_h5ad_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": rna_converted, + "measurement_name": par["rna_modality_output"], + "uns_keys": [], + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, rna_from_h5ad_args) + func_to_call(**rna_from_h5ad_args) + + _write_varm_obsm_indices(par["tiledb_dir"], "rna", multidim_column_indices) + + logger.info("Done ingesting RNA modality") + + if par["prot_modality"]: + logger.info("Ingesting protein modality") + + logger.info("Initalizing prot modality schema.") + # Note the 'schema_only' + prot_from_h5ad_schema_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "uns_keys": [], + "ingest_mode": "schema_only", + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, prot_from_h5ad_schema_args) + func_to_call(**prot_from_h5ad_schema_args) + + logger.info("Registering prot modality anndata") + # Register the second anndata object in the protein measurement + register_prot_args = { + "experiment_uri": str(par["tiledb_dir"]), + "h5ad_file_names": [prot_converted], + "measurement_name": par["prot_modality_output"], + "obs_field_name": par["obs_index_name_output"], + "var_field_name": par["prot_var_index_name_output"], + "append_obsm_varm": True, + } + func_to_call = tiledbsoma.io.register_h5ads + _log_arguments(func_to_call, register_prot_args) + rd = func_to_call(**register_prot_args) + + rd.prepare_experiment(str(par["tiledb_dir"])) + + # Ingest the second anndata object into the protein measurement + prot_write_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "registration_mapping": rd, + "ingest_mode": "write", + "uns_keys": [], + "X_layer_name": "raw", + } + func_to_call = tiledbsoma.io.from_h5ad + _log_arguments(func_to_call, prot_write_args) + func_to_call(**prot_write_args) + + _write_varm_obsm_indices(par["tiledb_dir"], "prot", multidim_column_indices) + + logger.info("Finished!") + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_h5mu_or_h5ad_to_tiledb", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow.config b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow.config new file mode 100644 index 00000000..f1ac87e6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_h5mu_or_h5ad_to_tiledb' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert a MuData or AnnData object to tiledb. Currently, transcriptome and protein modalities are supported.\n\nNOTE: The functionality provided by this component is experimental and may be subject to change. \n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_labels.config b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_schema.json b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_schema.json new file mode 100644 index 00000000..8b11e9e7 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/nextflow_schema.json @@ -0,0 +1,185 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_h5mu_or_h5ad_to_tiledb", + "description": "Convert a MuData or AnnData object to tiledb. Currently, transcriptome and protein modalities are supported.\n\nNOTE: The functionality provided by this component is experimental and may be subject to change. \n", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input AnnData or MuData file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + } + } + }, + "rna modality": { + "title": "RNA modality", + "type": "object", + "description": "No description", + "properties": { + "rna_modality": { + "type": "string", + "description": "The name used for the RNA modality", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "rna_raw_layer_input": { + "type": "string", + "description": "Location of the layer containing the raw transcriptome counts", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"X\"`. " + }, + "rna_normalized_layer_input": { + "type": "string", + "description": "Location of the layer containing the normalized counts", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"log_normalized\"`. " + }, + "rna_var_gene_names_input": { + "type": "string", + "description": "Column in .var that provides the gene names", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_symbol\"`. " + } + } + }, + "protein modality": { + "title": "Protein modality", + "type": "object", + "description": "No description", + "properties": { + "prot_modality": { + "type": "string", + "description": "The name used for the protein modality", + "help_text": "Type: `string`, multiple: `False`, example: `\"prot\"`. " + }, + "prot_raw_layer_input": { + "type": "string", + "description": "Location of the layer containing the raw protein counts", + "help_text": "Type: `string`, multiple: `False`, example: `\"X\"`. " + }, + "prot_normalized_layer_input": { + "type": "string", + "description": "Location of the layer containing the normalized counts", + "help_text": "Type: `string`, multiple: `False`, example: `\"clr\"`. " + } + } + }, + "output slots": { + "title": "Output slots", + "type": "object", + "description": "No description", + "properties": { + "rna_modality_output": { + "type": "string", + "description": "TileDB Measurement name where the RNA modality will be stored.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "prot_modality_output": { + "type": "string", + "description": "Name of the TileDB Measurement where the protein modality will be stored.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"prot\"`. ", + "default": "prot" + }, + "obs_index_name_output": { + "type": "string", + "description": "Name of the index that is used to describe the cells (observations).\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"cell_id\"`. ", + "default": "cell_id" + }, + "rna_var_index_name_output": { + "type": "string", + "description": "Output name of the index that is used to describe the genes.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna_index\"`. ", + "default": "rna_index" + }, + "rna_raw_layer_output": { + "type": "string", + "description": "Output location for the raw transcriptomics counts.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"X\"`. ", + "default": "X" + }, + "rna_normalized_layer_output": { + "type": "string", + "description": "Output location for the normalized RNA counts.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"log_normalized\"`. ", + "default": "log_normalized" + }, + "rna_var_gene_names_output": { + "type": "string", + "description": "Name of the .var column that specifies the gene games.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"gene_symbol\"`. ", + "default": "gene_symbol" + }, + "prot_var_index_name_output": { + "type": "string", + "description": "Output name of the index that is used to describe the proteins", + "help_text": "Type: `string`, multiple: `False`, default: `\"prot_index\"`. ", + "default": "prot_index" + }, + "prot_raw_layer_output": { + "type": "string", + "description": "Output location for the raw protein counts.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"X\"`. ", + "default": "X" + }, + "prot_normalized_layer_output": { + "type": "string", + "description": "Output location for the normalized protein counts.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"log_normalized\"`. ", + "default": "log_normalized" + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "tiledb_dir": { + "type": "string", + "format": "path", + "description": "Directory where the TileDB output will be written to.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.tiledb_dir\"`, direction: `output`. ", + "default": "$id.$key.tiledb_dir" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/rna modality" + }, + { + "$ref": "#/$defs/protein modality" + }, + { + "$ref": "#/$defs/output slots" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/setup_logger.py b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_or_h5ad_to_tiledb/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/convert/from_h5mu_to_h5ad/.config.vsh.yaml b/target/nextflow/convert/from_h5mu_to_h5ad/.config.vsh.yaml new file mode 100644 index 00000000..67b4bd7a --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_h5ad/.config.vsh.yaml @@ -0,0 +1,262 @@ +name: "from_h5mu_to_h5ad" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input MuData file" + info: null + default: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output AnnData file." + info: null + default: + - "output.h5ad" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts a h5mu file into a h5ad file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_to_h5ad/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_h5mu_to_h5ad" + executable: "target/nextflow/convert/from_h5mu_to_h5ad/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_h5mu_to_h5ad/main.nf b/target/nextflow/convert/from_h5mu_to_h5ad/main.nf new file mode 100644 index 00000000..7680029b --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_h5ad/main.nf @@ -0,0 +1,3932 @@ +// from_h5mu_to_h5ad main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_h5mu_to_h5ad", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input MuData file", + "default" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output AnnData file.", + "default" : [ + "output.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts a h5mu file into a h5ad file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_h5mu_to_h5ad/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_h5mu_to_h5ad", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +# TODO: Merge modalities into one layer + +logger.info("Reading input h5mu file %s, modality %s", par["input"], par["modality"]) +adat = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Writing to %s.", par["output"]) +adat.write_h5ad(par["output"], compression=par["output_compression"]) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_h5mu_to_h5ad", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_h5mu_to_h5ad/nextflow.config b/target/nextflow/convert/from_h5mu_to_h5ad/nextflow.config new file mode 100644 index 00000000..f9004b62 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_h5ad/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_h5mu_to_h5ad' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts a h5mu file into a h5ad file.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_h5mu_to_h5ad/nextflow_labels.config b/target/nextflow/convert/from_h5mu_to_h5ad/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_h5ad/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_h5mu_to_h5ad/nextflow_schema.json b/target/nextflow/convert/from_h5mu_to_h5ad/nextflow_schema.json new file mode 100644 index 00000000..73bc1d1d --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_h5ad/nextflow_schema.json @@ -0,0 +1,65 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_h5mu_to_h5ad", + "description": "Converts a h5mu file into a h5ad file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input MuData file", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"input.h5mu\"`, direction: `input`. ", + "default": "input.h5mu" + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output AnnData file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"output.h5ad\"`, direction: `output`. ", + "default": "output.h5ad" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/from_h5mu_to_h5ad/setup_logger.py b/target/nextflow/convert/from_h5mu_to_h5ad/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_h5ad/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/convert/from_h5mu_to_seurat/.config.vsh.yaml b/target/nextflow/convert/from_h5mu_to_seurat/.config.vsh.yaml new file mode 100644 index 00000000..f1a952b5 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_seurat/.config.vsh.yaml @@ -0,0 +1,237 @@ +name: "from_h5mu_to_seurat" +namespace: "convert" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output Seurat file" + info: null + example: + - "output.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Converts an h5mu file into a Seurat file.\n\nRestrictions:\n - Only\ + \ the intersection of cells is currently loaded into the Seurat object due to the\ + \ object structure limitation.\n - Multimodal embeddings (global .obsm slot) are\ + \ loaded with the assay.used field set to the default assay.\n - Embeddings names\ + \ are changed in order to comply with R & Seurat requirements and conventions.\n\ + \ - Feature names with underscores ('_') are automatically replaced with dashes\ + \ ('-')\n - Seurat does not support global variables metadata /var.\n" +test_resources: +- type: "r_script" + path: "run_test.R" + is_executable: true +- type: "file" + path: "10x_5k_anticmv" +info: null +status: "deprecated" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "libgeos-dev" + interactive: false + - type: "r" + cran: + - "anndata" + - "hdf5r" + - "testthat" + - "SeuratObject" + - "Seurat" + bioc_force_install: false + warnings_as_errors: true + - type: "r" + github: + - "pmbio/MuDataSeurat@empty-tables-and-nullable" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/convert/from_h5mu_to_seurat/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/from_h5mu_to_seurat" + executable: "target/nextflow/convert/from_h5mu_to_seurat/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/from_h5mu_to_seurat/main.nf b/target/nextflow/convert/from_h5mu_to_seurat/main.nf new file mode 100644 index 00000000..f7262e66 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_seurat/main.nf @@ -0,0 +1,3949 @@ +// from_h5mu_to_seurat main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "from_h5mu_to_seurat", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output Seurat file", + "example" : [ + "output.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Converts an h5mu file into a Seurat file.\n\nRestrictions:\n - Only the intersection of cells is currently loaded into the Seurat object due to the object structure limitation.\n - Multimodal embeddings (global .obsm slot) are loaded with the assay.used field set to the default assay.\n - Embeddings names are changed in order to comply with R & Seurat requirements and conventions.\n - Feature names with underscores ('_') are automatically replaced with dashes ('-')\n - Seurat does not support global variables metadata /var.\n", + "test_resources" : [ + { + "type" : "r_script", + "path" : "run_test.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_anticmv/" + } + ], + "status" : "deprecated", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:24.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "libgeos-dev" + ], + "interactive" : false + }, + { + "type" : "r", + "cran" : [ + "anndata", + "hdf5r", + "testthat", + "SeuratObject", + "Seurat" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + }, + { + "type" : "r", + "github" : [ + "pmbio/MuDataSeurat@empty-tables-and-nullable" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/convert/from_h5mu_to_seurat/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/from_h5mu_to_seurat", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +library(MuDataSeurat) +library(hdf5r) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + + +temp_h5mu <- tempfile(fileext = ".h5mu") +file.copy(par\\$input, temp_h5mu) + +delete_modality <- function(open_h5, modality_path) { + open_h5\\$link_delete(modality_path) + mod_name <- sub("/mod/", "", modality_path) + if ("mod-order" %in% names(hdf5r::h5attributes(open_h5[["mod"]]))) { + current_attributes <- hdf5r::h5attributes(open_h5[["mod"]])\\$\\`mod-order\\` + current_attributes <- current_attributes[current_attributes != mod_name] + hdf5r::h5attr(open_h5[["mod"]], "mod-order") <- current_attributes + } + for (obj_prefix in c("obsm/", "varm/", "varmap/", "obsmap/")) { + obj_path <- paste0(obj_prefix, mod_name) + if (hdf5r::existsGroup(open_h5, obj_path)) { + open_h5\\$link_delete(obj_path) + } + } +} + +open_file <- H5File\\$new(temp_h5mu, mode = "r+") +modalities <- list.groups(open_file[["mod"]], + full.names = TRUE, recursive = FALSE +) +to_delete <- c() + +determine_matrix_dims <- function(dataset, indexpointers, indices, rowwise) { + if ("shape" %in% hdf5r::h5attr_names(dataset)) { + x_dims <- hdf5r::h5attr(dataset, "shape") + } else { + x_dims <- c(length(indexpointers) - 1, max(indices) + 1) + if (rowwise) { + x_dims <- rev(x_dims) + } + } + x_dims +} +for (modality_path in modalities) { + dataset <- open_file[[modality_path]][["X"]] + dataset_names <- names(dataset) + if ( + "data" %in% dataset_names && + "indices" %in% dataset_names && + "indptr" %in% dataset_names + ) { + indexpointers <- dataset[["indptr"]]\\$read() + indices <- dataset[["indices"]]\\$read() + rowwise <- FALSE + if ("encoding-type" %in% h5attr_names(dataset)) { + rowwise <- h5attr(dataset, "encoding-type") == "csr_matrix" + } + x_dims <- determine_matrix_dims(dataset, indexpointers, indices, rowwise) + if (x_dims[2] < 1) { + delete_modality(open_file, modality_path) + } + } else if (dataset\\$dims[1] < 1) { + delete_modality(open_file, modality_path) + } +} + +open_file\\$close_all() + +cat("Reading input file\\\\n") +obj <- ReadH5MU(temp_h5mu) + +cat("Writing output file\\\\n") +saveRDS(obj, file = par\\$output, compress = TRUE) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/from_h5mu_to_seurat", + "tag" : "main" + }, + "label" : [ + "lowmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/from_h5mu_to_seurat/nextflow.config b/target/nextflow/convert/from_h5mu_to_seurat/nextflow.config new file mode 100644 index 00000000..9d5fb9b4 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_seurat/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/from_h5mu_to_seurat' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Converts an h5mu file into a Seurat file.\n\nRestrictions:\n - Only the intersection of cells is currently loaded into the Seurat object due to the object structure limitation.\n - Multimodal embeddings (global .obsm slot) are loaded with the assay.used field set to the default assay.\n - Embeddings names are changed in order to comply with R & Seurat requirements and conventions.\n - Feature names with underscores (\'_\') are automatically replaced with dashes (\'-\')\n - Seurat does not support global variables metadata /var.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/from_h5mu_to_seurat/nextflow_labels.config b/target/nextflow/convert/from_h5mu_to_seurat/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_seurat/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/from_h5mu_to_seurat/nextflow_schema.json b/target/nextflow/convert/from_h5mu_to_seurat/nextflow_schema.json new file mode 100644 index 00000000..1baeffe0 --- /dev/null +++ b/target/nextflow/convert/from_h5mu_to_seurat/nextflow_schema.json @@ -0,0 +1,49 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "from_h5mu_to_seurat", + "description": "Converts an h5mu file into a Seurat file.\n\nRestrictions:\n - Only the intersection of cells is currently loaded into the Seurat object due to the object structure limitation.\n - Multimodal embeddings (global .obsm slot) are loaded with the assay.used field set to the default assay.\n - Embeddings names are changed in order to comply with R & Seurat requirements and conventions.\n - Feature names with underscores ('_') are automatically replaced with dashes ('-')\n - Seurat does not support global variables metadata /var.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output Seurat file", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.rds\"`, direction: `output`, example: `\"output.rds\"`. ", + "default": "$id.$key.output.rds" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/convert/velocyto_to_h5mu/.config.vsh.yaml b/target/nextflow/convert/velocyto_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..fd564c98 --- /dev/null +++ b/target/nextflow/convert/velocyto_to_h5mu/.config.vsh.yaml @@ -0,0 +1,322 @@ +name: "velocyto_to_h5mu" +namespace: "convert" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input_loom" + description: "Path to the input loom file." + info: null + example: + - "input.loom" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_h5mu" + description: "If a MuData file is provided," + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "The name of the modality to operate on." + info: null + default: + - "rna_velocity" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Path to the output MuData file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_spliced" + description: "Output layer for the spliced reads." + info: null + default: + - "velo_spliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_unspliced" + description: "Output layer for the unspliced reads." + info: null + default: + - "velo_unspliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_ambiguous" + description: "Output layer for the ambiguous reads." + info: null + default: + - "velo_ambiguous" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert a velocyto loom file to a h5mu file.\n\nIf an input h5mu file\ + \ is also provided, the velocity\nh5ad object will get added to that h5mu instead.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "loompy" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/velocity/velocyto_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/convert/velocyto_to_h5mu" + executable: "target/nextflow/convert/velocyto_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/convert/velocyto_to_h5mu/main.nf b/target/nextflow/convert/velocyto_to_h5mu/main.nf new file mode 100644 index 00000000..85439478 --- /dev/null +++ b/target/nextflow/convert/velocyto_to_h5mu/main.nf @@ -0,0 +1,4039 @@ +// velocyto_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer, author) +// * Robrecht Cannoodt (author) +// * Angela Oliveira Pisco (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "velocyto_to_h5mu", + "namespace" : "convert", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input_loom", + "description" : "Path to the input loom file.", + "example" : [ + "input.loom" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_h5mu", + "description" : "If a MuData file is provided,", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "The name of the modality to operate on.", + "default" : [ + "rna_velocity" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Path to the output MuData file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_spliced", + "description" : "Output layer for the spliced reads.", + "default" : [ + "velo_spliced" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_unspliced", + "description" : "Output layer for the unspliced reads.", + "default" : [ + "velo_unspliced" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_ambiguous", + "description" : "Output layer for the ambiguous reads.", + "default" : [ + "velo_ambiguous" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert a velocyto loom file to a h5mu file.\n\nIf an input h5mu file is also provided, the velocity\nh5ad object will get added to that h5mu instead.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "loompy" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/velocity/velocyto_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/convert/velocyto_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import anndata as ad +import mudata as mu +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.string_ = np.bytes_ +numpy_module.unicode_ = np.str_ +sys.modules["numpy"] = numpy_module + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_loom': $( if [ ! -z ${VIASH_PAR_INPUT_LOOM+x} ]; then echo "r'${VIASH_PAR_INPUT_LOOM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_h5mu': $( if [ ! -z ${VIASH_PAR_INPUT_H5MU+x} ]; then echo "r'${VIASH_PAR_INPUT_H5MU//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_spliced': $( if [ ! -z ${VIASH_PAR_LAYER_SPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_SPLICED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_unspliced': $( if [ ! -z ${VIASH_PAR_LAYER_UNSPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_UNSPLICED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_ambiguous': $( if [ ! -z ${VIASH_PAR_LAYER_AMBIGUOUS+x} ]; then echo "r'${VIASH_PAR_LAYER_AMBIGUOUS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +print("Parameters:", par, flush=True) + +print("Reading AnnData from loom", flush=True) +adata_in = ad.read_loom(par["input_loom"]) +adata_in.var_names = adata_in.var["Accession"] + +print("Creating clean AnnData", flush=True) +adata = ad.AnnData( + X=adata_in.X, + obs=adata_in.obs[[]], + var=adata_in.var[[]], + layers={ + par["layer_spliced"]: adata_in.layers["spliced"], + par["layer_unspliced"]: adata_in.layers["unspliced"], + par["layer_ambiguous"]: adata_in.layers["ambiguous"], + }, +) + +if par["input_h5mu"]: + print("Received input h5mu to read", flush=True) + mdata = mu.read_h5mu(par["input_h5mu"]) + + print(f"Storing AnnData in modality {par['modality']}", flush=True) + mdata.mod[par["modality"]] = adata +else: + print("Creating h5mu from scratch", flush=True) + mdata = mu.MuData({par["modality"]: adata}) + +print("Resulting mudata:", mdata, flush=True) + +print("Writing h5mu to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/convert/velocyto_to_h5mu", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/convert/velocyto_to_h5mu/nextflow.config b/target/nextflow/convert/velocyto_to_h5mu/nextflow.config new file mode 100644 index 00000000..32f07d5c --- /dev/null +++ b/target/nextflow/convert/velocyto_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'convert/velocyto_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert a velocyto loom file to a h5mu file.\n\nIf an input h5mu file is also provided, the velocity\nh5ad object will get added to that h5mu instead.\n' + author = 'Dries Schaumont, Robrecht Cannoodt, Angela Oliveira Pisco' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/convert/velocyto_to_h5mu/nextflow_labels.config b/target/nextflow/convert/velocyto_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/convert/velocyto_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/convert/velocyto_to_h5mu/nextflow_schema.json b/target/nextflow/convert/velocyto_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..97c2b160 --- /dev/null +++ b/target/nextflow/convert/velocyto_to_h5mu/nextflow_schema.json @@ -0,0 +1,98 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "velocyto_to_h5mu", + "description": "Convert a velocyto loom file to a h5mu file.\n\nIf an input h5mu file is also provided, the velocity\nh5ad object will get added to that h5mu instead.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input_loom": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the input loom file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.loom\"`. " + }, + "input_h5mu": { + "type": "string", + "format": "path", + "description": "If a MuData file is provided,", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "The name of the modality to operate on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna_velocity\"`. ", + "default": "rna_velocity" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Path to the output MuData file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "layer_spliced": { + "type": "string", + "description": "Output layer for the spliced reads.", + "help_text": "Type: `string`, multiple: `False`, default: `\"velo_spliced\"`. ", + "default": "velo_spliced" + }, + "layer_unspliced": { + "type": "string", + "description": "Output layer for the unspliced reads.", + "help_text": "Type: `string`, multiple: `False`, default: `\"velo_unspliced\"`. ", + "default": "velo_unspliced" + }, + "layer_ambiguous": { + "type": "string", + "description": "Output layer for the ambiguous reads.", + "help_text": "Type: `string`, multiple: `False`, default: `\"velo_ambiguous\"`. ", + "default": "velo_ambiguous" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/correction/cellbender_remove_background/.config.vsh.yaml b/target/nextflow/correction/cellbender_remove_background/.config.vsh.yaml new file mode 100644 index 00000000..42753912 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background/.config.vsh.yaml @@ -0,0 +1,660 @@ +name: "cellbender_remove_background" +namespace: "correction" +version: "main" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file. Data file on which to run tool. Data must be un-filtered:\ + \ it should include empty droplets." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of modalities to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Full count matrix as an h5mu file, with background RNA removed.\ + \ This file contains all the original droplet barcodes." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_output" + description: "Output layer" + info: null + default: + - "cellbender_corrected" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_background_fraction" + info: null + default: + - "cellbender_background_fraction" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_cell_probability" + info: null + default: + - "cellbender_cell_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_cell_size" + info: null + default: + - "cellbender_cell_size" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_droplet_efficiency" + description: "Name of the column in the .obs dataframe to store the droplet efficiencies\ + \ in.\n" + info: null + default: + - "cellbender_droplet_efficiency" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_scale" + info: null + default: + - "cellbender_latent_scale" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_ambient_expression" + info: null + default: + - "cellbender_ambient_expression" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_expression_encoding" + info: null + default: + - "cellbender_gene_expression_encoding" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean" + name: "--expected_cells_from_qc" + description: "Will use the Cell Ranger QC to determine the estimated number of\ + \ cells" + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--expected_cells" + description: "Number of cells expected in the dataset (a rough estimate within\ + \ a factor of 2 is sufficient)." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--total_droplets_included" + description: "The number of droplets from the rank-ordered UMI plot\nthat will\ + \ have their cell probabilities inferred as an\noutput. Include the droplets\ + \ which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should\ + \ be\n'surely empty' droplets.\n" + info: null + example: + - 25000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_cell_umi_prior" + description: "Ignore CellBender's heuristic prior estimation, and use this prior\ + \ for UMI counts in cells." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_empty_umi_prior" + description: "Ignore CellBender's heuristic prior estimation, and use this prior\ + \ for UMI counts in empty droplets." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--model" + description: "Which model is being used for count data.\n\n* 'naive' subtracts\ + \ the estimated ambient profile.\n* 'simple' does not model either ambient RNA\ + \ or random barcode swapping (for debugging purposes -- not recommended).\n\ + * 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping'\ + \ assumes background RNA comes from random barcode swapping (via PCR chimeras).\n\ + * 'full' uses a combined ambient and swapping model.\n" + info: null + default: + - "full" + required: false + choices: + - "naive" + - "simple" + - "ambient" + - "swapping" + - "full" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--epochs" + description: "Number of epochs to train." + info: null + default: + - 150 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--low_count_threshold" + description: "Droplets with UMI counts below this number are completely \nexcluded\ + \ from the analysis. This can help identify the correct \nprior for empty droplet\ + \ counts in the rare case where empty \ncounts are extremely high (over 200).\n" + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_dim" + description: "Dimension of latent variable z.\n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_layers" + description: "Dimension of hidden layers in the encoder for z.\n" + info: null + default: + - 512 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--training_fraction" + description: "Training detail: the fraction of the data used for training.\nThe\ + \ rest is never seen by the inference algorithm. Speeds up learning.\n" + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--empty_drop_training_fraction" + description: "Training detail: the fraction of the training data each epoch that\ + \ \nis drawn (randomly sampled) from surely empty droplets.\n" + info: null + default: + - 0.2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ignore_features" + description: "Integer indices of features to ignore entirely. In the output\n\ + count matrix, the counts for these features will be unchanged.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--fpr" + description: "Target 'delta' false positive rate in [0, 1). Use 0 for a cohort\n\ + of samples which will be jointly analyzed for differential expression.\nA false\ + \ positive is a true signal count that is erroneously removed.\nMore background\ + \ removal is accompanied by more signal removal at\nhigh values of FPR. You\ + \ can specify multiple values, which will\ncreate multiple output files.\n" + info: null + default: + - 0.01 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--exclude_feature_types" + description: "Feature types to ignore during the analysis. These features will\n\ + be left unchanged in the output file.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--projected_ambient_count_threshold" + description: "Controls how many features are included in the analysis, which\n\ + can lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD\ + \ counts total in all cells\n(summed), then that gene is excluded, and it will\ + \ be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD\ + \ = 0 will include all features\nwhich have even a single count in any empty\ + \ droplet.\n" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--learning_rate" + description: "Training detail: lower learning rate for inference.\nA OneCycle\ + \ learning rate schedule is used, where the\nupper learning rate is ten times\ + \ this value. (For this\nvalue, probably do not exceed 1e-3).\n" + info: null + default: + - 1.0E-4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--final_elbo_fail_fraction" + description: "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO\ + \ - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically\ + \ re-run if --num-training-tries > 1.\nBy default, will not fail training based\ + \ on final_training_ELBO.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--epoch_elbo_fail_fraction" + description: "Training is considered to have failed if \n(previous_epoch_test_ELBO\ + \ - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO)\ + \ > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries\ + \ > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_training_tries" + description: "Number of times to attempt to train the model. At each subsequent\ + \ attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--learning_rate_retry_mult" + description: "Learning rate is multiplied by this amount each time a new training\n\ + attempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION\ + \ or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is > 1.) \n" + info: null + default: + - 0.2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--posterior_batch_size" + description: "Training detail: size of batches when creating the posterior.\n\ + Reduce this to avoid running out of GPU memory creating the posterior\n(will\ + \ be slower).\n" + info: null + default: + - 128 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--posterior_regulation" + description: "Posterior regularization method. (For experts: not required for\ + \ normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n\ + * PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n\ + * PRmu_gene is approximate mean-targeting per gene.\n" + info: null + required: false + choices: + - "PRq" + - "PRmu" + - "PRmu_gene" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "Tunable parameter alpha for the PRq posterior regularization method\n\ + (not normally used: see documentation).\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--q" + description: "Tunable parameter q for the CDF threshold estimation method (not\n\ + normally used: see documentation).\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--estimator" + description: "Output denoised count estimation method. (For experts: not required\n\ + for normal usage, see documentation).\n" + info: null + default: + - "mckp" + required: false + choices: + - "map" + - "mean" + - "cdf" + - "sample" + - "mckp" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--estimator_multiple_cpu" + description: "Including the flag --estimator-multiple-cpu will use more than one\n\ + CPU to compute the MCKP output count estimator in parallel (does nothing\nfor\ + \ other estimators).\n" + info: null + direction: "input" + - type: "boolean" + name: "--constant_learning_rate" + description: "Including the flag --constant-learning-rate will use the ClippedAdam\n\ + optimizer instead of the OneCycleLR learning rate schedule, which is\nthe default.\ + \ Learning is faster with the OneCycleLR schedule.\nHowever, training can easily\ + \ be continued from a checkpoint for more\nepochs than the initial command specified\ + \ when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule\ + \ with 150 epochs\nspecified, it is not possible to pick up from that final\ + \ checkpoint\nand continue training until 250 epochs.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--debug" + description: "Including the flag --debug will log extra messages useful for debugging.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--cuda" + description: "Including the flag --cuda will run the inference on a\nGPU.\n" + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Eliminating technical artifacts from high-throughput single-cell RNA\ + \ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\ + \ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\ + \ moment, only the count matrices produced by the CellRanger count pipeline is supported.\ + \ Support for additional tools and protocols \nwill be added in the future. A quick\ + \ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential\ + \ libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates\ + \ curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev\ + \ liblzma-dev mecab-ipadic-utf8 git \\\n&& curl https://pyenv.run | bash \\\n\ + && pyenv update \\\n&& pyenv install $PYTHON_VERSION \\\n&& pyenv global $PYTHON_VERSION\ + \ \\\n&& apt-get clean\n" + env: + - "PYENV_ROOT=\"/root/.pyenv\"" + - "PATH=\"$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH\"" + - "PYTHON_VERSION=3.7.16" + - type: "python" + user: false + packages: + - "lxml~=4.8.0" + - "mudata~=0.2.1" + - "cellbender~=0.3.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/correction/cellbender_remove_background/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/correction/cellbender_remove_background" + executable: "target/nextflow/correction/cellbender_remove_background/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/correction/cellbender_remove_background/main.nf b/target/nextflow/correction/cellbender_remove_background/main.nf new file mode 100644 index 00000000..b625ba1f --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background/main.nf @@ -0,0 +1,4532 @@ +// cellbender_remove_background main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellbender_remove_background", + "namespace" : "correction", + "version" : "main", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file. Data file on which to run tool. Data must be un-filtered: it should include empty droplets.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "List of modalities to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Full count matrix as an h5mu file, with background RNA removed. This file contains all the original droplet barcodes.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_output", + "description" : "Output layer", + "default" : [ + "cellbender_corrected" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_background_fraction", + "default" : [ + "cellbender_background_fraction" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_cell_probability", + "default" : [ + "cellbender_cell_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_cell_size", + "default" : [ + "cellbender_cell_size" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_droplet_efficiency", + "description" : "Name of the column in the .obs dataframe to store the droplet efficiencies in.\n", + "default" : [ + "cellbender_droplet_efficiency" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_latent_scale", + "default" : [ + "cellbender_latent_scale" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_ambient_expression", + "default" : [ + "cellbender_ambient_expression" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_gene_expression_encoding", + "default" : [ + "cellbender_gene_expression_encoding" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--expected_cells_from_qc", + "description" : "Will use the Cell Ranger QC to determine the estimated number of cells", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--expected_cells", + "description" : "Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient).", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--total_droplets_included", + "description" : "The number of droplets from the rank-ordered UMI plot\nthat will have their cell probabilities inferred as an\noutput. Include the droplets which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should be\n'surely empty' droplets.\n", + "example" : [ + 25000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--force_cell_umi_prior", + "description" : "Ignore CellBender's heuristic prior estimation, and use this prior for UMI counts in cells.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--force_empty_umi_prior", + "description" : "Ignore CellBender's heuristic prior estimation, and use this prior for UMI counts in empty droplets.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--model", + "description" : "Which model is being used for count data.\n\n* 'naive' subtracts the estimated ambient profile.\n* 'simple' does not model either ambient RNA or random barcode swapping (for debugging purposes -- not recommended).\n* 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping' assumes background RNA comes from random barcode swapping (via PCR chimeras).\n* 'full' uses a combined ambient and swapping model.\n", + "default" : [ + "full" + ], + "required" : false, + "choices" : [ + "naive", + "simple", + "ambient", + "swapping", + "full" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--epochs", + "description" : "Number of epochs to train.", + "default" : [ + 150 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--low_count_threshold", + "description" : "Droplets with UMI counts below this number are completely \nexcluded from the analysis. This can help identify the correct \nprior for empty droplet counts in the rare case where empty \ncounts are extremely high (over 200).\n", + "default" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--z_dim", + "description" : "Dimension of latent variable z.\n", + "default" : [ + 64 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--z_layers", + "description" : "Dimension of hidden layers in the encoder for z.\n", + "default" : [ + 512 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--training_fraction", + "description" : "Training detail: the fraction of the data used for training.\nThe rest is never seen by the inference algorithm. Speeds up learning.\n", + "default" : [ + 0.9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--empty_drop_training_fraction", + "description" : "Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets.\n", + "default" : [ + 0.2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--ignore_features", + "description" : "Integer indices of features to ignore entirely. In the output\ncount matrix, the counts for these features will be unchanged.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--fpr", + "description" : "Target 'delta' false positive rate in [0, 1). Use 0 for a cohort\nof samples which will be jointly analyzed for differential expression.\nA false positive is a true signal count that is erroneously removed.\nMore background removal is accompanied by more signal removal at\nhigh values of FPR. You can specify multiple values, which will\ncreate multiple output files.\n", + "default" : [ + 0.01 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--exclude_feature_types", + "description" : "Feature types to ignore during the analysis. These features will\nbe left unchanged in the output file.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--projected_ambient_count_threshold", + "description" : "Controls how many features are included in the analysis, which\ncan lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD counts total in all cells\n(summed), then that gene is excluded, and it will be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD = 0 will include all features\nwhich have even a single count in any empty droplet.\n", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--learning_rate", + "description" : "Training detail: lower learning rate for inference.\nA OneCycle learning rate schedule is used, where the\nupper learning rate is ten times this value. (For this\nvalue, probably do not exceed 1e-3).\n", + "default" : [ + 1.0E-4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--final_elbo_fail_fraction", + "description" : "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries > 1.\nBy default, will not fail training based on final_training_ELBO.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--epoch_elbo_fail_fraction", + "description" : "Training is considered to have failed if \n(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num_training_tries", + "description" : "Number of times to attempt to train the model. At each subsequent attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--learning_rate_retry_mult", + "description" : "Learning rate is multiplied by this amount each time a new training\nattempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is > 1.) \n", + "default" : [ + 0.2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--posterior_batch_size", + "description" : "Training detail: size of batches when creating the posterior.\nReduce this to avoid running out of GPU memory creating the posterior\n(will be slower).\n", + "default" : [ + 128 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--posterior_regulation", + "description" : "Posterior regularization method. (For experts: not required for normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n* PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n* PRmu_gene is approximate mean-targeting per gene.\n", + "required" : false, + "choices" : [ + "PRq", + "PRmu", + "PRmu_gene" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alpha", + "description" : "Tunable parameter alpha for the PRq posterior regularization method\n(not normally used: see documentation).\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--q", + "description" : "Tunable parameter q for the CDF threshold estimation method (not\nnormally used: see documentation).\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--estimator", + "description" : "Output denoised count estimation method. (For experts: not required\nfor normal usage, see documentation).\n", + "default" : [ + "mckp" + ], + "required" : false, + "choices" : [ + "map", + "mean", + "cdf", + "sample", + "mckp" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--estimator_multiple_cpu", + "description" : "Including the flag --estimator-multiple-cpu will use more than one\nCPU to compute the MCKP output count estimator in parallel (does nothing\nfor other estimators).\n", + "direction" : "input" + }, + { + "type" : "boolean", + "name" : "--constant_learning_rate", + "description" : "Including the flag --constant-learning-rate will use the ClippedAdam\noptimizer instead of the OneCycleLR learning rate schedule, which is\nthe default. Learning is faster with the OneCycleLR schedule.\nHowever, training can easily be continued from a checkpoint for more\nepochs than the initial command specified when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule with 150 epochs\nspecified, it is not possible to pick up from that final checkpoint\nand continue training until 250 epochs.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--debug", + "description" : "Including the flag --debug will log extra messages useful for debugging.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--cuda", + "description" : "Including the flag --cuda will run the inference on a\nGPU.\n", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midcpu", + "midmem", + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8 git \\\\\n&& curl https://pyenv.run | bash \\\\\n&& pyenv update \\\\\n&& pyenv install $PYTHON_VERSION \\\\\n&& pyenv global $PYTHON_VERSION \\\\\n&& apt-get clean\n" + ], + "env" : [ + "PYENV_ROOT=\\"/root/.pyenv\\"", + "PATH=\\"$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH\\"", + "PYTHON_VERSION=3.7.16" + ] + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "lxml~=4.8.0", + "mudata~=0.2.1", + "cellbender~=0.3.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/correction/cellbender_remove_background/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/correction/cellbender_remove_background", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import tempfile +import subprocess +import os +import sys +import numpy as np +from scipy.sparse import csr_matrix +from cellbender.remove_background.downstream import anndata_from_h5 + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_output': $( if [ ! -z ${VIASH_PAR_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_LAYER_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_background_fraction': $( if [ ! -z ${VIASH_PAR_OBS_BACKGROUND_FRACTION+x} ]; then echo "r'${VIASH_PAR_OBS_BACKGROUND_FRACTION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_cell_probability': $( if [ ! -z ${VIASH_PAR_OBS_CELL_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OBS_CELL_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_cell_size': $( if [ ! -z ${VIASH_PAR_OBS_CELL_SIZE+x} ]; then echo "r'${VIASH_PAR_OBS_CELL_SIZE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_droplet_efficiency': $( if [ ! -z ${VIASH_PAR_OBS_DROPLET_EFFICIENCY+x} ]; then echo "r'${VIASH_PAR_OBS_DROPLET_EFFICIENCY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_latent_scale': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_SCALE+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_SCALE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_ambient_expression': $( if [ ! -z ${VIASH_PAR_VAR_AMBIENT_EXPRESSION+x} ]; then echo "r'${VIASH_PAR_VAR_AMBIENT_EXPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_gene_expression_encoding': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_EXPRESSION_ENCODING//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'expected_cells_from_qc': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS_FROM_QC+x} ]; then echo "r'${VIASH_PAR_EXPECTED_CELLS_FROM_QC//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'expected_cells': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS+x} ]; then echo "int(r'${VIASH_PAR_EXPECTED_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'total_droplets_included': $( if [ ! -z ${VIASH_PAR_TOTAL_DROPLETS_INCLUDED+x} ]; then echo "int(r'${VIASH_PAR_TOTAL_DROPLETS_INCLUDED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'force_cell_umi_prior': $( if [ ! -z ${VIASH_PAR_FORCE_CELL_UMI_PRIOR+x} ]; then echo "int(r'${VIASH_PAR_FORCE_CELL_UMI_PRIOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'force_empty_umi_prior': $( if [ ! -z ${VIASH_PAR_FORCE_EMPTY_UMI_PRIOR+x} ]; then echo "int(r'${VIASH_PAR_FORCE_EMPTY_UMI_PRIOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'epochs': $( if [ ! -z ${VIASH_PAR_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'low_count_threshold': $( if [ ! -z ${VIASH_PAR_LOW_COUNT_THRESHOLD+x} ]; then echo "int(r'${VIASH_PAR_LOW_COUNT_THRESHOLD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'z_dim': $( if [ ! -z ${VIASH_PAR_Z_DIM+x} ]; then echo "int(r'${VIASH_PAR_Z_DIM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'z_layers': $( if [ ! -z ${VIASH_PAR_Z_LAYERS+x} ]; then echo "list(map(int, r'${VIASH_PAR_Z_LAYERS//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'training_fraction': $( if [ ! -z ${VIASH_PAR_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_TRAINING_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'empty_drop_training_fraction': $( if [ ! -z ${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'ignore_features': $( if [ ! -z ${VIASH_PAR_IGNORE_FEATURES+x} ]; then echo "list(map(int, r'${VIASH_PAR_IGNORE_FEATURES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'fpr': $( if [ ! -z ${VIASH_PAR_FPR+x} ]; then echo "list(map(float, r'${VIASH_PAR_FPR//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'exclude_feature_types': $( if [ ! -z ${VIASH_PAR_EXCLUDE_FEATURE_TYPES+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_FEATURE_TYPES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'projected_ambient_count_threshold': $( if [ ! -z ${VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD+x} ]; then echo "float(r'${VIASH_PAR_PROJECTED_AMBIENT_COUNT_THRESHOLD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'final_elbo_fail_fraction': $( if [ ! -z ${VIASH_PAR_FINAL_ELBO_FAIL_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_FINAL_ELBO_FAIL_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'epoch_elbo_fail_fraction': $( if [ ! -z ${VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_EPOCH_ELBO_FAIL_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'num_training_tries': $( if [ ! -z ${VIASH_PAR_NUM_TRAINING_TRIES+x} ]; then echo "int(r'${VIASH_PAR_NUM_TRAINING_TRIES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'learning_rate_retry_mult': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE_RETRY_MULT+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE_RETRY_MULT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'posterior_batch_size': $( if [ ! -z ${VIASH_PAR_POSTERIOR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_POSTERIOR_BATCH_SIZE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'posterior_regulation': $( if [ ! -z ${VIASH_PAR_POSTERIOR_REGULATION+x} ]; then echo "r'${VIASH_PAR_POSTERIOR_REGULATION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'q': $( if [ ! -z ${VIASH_PAR_Q+x} ]; then echo "float(r'${VIASH_PAR_Q//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'estimator': $( if [ ! -z ${VIASH_PAR_ESTIMATOR+x} ]; then echo "r'${VIASH_PAR_ESTIMATOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'estimator_multiple_cpu': $( if [ ! -z ${VIASH_PAR_ESTIMATOR_MULTIPLE_CPU+x} ]; then echo "r'${VIASH_PAR_ESTIMATOR_MULTIPLE_CPU//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'constant_learning_rate': $( if [ ! -z ${VIASH_PAR_CONSTANT_LEARNING_RATE+x} ]; then echo "r'${VIASH_PAR_CONSTANT_LEARNING_RATE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'debug': $( if [ ! -z ${VIASH_PAR_DEBUG+x} ]; then echo "r'${VIASH_PAR_DEBUG//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'cuda': $( if [ ! -z ${VIASH_PAR_CUDA+x} ]; then echo "r'${VIASH_PAR_CUDA//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) + +mod = par["modality"] +logger.info("Performing log transformation on modality %s", mod) +data = mdata.mod[mod] + +# import pathlib +# with pathlib.Path(os.path.dirname(par["output"])) / "cellbender" as temp_dir: +# os.mkdir(temp_dir) +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: + # construct paths within tempdir + input_file = os.path.join(temp_dir, "input.h5ad") + output_file = os.path.join(temp_dir, "output.h5") + + logger.info("Creating AnnData input file for CellBender: '%s'", input_file) + data.write_h5ad(input_file) + + logger.info("Constructing CellBender command") + cmd_pars = [ + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, + # don't create checkpoints because they're not used / returned anyways + "--checkpoint-mins", + "99999999", + ] + + if meta.get("cpus") is not None: + cmd_pars += ["--cpu-threads", str(meta["cpus"])] + + extra_args = [ + ("--expected-cells", "expected_cells", True), + ("--total-droplets-included", "total_droplets_included", True), + ("--force-cell-umi-prior", "force_cell_umi_prior", True), + ("--force-empty-umi-prior", "force_empty_umi_prior", True), + ("--model", "model", True), + ("--epochs", "epochs", True), + ("--low-count-threshold", "low_count_threshold", True), + ("--z-dim", "z_dim", True), + ("--z-layers", "z_layers", True), + ("--training-fraction", "training_fraction", True), + ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), + ("--ignore-features", "ignore_features", True), + ("--fpr", "fpr", True), + ("--exclude-feature-types", "exclude_feature_types", True), + ( + "--projected-ambient-count-threshold", + "projected_ambient_count_threshold", + True, + ), + ("--learning-rate", "learning_rate", True), + ("--final-elbo-fail-fraction", "final_elbo_fail_fraction", True), + ("--epoch-elbo-fail-fraction", "epoch_elbo_fail_fraction", True), + ("--num-training-tries", "num_training_tries", True), + ("--learning-rate-retry-mult", "learning_rate_retry_mult", True), + ("--posterior-batch-size", "posterior_batch_size", True), + ("--posterior-regulation", "posterior_regulation", True), + ("--alpha", "alpha", True), + ("--q", "q", True), + ("--estimator", "estimator", True), + ("--estimator-multiple-cpu", "estimator_multiple_cpu", False), + ("--constant-learning-rate", "constant_learning_rate", False), + ("--debug", "debug", False), + ("--cuda", "cuda", False), + ] + for flag, name, is_kwarg in extra_args: + if par[name]: + values = par[name] if isinstance(par[name], list) else [par[name]] + cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] + + if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: + assert par["expected_cells"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + assert par["total_droplets_included"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + met = data.uns["metrics_cellranger"] + col_name = "Estimated Number of Cells" + assert col_name in met.columns, ( + "%s should be a column in .obs[metrics_cellranger]" + ) + est_cells = met[col_name].values[0] + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) + out = subprocess.check_output(cmd_pars).decode("utf-8") + + logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) + adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=False) + + logger.info("CellBender output format:", adata_out) + + # AnnData object with n_obs x n_vars = 6794880 x 33538 + # obs: 'cellbender_analyzed' + # var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed' + # uns: 'background_fraction', 'barcode_indices_for_latents', 'cell_probability', 'cell_size', 'droplet_efficiency', 'gene_expression_encoding', + # 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'swapping_fraction_dist_params', + # 'barcodes_analyzed', 'barcodes_analyzed_inds', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', + # 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_elbo', 'learning_curve_train_epoch', + # 'target_false_positive_rate' + + logger.info("Copying X output to MuData") + data.layers[par["layer_output"]] = adata_out.X + + logger.info("Copying .obs output to MuData") + obs_store = { + "obs_background_fraction": "background_fraction", + "obs_cell_probability": "cell_probability", + "obs_cell_size": "cell_size", + "obs_droplet_efficiency": "droplet_efficiency", + "obs_latent_scale": "latent_scale", + } + for to_name, from_name in obs_store.items(): + if par[to_name]: + if from_name in adata_out.obs: + data.obs[par[to_name]] = adata_out.obs[from_name] + # when using unfiltered data, the values will be in uns instead of obs + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + vec = np.zeros(data.n_obs) + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] + data.obs[par[to_name]] = vec + + logger.info("Copying .var output to MuData") + var_store = {"var_ambient_expression": "ambient_expression"} + for to_name, from_name in var_store.items(): + if par[to_name]: + data.var[par[to_name]] = adata_out.var[from_name] + + logger.info("Copying obsm_gene_expression_encoding output to MuData") + obsm_store = {"obsm_gene_expression_encoding": "gene_expression_encoding"} + for to_name, from_name in obsm_store.items(): + if par[to_name]: + if from_name in adata_out.obsm: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + matrix_to_store = adata_out.uns[from_name] + number_of_obs = data.X.shape[0] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] + data.obsm[par[to_name]] = latent_space_sparse + else: + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) + + +logger.info("Writing to file %s", par["output"]) +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/correction/cellbender_remove_background", + "tag" : "main" + }, + "label" : [ + "midcpu", + "midmem", + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/correction/cellbender_remove_background/nextflow.config b/target/nextflow/correction/cellbender_remove_background/nextflow.config new file mode 100644 index 00000000..512bb849 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'correction/cellbender_remove_background' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/correction/cellbender_remove_background/nextflow_labels.config b/target/nextflow/correction/cellbender_remove_background/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/correction/cellbender_remove_background/nextflow_schema.json b/target/nextflow/correction/cellbender_remove_background/nextflow_schema.json new file mode 100644 index 00000000..ad11d4aa --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background/nextflow_schema.json @@ -0,0 +1,335 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellbender_remove_background", + "description": "Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "List of modalities to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Full count matrix as an h5mu file, with background RNA removed", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "layer_output": { + "type": "string", + "description": "Output layer", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_corrected\"`. ", + "default": "cellbender_corrected" + }, + "obs_background_fraction": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_background_fraction\"`. ", + "default": "cellbender_background_fraction" + }, + "obs_cell_probability": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_cell_probability\"`. ", + "default": "cellbender_cell_probability" + }, + "obs_cell_size": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_cell_size\"`. ", + "default": "cellbender_cell_size" + }, + "obs_droplet_efficiency": { + "type": "string", + "description": "Name of the column in the .obs dataframe to store the droplet efficiencies in.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_droplet_efficiency\"`. ", + "default": "cellbender_droplet_efficiency" + }, + "obs_latent_scale": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_latent_scale\"`. ", + "default": "cellbender_latent_scale" + }, + "var_ambient_expression": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_ambient_expression\"`. ", + "default": "cellbender_ambient_expression" + }, + "obsm_gene_expression_encoding": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_gene_expression_encoding\"`. ", + "default": "cellbender_gene_expression_encoding" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "expected_cells_from_qc": { + "type": "boolean", + "description": "Will use the Cell Ranger QC to determine the estimated number of cells", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "expected_cells": { + "type": "integer", + "description": "Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient).", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "total_droplets_included": { + "type": "integer", + "description": "The number of droplets from the rank-ordered UMI plot\nthat will have their cell probabilities inferred as an\noutput", + "help_text": "Type: `integer`, multiple: `False`, example: `25000`. " + }, + "force_cell_umi_prior": { + "type": "integer", + "description": "Ignore CellBender's heuristic prior estimation, and use this prior for UMI counts in cells.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "force_empty_umi_prior": { + "type": "integer", + "description": "Ignore CellBender's heuristic prior estimation, and use this prior for UMI counts in empty droplets.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "model": { + "type": "string", + "description": "Which model is being used for count data.\n\n* 'naive' subtracts the estimated ambient profile.\n* 'simple' does not model either ambient RNA or random barcode swapping (for debugging purposes -- not recommended).\n* 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping' assumes background RNA comes from random barcode swapping (via PCR chimeras).\n* 'full' uses a combined ambient and swapping model.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"full\"`, choices: ``naive`, `simple`, `ambient`, `swapping`, `full``. ", + "enum": [ + "naive", + "simple", + "ambient", + "swapping", + "full" + ], + "default": "full" + }, + "epochs": { + "type": "integer", + "description": "Number of epochs to train.", + "help_text": "Type: `integer`, multiple: `False`, default: `150`. ", + "default": 150 + }, + "low_count_threshold": { + "type": "integer", + "description": "Droplets with UMI counts below this number are completely \nexcluded from the analysis", + "help_text": "Type: `integer`, multiple: `False`, default: `5`. ", + "default": 5 + }, + "z_dim": { + "type": "integer", + "description": "Dimension of latent variable z.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `64`. ", + "default": 64 + }, + "z_layers": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Dimension of hidden layers in the encoder for z.\n", + "help_text": "Type: `integer`, multiple: `True`, default: `[512]`. ", + "default": [ + 512 + ] + }, + "training_fraction": { + "type": "number", + "description": "Training detail: the fraction of the data used for training.\nThe rest is never seen by the inference algorithm", + "help_text": "Type: `double`, multiple: `False`, default: `0.9`. ", + "default": 0.9 + }, + "empty_drop_training_fraction": { + "type": "number", + "description": "Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets.\n", + "help_text": "Type: `double`, multiple: `False`, default: `0.2`. ", + "default": 0.2 + }, + "ignore_features": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Integer indices of features to ignore entirely", + "help_text": "Type: `integer`, multiple: `True`. " + }, + "fpr": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Target 'delta' false positive rate in [0, 1)", + "help_text": "Type: `double`, multiple: `True`, default: `[0.01]`. ", + "default": [ + 0.01 + ] + }, + "exclude_feature_types": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Feature types to ignore during the analysis", + "help_text": "Type: `string`, multiple: `True`. " + }, + "projected_ambient_count_threshold": { + "type": "number", + "description": "Controls how many features are included in the analysis, which\ncan lead to a large speedup", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + }, + "learning_rate": { + "type": "number", + "description": "Training detail: lower learning rate for inference.\nA OneCycle learning rate schedule is used, where the\nupper learning rate is ten times this value", + "help_text": "Type: `double`, multiple: `False`, default: `1.0E-4`. ", + "default": 0.00010 + }, + "final_elbo_fail_fraction": { + "type": "number", + "description": "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries > 1.\nBy default, will not fail training based on final_training_ELBO.\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "epoch_elbo_fail_fraction": { + "type": "number", + "description": "Training is considered to have failed if \n(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "num_training_tries": { + "type": "integer", + "description": "Number of times to attempt to train the model", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "learning_rate_retry_mult": { + "type": "number", + "description": "Learning rate is multiplied by this amount each time a new training\nattempt is made", + "help_text": "Type: `double`, multiple: `False`, default: `0.2`. ", + "default": 0.2 + }, + "posterior_batch_size": { + "type": "integer", + "description": "Training detail: size of batches when creating the posterior.\nReduce this to avoid running out of GPU memory creating the posterior\n(will be slower).\n", + "help_text": "Type: `integer`, multiple: `False`, default: `128`. ", + "default": 128 + }, + "posterior_regulation": { + "type": "string", + "description": "Posterior regularization method", + "help_text": "Type: `string`, multiple: `False`, choices: ``PRq`, `PRmu`, `PRmu_gene``. ", + "enum": [ + "PRq", + "PRmu", + "PRmu_gene" + ] + }, + "alpha": { + "type": "number", + "description": "Tunable parameter alpha for the PRq posterior regularization method\n(not normally used: see documentation).\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "q": { + "type": "number", + "description": "Tunable parameter q for the CDF threshold estimation method (not\nnormally used: see documentation).\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "estimator": { + "type": "string", + "description": "Output denoised count estimation method", + "help_text": "Type: `string`, multiple: `False`, default: `\"mckp\"`, choices: ``map`, `mean`, `cdf`, `sample`, `mckp``. ", + "enum": [ + "map", + "mean", + "cdf", + "sample", + "mckp" + ], + "default": "mckp" + }, + "estimator_multiple_cpu": { + "type": "boolean", + "description": "Including the flag --estimator-multiple-cpu will use more than one\nCPU to compute the MCKP output count estimator in parallel (does nothing\nfor other estimators).\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "constant_learning_rate": { + "type": "boolean", + "description": "Including the flag --constant-learning-rate will use the ClippedAdam\noptimizer instead of the OneCycleLR learning rate schedule, which is\nthe default", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "debug": { + "type": "boolean", + "description": "Including the flag --debug will log extra messages useful for debugging.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "cuda": { + "type": "boolean", + "description": "Including the flag --cuda will run the inference on a\nGPU.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/correction/cellbender_remove_background/setup_logger.py b/target/nextflow/correction/cellbender_remove_background/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/.config.vsh.yaml b/target/nextflow/correction/cellbender_remove_background_v0_2/.config.vsh.yaml new file mode 100644 index 00000000..c20f387e --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/.config.vsh.yaml @@ -0,0 +1,455 @@ +name: "cellbender_remove_background_v0_2" +namespace: "correction" +version: "main" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of modalities to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Full count matrix as an h5mu file, with background RNA removed.\ + \ This file contains all the original droplet barcodes." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_output" + description: "Output layer" + info: null + default: + - "corrected" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_rt_efficiency" + info: null + default: + - "latent_rt_efficiency" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_cell_probability" + info: null + default: + - "latent_cell_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_latent_scale" + info: null + default: + - "latent_scale" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_ambient_expression" + info: null + default: + - "ambient_expression" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_latent_gene_encoding" + info: null + default: + - "cellbender_latent_gene_encoding" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--expected_cells" + description: "Number of cells expected in the dataset (a rough estimate within\ + \ a factor of 2 is sufficient)." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--total_droplets_included" + description: "The number of droplets from the rank-ordered UMI plot\nthat will\ + \ be analyzed. The largest 'total_droplets'\ndroplets will have their cell probabilities\ + \ inferred\nas an output.\n" + info: null + example: + - 25000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--expected_cells_from_qc" + description: "Will use the Cell Ranger QC to determine the estimated number of\ + \ cells" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--model" + description: "Which model is being used for count data. 'simple'\ndoes not model\ + \ either ambient RNA or random barcode\nswapping (for debugging purposes --\ + \ not recommended).\n'ambient' assumes background RNA is incorporated into\n\ + droplets. 'swapping' assumes background RNA comes from\nrandom barcode swapping.\ + \ 'full' uses a combined\nambient and swapping model.\n" + info: null + default: + - "full" + required: false + choices: + - "simple" + - "ambient" + - "swapping" + - "full" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--epochs" + description: "Number of epochs to train." + info: null + default: + - 150 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--low_count_threshold" + description: "Droplets with UMI counts below this number are completely \nexcluded\ + \ from the analysis. This can help identify the correct \nprior for empty droplet\ + \ counts in the rare case where empty \ncounts are extremely high (over 200).\n" + info: null + default: + - 15 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_dim" + description: "Dimension of latent variable z.\n" + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--z_layers" + description: "Dimension of hidden layers in the encoder for z.\n" + info: null + default: + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--training_fraction" + description: "Training detail: the fraction of the data used for training.\nThe\ + \ rest is never seen by the inference algorithm. Speeds up learning.\n" + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--empty_drop_training_fraction" + description: "Training detail: the fraction of the training data each epoch that\ + \ \nis drawn (randomly sampled) from surely empty droplets.\n" + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--fpr" + description: "Target false positive rate in (0, 1). A false positive\nis a true\ + \ signal count that is erroneously removed.\nMore background removal is accompanied\ + \ by more signal\nremoval at high values of FPR. You can specify\nmultiple values,\ + \ which will create multiple output\nfiles.\n" + info: null + default: + - 0.01 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--exclude_antibody_capture" + description: "Including the flag --exclude-antibody-capture will\ncause remove-background\ + \ to operate on gene counts\nonly, ignoring other features.\n" + info: null + direction: "input" + - type: "double" + name: "--learning_rate" + description: "Training detail: lower learning rate for inference. A\nOneCycle\ + \ learning rate schedule is used, where the\nupper learning rate is ten times\ + \ this value. (For this\nvalue, probably do not exceed 1e-3).\n" + info: null + example: + - 1.0E-4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--cuda" + description: "Including the flag --cuda will run the inference on a\nGPU.\n" + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Eliminating technical artifacts from high-throughput single-cell RNA\ + \ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\ + \ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\ + \ moment, only the count matrices produced by the CellRanger count pipeline is supported.\ + \ Support for additional tools and protocols \nwill be added in the future. A quick\ + \ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.12-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "muon==0.1.5" + - "cellbender==0.2.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "muon~=0.1.4" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/correction/cellbender_remove_background_v0_2/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/correction/cellbender_remove_background_v0_2" + executable: "target/nextflow/correction/cellbender_remove_background_v0_2/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/helper.py b/target/nextflow/correction/cellbender_remove_background_v0_2/helper.py new file mode 100644 index 00000000..55e0cd36 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/helper.py @@ -0,0 +1,166 @@ +# This file is copied from https://github.com/broadinstitute/CellBender/issues/128#issuecomment-1175336065 +# to solve an issue with scanpy not being able to read in the 10x h5 files produced by cellbender. +# +# Note: If something doesn't work in this helper function, it may be interesting to +# take a look at the comments by Dries: https://github.com/openpipelines-bio/openpipeline/pull/115 +# I'm not going to apply them for now -- if it ain't broke, don't fix it. +import tables +import numpy as np +import scipy.sparse as sp +import anndata +from typing import Dict + + +def anndata_from_h5( + file: str, analyzed_barcodes_only: bool = True +) -> "anndata.AnnData": + """Load an output h5 file into an AnnData object for downstream work. + + Args: + file: The h5 file + analyzed_barcodes_only: False to load all barcodes, so that the size of + the AnnData object will match the size of the input raw count matrix. + True to load a limited set of barcodes: only those analyzed by the + algorithm. This allows relevant latent variables to be loaded + properly into adata.obs and adata.obsm, rather than adata.uns. + + Returns: + adata: The anndata object, populated with inferred latent variables + and metadata. + + """ + + d = dict_from_h5(file) + X = ( + sp.csc_matrix( + (d.pop("data"), d.pop("indices"), d.pop("indptr")), shape=d.pop("shape") + ) + .transpose() + .tocsr() + ) + + # check and see if we have barcode index annotations, and if the file is filtered + barcode_key = [k for k in d.keys() if (("barcode" in k) and ("ind" in k))] + if len(barcode_key) > 0: + max_barcode_ind = d[barcode_key[0]].max() + filtered_file = max_barcode_ind >= X.shape[0] + else: + filtered_file = True + + if analyzed_barcodes_only: + if filtered_file: + # filtered file being read, so we don't need to subset + print('Assuming we are loading a "filtered" file that contains only cells.') + pass + elif "barcode_indices_for_latents" in d.keys(): + X = X[d["barcode_indices_for_latents"], :] + d["barcodes"] = d["barcodes"][d["barcode_indices_for_latents"]] + elif "barcodes_analyzed_inds" in d.keys(): + X = X[d["barcodes_analyzed_inds"], :] + d["barcodes"] = d["barcodes"][d["barcodes_analyzed_inds"]] + else: + print( + "Warning: analyzed_barcodes_only=True, but the key " + '"barcodes_analyzed_inds" or "barcode_indices_for_latents" ' + "is missing from the h5 file. " + "Will output all barcodes, and proceed as if " + "analyzed_barcodes_only=False" + ) + + # Construct the anndata object. + adata = anndata.AnnData( + X=X, + obs={"barcode": d.pop("barcodes").astype(str)}, + var={ + "gene_name": ( + d.pop("gene_names") if "gene_names" in d.keys() else d.pop("name") + ).astype(str) + }, + dtype=X.dtype, + ) + adata.obs.set_index("barcode", inplace=True) + adata.var.set_index("gene_name", inplace=True) + + # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this + if "genes" in d.keys(): + d["id"] = d.pop("genes") + + # For purely aesthetic purposes, rename "id" to "gene_id" + if "id" in d.keys(): + d["gene_id"] = d.pop("id") + + # If genomes are empty, try to guess them based on gene_id + if "genome" in d.keys(): + if np.array([s.decode() == "" for s in d["genome"]]).all(): + if "_" in d["gene_id"][0].decode(): + print( + "Genome field blank, so attempting to guess genomes based on gene_id prefixes" + ) + d["genome"] = np.array( + [s.decode().split("_")[0] for s in d["gene_id"]], dtype=str + ) + + # Add other information to the anndata object in the appropriate slot. + _fill_adata_slots_automatically(adata, d) + + # Add a special additional field to .var if it exists. + if "features_analyzed_inds" in adata.uns.keys(): + adata.var["cellbender_analyzed"] = [ + True if (i in adata.uns["features_analyzed_inds"]) else False + for i in range(adata.shape[1]) + ] + + if analyzed_barcodes_only: + for col in adata.obs.columns[ + adata.obs.columns.str.startswith("barcodes_analyzed") + | adata.obs.columns.str.startswith("barcode_indices") + ]: + try: + del adata.obs[col] + except Exception: + pass + else: + # Add a special additional field to .obs if all barcodes are included. + if "barcodes_analyzed_inds" in adata.uns.keys(): + adata.obs["cellbender_analyzed"] = [ + True if (i in adata.uns["barcodes_analyzed_inds"]) else False + for i in range(adata.shape[0]) + ] + + return adata + + +def dict_from_h5(file: str) -> Dict[str, np.ndarray]: + """Read in everything from an h5 file and put into a dictionary.""" + d = {} + with tables.open_file(file) as f: + # read in everything + for array in f.walk_nodes("/", "Array"): + d[array.name] = array.read() + return d + + +def _fill_adata_slots_automatically(adata, d): + """Add other information to the adata object in the appropriate slot.""" + + for key, value in d.items(): + try: + if value is None: + continue + value = np.asarray(value) + if len(value.shape) == 0: + adata.uns[key] = value + elif value.shape[0] == adata.shape[0]: + if (len(value.shape) < 2) or (value.shape[1] < 2): + adata.obs[key] = value + else: + adata.obsm[key] = value + elif value.shape[0] == adata.shape[1]: + if value.dtype.name.startswith("bytes"): + adata.var[key] = value.astype(str) + else: + adata.var[key] = value + else: + adata.uns[key] = value + except Exception: + print("Unable to load data into AnnData: ", key, value, type(value)) diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/main.nf b/target/nextflow/correction/cellbender_remove_background_v0_2/main.nf new file mode 100644 index 00000000..d16b4d42 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/main.nf @@ -0,0 +1,4287 @@ +// cellbender_remove_background_v0_2 main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellbender_remove_background_v0_2", + "namespace" : "correction", + "version" : "main", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "List of modalities to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Full count matrix as an h5mu file, with background RNA removed. This file contains all the original droplet barcodes.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_output", + "description" : "Output layer", + "default" : [ + "corrected" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_latent_rt_efficiency", + "default" : [ + "latent_rt_efficiency" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_latent_cell_probability", + "default" : [ + "latent_cell_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_latent_scale", + "default" : [ + "latent_scale" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_ambient_expression", + "default" : [ + "ambient_expression" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_latent_gene_encoding", + "default" : [ + "cellbender_latent_gene_encoding" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--expected_cells", + "description" : "Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient).", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--total_droplets_included", + "description" : "The number of droplets from the rank-ordered UMI plot\nthat will be analyzed. The largest 'total_droplets'\ndroplets will have their cell probabilities inferred\nas an output.\n", + "example" : [ + 25000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--expected_cells_from_qc", + "description" : "Will use the Cell Ranger QC to determine the estimated number of cells", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--model", + "description" : "Which model is being used for count data. 'simple'\ndoes not model either ambient RNA or random barcode\nswapping (for debugging purposes -- not recommended).\n'ambient' assumes background RNA is incorporated into\ndroplets. 'swapping' assumes background RNA comes from\nrandom barcode swapping. 'full' uses a combined\nambient and swapping model.\n", + "default" : [ + "full" + ], + "required" : false, + "choices" : [ + "simple", + "ambient", + "swapping", + "full" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--epochs", + "description" : "Number of epochs to train.", + "default" : [ + 150 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--low_count_threshold", + "description" : "Droplets with UMI counts below this number are completely \nexcluded from the analysis. This can help identify the correct \nprior for empty droplet counts in the rare case where empty \ncounts are extremely high (over 200).\n", + "default" : [ + 15 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--z_dim", + "description" : "Dimension of latent variable z.\n", + "default" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--z_layers", + "description" : "Dimension of hidden layers in the encoder for z.\n", + "default" : [ + 500 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--training_fraction", + "description" : "Training detail: the fraction of the data used for training.\nThe rest is never seen by the inference algorithm. Speeds up learning.\n", + "default" : [ + 0.9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--empty_drop_training_fraction", + "description" : "Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets.\n", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--fpr", + "description" : "Target false positive rate in (0, 1). A false positive\nis a true signal count that is erroneously removed.\nMore background removal is accompanied by more signal\nremoval at high values of FPR. You can specify\nmultiple values, which will create multiple output\nfiles.\n", + "default" : [ + 0.01 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--exclude_antibody_capture", + "description" : "Including the flag --exclude-antibody-capture will\ncause remove-background to operate on gene counts\nonly, ignoring other features.\n", + "direction" : "input" + }, + { + "type" : "double", + "name" : "--learning_rate", + "description" : "Training detail: lower learning rate for inference. A\nOneCycle learning rate schedule is used, where the\nupper learning rate is ten times this value. (For this\nvalue, probably do not exceed 1e-3).\n", + "example" : [ + 1.0E-4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--cuda", + "description" : "Including the flag --cuda will run the inference on a\nGPU.\n", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "helper.py" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:23.12-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "muon==0.1.5", + "cellbender==0.2.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "muon~=0.1.4" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/correction/cellbender_remove_background_v0_2/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/correction/cellbender_remove_background_v0_2", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import tempfile +import subprocess +import os +import sys +import numpy as np +from scipy.sparse import csr_matrix + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_output': $( if [ ! -z ${VIASH_PAR_LAYER_OUTPUT+x} ]; then echo "r'${VIASH_PAR_LAYER_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_latent_rt_efficiency': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_RT_EFFICIENCY+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_RT_EFFICIENCY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_latent_cell_probability': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_CELL_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_CELL_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_latent_scale': $( if [ ! -z ${VIASH_PAR_OBS_LATENT_SCALE+x} ]; then echo "r'${VIASH_PAR_OBS_LATENT_SCALE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_ambient_expression': $( if [ ! -z ${VIASH_PAR_VAR_AMBIENT_EXPRESSION+x} ]; then echo "r'${VIASH_PAR_VAR_AMBIENT_EXPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_latent_gene_encoding': $( if [ ! -z ${VIASH_PAR_OBSM_LATENT_GENE_ENCODING+x} ]; then echo "r'${VIASH_PAR_OBSM_LATENT_GENE_ENCODING//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'expected_cells': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS+x} ]; then echo "int(r'${VIASH_PAR_EXPECTED_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'total_droplets_included': $( if [ ! -z ${VIASH_PAR_TOTAL_DROPLETS_INCLUDED+x} ]; then echo "int(r'${VIASH_PAR_TOTAL_DROPLETS_INCLUDED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'expected_cells_from_qc': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELLS_FROM_QC+x} ]; then echo "r'${VIASH_PAR_EXPECTED_CELLS_FROM_QC//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'epochs': $( if [ ! -z ${VIASH_PAR_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'low_count_threshold': $( if [ ! -z ${VIASH_PAR_LOW_COUNT_THRESHOLD+x} ]; then echo "int(r'${VIASH_PAR_LOW_COUNT_THRESHOLD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'z_dim': $( if [ ! -z ${VIASH_PAR_Z_DIM+x} ]; then echo "int(r'${VIASH_PAR_Z_DIM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'z_layers': $( if [ ! -z ${VIASH_PAR_Z_LAYERS+x} ]; then echo "list(map(int, r'${VIASH_PAR_Z_LAYERS//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'training_fraction': $( if [ ! -z ${VIASH_PAR_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_TRAINING_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'empty_drop_training_fraction': $( if [ ! -z ${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_EMPTY_DROP_TRAINING_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'fpr': $( if [ ! -z ${VIASH_PAR_FPR+x} ]; then echo "list(map(float, r'${VIASH_PAR_FPR//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'exclude_antibody_capture': $( if [ ! -z ${VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_ANTIBODY_CAPTURE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'cuda': $( if [ ! -z ${VIASH_PAR_CUDA+x} ]; then echo "r'${VIASH_PAR_CUDA//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +from helper import anndata_from_h5 + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) + +mod = par["modality"] +logger.info("Performing log transformation on modality %s", mod) +data = mdata.mod[mod] + +# with pathlib.Path(meta["temp_dir"]) / "cellbender" as temp_dir: +# os.mkdir(temp_dir) +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: + # construct paths within tempdir + input_file = os.path.join(temp_dir, "input.h5ad") + output_file = os.path.join(temp_dir, "output.h5") + + logger.info("Creating AnnData input file for CellBender: '%s'", input_file) + data.write_h5ad(input_file) + + logger.info("Constructing CellBender command") + cmd_pars = [ + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, + ] + + extra_args = [ + ("--expected-cells", "expected_cells", True), + ("--total-droplets-included", "total_droplets_included", True), + ("--model", "model", True), + ("--epochs", "epochs", True), + ("--cuda", "cuda", False), + ("--low-count-threshold", "low_count_threshold", True), + ("--z-dim", "z_dim", True), + ("--z-layers", "z_layers", True), + ("--training-fraction", "training_fraction", True), + ("--exclude-antibody-capture", "exclude_antibody_capture", False), + ("--learning-rate", "learning_rate", True), + ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), + ] + for flag, name, is_kwarg in extra_args: + if par[name]: + values = par[name] if isinstance(par[name], list) else [par[name]] + cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] + + if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: + assert par["expected_cells"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + assert par["total_droplets_included"] is None, ( + "If min_counts is defined, expected_cells should be undefined" + ) + met = data.uns["metrics_cellranger"] + col_name = "Estimated Number of Cells" + assert col_name in met.columns, ( + "%s should be a column in .obs[metrics_cellranger]" + ) + est_cells = met[col_name].values[0] + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) + out = subprocess.check_output(cmd_pars).decode("utf-8") + + logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) + # have to use custom read_10x_h5 function for now + # will be fixed when https://github.com/scverse/scanpy/pull/2344 is merged + # adata_out = sc.read_10x_h5(output_file, gex_only=False) + adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=False) + + logger.info("Copying X output to MuData") + data.layers[par["layer_output"]] = adata_out.X + + logger.info("Copying .obs output to MuData") + obs_store = { + "obs_latent_rt_efficiency": "latent_RT_efficiency", + "obs_latent_cell_probability": "latent_cell_probability", + "obs_latent_scale": "latent_scale", + } + for to_name, from_name in obs_store.items(): + if par[to_name]: + if from_name in adata_out.obs: + data.obs[par[to_name]] = adata_out.obs[from_name] + # when using unfiltered data, the values will be in uns instead of obs + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + vec = np.zeros(data.n_obs) + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] + data.obs[par[to_name]] = vec + + logger.info("Copying .var output to MuData") + var_store = {"var_ambient_expression": "ambient_expression"} + for to_name, from_name in var_store.items(): + if par[to_name]: + data.var[par[to_name]] = adata_out.var[from_name] + + logger.info("Copying obsm_latent_gene_encoding output to MuData") + obsm_store = {"obsm_latent_gene_encoding": "latent_gene_encoding"} + for to_name, from_name in obsm_store.items(): + if par[to_name]: + if from_name in adata_out.obsm: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): + matrix_to_store = adata_out.uns[from_name] + number_of_obs = data.X.shape[0] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] + data.obsm[par[to_name]] = latent_space_sparse + else: + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) + + +logger.info("Writing to file %s", par["output"]) +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/correction/cellbender_remove_background_v0_2", + "tag" : "main" + }, + "label" : [ + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow.config b/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow.config new file mode 100644 index 00000000..e50d4a72 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'correction/cellbender_remove_background_v0_2' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow_labels.config b/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow_schema.json b/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow_schema.json new file mode 100644 index 00000000..8d5fd745 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/nextflow_schema.json @@ -0,0 +1,217 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellbender_remove_background_v0_2", + "description": "Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "List of modalities to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Full count matrix as an h5mu file, with background RNA removed", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "layer_output": { + "type": "string", + "description": "Output layer", + "help_text": "Type: `string`, multiple: `False`, default: `\"corrected\"`. ", + "default": "corrected" + }, + "obs_latent_rt_efficiency": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"latent_rt_efficiency\"`. ", + "default": "latent_rt_efficiency" + }, + "obs_latent_cell_probability": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"latent_cell_probability\"`. ", + "default": "latent_cell_probability" + }, + "obs_latent_scale": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"latent_scale\"`. ", + "default": "latent_scale" + }, + "var_ambient_expression": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"ambient_expression\"`. ", + "default": "ambient_expression" + }, + "obsm_latent_gene_encoding": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"cellbender_latent_gene_encoding\"`. ", + "default": "cellbender_latent_gene_encoding" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "expected_cells": { + "type": "integer", + "description": "Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient).", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "total_droplets_included": { + "type": "integer", + "description": "The number of droplets from the rank-ordered UMI plot\nthat will be analyzed", + "help_text": "Type: `integer`, multiple: `False`, example: `25000`. " + }, + "expected_cells_from_qc": { + "type": "boolean", + "description": "Will use the Cell Ranger QC to determine the estimated number of cells", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "model": { + "type": "string", + "description": "Which model is being used for count data", + "help_text": "Type: `string`, multiple: `False`, default: `\"full\"`, choices: ``simple`, `ambient`, `swapping`, `full``. ", + "enum": [ + "simple", + "ambient", + "swapping", + "full" + ], + "default": "full" + }, + "epochs": { + "type": "integer", + "description": "Number of epochs to train.", + "help_text": "Type: `integer`, multiple: `False`, default: `150`. ", + "default": 150 + }, + "low_count_threshold": { + "type": "integer", + "description": "Droplets with UMI counts below this number are completely \nexcluded from the analysis", + "help_text": "Type: `integer`, multiple: `False`, default: `15`. ", + "default": 15 + }, + "z_dim": { + "type": "integer", + "description": "Dimension of latent variable z.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "z_layers": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Dimension of hidden layers in the encoder for z.\n", + "help_text": "Type: `integer`, multiple: `True`, default: `[500]`. ", + "default": [ + 500 + ] + }, + "training_fraction": { + "type": "number", + "description": "Training detail: the fraction of the data used for training.\nThe rest is never seen by the inference algorithm", + "help_text": "Type: `double`, multiple: `False`, default: `0.9`. ", + "default": 0.9 + }, + "empty_drop_training_fraction": { + "type": "number", + "description": "Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets.\n", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "fpr": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Target false positive rate in (0, 1)", + "help_text": "Type: `double`, multiple: `True`, default: `[0.01]`. ", + "default": [ + 0.01 + ] + }, + "exclude_antibody_capture": { + "type": "boolean", + "description": "Including the flag --exclude-antibody-capture will\ncause remove-background to operate on gene counts\nonly, ignoring other features.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "learning_rate": { + "type": "number", + "description": "Training detail: lower learning rate for inference", + "help_text": "Type: `double`, multiple: `False`, example: `1.0E-4`. " + }, + "cuda": { + "type": "boolean", + "description": "Including the flag --cuda will run the inference on a\nGPU.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/correction/cellbender_remove_background_v0_2/setup_logger.py b/target/nextflow/correction/cellbender_remove_background_v0_2/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/correction/cellbender_remove_background_v0_2/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/create_pseudobulk/differential_expression/.config.vsh.yaml b/target/nextflow/create_pseudobulk/differential_expression/.config.vsh.yaml new file mode 100644 index 00000000..04652109 --- /dev/null +++ b/target/nextflow/create_pseudobulk/differential_expression/.config.vsh.yaml @@ -0,0 +1,365 @@ +name: "differential_expression" +namespace: "create_pseudobulk" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Dries De Maeyer" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used. This layer must contain\ + \ raw counts." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_label" + description: ".obs field containing the variable to group on. Typically this field\ + \ contains cell groups such as annotated cell types or clusters." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_groups" + description: ".obs fields containing the experimental condition(s) to create pseudobulk\ + \ samples for." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--aggregation_method" + description: "Method to aggregate the raw counts for pseudoreplicates. Either\ + \ sum or mean." + info: null + default: + - "sum" + required: false + choices: + - "sum" + - "mean" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_obs_per_sample" + description: "Minimum number of cells per pseudobulk sample." + info: null + default: + - 30 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed for sampling.\n" + info: null + default: + - 0 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_cell_count" + description: "The .obs field to store the number of observations per pseudobulk\ + \ sample." + info: null + default: + - "n_cells" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file containing the aggregated pseudobulk samples." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Generation of pseudobulk samples from single-cell transcriptomics data,\n\ + by aggregating raw gene expression counts from individual cells to create \nbulk-like\ + \ expression profiles suitable for differential expression analysis\nwith methods\ + \ designed for bulk differential expression analysis.\nNote that this componentonly\ + \ considers factors as explanatory variables, \nand excludes covariates from the\ + \ analysis.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/differential_expression/create_pseudobulk/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/create_pseudobulk/differential_expression" + executable: "target/nextflow/create_pseudobulk/differential_expression/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/create_pseudobulk/differential_expression/main.nf b/target/nextflow/create_pseudobulk/differential_expression/main.nf new file mode 100644 index 00000000..853e54e2 --- /dev/null +++ b/target/nextflow/create_pseudobulk/differential_expression/main.nf @@ -0,0 +1,4143 @@ +// differential_expression main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author) +// * Jakub Majercik (author) +// * Dries De Maeyer (contributor) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "differential_expression", + "namespace" : "create_pseudobulk", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + }, + { + "name" : "Dries De Maeyer", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. If None, X is used. This layer must contain raw counts.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_label", + "description" : ".obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_groups", + "description" : ".obs fields containing the experimental condition(s) to create pseudobulk samples for.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--aggregation_method", + "description" : "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean.", + "default" : [ + "sum" + ], + "required" : false, + "choices" : [ + "sum", + "mean" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_obs_per_sample", + "description" : "Minimum number of cells per pseudobulk sample.", + "default" : [ + 30 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--random_state", + "description" : "The random seed for sampling.\n", + "default" : [ + 0 + ], + "required" : false, + "min" : 0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_cell_count", + "description" : "The .obs field to store the number of observations per pseudobulk sample.", + "default" : [ + "n_cells" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file containing the aggregated pseudobulk samples.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Generation of pseudobulk samples from single-cell transcriptomics data,\nby aggregating raw gene expression counts from individual cells to create \nbulk-like expression profiles suitable for differential expression analysis\nwith methods designed for bulk differential expression analysis.\nNote that this componentonly considers factors as explanatory variables, \nand excludes covariates from the analysis.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/differential_expression/create_pseudobulk/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/create_pseudobulk/differential_expression", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import numpy as np +import mudata as mu +import pandas as pd +import sys +import scanpy as sc +import scipy.sparse as sp + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_label': $( if [ ! -z ${VIASH_PAR_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_groups': $( if [ ! -z ${VIASH_PAR_OBS_GROUPS+x} ]; then echo "r'${VIASH_PAR_OBS_GROUPS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'aggregation_method': $( if [ ! -z ${VIASH_PAR_AGGREGATION_METHOD+x} ]; then echo "r'${VIASH_PAR_AGGREGATION_METHOD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_obs_per_sample': $( if [ ! -z ${VIASH_PAR_MIN_OBS_PER_SAMPLE+x} ]; then echo "int(r'${VIASH_PAR_MIN_OBS_PER_SAMPLE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'obs_cell_count': $( if [ ! -z ${VIASH_PAR_OBS_CELL_COUNT+x} ]; then echo "r'${VIASH_PAR_OBS_CELL_COUNT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def is_normalized(layer): + if sp.issparse(layer): + row_sums = np.array(layer.sum(axis=1)).flatten() + else: + row_sums = layer.sum(axis=1) + + return np.allclose(row_sums, 1) + + +def count_obs(adata, pb_adata, obs_cols): + counts = [] + for i in range(pb_adata.n_obs): + values = pb_adata.obs.iloc[i][obs_cols] + + mask = pd.Series([True] * adata.n_obs) + for col in obs_cols: + mask &= (adata.obs[col] == values[col]).values + + count = mask.sum() + counts.append(count) + + return counts + + +def main(): + # Read in data + logger.info(f"Reading input data {par['input']}...") + adata = mu.read_h5ad(par["input"], mod=par["modality"]).copy() + + # Make sure .X contains raw counts + if par["input_layer"]: + adata.X = adata.layers[par["input_layer"]] + if is_normalized(adata.X): + raise ValueError("Input layer must contain raw counts.") + + # Sanitize pseudobulk aggregation fields + pseudobulk_cols = [par["obs_label"]] + if par["obs_groups"]: + pseudobulk_cols += par["obs_groups"] + + for col in pseudobulk_cols: + if col not in adata.obs.columns: + raise ValueError(f"Required column '{col}' not found in .obs.") + adata.obs[col] = ( + adata.obs[col] + .astype(str) + .str.replace(" ", "_") + .str.replace("+", "") + .astype("category") + ) + + # Aggregate pseudobulk data per cell group + logger.info("Creating pseudobulk samples...") + adata_pb = sc.get.aggregate(adata, by=pseudobulk_cols, func="sum", axis=0) + adata_pb.X = adata_pb.layers["sum"] + del adata_pb.layers["sum"] + + adata_pb.obs_names = [ + "_".join(values) + for values in zip(*[adata_pb.obs[col] for col in pseudobulk_cols]) + ] + + # Filter pseudobulk samples based on minimum observation count + logger.info("Filtering pseudobulk samples based on minimum observation count...") + adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols) + adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]] + + logger.info( + f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes" + ) + + logger.info("Writing output data...") + mdata_pb = mu.MuData({"rna": adata_pb}) + mdata_pb.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/create_pseudobulk/differential_expression", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/create_pseudobulk/differential_expression/nextflow.config b/target/nextflow/create_pseudobulk/differential_expression/nextflow.config new file mode 100644 index 00000000..573130b0 --- /dev/null +++ b/target/nextflow/create_pseudobulk/differential_expression/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'create_pseudobulk/differential_expression' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Generation of pseudobulk samples from single-cell transcriptomics data,\nby aggregating raw gene expression counts from individual cells to create \nbulk-like expression profiles suitable for differential expression analysis\nwith methods designed for bulk differential expression analysis.\nNote that this componentonly considers factors as explanatory variables, \nand excludes covariates from the analysis.\n' + author = 'Dorien Roosen, Jakub Majercik, Dries De Maeyer, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/create_pseudobulk/differential_expression/nextflow_labels.config b/target/nextflow/create_pseudobulk/differential_expression/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/create_pseudobulk/differential_expression/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/create_pseudobulk/differential_expression/nextflow_schema.json b/target/nextflow/create_pseudobulk/differential_expression/nextflow_schema.json new file mode 100644 index 00000000..ed66ec70 --- /dev/null +++ b/target/nextflow/create_pseudobulk/differential_expression/nextflow_schema.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "differential_expression", + "description": "Generation of pseudobulk samples from single-cell transcriptomics data,\nby aggregating raw gene expression counts from individual cells to create \nbulk-like expression profiles suitable for differential expression analysis\nwith methods designed for bulk differential expression analysis.\nNote that this componentonly considers factors as explanatory variables, \nand excludes covariates from the analysis.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_label": { + "type": "string", + "description": ".obs field containing the variable to group on", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "obs_groups": { + "type": "array", + "items": { + "type": "string" + }, + "description": ".obs fields containing the experimental condition(s) to create pseudobulk samples for.", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file containing the aggregated pseudobulk samples.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_compression": { + "type": "string", + "description": "The compression format to be used on the output h5mu object.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "aggregation_method": { + "type": "string", + "description": "Method to aggregate the raw counts for pseudoreplicates", + "help_text": "Type: `string`, multiple: `False`, default: `\"sum\"`, choices: ``sum`, `mean``. ", + "enum": [ + "sum", + "mean" + ], + "default": "sum" + }, + "min_obs_per_sample": { + "type": "integer", + "description": "Minimum number of cells per pseudobulk sample.", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + }, + "random_state": { + "type": "integer", + "description": "The random seed for sampling.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "obs_cell_count": { + "type": "string", + "description": "The .obs field to store the number of observations per pseudobulk sample.", + "help_text": "Type: `string`, multiple: `False`, default: `\"n_cells\"`. ", + "default": "n_cells" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/create_pseudobulk/differential_expression/setup_logger.py b/target/nextflow/create_pseudobulk/differential_expression/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/create_pseudobulk/differential_expression/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dataflow/concatenate_h5mu/.config.vsh.yaml b/target/nextflow/dataflow/concatenate_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..28f9e13f --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/.config.vsh.yaml @@ -0,0 +1,337 @@ +name: "concatenate_h5mu" +namespace: "dataflow" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Paths to the different samples to be concatenated." + info: null + example: + - "sample_paths" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Only output concatenated objects for the provided modalities. Outputs\ + \ all modalities by default." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--input_id" + description: "Names of the different samples that have to be concatenated. Must\ + \ be specified when using '--mode move'.\nIn this case, the ids will be used\ + \ for the columns names of the dataframes registring the conflicts.\nIf specified,\ + \ must be of same length as `--input`.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output location for the concatenated MuData object file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_sample_name" + description: "Name of the .obs key under which to add the sample names." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--other_axis_mode" + description: "How to handle the merging of other axis (var, obs, ...).\n\n -\ + \ None: keep no data\n - same: only keep elements of the matrices which are\ + \ the same in each of the samples\n - unique: only keep elements for which\ + \ there is only 1 possible value (1 value that can occur in multiple samples)\n\ + \ - first: keep the annotation from the first sample\n - only: keep elements\ + \ that show up in only one of the objects (1 unique element in only 1 sample)\n\ + \ - move: identical to 'same', but moving the conflicting values to .varm or\ + \ .obsm\n" + info: null + default: + - "move" + required: false + choices: + - "same" + - "unique" + - "first" + - "only" + - "concat" + - "move" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_merge_mode" + description: "How to handle the merging of .uns across modalities\n - None: keep\ + \ no data\n - same: only keep elements of the matrices which are the same in\ + \ each of the samples\n - unique: only keep elements for which there is only\ + \ 1 possible value (1 value that can occur in multiple samples)\n - first:\ + \ keep the annotation from the first sample\n - only: keep elements that show\ + \ up in only one of the objects (1 unique element in only 1 sample)\n - make_unique:\ + \ identical to 'unique', but keys which are not unique are made unique by prefixing\ + \ them with the sample id.\n" + info: null + default: + - "make_unique" + required: false + choices: + - "same" + - "unique" + - "first" + - "only" + - "make_unique" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Concatenate observations from samples in several (uni- and/or multi-modal)\ + \ MuData files into a single file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +- type: "file" + path: "human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "highmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "pandas~=2.1.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/concatenate_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dataflow/concatenate_h5mu" + executable: "target/nextflow/dataflow/concatenate_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dataflow/concatenate_h5mu/compress_h5mu.py b/target/nextflow/dataflow/concatenate_h5mu/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/dataflow/concatenate_h5mu/main.nf b/target/nextflow/dataflow/concatenate_h5mu/main.nf new file mode 100644 index 00000000..0e44e514 --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/main.nf @@ -0,0 +1,4370 @@ +// concatenate_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "concatenate_h5mu", + "namespace" : "dataflow", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Paths to the different samples to be concatenated.", + "example" : [ + "sample_paths" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Only output concatenated objects for the provided modalities. Outputs all modalities by default.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_id", + "description" : "Names of the different samples that have to be concatenated. Must be specified when using '--mode move'.\nIn this case, the ids will be used for the columns names of the dataframes registring the conflicts.\nIf specified, must be of same length as `--input`.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output location for the concatenated MuData object file.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_sample_name", + "description" : "Name of the .obs key under which to add the sample names.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--other_axis_mode", + "description" : "How to handle the merging of other axis (var, obs, ...).\n\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - move: identical to 'same', but moving the conflicting values to .varm or .obsm\n", + "default" : [ + "move" + ], + "required" : false, + "choices" : [ + "same", + "unique", + "first", + "only", + "concat", + "move" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_merge_mode", + "description" : "How to handle the merging of .uns across modalities\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - make_unique: identical to 'unique', but keys which are not unique are made unique by prefixing them with the sample id.\n", + "default" : [ + "make_unique" + ], + "required" : false, + "choices" : [ + "same", + "unique", + "first", + "only", + "make_unique" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midcpu", + "highmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "pandas~=2.1.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dataflow/concatenate_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dataflow/concatenate_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from __future__ import annotations +import sys +import anndata +import mudata as mu +import pandas as pd +import numpy as np +from collections.abc import Iterable +from multiprocessing import Pool +from pathlib import Path +from h5py import File as H5File +from typing import Literal +import shutil + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_sample_name': $( if [ ! -z ${VIASH_PAR_OBS_SAMPLE_NAME+x} ]; then echo "r'${VIASH_PAR_OBS_SAMPLE_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'other_axis_mode': $( if [ ! -z ${VIASH_PAR_OTHER_AXIS_MODE+x} ]; then echo "r'${VIASH_PAR_OTHER_AXIS_MODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_merge_mode': $( if [ ! -z ${VIASH_PAR_UNS_MERGE_MODE+x} ]; then echo "r'${VIASH_PAR_UNS_MERGE_MODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import compress_h5mu +from setup_logger import setup_logger + +logger = setup_logger() + + +def nunique(row): + unique = pd.unique(row) + unique_without_na = pd.core.dtypes.missing.remove_na_arraylike(unique) + return len(unique_without_na) > 1 + + +def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) -> bool: + """ + Check if any row contains duplicate values, that are not NA. + """ + numpy_array = frame.to_numpy() + with Pool(n_processes) as pool: + is_duplicated = pool.map(nunique, iter(numpy_array)) + return any(is_duplicated) + + +def concatenate_matrices( + n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index +) -> tuple[ + dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype] +]: + """ + Merge matrices by combining columns that have the same name. + Columns that contain conflicting values (e.i. the columns have different values), + are not merged, but instead moved to a new dataframe. + """ + column_names = set(column_name for var in matrices.values() for column_name in var) + logger.debug("Trying to concatenate columns: %s.", ",".join(column_names)) + if not column_names: + return {}, pd.DataFrame(index=align_to) + conflicts, concatenated_matrix = split_conflicts_and_concatenated_columns( + n_processes, matrices, column_names, align_to + ) + concatenated_matrix = cast_to_writeable_dtype(concatenated_matrix) + conflicts = { + conflict_name: cast_to_writeable_dtype(conflict_df) + for conflict_name, conflict_df in conflicts.items() + } + return conflicts, concatenated_matrix + + +def get_first_non_na_value_vector(df): + numpy_arr = df.to_numpy() + n_rows, n_cols = numpy_arr.shape + col_index = pd.isna(numpy_arr).argmin(axis=1) + flat_index = n_cols * np.arange(n_rows) + col_index + return pd.Series(numpy_arr.ravel()[flat_index], index=df.index, name=df.columns[0]) + + +def make_uns_keys_unique(mod_data, concatenated_data): + """ + Check if the uns keys across samples are unique before adding them + to the final concatenated object. If a conflict occurs between the samples, + add the sample ID to make the key unique again. + """ + all_uns_keys = {} + for sample_id, mod in mod_data.items(): + for uns_key, _ in mod.uns.items(): + all_uns_keys.setdefault(uns_key, []).append(sample_id) + for uns_key, samples_ids in all_uns_keys.items(): + assert samples_ids + if len(samples_ids) == 1: + sample_id = samples_ids[0] + concatenated_data.uns[uns_key] = mod_data[sample_id].uns[uns_key] + else: + for sample_id in samples_ids: + concatenated_data.uns[f"{sample_id}_{uns_key}"] = mod_data[ + sample_id + ].uns[uns_key] + return concatenated_data + + +def split_conflicts_and_concatenated_columns( + n_processes: int, + matrices: dict[str, pd.DataFrame], + column_names: Iterable[str], + align_to: pd.Index, +) -> tuple[dict[str, pd.DataFrame], pd.DataFrame]: + """ + Retrieve columns with the same name from a list of dataframes which are + identical across all the frames (ignoring NA values). + Columns which are not the same are regarded as 'conflicts', + which are stored in seperate dataframes, one per columns + with the same name that store conflicting values. + """ + conflicts = {} + concatenated_matrix = [] + for column_name in column_names: + columns = { + input_id: var[column_name] + for input_id, var in matrices.items() + if column_name in var + } + assert columns, "Some columns should have been found." + concatenated_columns = pd.concat( + columns.values(), axis=1, join="outer", sort=False + ) + if any_row_contains_duplicate_values(n_processes, concatenated_columns): + concatenated_columns.columns = ( + columns.keys() + ) # Use the sample id as column name + concatenated_columns = concatenated_columns.reindex(align_to, copy=False) + conflicts[f"conflict_{column_name}"] = concatenated_columns + else: + unique_values = get_first_non_na_value_vector(concatenated_columns) + concatenated_matrix.append(unique_values) + if not concatenated_matrix: + return conflicts, pd.DataFrame(index=align_to) + concatenated_matrix = pd.concat( + concatenated_matrix, join="outer", axis=1, sort=False + ) + concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False) + return conflicts, concatenated_matrix + + +def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: + """ + Cast the dataframe to dtypes that can be written by mudata. + """ + # dtype inferral workfs better with np.nan + result = result.replace({pd.NA: np.nan}) + + # MuData supports nullable booleans and ints + # ie. \\`IntegerArray\\` and \\`BooleanArray\\` + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # Convert leftover 'object' columns to string + # However, na values are supported, so convert all values except NA's to string + object_cols = result.select_dtypes(include="object").columns.values + for obj_col in object_cols: + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) + return result + + +def split_conflicts_modalities( + n_processes: int, samples: dict[str, anndata.AnnData], output: anndata.AnnData +) -> anndata.AnnData: + """ + Merge .var and .obs matrices of the anndata objects. Columns are merged + when the values (excl NA) are the same in each of the matrices. + Conflicting columns are moved to a separate dataframe (one dataframe for each column, + containing all the corresponding column from each sample). + """ + matrices_to_parse = ("var", "obs") + for matrix_name in matrices_to_parse: + matrices = { + sample_id: getattr(sample, matrix_name) + for sample_id, sample in samples.items() + } + output_index = getattr(output, matrix_name).index + conflicts, concatenated_matrix = concatenate_matrices( + n_processes, matrices, output_index + ) + if concatenated_matrix.empty: + concatenated_matrix.index = output_index + + # Even though we did not touch the varm and obsm matrices that were already present, + # the joining of observations might have caused a dtype change in these matrices as well + # so these also need to be casted to a writable dtype... + for multidim_name, multidim_data in getattr(output, f"{matrix_name}m").items(): + new_data = ( + cast_to_writeable_dtype(multidim_data) + if isinstance(multidim_data, pd.DataFrame) + else multidim_data + ) + getattr(output, f"{matrix_name}m")[multidim_name] = new_data + + # Write the conflicts to the output + for conflict_name, conflict_data in conflicts.items(): + getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data + + # Set other annotation matrices in the output + setattr(output, matrix_name, concatenated_matrix) + + return output + + +def concatenate_modality( + n_processes: int, + mod: str | None, + input_files: Iterable[str | Path], + other_axis_mode: str, + uns_merge_mode: str, + input_ids: tuple[str], +) -> anndata.AnnData: + concat_modes = { + "move": "unique", + } + other_axis_mode_to_apply = concat_modes.get(other_axis_mode, other_axis_mode) + + uns_merge_modes = {"make_unique": None} + uns_merge_mode_to_apply = uns_merge_modes.get(uns_merge_mode, uns_merge_mode) + + mod_data = {} + mod_indices_combined = pd.Index([]) + for input_id, input_file in zip(input_ids, input_files): + if mod is not None: + try: + data = mu.read_h5ad(input_file, mod=mod) + mod_data[input_id] = data + mod_indices_combined = mod_indices_combined.append(data.obs.index) + except KeyError as e: # Modality does not exist for this sample, skip it + if ( + f"Unable to synchronously open object (object '{mod}' doesn't exist)" + not in str(e) + ): + raise e + pass + else: # When mod=None, process the 'global' h5mu state + with H5File(input_file, "r") as input_h5: + if "uns" in input_h5.keys(): + uns_data = anndata.experimental.read_elem(input_h5["uns"]) + if uns_data: + mod_data[input_id] = anndata.AnnData(uns=uns_data) + + if not mod_indices_combined.is_unique: + raise ValueError("Observations are not unique across samples.") + + if not mod_data: + return anndata.AnnData() + + concatenated_data = anndata.concat( + mod_data.values(), + join="outer", + merge=other_axis_mode_to_apply, + uns_merge=uns_merge_mode_to_apply, + ) + + if other_axis_mode == "move": + concatenated_data = split_conflicts_modalities( + n_processes, mod_data, concatenated_data + ) + + if uns_merge_mode == "make_unique": + concatenated_data = make_uns_keys_unique(mod_data, concatenated_data) + + return concatenated_data + + +def concatenate_modalities( + n_processes: int, + modalities: list[str], + input_files: Path | str, + other_axis_mode: str, + uns_merge_mode: str, + output_file: Path | str, + compression: Literal["gzip"] | Literal["lzf"], + input_ids: tuple[str] | None = None, +) -> None: + """ + Join the modalities together into a single multimodal sample. + """ + logger.info("Concatenating samples.") + output_file, input_files = ( + Path(output_file), + [Path(input_file) for input_file in input_files], + ) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) + output_file_uncompressed.touch() + # Create empty mudata file + mdata = mu.MuData({modality: anndata.AnnData() for modality in modalities}) + mdata.write(output_file_uncompressed, compression=compression) + + # Use "None" for the global slots (not assigned to any modality) + for mod_name in modalities + [ + None, + ]: + new_mod = concatenate_modality( + n_processes, + mod_name, + input_files, + other_axis_mode, + uns_merge_mode, + input_ids, + ) + if mod_name is None: + if new_mod.uns: + with H5File(output_file_uncompressed, "r+") as open_h5mu_file: + anndata.experimental.write_elem( + open_h5mu_file, "uns", dict(new_mod.uns) + ) + continue + logger.info( + "Writing out modality '%s' to '%s' with compression '%s'.", + mod_name, + output_file_uncompressed, + compression, + ) + mu.write_h5ad(output_file_uncompressed, data=new_mod, mod=mod_name) + + if compression: + compress_h5mu(output_file_uncompressed, output_file, compression=compression) + output_file_uncompressed.unlink() + else: + shutil.move(output_file_uncompressed, output_file) + + logger.info("Concatenation successful.") + + +def main() -> None: + # Get a list of all possible modalities + mods = set() + for path in par["input"]: + try: + with H5File(path, "r") as f_root: + mods = mods | set(f_root["mod"].keys()) + except OSError: + raise OSError(f"Failed to load {path}. Is it a valid h5 file?") + + input_ids = None + if par["input_id"]: + input_ids: tuple[str] = tuple(i.strip() for i in par["input_id"]) + if len(input_ids) != len(par["input"]): + raise ValueError( + "The number of sample names must match the number of sample files." + ) + + if len(set(input_ids)) != len(input_ids): + raise ValueError("The sample names should be unique.") + + logger.info("\\\\nConcatenating data from paths:\\\\n\\\\t%s", "\\\\n\\\\t".join(par["input"])) + + if par["other_axis_mode"] == "move" and not input_ids: + raise ValueError("--mode 'move' requires --input_ids.") + + n_processes = meta["cpus"] if meta["cpus"] else 1 + + if par["modality"]: + par["modality"] = set(par["modality"]) + if not par["modality"].issubset(mods): + mods_joined, input_mods_joined = ", ".join(mods), ", ".join(par["modality"]) + raise ValueError( + f"One of the modalities provided ({input_mods_joined}) is not available in the input data {mods_joined}" + ) + mods = par["modality"] + + concatenate_modalities( + n_processes, + list(mods), + par["input"], + par["other_axis_mode"], + par["uns_merge_mode"], + par["output"], + par["output_compression"], + input_ids=input_ids, + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dataflow/concatenate_h5mu", + "tag" : "main" + }, + "label" : [ + "midcpu", + "highmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dataflow/concatenate_h5mu/nextflow.config b/target/nextflow/dataflow/concatenate_h5mu/nextflow.config new file mode 100644 index 00000000..3102220b --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dataflow/concatenate_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dataflow/concatenate_h5mu/nextflow_labels.config b/target/nextflow/dataflow/concatenate_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dataflow/concatenate_h5mu/nextflow_schema.json b/target/nextflow/dataflow/concatenate_h5mu/nextflow_schema.json new file mode 100644 index 00000000..d511fad1 --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/nextflow_schema.json @@ -0,0 +1,110 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "concatenate_h5mu", + "description": "Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths to the different samples to be concatenated.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"sample_paths\"]`. " + }, + "modality": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only output concatenated objects for the provided modalities", + "help_text": "Type: `string`, multiple: `True`. " + }, + "input_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Names of the different samples that have to be concatenated", + "help_text": "Type: `string`, multiple: `True`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output location for the concatenated MuData object file.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obs_sample_name": { + "type": "string", + "description": "Name of the .obs key under which to add the sample names.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "other_axis_mode": { + "type": "string", + "description": "How to handle the merging of other axis (var, obs, ...).\n\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - move: identical to 'same', but moving the conflicting values to .varm or .obsm\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"move\"`, choices: ``same`, `unique`, `first`, `only`, `concat`, `move``. ", + "enum": [ + "same", + "unique", + "first", + "only", + "concat", + "move" + ], + "default": "move" + }, + "uns_merge_mode": { + "type": "string", + "description": "How to handle the merging of .uns across modalities\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - make_unique: identical to 'unique', but keys which are not unique are made unique by prefixing them with the sample id.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"make_unique\"`, choices: ``same`, `unique`, `first`, `only`, `make_unique``. ", + "enum": [ + "same", + "unique", + "first", + "only", + "make_unique" + ], + "default": "make_unique" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dataflow/concatenate_h5mu/setup_logger.py b/target/nextflow/dataflow/concatenate_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dataflow/concatenate_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dataflow/merge/.config.vsh.yaml b/target/nextflow/dataflow/merge/.config.vsh.yaml new file mode 100644 index 00000000..933dd308 --- /dev/null +++ b/target/nextflow/dataflow/merge/.config.vsh.yaml @@ -0,0 +1,251 @@ +name: "merge" +namespace: "dataflow" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Paths to the single-modality .h5mu files that need to be combined" + info: null + default: + - "sample_paths" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Path to the output file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Combine one or more single-modality .h5mu files together into one .h5mu\ + \ file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "highmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/merge/config.vsh.yml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dataflow/merge" + executable: "target/nextflow/dataflow/merge/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dataflow/merge/main.nf b/target/nextflow/dataflow/merge/main.nf new file mode 100644 index 00000000..e3c1405e --- /dev/null +++ b/target/nextflow/dataflow/merge/main.nf @@ -0,0 +1,3973 @@ +// merge main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "merge", + "namespace" : "dataflow", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Paths to the single-modality .h5mu files that need to be combined", + "default" : [ + "sample_paths" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Path to the output file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Combine one or more single-modality .h5mu files together into one .h5mu file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/merge_test_data/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu" + }, + { + "type" : "file", + "path" : "../../../resources_test/merge_test_data/pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "highmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dataflow/merge/config.vsh.yml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dataflow/merge", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from __future__ import annotations +import sys +import mudata as md +import pandas as pd +import numpy as np + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info("Reading input files %s", ",".join(par["input"])) + input_samples = [md.read_h5mu(path) for path in par["input"]] + + logger.info("Merging into single object.") + sample_modalities = {} + for input_sample in input_samples: + for mod_name, mod_data in input_sample.mod.items(): + if mod_name in sample_modalities: + raise ValueError( + f"Modality '{mod_name}' was found in more than 1 sample." + ) + sample_modalities[mod_name] = mod_data + + merged = md.MuData(sample_modalities) + merged.update() + for df_attr in ("var", "obs"): + df = getattr(merged, df_attr) + df = df.replace({pd.NA: np.nan}, inplace=False) + + # MuData supports nullable booleans and ints + # ie. \\`IntegerArray\\` and \\`BooleanArray\\` + df = df.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + + # pd.convert_dtypes does not convert already existing nullable dtypes + # to their native numpy dtypes. + # At this point, integer and boolean columns use a nullable dtype; and string columns + # might use 'object' or pd.StringDtype. This is OK. + # However, for example floating number columns might still use a nullable dtype + # from before the call to convert_dtypes. So we need to explicitly cast. + col_dtypes = df.select_dtypes( + exclude=["integer", "string", "object", "boolean"] + ).dtypes.to_dict() + numpy_dtypes = { + col_name: np.dtype(col_dtype.kind) + for col_name, col_dtype in col_dtypes.items() + if pd.api.types.is_extension_array_dtype(col_dtype) + } + df = df.astype(numpy_dtypes) + + # Convert leftover 'object' columns to string + object_cols = df.select_dtypes(include="object").columns.values + for obj_col in object_cols: + df[obj_col] = df[obj_col].astype(str).astype("category") + + setattr(merged, df_attr, df) + + merged.write_h5mu(par["output"], compression=par["output_compression"]) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dataflow/merge", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "highmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dataflow/merge/nextflow.config b/target/nextflow/dataflow/merge/nextflow.config new file mode 100644 index 00000000..48c02fdb --- /dev/null +++ b/target/nextflow/dataflow/merge/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dataflow/merge' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Combine one or more single-modality .h5mu files together into one .h5mu file.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dataflow/merge/nextflow_labels.config b/target/nextflow/dataflow/merge/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dataflow/merge/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dataflow/merge/nextflow_schema.json b/target/nextflow/dataflow/merge/nextflow_schema.json new file mode 100644 index 00000000..2f20aa05 --- /dev/null +++ b/target/nextflow/dataflow/merge/nextflow_schema.json @@ -0,0 +1,64 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "merge", + "description": "Combine one or more single-modality .h5mu files together into one .h5mu file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths to the single-modality .h5mu files that need to be combined", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"sample_paths\"]`, direction: `input`. ", + "default": [ + "sample_paths" + ] + }, + "output": { + "type": "string", + "format": "path", + "description": "Path to the output file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "The compression format to be used on the output h5mu object.", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dataflow/merge/setup_logger.py b/target/nextflow/dataflow/merge/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dataflow/merge/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dataflow/split_h5mu/.config.vsh.yaml b/target/nextflow/dataflow/split_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..7c135594 --- /dev/null +++ b/target/nextflow/dataflow/split_h5mu/.config.vsh.yaml @@ -0,0 +1,291 @@ +name: "split_h5mu" +namespace: "dataflow" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input & specifications" + arguments: + - type: "file" + name: "--input" + description: "Path to a single .h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_feature" + description: "The .obs column to split the mudata on." + info: null + example: + - "celltype" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--drop_obs_nan" + description: "Whether to drop all .obs columns that contain only nan values after\ + \ splitting." + info: null + direction: "input" + - type: "boolean_true" + name: "--ensure_unique_filenames" + description: "Append number suffixes to ensure unique filenames after sanitizing\ + \ obs feature values." + info: null + direction: "input" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output directory containing multiple h5mu files." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_files" + description: "A csv containing the base filename and obs feature by which it was\ + \ split." + info: null + example: + - "sample_files.csv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split the samples of a single modality from a .h5mu (multimodal) sample\ + \ into seperate .h5mu files based on the values of an .obs column of this modality.\ + \ \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/split_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dataflow/split_h5mu" + executable: "target/nextflow/dataflow/split_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dataflow/split_h5mu/main.nf b/target/nextflow/dataflow/split_h5mu/main.nf new file mode 100644 index 00000000..d023fc13 --- /dev/null +++ b/target/nextflow/dataflow/split_h5mu/main.nf @@ -0,0 +1,4035 @@ +// split_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "split_h5mu", + "namespace" : "dataflow", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input & specifications", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to a single .h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_feature", + "description" : "The .obs column to split the mudata on.", + "example" : [ + "celltype" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--drop_obs_nan", + "description" : "Whether to drop all .obs columns that contain only nan values after splitting.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--ensure_unique_filenames", + "description" : "Append number suffixes to ensure unique filenames after sanitizing obs feature values.", + "direction" : "input" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output directory containing multiple h5mu files.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_files", + "description" : "A csv containing the base filename and obs feature by which it was split.", + "example" : [ + "sample_files.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dataflow/split_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dataflow/split_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +import pandas as pd +import re +import gc +from pathlib import Path +from collections import defaultdict + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_feature': $( if [ ! -z ${VIASH_PAR_OBS_FEATURE+x} ]; then echo "r'${VIASH_PAR_OBS_FEATURE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'drop_obs_nan': $( if [ ! -z ${VIASH_PAR_DROP_OBS_NAN+x} ]; then echo "r'${VIASH_PAR_DROP_OBS_NAN//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'ensure_unique_filenames': $( if [ ! -z ${VIASH_PAR_ENSURE_UNIQUE_FILENAMES+x} ]; then echo "r'${VIASH_PAR_ENSURE_UNIQUE_FILENAMES//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_files': $( if [ ! -z ${VIASH_PAR_OUTPUT_FILES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_FILES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main(): + logger.info(f"Reading {par['input']}") + input_file = Path(par["input"].strip()) + + mdata = mu.read_h5mu(input_file) + adata = mdata.mod[par["modality"]] + + logger.info(f"Reading unique features from {par['obs_feature']}") + obs_features = adata.obs[par["obs_feature"]].unique().tolist() + + # sanitize --obs_feature values + obs_features_s = [re.sub(r"[-\\\\s]", "_", str(s).strip()) for s in obs_features] + obs_features_s = [re.sub(r"[^A-Za-z0-9_]", "", s) for s in obs_features_s] + + # ensure that names are unique, if not raise or append number as suffix + if not len(obs_features_s) == len(set(obs_features_s)): + if not par["ensure_unique_filenames"]: + raise ValueError( + f"File names are not unique after sanitizing the --obs_feature {par['obs_feature']} values" + ) + + logger.info("Ensuring unique names for par['obs_feature']") + counts = defaultdict(lambda: -1) + for i, feature in enumerate(obs_features_s): + counts[feature] += 1 + if (curr_counts := counts[feature]) > 0: + obs_features_s[i] += f"_{curr_counts}" + + # generate output dir + output_dir = Path(par["output"]) + if not output_dir.is_dir(): + output_dir.mkdir(parents=True) + + # split modality of mdata file base on obs_feature + logger.info(f"Splitting file based on {par['obs_feature']} values {obs_features}") + obs_files = [] + + for obs_name, file_name in zip(obs_features, obs_features_s): + logger.info( + f"Filtering modality '{par['modality']}' observations by .obs['{par['obs_feature']}'] == {obs_name}" + ) + mdata_obs = mdata.copy() + adata_obs = mdata_obs.mod[par["modality"]] + + # split the samples + adata_obs = adata_obs[adata_obs.obs[par["obs_feature"]] == obs_name] + mdata_obs_name = f"{input_file.stem}_{file_name}.h5mu" + obs_files.append(mdata_obs_name) + + # Dropping columns that only have nan values after splitting + if par["drop_obs_nan"]: + logger.info("Dropping all .obs columns with NaN values") + adata_obs.obs.dropna(axis=1, how="all", inplace=True) + + # replace mdata file with modality adata contianing split samples + logger.info( + f"Writing h5mu filtered for {par['obs_feature']} {obs_name} to file {output_dir / mdata_obs_name}" + ) + mdata_obs.mod[par["modality"]] = adata_obs + mdata_obs.write_h5mu( + output_dir / mdata_obs_name, compression=par["output_compression"] + ) + + # avoid keeping files in memory + del mdata_obs + del adata_obs + gc.collect() + + logger.info(f"Writing output_files CSV file to {par['output_files']}") + df = pd.DataFrame({"name": obs_features_s, "filename": obs_files}) + df.to_csv(par["output_files"], index=False) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dataflow/split_h5mu", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dataflow/split_h5mu/nextflow.config b/target/nextflow/dataflow/split_h5mu/nextflow.config new file mode 100644 index 00000000..eaf43298 --- /dev/null +++ b/target/nextflow/dataflow/split_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dataflow/split_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. \n' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dataflow/split_h5mu/nextflow_labels.config b/target/nextflow/dataflow/split_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dataflow/split_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dataflow/split_h5mu/nextflow_schema.json b/target/nextflow/dataflow/split_h5mu/nextflow_schema.json new file mode 100644 index 00000000..e69056e6 --- /dev/null +++ b/target/nextflow/dataflow/split_h5mu/nextflow_schema.json @@ -0,0 +1,98 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "split_h5mu", + "description": "Split the samples of a single modality from a .h5mu (multimodal) sample into seperate .h5mu files based on the values of an .obs column of this modality. \n", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory containing multiple h5mu files.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "$id.$key.output" + }, + "output_files": { + "type": "string", + "format": "path", + "description": "A csv containing the base filename and obs feature by which it was split.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_files.csv\"`, direction: `output`, example: `\"sample_files.csv\"`. ", + "default": "$id.$key.output_files.csv" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "input & specifications": { + "title": "Input & specifications", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to a single .h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_feature": { + "type": "string", + "description": "The .obs column to split the mudata on.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"celltype\"`. " + }, + "drop_obs_nan": { + "type": "boolean", + "description": "Whether to drop all .obs columns that contain only nan values after splitting.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "ensure_unique_filenames": { + "type": "boolean", + "description": "Append number suffixes to ensure unique filenames after sanitizing obs feature values.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/input & specifications" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dataflow/split_h5mu/setup_logger.py b/target/nextflow/dataflow/split_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dataflow/split_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dataflow/split_modalities/.config.vsh.yaml b/target/nextflow/dataflow/split_modalities/.config.vsh.yaml new file mode 100644 index 00000000..1748166d --- /dev/null +++ b/target/nextflow/dataflow/split_modalities/.config.vsh.yaml @@ -0,0 +1,278 @@ +name: "split_modalities" +namespace: "dataflow" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to a single .h5mu file." + info: null + default: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containing multiple h5mu files." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_types" + description: "A csv containing the base filename and modality type per output\ + \ file." + info: null + example: + - "types.csv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split the modalities from a single .h5mu multimodal sample into seperate\ + \ .h5mu files. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dataflow/split_modalities/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dataflow/split_modalities" + executable: "target/nextflow/dataflow/split_modalities/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dataflow/split_modalities/main.nf b/target/nextflow/dataflow/split_modalities/main.nf new file mode 100644 index 00000000..91e7a1a1 --- /dev/null +++ b/target/nextflow/dataflow/split_modalities/main.nf @@ -0,0 +1,3991 @@ +// split_modalities main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Robrecht Cannoodt (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "split_modalities", + "namespace" : "dataflow", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to a single .h5mu file.", + "default" : [ + "sample_path" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory containing multiple h5mu files.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_types", + "description" : "A csv containing the base filename and modality type per output file.", + "example" : [ + "types.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dataflow/split_modalities/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dataflow/split_modalities", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from __future__ import annotations +import sys +import mudata as md +from pathlib import Path +import pandas as pd + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_types': $( if [ ! -z ${VIASH_PAR_OUTPUT_TYPES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_TYPES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def main() -> None: + output_dir = Path(par["output"]) + logger.info("Creating output directory '%s' if it does not exist", output_dir) + if not output_dir.is_dir(): + logger.info("Creating %s", output_dir) + output_dir.mkdir(parents=True) + + logger.info("Reading input file '%s'", par["input"]) + input_file = Path(par["input"].strip()) + sample = md.read_h5mu(input_file) + + logger.info("Creating output types CSV.") + modalities = list(sample.mod.keys()) + + logger.info("Found the following modalities:\\\\n%s", "\\\\n".join(modalities)) + names = {mod_name: f"{input_file.stem}_{mod_name}.h5mu" for mod_name in modalities} + output_files = list(names.values()) + logger.info( + "Will be creating the following output .h5mu files:\\\\n%s", + "\\\\n".join(output_files), + ) + df = pd.DataFrame({"name": modalities, "filename": output_files}) + logger.info("Writing output_types CSV file to '%s'.", par["output_types"]) + df.to_csv(par["output_types"], index=False) + + logger.info("Splitting input file into unimodal output files.") + for mod_name, mod in sample.mod.items(): + logger.info("Processing modality '%s'", mod_name) + new_sample = md.MuData({mod_name: mod}) + logger.info( + "Writing to '%s', with compression '%s'", + names[mod_name], + par["output_compression"], + ) + new_sample.write_h5mu( + output_dir / names[mod_name], compression=par["output_compression"] + ) + logger.info("Done writing output file.") + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dataflow/split_modalities", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dataflow/split_modalities/nextflow.config b/target/nextflow/dataflow/split_modalities/nextflow.config new file mode 100644 index 00000000..297fd179 --- /dev/null +++ b/target/nextflow/dataflow/split_modalities/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dataflow/split_modalities' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n' + author = 'Dries Schaumont, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dataflow/split_modalities/nextflow_labels.config b/target/nextflow/dataflow/split_modalities/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dataflow/split_modalities/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dataflow/split_modalities/nextflow_schema.json b/target/nextflow/dataflow/split_modalities/nextflow_schema.json new file mode 100644 index 00000000..7e4a6d1f --- /dev/null +++ b/target/nextflow/dataflow/split_modalities/nextflow_schema.json @@ -0,0 +1,66 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "split_modalities", + "description": "Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to a single .h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"sample_path\"`, direction: `input`. ", + "default": "sample_path" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output directory containing multiple h5mu files.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "$id.$key.output" + }, + "output_types": { + "type": "string", + "format": "path", + "description": "A csv containing the base filename and modality type per output file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_types.csv\"`, direction: `output`, example: `\"types.csv\"`. ", + "default": "$id.$key.output_types.csv" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dataflow/split_modalities/setup_logger.py b/target/nextflow/dataflow/split_modalities/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dataflow/split_modalities/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/demux/bcl2fastq/.config.vsh.yaml b/target/nextflow/demux/bcl2fastq/.config.vsh.yaml new file mode 100644 index 00000000..680eac18 --- /dev/null +++ b/target/nextflow/demux/bcl2fastq/.config.vsh.yaml @@ -0,0 +1,245 @@ +name: "bcl2fastq" +namespace: "demux" +version: "main" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + - "--runfolder_dir" + description: "Input run directory" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + alternatives: + - "-s" + description: "Pointer to the sample sheet" + info: null + example: + - "SampleSheet.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containig fastq files" + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--ignore_missing" + description: "Interpret missing *.bcl files as no call (N), interpret missing\ + \ control files as not-set\ncontrol bits and fill in with zeros when *.stats\ + \ files are missing.\n" + info: null + direction: "input" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert bcl files to fastq files using bcl2fastq.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "bcl" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/bcl2fastq:2.20" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/bcl2fastq/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/demux/bcl2fastq" + executable: "target/nextflow/demux/bcl2fastq/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/demux/bcl2fastq/main.nf b/target/nextflow/demux/bcl2fastq/main.nf new file mode 100644 index 00000000..07a4e385 --- /dev/null +++ b/target/nextflow/demux/bcl2fastq/main.nf @@ -0,0 +1,3898 @@ +// bcl2fastq main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Toni Verbeiren (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bcl2fastq", + "namespace" : "demux", + "version" : "main", + "authors" : [ + { + "name" : "Toni Verbeiren", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "github" : "tverbeiren", + "linkedin" : "verbeiren" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist and CEO" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i", + "--runfolder_dir" + ], + "description" : "Input run directory", + "example" : [ + "bcl_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_sheet", + "alternatives" : [ + "-s" + ], + "description" : "Pointer to the sample sheet", + "example" : [ + "SampleSheet.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory containig fastq files", + "example" : [ + "fastq_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reports", + "description" : "Reports directory", + "example" : [ + "reports_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--ignore_missing", + "description" : "Interpret missing *.bcl files as no call (N), interpret missing control files as not-set\ncontrol bits and fill in with zeros when *.stats files are missing.\n", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert bcl files to fastq files using bcl2fastq.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_bcl/bcl" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/bcl2fastq:2.20", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/demux/bcl2fastq/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/demux/bcl2fastq", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then echo "${VIASH_PAR_SAMPLE_SHEET}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sample_sheet='&'#" ; else echo "# par_sample_sheet="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_PAR_IGNORE_MISSING+x} ]; then echo "${VIASH_PAR_IGNORE_MISSING}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ignore_missing='&'#" ; else echo "# par_ignore_missing="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -exo pipefail + +extra_params=() + +# Handle reports stored separate +if [ ! -z "\\$par_reports" ]; then + extra_params+=("--reports-dir" "\\$par_reports") +fi + +# Handle the boolean flag +if [ "\\$par_ignore_missing" == "true" ]; then + extra_params+=("--ignore-missing-control" "--ignore-missing-bcl" "--ignore-missing-filter") +fi + +# Run the actual command +bcl2fastq \\\\ + --runfolder-dir "\\$par_input" \\\\ + --sample-sheet "\\$par_sample_sheet" \\\\ + --output-dir "\\$par_output" \\\\ + "\\${extra_params[@]}" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/demux/bcl2fastq", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/demux/bcl2fastq/nextflow.config b/target/nextflow/demux/bcl2fastq/nextflow.config new file mode 100644 index 00000000..b94695e5 --- /dev/null +++ b/target/nextflow/demux/bcl2fastq/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'demux/bcl2fastq' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert bcl files to fastq files using bcl2fastq.\n' + author = 'Toni Verbeiren' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/demux/bcl2fastq/nextflow_labels.config b/target/nextflow/demux/bcl2fastq/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/demux/bcl2fastq/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/demux/bcl2fastq/nextflow_schema.json b/target/nextflow/demux/bcl2fastq/nextflow_schema.json new file mode 100644 index 00000000..187efad7 --- /dev/null +++ b/target/nextflow/demux/bcl2fastq/nextflow_schema.json @@ -0,0 +1,69 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bcl2fastq", + "description": "Convert bcl files to fastq files using bcl2fastq.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input run directory", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"bcl_dir\"`. " + }, + "sample_sheet": { + "type": "string", + "format": "path", + "exists": true, + "description": "Pointer to the sample sheet", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"SampleSheet.csv\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output directory containig fastq files", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"fastq_dir\"`. ", + "default": "$id.$key.output" + }, + "reports": { + "type": "string", + "format": "path", + "description": "Reports directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.reports\"`, direction: `output`, example: `\"reports_dir\"`. ", + "default": "$id.$key.reports" + }, + "ignore_missing": { + "type": "boolean", + "description": "Interpret missing *.bcl files as no call (N), interpret missing control files as not-set\ncontrol bits and fill in with zeros when *.stats files are missing.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/demux/bcl_convert/.config.vsh.yaml b/target/nextflow/demux/bcl_convert/.config.vsh.yaml new file mode 100644 index 00000000..547a6e16 --- /dev/null +++ b/target/nextflow/demux/bcl_convert/.config.vsh.yaml @@ -0,0 +1,317 @@ +name: "bcl_convert" +namespace: "demux" +version: "main" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +- name: "Marijke Van Moerbeke" + roles: + - "author" + info: + role: "Contributor" + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input run directory" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + alternatives: + - "-s" + description: "Pointer to the sample sheet" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory containig fastq files" + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--test_mode" + description: "Should bcl-convert be run in test mode (using --first-tile-only)?" + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--strict_mode" + description: "Abort if any files are missing." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tiles" + description: "Process only a subset of tiles by a regular expression." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--exclude_tiles" + description: "Exclude set of tiles by a regular expression" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--no_lane_splitting" + description: "Wheter to avoid splitting FASTQ file by lane." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert bcl files to fastq files using bcl-convert.\nInformation about\ + \ upgrading from bcl2fastq via\nhttps://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html\n\ + and https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "bcl2" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/bclconvert:4.2" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/bcl_convert/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/demux/bcl_convert" + executable: "target/nextflow/demux/bcl_convert/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/demux/bcl_convert/main.nf b/target/nextflow/demux/bcl_convert/main.nf new file mode 100644 index 00000000..3b2c49db --- /dev/null +++ b/target/nextflow/demux/bcl_convert/main.nf @@ -0,0 +1,4009 @@ +// bcl_convert main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Toni Verbeiren (author, maintainer) +// * Marijke Van Moerbeke (author) +// * Weiwei Schultz (contributor) +// * Dorien Roosen (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bcl_convert", + "namespace" : "demux", + "version" : "main", + "authors" : [ + { + "name" : "Toni Verbeiren", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "github" : "tverbeiren", + "linkedin" : "verbeiren" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist and CEO" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + }, + { + "name" : "Dorien Roosen", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input run directory", + "example" : [ + "bcl_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_sheet", + "alternatives" : [ + "-s" + ], + "description" : "Pointer to the sample sheet", + "example" : [ + "bcl_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory containig fastq files", + "example" : [ + "fastq_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reports", + "description" : "Reports directory", + "example" : [ + "reports_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--test_mode", + "description" : "Should bcl-convert be run in test mode (using --first-tile-only)?", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--strict_mode", + "description" : "Abort if any files are missing.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tiles", + "description" : "Process only a subset of tiles by a regular expression.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--exclude_tiles", + "description" : "Exclude set of tiles by a regular expression", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--no_lane_splitting", + "description" : "Wheter to avoid splitting FASTQ file by lane.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert bcl files to fastq files using bcl-convert.\nInformation about upgrading from bcl2fastq via\nhttps://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html\nand https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_bcl/bcl2" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/bclconvert:4.2", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/demux/bcl_convert/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/demux/bcl_convert", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then echo "${VIASH_PAR_SAMPLE_SHEET}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sample_sheet='&'#" ; else echo "# par_sample_sheet="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_PAR_TEST_MODE+x} ]; then echo "${VIASH_PAR_TEST_MODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_test_mode='&'#" ; else echo "# par_test_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_STRICT_MODE+x} ]; then echo "${VIASH_PAR_STRICT_MODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_strict_mode='&'#" ; else echo "# par_strict_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_TILES+x} ]; then echo "${VIASH_PAR_TILES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_tiles='&'#" ; else echo "# par_tiles="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE_TILES+x} ]; then echo "${VIASH_PAR_EXCLUDE_TILES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_exclude_tiles='&'#" ; else echo "# par_exclude_tiles="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_LANE_SPLITTING+x} ]; then echo "${VIASH_PAR_NO_LANE_SPLITTING}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_lane_splitting='&'#" ; else echo "# par_no_lane_splitting="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +[ -d "\\$par_output" ] || mkdir -p "\\$par_output" + +bcl-convert \\\\ + --force \\\\ + --bcl-input-directory "\\$par_input" \\\\ + --output-directory "\\$par_output" \\\\ + --sample-sheet "\\$par_sample_sheet" \\\\ + --first-tile-only "\\$par_test_mode" \\\\ + --strict-mode "\\$par_strict_mode" \\\\ + \\${par_no_lane_splitting:+--no-lane-splitting "\\$par_no_lane_splitting"} \\\\ + \\${par_tiles:+--tiles \\$par_tiles} \\\\ + \\${par_exclude_tiles:+--exclude-tiles \\$par_exclude_tiles} + + +if [ ! -z "\\$par_reports" ]; then + echo "Moving reports to its own location" + mv "\\$par_output"/Reports "\\$par_reports" +else + echo "Leaving reports alone" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/demux/bcl_convert", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/demux/bcl_convert/nextflow.config b/target/nextflow/demux/bcl_convert/nextflow.config new file mode 100644 index 00000000..22329585 --- /dev/null +++ b/target/nextflow/demux/bcl_convert/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'demux/bcl_convert' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert bcl files to fastq files using bcl-convert.\nInformation about upgrading from bcl2fastq via\nhttps://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html\nand https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html\n' + author = 'Toni Verbeiren, Marijke Van Moerbeke, Weiwei Schultz, Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/demux/bcl_convert/nextflow_labels.config b/target/nextflow/demux/bcl_convert/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/demux/bcl_convert/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/demux/bcl_convert/nextflow_schema.json b/target/nextflow/demux/bcl_convert/nextflow_schema.json new file mode 100644 index 00000000..7ced9962 --- /dev/null +++ b/target/nextflow/demux/bcl_convert/nextflow_schema.json @@ -0,0 +1,90 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bcl_convert", + "description": "Convert bcl files to fastq files using bcl-convert.\nInformation about upgrading from bcl2fastq via\nhttps://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html\nand https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input run directory", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"bcl_dir\"`. " + }, + "sample_sheet": { + "type": "string", + "format": "path", + "exists": true, + "description": "Pointer to the sample sheet", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"bcl_dir\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output directory containig fastq files", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"fastq_dir\"`. ", + "default": "$id.$key.output" + }, + "reports": { + "type": "string", + "format": "path", + "description": "Reports directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.reports\"`, direction: `output`, example: `\"reports_dir\"`. ", + "default": "$id.$key.reports" + }, + "test_mode": { + "type": "boolean", + "description": "Should bcl-convert be run in test mode (using --first-tile-only)?", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "strict_mode": { + "type": "boolean", + "description": "Abort if any files are missing.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "tiles": { + "type": "string", + "description": "Process only a subset of tiles by a regular expression.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "exclude_tiles": { + "type": "string", + "description": "Exclude set of tiles by a regular expression", + "help_text": "Type: `string`, multiple: `False`. " + }, + "no_lane_splitting": { + "type": "boolean", + "description": "Wheter to avoid splitting FASTQ file by lane.", + "help_text": "Type: `boolean`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/demux/cellranger_atac_mkfastq/.config.vsh.yaml b/target/nextflow/demux/cellranger_atac_mkfastq/.config.vsh.yaml new file mode 100644 index 00000000..60081379 --- /dev/null +++ b/target/nextflow/demux/cellranger_atac_mkfastq/.config.vsh.yaml @@ -0,0 +1,286 @@ +name: "cellranger_atac_mkfastq" +namespace: "demux" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Path of Illumina BCL run folder." + info: null + example: + - "/path/to/bcl" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--csv" + description: "The path to the simple layout sample sheet." + info: null + example: + - "SampleSheet.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--lanes" + description: "bcl2fastq option. Semicolon-delimited series of lanes to demultiplex.\ + \ Use this if you have a sample sheet for an entire flow cell but only want\ + \ to generate a few lanes for further 10x Genomics analysis." + info: null + example: + - "1,3" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "string" + name: "--use_bases_mask" + description: "bcl2fastq option. Use to clip extra bases off a read if you ran\ + \ extra cycles for QC." + info: null + example: + - "y50n,I6n,Y50n" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "boolean_true" + name: "--delete_undetermined" + description: "bcl2fastq option. Delete the Undetermined FASTQs generated by bcl2fastq.\ + \ Useful if you are demultiplexing a small number of samples from a large flow\ + \ cell." + info: null + direction: "input" + - type: "integer" + name: "--barcode_mismatches" + description: "bcl2fastq option. Use this option to change the number of allowed\ + \ mismatches per index adapter (0, 1, 2)." + info: null + default: + - 1 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "The folder to store the demux results" + info: null + example: + - "/path/to/output" + default: + - "fastqs" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Demultiplex raw sequencing data for ATAC experiments" +usage: "cellranger_atac_mkfastq \\\n --input /path/to/bcl \\\n --csv simple_layout_sample_sheet.csv\ + \ \\\n --output /path/to/output\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_atac_tiny_bcl" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger_atac:2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update \\\n&& apt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/cellranger_atac_mkfastq/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/demux/cellranger_atac_mkfastq" + executable: "target/nextflow/demux/cellranger_atac_mkfastq/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/demux/cellranger_atac_mkfastq/main.nf b/target/nextflow/demux/cellranger_atac_mkfastq/main.nf new file mode 100644 index 00000000..055349a5 --- /dev/null +++ b/target/nextflow/demux/cellranger_atac_mkfastq/main.nf @@ -0,0 +1,3979 @@ +// cellranger_atac_mkfastq main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_atac_mkfastq", + "namespace" : "demux", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path of Illumina BCL run folder.", + "example" : [ + "/path/to/bcl" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--csv", + "description" : "The path to the simple layout sample sheet.", + "example" : [ + "SampleSheet.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--lanes", + "description" : "bcl2fastq option. Semicolon-delimited series of lanes to demultiplex. Use this if you have a sample sheet for an entire flow cell but only want to generate a few lanes for further 10x Genomics analysis.", + "example" : [ + "1,3" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "string", + "name" : "--use_bases_mask", + "description" : "bcl2fastq option. Use to clip extra bases off a read if you ran extra cycles for QC.", + "example" : [ + "y50n,I6n,Y50n" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "boolean_true", + "name" : "--delete_undetermined", + "description" : "bcl2fastq option. Delete the Undetermined FASTQs generated by bcl2fastq. Useful if you are demultiplexing a small number of samples from a large flow cell.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--barcode_mismatches", + "description" : "bcl2fastq option. Use this option to change the number of allowed mismatches per index adapter (0, 1, 2).", + "default" : [ + 1 + ], + "required" : false, + "min" : 0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "The folder to store the demux results", + "example" : [ + "/path/to/output" + ], + "default" : [ + "fastqs" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reports", + "description" : "Reports directory", + "example" : [ + "reports_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Demultiplex raw sequencing data for ATAC experiments", + "usage" : "cellranger_atac_mkfastq \\\\\n --input /path/to/bcl \\\\\n --csv simple_layout_sample_sheet.csv \\\\\n --output /path/to/output\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_atac_tiny_bcl" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger_atac:2.1", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update \\\\\n&& apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*\n" + ] + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/demux/cellranger_atac_mkfastq/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/demux/cellranger_atac_mkfastq", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_CSV+x} ]; then echo "${VIASH_PAR_CSV}" | sed "s#'#'\\"'\\"'#g;s#.*#par_csv='&'#" ; else echo "# par_csv="; fi ) +$( if [ ! -z ${VIASH_PAR_LANES+x} ]; then echo "${VIASH_PAR_LANES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_lanes='&'#" ; else echo "# par_lanes="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_BASES_MASK+x} ]; then echo "${VIASH_PAR_USE_BASES_MASK}" | sed "s#'#'\\"'\\"'#g;s#.*#par_use_bases_mask='&'#" ; else echo "# par_use_bases_mask="; fi ) +$( if [ ! -z ${VIASH_PAR_DELETE_UNDETERMINED+x} ]; then echo "${VIASH_PAR_DELETE_UNDETERMINED}" | sed "s#'#'\\"'\\"'#g;s#.*#par_delete_undetermined='&'#" ; else echo "# par_delete_undetermined="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODE_MISMATCHES+x} ]; then echo "${VIASH_PAR_BARCODE_MISMATCHES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_barcode_mismatches='&'#" ; else echo "# par_barcode_mismatches="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\\$(mktemp -d "$VIASH_TEMP/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# if par_input not is a folder, untar first +if [ ! -d "\\$par_input" ]; then + echo "Assuming input is a tar.gz, untarring" + input_dir="\\$tmpdir/bcl" + mkdir -p "\\$input_dir" + tar -xzf "\\$par_input" -C "\\$input_dir" --strip-components=1 +else + input_dir="\\$par_input" +fi + + +if [ ! -z "\\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\\`python -c "print(int('\\$meta_memory_gb') - 2)"\\` +fi + + +echo "Running cellranger-atac mkfastq" + +id=myoutput + +IFS="," +cellranger-atac mkfastq \\\\ + --id "\\$id" \\\\ + --csv "\\$par_csv" \\\\ + --run "\\$par_input" \\\\ + --disable-ui \\\\ + --output-dir "\\$par_output" \\\\ + \\${meta_cpus:+--localcores=\\$meta_cpus} \\\\ + \\${memory_gb:+--localmem=\\$memory_gb} \\\\ + \\${par_lanes:+--lanes=\\${par_lanes[*]}} \\\\ + \\${par_use_bases_mask:+--use-bases-mask=\\${par_use_bases_mask[*]}} \\\\ + \\${par_delete_undetermined:+--delete-undetermined} \\\\ + \\${par_barcode_mismatches:+--barcode-mismatches=\\$par_barcode_mismatches} +unset IFS + +# Move reports to their own output location +if [ ! -z "\\$par_reports" ]; then + echo "Moving reports its own location" + mv "\\$par_output"/Reports "\\$par_reports" +else + echo "Leaving reports alone" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/demux/cellranger_atac_mkfastq", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/demux/cellranger_atac_mkfastq/nextflow.config b/target/nextflow/demux/cellranger_atac_mkfastq/nextflow.config new file mode 100644 index 00000000..cf922ece --- /dev/null +++ b/target/nextflow/demux/cellranger_atac_mkfastq/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'demux/cellranger_atac_mkfastq' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Demultiplex raw sequencing data for ATAC experiments' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/demux/cellranger_atac_mkfastq/nextflow_labels.config b/target/nextflow/demux/cellranger_atac_mkfastq/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/demux/cellranger_atac_mkfastq/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/demux/cellranger_atac_mkfastq/nextflow_schema.json b/target/nextflow/demux/cellranger_atac_mkfastq/nextflow_schema.json new file mode 100644 index 00000000..cdd93b60 --- /dev/null +++ b/target/nextflow/demux/cellranger_atac_mkfastq/nextflow_schema.json @@ -0,0 +1,91 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_atac_mkfastq", + "description": "Demultiplex raw sequencing data for ATAC experiments", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path of Illumina BCL run folder.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/bcl\"`. " + }, + "csv": { + "type": "string", + "format": "path", + "exists": true, + "description": "The path to the simple layout sample sheet.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"SampleSheet.csv\"`. " + }, + "lanes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "bcl2fastq option", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1,3\"]`. " + }, + "use_bases_mask": { + "type": "array", + "items": { + "type": "string" + }, + "description": "bcl2fastq option", + "help_text": "Type: `string`, multiple: `True`, example: `[\"y50n,I6n,Y50n\"]`. " + }, + "delete_undetermined": { + "type": "boolean", + "description": "bcl2fastq option", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "barcode_mismatches": { + "type": "integer", + "description": "bcl2fastq option", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "output": { + "type": "string", + "format": "path", + "description": "The folder to store the demux results", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"fastqs\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "fastqs" + }, + "reports": { + "type": "string", + "format": "path", + "description": "Reports directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.reports\"`, direction: `output`, example: `\"reports_dir\"`. ", + "default": "$id.$key.reports" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/demux/cellranger_atac_mkfastq/setup_logger.py b/target/nextflow/demux/cellranger_atac_mkfastq/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/demux/cellranger_atac_mkfastq/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/demux/cellranger_mkfastq/.config.vsh.yaml b/target/nextflow/demux/cellranger_mkfastq/.config.vsh.yaml new file mode 100644 index 00000000..54bd5f59 --- /dev/null +++ b/target/nextflow/demux/cellranger_mkfastq/.config.vsh.yaml @@ -0,0 +1,277 @@ +name: "cellranger_mkfastq" +namespace: "demux" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Path to the (untarred) BCL files. Expects 'RunParameters.xml' at\ + \ './'." + info: null + example: + - "/path/to/bcl" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + description: "The path to the sample sheet." + info: null + example: + - "SampleSheet.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "The folder to store the demux results" + info: null + example: + - "/path/to/output" + default: + - "fastqs" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reports" + description: "Reports directory" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Demultiplex raw sequencing data" +usage: "cellranger_mkfastq \\\n --input /path/to/bcl \\\n --sample_sheet SampleSheet.csv\ + \ \\\n --output /path/to/output\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_bcl" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/demux/cellranger_mkfastq/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/demux/cellranger_mkfastq" + executable: "target/nextflow/demux/cellranger_mkfastq/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/demux/cellranger_mkfastq/main.nf b/target/nextflow/demux/cellranger_mkfastq/main.nf new file mode 100644 index 00000000..717d6497 --- /dev/null +++ b/target/nextflow/demux/cellranger_mkfastq/main.nf @@ -0,0 +1,3986 @@ +// cellranger_mkfastq main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Samuel D'Souza (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_mkfastq", + "namespace" : "demux", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Samuel D'Souza", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "srdsam", + "linkedin" : "samuel-d-souza-887023150/" + }, + "organizations" : [ + { + "name" : "Chan Zuckerberg Biohub", + "href" : "https://www.czbiohub.org", + "role" : "Data Engineer" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to the (untarred) BCL files. Expects 'RunParameters.xml' at './'.", + "example" : [ + "/path/to/bcl" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_sheet", + "description" : "The path to the sample sheet.", + "example" : [ + "SampleSheet.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "The folder to store the demux results", + "example" : [ + "/path/to/output" + ], + "default" : [ + "fastqs" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reports", + "description" : "Reports directory", + "example" : [ + "reports_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Demultiplex raw sequencing data", + "usage" : "cellranger_mkfastq \\\\\n --input /path/to/bcl \\\\\n --sample_sheet SampleSheet.csv \\\\\n --output /path/to/output\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_bcl" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger:9.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update && \\\\\napt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*\n" + ] + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/demux/cellranger_mkfastq/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/demux/cellranger_mkfastq", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_SHEET+x} ]; then echo "${VIASH_PAR_SAMPLE_SHEET}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sample_sheet='&'#" ; else echo "# par_sample_sheet="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORTS+x} ]; then echo "${VIASH_PAR_REPORTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reports='&'#" ; else echo "# par_reports="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\\$(mktemp -d "$VIASH_TEMP/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# if par_input not is a folder, untar first +if [ ! -d "\\$par_input" ]; then + echo "Assuming input is a tar.gz, untarring" + input_dir="\\$tmpdir/bcl" + mkdir -p "\\$input_dir" + tar -xzf "\\$par_input" -C "\\$input_dir" --strip-components=1 +else + input_dir="\\$par_input" +fi + + +# add additional params +extra_params=( ) + +if [ ! -z "\\$meta_cpus" ]; then + extra_params+=( "--localcores=\\$meta_cpus" ) +fi +if [ ! -z "\\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\\`python -c "print(int('\\$meta_memory_gb') - 2)"\\` + extra_params+=( "--localmem=\\$memory_gb" ) +fi + + +echo "Running cellranger demux" + +id=myoutput + +cellranger mkfastq \\\\ + --id "\\$id" \\\\ + --csv "\\$par_sample_sheet" \\\\ + --run "\\$par_input" \\\\ + "\\${extra_params[@]}" \\\\ + --disable-ui \\\\ + --output-dir "\\$par_output" + +# Move reports to their own output location +if [ ! -z "\\$par_reports" ]; then + echo "Moving reports its own location" + mv "\\$par_output"/Reports "\\$par_reports" +else + echo "Leaving reports alone" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/demux/cellranger_mkfastq", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/demux/cellranger_mkfastq/nextflow.config b/target/nextflow/demux/cellranger_mkfastq/nextflow.config new file mode 100644 index 00000000..20b8db61 --- /dev/null +++ b/target/nextflow/demux/cellranger_mkfastq/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'demux/cellranger_mkfastq' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Demultiplex raw sequencing data' + author = 'Angela Oliveira Pisco, Samuel D\'Souza, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/demux/cellranger_mkfastq/nextflow_labels.config b/target/nextflow/demux/cellranger_mkfastq/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/demux/cellranger_mkfastq/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/demux/cellranger_mkfastq/nextflow_schema.json b/target/nextflow/demux/cellranger_mkfastq/nextflow_schema.json new file mode 100644 index 00000000..f75ca40c --- /dev/null +++ b/target/nextflow/demux/cellranger_mkfastq/nextflow_schema.json @@ -0,0 +1,63 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_mkfastq", + "description": "Demultiplex raw sequencing data", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the (untarred) BCL files", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/bcl\"`. " + }, + "sample_sheet": { + "type": "string", + "format": "path", + "exists": true, + "description": "The path to the sample sheet.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"SampleSheet.csv\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "The folder to store the demux results", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"fastqs\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "fastqs" + }, + "reports": { + "type": "string", + "format": "path", + "description": "Reports directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.reports\"`, direction: `output`, example: `\"reports_dir\"`. ", + "default": "$id.$key.reports" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/demux/cellranger_mkfastq/setup_logger.py b/target/nextflow/demux/cellranger_mkfastq/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/demux/cellranger_mkfastq/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dimred/densmap/.config.vsh.yaml b/target/nextflow/dimred/densmap/.config.vsh.yaml new file mode 100644 index 00000000..2b4cca1b --- /dev/null +++ b/target/nextflow/dimred/densmap/.config.vsh.yaml @@ -0,0 +1,442 @@ +name: "densmap" +namespace: "dimred" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "maintainer" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_neighbors" + description: "The `.uns` neighbors slot as output by the `find_neighbors` component." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_pca" + description: "The slot in `.obsm` where the PCA results are stored.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The .obsm key to use for storing the densMAP results.." + info: null + default: + - "X_densmap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments UMAP" + arguments: + - type: "double" + name: "--min_dist" + description: "The effective minimum distance between embedded points. Smaller\ + \ values will result \nin a more clustered/clumped embedding where nearby points\ + \ on the manifold are drawn \ncloser together, while larger values will result\ + \ on a more even dispersal of points. \nThe value should be set relative to\ + \ the spread value, which determines the scale at \nwhich embedded points will\ + \ be spread out. \n" + info: null + default: + - 0.5 + required: false + min: 0.0 + max: 10.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--spread" + description: "The effective scale of embedded points. In combination with `min_dist`\ + \ this \ndetermines how clustered/clumped the embedded points are.\n" + info: null + default: + - 1.0 + required: false + min: 0.0 + max: 10.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_components" + description: "The number of dimensions of the embedding." + info: null + default: + - 2 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "The number of iterations (epochs) of the optimization. Called `n_epochs`\ + \ \nin the original UMAP. Default is set to 500 if \nneighbors['connectivities'].shape[0]\ + \ <= 10000, else 200.\n" + info: null + default: + - 0 + required: false + min: 0 + max: 1000 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "The initial learning rate for the embedding optimization." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--gamma" + description: "Weighting applied to negative samples in low dimensional embedding\ + \ optimization. \nValues higher than one will result in greater weight being\ + \ given to negative samples.\n" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--negative_sample_rate" + description: "The number of negative samples to select per positive sample\nin\ + \ the optimization process. Increasing this value will result\nin greater repulsive\ + \ force being applied, greater optimization\ncost, but slightly more accuracy.\n" + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--init_pos" + description: "How to initialize the low dimensional embedding. Called `init` in\ + \ the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions\ + \ from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`:\ + \ assign initial embedding positions at random.\n" + info: null + default: + - "spectral" + required: false + choices: + - "paga" + - "spectral" + - "random" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments densMAP" + arguments: + - type: "double" + name: "--lambda" + description: "Controls the regularization weight of the density correlation term\ + \ in densMAP. \nHigher values prioritize density preservation over the UMAP\ + \ objective, and vice versa \nfor values closer to zero. Setting this parameter\ + \ to zero is equivalent to running \nthe original UMAP algorithm.\n" + info: null + default: + - 2.0 + required: false + min: 0.01 + max: 10.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--fraction" + description: "Controls the fraction of epochs (between 0 and 1) where the density-augmented\ + \ objective \nis used in densMAP. The first (1 - dens_frac) fraction of epochs\ + \ optimize the original \nUMAP objective before introducing the density correlation\ + \ term.\n" + info: null + default: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--var_shift" + description: "A small constant added to the variance of local radii in the embedding\ + \ when calculating \nthe density correlation objective to prevent numerical\ + \ instability from dividing by a \nsmall number.\n" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A modification of UMAP that adds an extra cost term in order to preserve\ + \ information \nabout the relative local density of the data. It is performed on\ + \ the same inputs as UMAP.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "umap-learn" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/densmap/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dimred/densmap" + executable: "target/nextflow/dimred/densmap/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dimred/densmap/compress_h5mu.py b/target/nextflow/dimred/densmap/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/dimred/densmap/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/dimred/densmap/main.nf b/target/nextflow/dimred/densmap/main.nf new file mode 100644 index 00000000..45d34cb0 --- /dev/null +++ b/target/nextflow/dimred/densmap/main.nf @@ -0,0 +1,4203 @@ +// densmap main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "densmap", + "namespace" : "dimred", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "The `.uns` neighbors slot as output by the `find_neighbors` component.", + "default" : [ + "neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_pca", + "description" : "The slot in `.obsm` where the PCA results are stored.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "The .obsm key to use for storing the densMAP results..", + "default" : [ + "X_densmap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments UMAP", + "arguments" : [ + { + "type" : "double", + "name" : "--min_dist", + "description" : "The effective minimum distance between embedded points. Smaller values will result \nin a more clustered/clumped embedding where nearby points on the manifold are drawn \ncloser together, while larger values will result on a more even dispersal of points. \nThe value should be set relative to the spread value, which determines the scale at \nwhich embedded points will be spread out. \n", + "default" : [ + 0.5 + ], + "required" : false, + "min" : 0.0, + "max" : 10.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--spread", + "description" : "The effective scale of embedded points. In combination with `min_dist` this \ndetermines how clustered/clumped the embedded points are.\n", + "default" : [ + 1.0 + ], + "required" : false, + "min" : 0.0, + "max" : 10.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num_components", + "description" : "The number of dimensions of the embedding.", + "default" : [ + 2 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_iter", + "description" : "The number of iterations (epochs) of the optimization. Called `n_epochs` \nin the original UMAP. Default is set to 500 if \nneighbors['connectivities'].shape[0] <= 10000, else 200.\n", + "default" : [ + 0 + ], + "required" : false, + "min" : 0, + "max" : 1000, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alpha", + "description" : "The initial learning rate for the embedding optimization.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--gamma", + "description" : "Weighting applied to negative samples in low dimensional embedding optimization. \nValues higher than one will result in greater weight being given to negative samples.\n", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--negative_sample_rate", + "description" : "The number of negative samples to select per positive sample\nin the optimization process. Increasing this value will result\nin greater repulsive force being applied, greater optimization\ncost, but slightly more accuracy.\n", + "default" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--init_pos", + "description" : "How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`: assign initial embedding positions at random.\n", + "default" : [ + "spectral" + ], + "required" : false, + "choices" : [ + "paga", + "spectral", + "random" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments densMAP", + "arguments" : [ + { + "type" : "double", + "name" : "--lambda", + "description" : "Controls the regularization weight of the density correlation term in densMAP. \nHigher values prioritize density preservation over the UMAP objective, and vice versa \nfor values closer to zero. Setting this parameter to zero is equivalent to running \nthe original UMAP algorithm.\n", + "default" : [ + 2.0 + ], + "required" : false, + "min" : 0.01, + "max" : 10.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--fraction", + "description" : "Controls the fraction of epochs (between 0 and 1) where the density-augmented objective \nis used in densMAP. The first (1 - dens_frac) fraction of epochs optimize the original \nUMAP objective before introducing the density correlation term.\n", + "default" : [ + 0.3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--var_shift", + "description" : "A small constant added to the variance of local radii in the embedding when calculating \nthe density correlation objective to prevent numerical instability from dividing by a \nsmall number.\n", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A modification of UMAP that adds an extra cost term in order to preserve information \nabout the relative local density of the data. It is performed on the same inputs as UMAP.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "umap-learn" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dimred/densmap/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dimred/densmap", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from umap import UMAP +import mudata as mu +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_neighbors': $( if [ ! -z ${VIASH_PAR_UNS_NEIGHBORS+x} ]; then echo "r'${VIASH_PAR_UNS_NEIGHBORS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_pca': $( if [ ! -z ${VIASH_PAR_OBSM_PCA+x} ]; then echo "r'${VIASH_PAR_OBSM_PCA//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_dist': $( if [ ! -z ${VIASH_PAR_MIN_DIST+x} ]; then echo "float(r'${VIASH_PAR_MIN_DIST//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'spread': $( if [ ! -z ${VIASH_PAR_SPREAD+x} ]; then echo "float(r'${VIASH_PAR_SPREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gamma': $( if [ ! -z ${VIASH_PAR_GAMMA+x} ]; then echo "float(r'${VIASH_PAR_GAMMA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'negative_sample_rate': $( if [ ! -z ${VIASH_PAR_NEGATIVE_SAMPLE_RATE+x} ]; then echo "int(r'${VIASH_PAR_NEGATIVE_SAMPLE_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'init_pos': $( if [ ! -z ${VIASH_PAR_INIT_POS+x} ]; then echo "r'${VIASH_PAR_INIT_POS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'lambda': $( if [ ! -z ${VIASH_PAR_LAMBDA+x} ]; then echo "float(r'${VIASH_PAR_LAMBDA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'fraction': $( if [ ! -z ${VIASH_PAR_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'var_shift': $( if [ ! -z ${VIASH_PAR_VAR_SHIFT+x} ]; then echo "float(r'${VIASH_PAR_VAR_SHIFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +try: + data = mu.read_h5ad(par["input"], mod=par["modality"]) +except KeyError as e: + raise ValueError( + f"Modality '{par['modality']}' not found in the input data." + ) from e + +logger.info("Computing densMAP for modality '%s'", par["modality"]) + +neigh_key = par["uns_neighbors"] + +if neigh_key not in data.uns: + raise ValueError( + f"'{neigh_key}' was not found in .mod['{par['modality']}'].uns. Set the correct key or run 'find_neighbors' first." + ) + +temp_uns = {neigh_key: data.uns[neigh_key]} + +if "use_rep" not in temp_uns[neigh_key]["params"]: + raise ValueError( + f"'use_rep' was not found in .mod['{par['modality']}'].uns['{neigh_key}'].params. Set the correct key or run PCA first." + ) + + +X_densmap = UMAP( + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + n_epochs=par["max_iter"], + learning_rate=par["alpha"], + repulsion_strength=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init=par["init_pos"], + metric=data.uns["neighbors"].get("metric", "euclidean"), + metric_kwds=data.uns["neighbors"].get("metric_kwds", {}), + densmap=True, + dens_lambda=par["lambda"], + dens_frac=par["fraction"], + dens_var_shift=par["var_shift"], +).fit_transform(data.obsm[par["obsm_pca"]]) + +logger.info( + f"Writing densMAP embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = X_densmap + +logger.info(f"Writing densMAP metadata to .mod[{par['modality']}].uns['densmap']") +data.uns["densmap"] = { + "params": { + "min_dist": par["min_dist"], + "spread": par["spread"], + "n_components": par["num_components"], + "n_epochs": par["max_iter"], + "learning_rate": par["alpha"], + "repulsion_strength": par["gamma"], + "negative_sample_rate": par["negative_sample_rate"], + "init": par["init_pos"], + "metric": data.uns["neighbors"].get("metric", "euclidean"), + "metric_kwds": data.uns["neighbors"].get("metric_kwds", {}), + "dens_lambda": par["lambda"], + "dens_frac": par["fraction"], + "dens_var_shift": par["var_shift"], + } +} + +logger.info("Writing to %s.", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dimred/densmap", + "tag" : "main" + }, + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dimred/densmap/nextflow.config b/target/nextflow/dimred/densmap/nextflow.config new file mode 100644 index 00000000..dc15b358 --- /dev/null +++ b/target/nextflow/dimred/densmap/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dimred/densmap' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A modification of UMAP that adds an extra cost term in order to preserve information \nabout the relative local density of the data. It is performed on the same inputs as UMAP.\n' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dimred/densmap/nextflow_labels.config b/target/nextflow/dimred/densmap/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dimred/densmap/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dimred/densmap/nextflow_schema.json b/target/nextflow/dimred/densmap/nextflow_schema.json new file mode 100644 index 00000000..eff1e201 --- /dev/null +++ b/target/nextflow/dimred/densmap/nextflow_schema.json @@ -0,0 +1,182 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "densmap", + "description": "A modification of UMAP that adds an extra cost term in order to preserve information \nabout the relative local density of the data. It is performed on the same inputs as UMAP.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "uns_neighbors": { + "type": "string", + "description": "The `.uns` neighbors slot as output by the `find_neighbors` component.", + "help_text": "Type: `string`, multiple: `False`, default: `\"neighbors\"`. ", + "default": "neighbors" + }, + "obsm_pca": { + "type": "string", + "description": "The slot in `.obsm` where the PCA results are stored.\n", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_output": { + "type": "string", + "description": "The .obsm key to use for storing the densMAP results..", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_densmap\"`. ", + "default": "X_densmap" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments umap": { + "title": "Arguments UMAP", + "type": "object", + "description": "No description", + "properties": { + "min_dist": { + "type": "number", + "description": "The effective minimum distance between embedded points", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "spread": { + "type": "number", + "description": "The effective scale of embedded points", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "num_components": { + "type": "integer", + "description": "The number of dimensions of the embedding.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "max_iter": { + "type": "integer", + "description": "The number of iterations (epochs) of the optimization", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "alpha": { + "type": "number", + "description": "The initial learning rate for the embedding optimization.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "gamma": { + "type": "number", + "description": "Weighting applied to negative samples in low dimensional embedding optimization", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "negative_sample_rate": { + "type": "integer", + "description": "The number of negative samples to select per positive sample\nin the optimization process", + "help_text": "Type: `integer`, multiple: `False`, default: `5`. ", + "default": 5 + }, + "init_pos": { + "type": "string", + "description": "How to initialize the low dimensional embedding", + "help_text": "Type: `string`, multiple: `False`, default: `\"spectral\"`, choices: ``paga`, `spectral`, `random``. ", + "enum": [ + "paga", + "spectral", + "random" + ], + "default": "spectral" + } + } + }, + "arguments densmap": { + "title": "Arguments densMAP", + "type": "object", + "description": "No description", + "properties": { + "lambda": { + "type": "number", + "description": "Controls the regularization weight of the density correlation term in densMAP", + "help_text": "Type: `double`, multiple: `False`, default: `2.0`. ", + "default": 2.0 + }, + "fraction": { + "type": "number", + "description": "Controls the fraction of epochs (between 0 and 1) where the density-augmented objective \nis used in densMAP", + "help_text": "Type: `double`, multiple: `False`, default: `0.3`. ", + "default": 0.3 + }, + "var_shift": { + "type": "number", + "description": "A small constant added to the variance of local radii in the embedding when calculating \nthe density correlation objective to prevent numerical instability from dividing by a \nsmall number.\n", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments umap" + }, + { + "$ref": "#/$defs/arguments densmap" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dimred/densmap/setup_logger.py b/target/nextflow/dimred/densmap/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dimred/densmap/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dimred/lsi/.config.vsh.yaml b/target/nextflow/dimred/lsi/.config.vsh.yaml new file mode 100644 index 00000000..d2af6f38 --- /dev/null +++ b/target/nextflow/dimred/lsi/.config.vsh.yaml @@ -0,0 +1,361 @@ +name: "lsi" +namespace: "dimred" +version: "main" +authors: +- name: "Sarah Ouologuem" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "SarahOuologuem" + orcid: "0009-0005-3398-1700" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +- name: "Vladimir Shitov" + roles: + - "contributor" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "On which modality to run LSI on." + info: null + default: + - "atac" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Use specified layer for expression values. If not specified, uses\ + \ adata.X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "Column name in .var matrix that will be used to select which genes\ + \ to run the LSI on. If not specified, uses all features." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "LSI options" + arguments: + - type: "integer" + name: "--num_components" + description: "Number of components to compute." + info: null + default: + - 50 + required: false + min: 2 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--scale_embeddings" + description: "Scale embeddings to zero mean and unit variance." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting embedding." + info: null + default: + - "X_lsi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--varm_output" + description: "In which .varm slot to store the resulting loadings matrix." + info: null + default: + - "lsi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "In which .uns slot to store the stdev." + info: null + default: + - "lsi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite" + description: "Allow overwriting .obsm, .varm and .uns slots." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings\ + \ and singular values. Uses the implementation of scipy.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "concat_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pkg-config" + - "libhdf5-dev" + - "gcc" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "muon~=0.1.6" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/lsi/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dimred/lsi" + executable: "target/nextflow/dimred/lsi/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dimred/lsi/compress_h5mu.py b/target/nextflow/dimred/lsi/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/dimred/lsi/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/dimred/lsi/main.nf b/target/nextflow/dimred/lsi/main.nf new file mode 100644 index 00000000..610d1c54 --- /dev/null +++ b/target/nextflow/dimred/lsi/main.nf @@ -0,0 +1,4150 @@ +// lsi main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Sarah Ouologuem (contributor) +// * Vladimir Shitov (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "lsi", + "namespace" : "dimred", + "version" : "main", + "authors" : [ + { + "name" : "Sarah Ouologuem", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "SarahOuologuem", + "orcid" : "0009-0005-3398-1700" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + }, + { + "name" : "Vladimir Shitov", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "On which modality to run LSI on.", + "default" : [ + "atac" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Use specified layer for expression values. If not specified, uses adata.X.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : "Column name in .var matrix that will be used to select which genes to run the LSI on. If not specified, uses all features.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "LSI options", + "arguments" : [ + { + "type" : "integer", + "name" : "--num_components", + "description" : "Number of components to compute.", + "default" : [ + 50 + ], + "required" : false, + "min" : 2, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--scale_embeddings", + "description" : "Scale embeddings to zero mean and unit variance.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting embedding.", + "default" : [ + "X_lsi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--varm_output", + "description" : "In which .varm slot to store the resulting loadings matrix.", + "default" : [ + "lsi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_output", + "description" : "In which .uns slot to store the stdev.", + "default" : [ + "lsi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--overwrite", + "description" : "Allow overwriting .obsm, .varm and .uns slots.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "../../../resources_test/concat_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "highmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "pkg-config", + "libhdf5-dev", + "gcc" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "muon~=0.1.6" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dimred/lsi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dimred/lsi", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import muon as mu +import mudata as md +from anndata import AnnData +import numpy as np +import sys + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scale_embeddings': $( if [ ! -z ${VIASH_PAR_SCALE_EMBEDDINGS+x} ]; then echo "r'${VIASH_PAR_SCALE_EMBEDDINGS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'varm_output': $( if [ ! -z ${VIASH_PAR_VARM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_VARM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'overwrite': $( if [ ! -z ${VIASH_PAR_OVERWRITE+x} ]; then echo "r'${VIASH_PAR_OVERWRITE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from subset_vars import subset_vars + +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from setup_logger import setup_logger + +logger = setup_logger() + + +# 1.read in adata +logger.info("Reading %s.", par["input"]) +try: + adata = md.read_h5ad(par["input"], mod=par["modality"]) +except KeyError as e: + raise ValueError( + f"Modality '{par['modality']}' was not found in mudata {par['input']}." + ) from e + +# 3. Specify layer +if par["layer"] and par["layer"] not in adata.layers: + raise ValueError( + f"Layer '{par['layer']}' was not found in modality '{par['modality']}'." + ) +layer = adata.X if not par["layer"] else adata.layers[par["layer"]] +adata_input_layer = AnnData(layer, var=adata.var) + + +if not par["layer"]: + logger.info("Using modality '%s' and adata.X for LSI computation", par["modality"]) +else: + logger.info( + "Using modality '%s' and layer '%s' for LSI computation", + par["modality"], + par["layer"], + ) + + +# 4. Subset on highly variable features if applicable +if par["var_input"]: + adata_input_layer = subset_vars(adata_input_layer, par["var_input"]) + + +# 5. Run LSI +logger.info( + "Computing %s LSI components on %s features", + par["num_components"], + adata_input_layer.X.shape[1], +) +mu.atac.tl.lsi( + adata_input_layer, + scale_embeddings=par["scale_embeddings"], + n_comps=par["num_components"], +) + + +# 6. Store output in object +check_exist_dict = { + "obsm_output": ("obsm"), + "varm_output": ("varm"), + "uns_output": ("uns"), +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(adata, field): + if not par["overwrite"]: + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) + del getattr(adata, field)[par[parameter_name]] + +adata.obsm[par["obsm_output"]] = adata_input_layer.obsm["X_lsi"] +adata.uns[par["uns_output"]] = adata_input_layer.uns["lsi"] +if par["var_input"]: + adata.varm[par["varm_output"]] = np.zeros( + shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]) + ) + adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = ( + adata_input_layer.varm["LSI"] + ) +else: + adata.varm[par["varm_output"]] = adata_input_layer.varm["LSI"] + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dimred/lsi", + "tag" : "main" + }, + "label" : [ + "highcpu", + "highmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dimred/lsi/nextflow.config b/target/nextflow/dimred/lsi/nextflow.config new file mode 100644 index 00000000..2e659962 --- /dev/null +++ b/target/nextflow/dimred/lsi/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dimred/lsi' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy.\n' + author = 'Sarah Ouologuem, Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dimred/lsi/nextflow_labels.config b/target/nextflow/dimred/lsi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dimred/lsi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dimred/lsi/nextflow_schema.json b/target/nextflow/dimred/lsi/nextflow_schema.json new file mode 100644 index 00000000..9d90010f --- /dev/null +++ b/target/nextflow/dimred/lsi/nextflow_schema.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "lsi", + "description": "Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "On which modality to run LSI on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"atac\"`. ", + "default": "atac" + }, + "layer": { + "type": "string", + "description": "Use specified layer for expression values", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": "Column name in .var matrix that will be used to select which genes to run the LSI on", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_lsi\"`. ", + "default": "X_lsi" + }, + "varm_output": { + "type": "string", + "description": "In which .varm slot to store the resulting loadings matrix.", + "help_text": "Type: `string`, multiple: `False`, default: `\"lsi\"`. ", + "default": "lsi" + }, + "uns_output": { + "type": "string", + "description": "In which .uns slot to store the stdev.", + "help_text": "Type: `string`, multiple: `False`, default: `\"lsi\"`. ", + "default": "lsi" + }, + "overwrite": { + "type": "boolean", + "description": "Allow overwriting .obsm, .varm and .uns slots.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "lsi options": { + "title": "LSI options", + "type": "object", + "description": "No description", + "properties": { + "num_components": { + "type": "integer", + "description": "Number of components to compute.", + "help_text": "Type: `integer`, multiple: `False`, default: `50`. ", + "default": 50 + }, + "scale_embeddings": { + "type": "boolean", + "description": "Scale embeddings to zero mean and unit variance.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/lsi options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dimred/lsi/setup_logger.py b/target/nextflow/dimred/lsi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dimred/lsi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dimred/lsi/subset_vars.py b/target/nextflow/dimred/lsi/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/dimred/lsi/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/dimred/pca/.config.vsh.yaml b/target/nextflow/dimred/pca/.config.vsh.yaml new file mode 100644 index 00000000..28dcb633 --- /dev/null +++ b/target/nextflow/dimred/pca/.config.vsh.yaml @@ -0,0 +1,323 @@ +name: "pca" +namespace: "dimred" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "Column name in .var matrix that will be used to select which genes\ + \ to run the PCA on." + info: null + example: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--varm_output" + description: "In which .varm slot to store the resulting loadings matrix." + info: null + default: + - "pca_loadings" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "In which .uns slot to store the resulting variance objects." + info: null + default: + - "pca_variance" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_components" + description: "Number of principal components to compute. Defaults to 50, or 1\ + \ - minimum dimension size of selected representation." + info: null + example: + - 25 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite" + description: "Allow overwriting .obsm, .varm and .uns slots." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Computes PCA coordinates, loadings and variance decomposition. Uses\ + \ the implementation of scikit-learn [Pedregosa11].\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "highmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/pca/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dimred/pca" + executable: "target/nextflow/dimred/pca/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dimred/pca/compress_h5mu.py b/target/nextflow/dimred/pca/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/dimred/pca/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/dimred/pca/main.nf b/target/nextflow/dimred/pca/main.nf new file mode 100644 index 00000000..eeb444e6 --- /dev/null +++ b/target/nextflow/dimred/pca/main.nf @@ -0,0 +1,4060 @@ +// pca main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "pca", + "namespace" : "dimred", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Use specified layer for expression values instead of the .X object from the modality.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : "Column name in .var matrix that will be used to select which genes to run the PCA on.", + "example" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting embedding.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--varm_output", + "description" : "In which .varm slot to store the resulting loadings matrix.", + "default" : [ + "pca_loadings" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_output", + "description" : "In which .uns slot to store the resulting variance objects.", + "default" : [ + "pca_variance" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num_components", + "description" : "Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation.", + "example" : [ + 25 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--overwrite", + "description" : "Allow overwriting .obsm, .varm and .uns slots.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "highmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dimred/pca/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dimred/pca", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import sys +from anndata import AnnData + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'varm_output': $( if [ ! -z ${VIASH_PAR_VARM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_VARM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'overwrite': $( if [ ! -z ${VIASH_PAR_OVERWRITE+x} ]; then echo "r'${VIASH_PAR_OVERWRITE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing PCA components for modality '%s'", par["modality"]) +if par["layer"] and par["layer"] not in data.layers: + raise ValueError(f"{par['layer']} was not found in modality {par['modality']}.") +layer = data.X if not par["layer"] else data.layers[par["layer"]] +adata_input_layer = AnnData(layer) +adata_input_layer.var.index = data.var.index + +use_highly_variable = False +if par["var_input"]: + if par["var_input"] not in data.var.columns: + raise ValueError( + f"Requested to use .var column {par['var_input']} " + "as a selection of genes to run the PCA on, " + f"but the column is not available for modality {par['modality']}" + ) + use_highly_variable = True + adata_input_layer.var["highly_variable"] = data.var[par["var_input"]] + +# run pca +output_adata = sc.tl.pca( + adata_input_layer, + n_comps=par["num_components"], + copy=True, + use_highly_variable=use_highly_variable, +) + +# store output in specific objects + +check_exist_dict = { + "obsm_output": ("obs"), + "varm_output": ("varm"), + "uns_output": ("uns"), +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(data, field): + if not par["overwrite"]: + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) + del getattr(data, field)[par[parameter_name]] + +data.obsm[par["obsm_output"]] = output_adata.obsm["X_pca"] +data.varm[par["varm_output"]] = output_adata.varm["PCs"] +data.uns[par["uns_output"]] = { + "variance": output_adata.uns["pca"]["variance"], + "variance_ratio": output_adata.uns["pca"]["variance_ratio"], +} + + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dimred/pca", + "tag" : "main" + }, + "label" : [ + "highcpu", + "highmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dimred/pca/nextflow.config b/target/nextflow/dimred/pca/nextflow.config new file mode 100644 index 00000000..41a6f739 --- /dev/null +++ b/target/nextflow/dimred/pca/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dimred/pca' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n' + author = 'Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dimred/pca/nextflow_labels.config b/target/nextflow/dimred/pca/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dimred/pca/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dimred/pca/nextflow_schema.json b/target/nextflow/dimred/pca/nextflow_schema.json new file mode 100644 index 00000000..c9eb0931 --- /dev/null +++ b/target/nextflow/dimred/pca/nextflow_schema.json @@ -0,0 +1,103 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "pca", + "description": "Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": "Column name in .var matrix that will be used to select which genes to run the PCA on.", + "help_text": "Type: `string`, multiple: `False`, example: `\"filter_with_hvg\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "varm_output": { + "type": "string", + "description": "In which .varm slot to store the resulting loadings matrix.", + "help_text": "Type: `string`, multiple: `False`, default: `\"pca_loadings\"`. ", + "default": "pca_loadings" + }, + "uns_output": { + "type": "string", + "description": "In which .uns slot to store the resulting variance objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"pca_variance\"`. ", + "default": "pca_variance" + }, + "num_components": { + "type": "integer", + "description": "Number of principal components to compute", + "help_text": "Type: `integer`, multiple: `False`, example: `25`. " + }, + "overwrite": { + "type": "boolean", + "description": "Allow overwriting .obsm, .varm and .uns slots.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dimred/pca/setup_logger.py b/target/nextflow/dimred/pca/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dimred/pca/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dimred/tsne/.config.vsh.yaml b/target/nextflow/dimred/tsne/.config.vsh.yaml new file mode 100644 index 00000000..af172c49 --- /dev/null +++ b/target/nextflow/dimred/tsne/.config.vsh.yaml @@ -0,0 +1,370 @@ +name: "tsne" +namespace: "dimred" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "maintainer" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--use_rep" + description: "The `.obsm` slot to use as input for the tSNE computation." + info: null + example: + - "X_pca" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The .obsm key to use for storing the tSNE results." + info: null + default: + - "X_tsne" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--n_pcs" + description: "The number of principal components to use for the tSNE computation." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--perplexity" + description: "The perplexity is related to the number of nearest neighbors that\ + \ is used in other manifold learning algorithms. Larger datasets usually require\ + \ a larger perplexity. Consider selecting a value between 5 and 50. Different\ + \ values can result in significantly different results." + info: null + default: + - 30.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_dist" + description: "The effective minimum distance between embedded points. Smaller\ + \ values will result in a more clustered/clumped embedding where nearby points\ + \ on the manifold are drawn closer together, while larger values will result\ + \ on a more even dispersal of points. The value should be set relative to the\ + \ spread value, which determines the scale at which embedded points will be\ + \ spread out." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--metric" + description: "Distance metric to calculate neighbors on." + info: null + default: + - "euclidean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_exaggeration" + description: "Controls how tight natural clusters in the original space are in\ + \ the embedded space and how much space will be between them. For larger values,\ + \ the space between natural clusters will be larger in the embedded space. Again,\ + \ the choice of this parameter is not very critical. If the cost function increases\ + \ during initial optimization, the early exaggeration factor or the learning\ + \ rate might be too high." + info: null + default: + - 12.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--learning_rate" + description: "The learning rate for t-SNE optimization. Typical values range between\ + \ 10.0 and 1000.0." + info: null + default: + - 1000.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed to use for the tSNE computation." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "t-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality\ + \ reduction technique used to visualize high-dimensional data in a low-dimensional\ + \ space, revealing patterns and clusters by preserving local data similarities.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/tsne/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dimred/tsne" + executable: "target/nextflow/dimred/tsne/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dimred/tsne/compress_h5mu.py b/target/nextflow/dimred/tsne/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/dimred/tsne/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/dimred/tsne/main.nf b/target/nextflow/dimred/tsne/main.nf new file mode 100644 index 00000000..74bbbb68 --- /dev/null +++ b/target/nextflow/dimred/tsne/main.nf @@ -0,0 +1,4091 @@ +// tsne main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "tsne", + "namespace" : "dimred", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--use_rep", + "description" : "The `.obsm` slot to use as input for the tSNE computation.", + "example" : [ + "X_pca" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "The .obsm key to use for storing the tSNE results.", + "default" : [ + "X_tsne" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_pcs", + "description" : "The number of principal components to use for the tSNE computation.", + "default" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--perplexity", + "description" : "The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. Different values can result in significantly different results.", + "default" : [ + 30.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_dist", + "description" : "The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the spread value, which determines the scale at which embedded points will be spread out.", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--metric", + "description" : "Distance metric to calculate neighbors on.", + "default" : [ + "euclidean" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--early_exaggeration", + "description" : "Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high.", + "default" : [ + 12.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--learning_rate", + "description" : "The learning rate for t-SNE optimization. Typical values range between 10.0 and 1000.0.", + "default" : [ + 1000.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--random_state", + "description" : "The random seed to use for the tSNE computation.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "t-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality reduction technique used to visualize high-dimensional data in a low-dimensional space, revealing patterns and clusters by preserving local data similarities.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dimred/tsne/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dimred/tsne", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import sys +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'use_rep': $( if [ ! -z ${VIASH_PAR_USE_REP+x} ]; then echo "r'${VIASH_PAR_USE_REP//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_pcs': $( if [ ! -z ${VIASH_PAR_N_PCS+x} ]; then echo "int(r'${VIASH_PAR_N_PCS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'perplexity': $( if [ ! -z ${VIASH_PAR_PERPLEXITY+x} ]; then echo "float(r'${VIASH_PAR_PERPLEXITY//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_dist': $( if [ ! -z ${VIASH_PAR_MIN_DIST+x} ]; then echo "float(r'${VIASH_PAR_MIN_DIST//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'metric': $( if [ ! -z ${VIASH_PAR_METRIC+x} ]; then echo "r'${VIASH_PAR_METRIC//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'early_exaggeration': $( if [ ! -z ${VIASH_PAR_EARLY_EXAGGERATION+x} ]; then echo "float(r'${VIASH_PAR_EARLY_EXAGGERATION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing tSNE for modality '%s'", par["modality"]) +if par["use_rep"] not in data.obsm.keys(): + raise ValueError( + f"'{par['use_rep']}' was not found in .mod['{par['modality']}'].obsm. No precomputed PCA provided. Please run PCA first." + ) +temp_obsm = {par["use_rep"]: data.obsm[par["use_rep"]]} + +temp_adata = ad.AnnData(obsm=temp_obsm, shape=data.shape) + +sc.tl.tsne( + adata=temp_adata, + n_pcs=par["n_pcs"], + use_rep=par["use_rep"], + perplexity=par["perplexity"], + metric=par["metric"], + early_exaggeration=par["early_exaggeration"], + learning_rate=par["learning_rate"], + random_state=par["random_state"], + n_jobs=meta["cpus"], +) + +logger.info( + f"Writing tSNE embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_tsne"] + +logger.info(f"Writing tSNE metadata to .mod[{par['modality']}].uns['tsne']") +data.uns["tsne"] = temp_adata.uns["tsne"] + +logger.info( + "Writing to %s with compression %s.", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dimred/tsne", + "tag" : "main" + }, + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dimred/tsne/nextflow.config b/target/nextflow/dimred/tsne/nextflow.config new file mode 100644 index 00000000..93f5f2d1 --- /dev/null +++ b/target/nextflow/dimred/tsne/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dimred/tsne' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 't-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality reduction technique used to visualize high-dimensional data in a low-dimensional space, revealing patterns and clusters by preserving local data similarities.\n' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dimred/tsne/nextflow_labels.config b/target/nextflow/dimred/tsne/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dimred/tsne/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dimred/tsne/nextflow_schema.json b/target/nextflow/dimred/tsne/nextflow_schema.json new file mode 100644 index 00000000..3df75c7f --- /dev/null +++ b/target/nextflow/dimred/tsne/nextflow_schema.json @@ -0,0 +1,137 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "tsne", + "description": "t-SNE (t-Distributed Stochastic Neighbor Embedding) is a dimensionality reduction technique used to visualize high-dimensional data in a low-dimensional space, revealing patterns and clusters by preserving local data similarities.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, required, default: `\"rna\"`. ", + "default": "rna" + }, + "use_rep": { + "type": "string", + "description": "The `.obsm` slot to use as input for the tSNE computation.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"X_pca\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_output": { + "type": "string", + "description": "The .obsm key to use for storing the tSNE results.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_tsne\"`. ", + "default": "X_tsne" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "n_pcs": { + "type": "integer", + "description": "The number of principal components to use for the tSNE computation.", + "help_text": "Type: `integer`, multiple: `False`, default: `50`. ", + "default": 50 + }, + "perplexity": { + "type": "number", + "description": "The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + }, + "min_dist": { + "type": "number", + "description": "The effective minimum distance between embedded points", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "metric": { + "type": "string", + "description": "Distance metric to calculate neighbors on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"euclidean\"`. ", + "default": "euclidean" + }, + "early_exaggeration": { + "type": "number", + "description": "Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them", + "help_text": "Type: `double`, multiple: `False`, default: `12.0`. ", + "default": 12.0 + }, + "learning_rate": { + "type": "number", + "description": "The learning rate for t-SNE optimization", + "help_text": "Type: `double`, multiple: `False`, default: `1000.0`. ", + "default": 1000.0 + }, + "random_state": { + "type": "integer", + "description": "The random seed to use for the tSNE computation.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dimred/tsne/setup_logger.py b/target/nextflow/dimred/tsne/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dimred/tsne/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/dimred/umap/.config.vsh.yaml b/target/nextflow/dimred/umap/.config.vsh.yaml new file mode 100644 index 00000000..7dee67ef --- /dev/null +++ b/target/nextflow/dimred/umap/.config.vsh.yaml @@ -0,0 +1,377 @@ +name: "umap" +namespace: "dimred" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_neighbors" + description: "The `.uns` neighbors slot as output by the `find_neighbors` component." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The pre/postfix under which to store the UMAP results." + info: null + default: + - "umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "double" + name: "--min_dist" + description: "The effective minimum distance between embedded points. Smaller\ + \ values will result in a more clustered/clumped embedding where nearby points\ + \ on the manifold are drawn closer together, while larger values will result\ + \ on a more even dispersal of points. The value should be set relative to the\ + \ spread value, which determines the scale at which embedded points will be\ + \ spread out." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--spread" + description: "The effective scale of embedded points. In combination with `min_dist`\ + \ this determines how clustered/clumped the embedded points are." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_components" + description: "The number of dimensions of the embedding." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_iter" + description: "The number of iterations (epochs) of the optimization. Called `n_epochs`\ + \ in the original UMAP. Default is set to 500 if neighbors['connectivities'].shape[0]\ + \ <= 10000, else 200." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "The initial learning rate for the embedding optimization." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--gamma" + description: "Weighting applied to negative samples in low dimensional embedding\ + \ optimization. Values higher than one will result in greater weight being given\ + \ to negative samples." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--negative_sample_rate" + description: "The number of negative edge/1-simplex samples to use per positive\ + \ edge/1-simplex sample in optimizing the low dimensional embedding." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--init_pos" + description: "How to initialize the low dimensional embedding. Called `init` in\ + \ the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions\ + \ from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`:\ + \ assign initial embedding positions at random.\n" + info: null + default: + - "spectral" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning\ + \ technique suitable for visualizing high-dimensional data. Besides tending to be\ + \ faster than tSNE, it optimizes the embedding such that it best reflects the topology\ + \ of the data, which we represent throughout Scanpy using a neighborhood graph.\ + \ tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in\ + \ the embedding such that these best match the distribution of distances in the\ + \ high-dimensional space. We use the implementation of umap-learn [McInnes18]. For\ + \ a few comparisons of UMAP with tSNE, see this preprint.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/dimred/umap/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/dimred/umap" + executable: "target/nextflow/dimred/umap/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/dimred/umap/compress_h5mu.py b/target/nextflow/dimred/umap/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/dimred/umap/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/dimred/umap/main.nf b/target/nextflow/dimred/umap/main.nf new file mode 100644 index 00000000..6d1b0766 --- /dev/null +++ b/target/nextflow/dimred/umap/main.nf @@ -0,0 +1,4098 @@ +// umap main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "umap", + "namespace" : "dimred", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "The `.uns` neighbors slot as output by the `find_neighbors` component.", + "default" : [ + "neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "The pre/postfix under which to store the UMAP results.", + "default" : [ + "umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "double", + "name" : "--min_dist", + "description" : "The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the spread value, which determines the scale at which embedded points will be spread out.", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--spread", + "description" : "The effective scale of embedded points. In combination with `min_dist` this determines how clustered/clumped the embedded points are.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num_components", + "description" : "The number of dimensions of the embedding.", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_iter", + "description" : "The number of iterations (epochs) of the optimization. Called `n_epochs` in the original UMAP. Default is set to 500 if neighbors['connectivities'].shape[0] <= 10000, else 200.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alpha", + "description" : "The initial learning rate for the embedding optimization.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--gamma", + "description" : "Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--negative_sample_rate", + "description" : "The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.", + "default" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--init_pos", + "description" : "How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`: assign initial embedding positions at random.\n", + "default" : [ + "spectral" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of umap-learn [McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/dimred/umap/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/dimred/umap", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import sys +import anndata as ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_neighbors': $( if [ ! -z ${VIASH_PAR_UNS_NEIGHBORS+x} ]; then echo "r'${VIASH_PAR_UNS_NEIGHBORS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_dist': $( if [ ! -z ${VIASH_PAR_MIN_DIST+x} ]; then echo "float(r'${VIASH_PAR_MIN_DIST//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'spread': $( if [ ! -z ${VIASH_PAR_SPREAD+x} ]; then echo "float(r'${VIASH_PAR_SPREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'num_components': $( if [ ! -z ${VIASH_PAR_NUM_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_COMPONENTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_iter': $( if [ ! -z ${VIASH_PAR_MAX_ITER+x} ]; then echo "int(r'${VIASH_PAR_MAX_ITER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gamma': $( if [ ! -z ${VIASH_PAR_GAMMA+x} ]; then echo "float(r'${VIASH_PAR_GAMMA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'negative_sample_rate': $( if [ ! -z ${VIASH_PAR_NEGATIVE_SAMPLE_RATE+x} ]; then echo "int(r'${VIASH_PAR_NEGATIVE_SAMPLE_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'init_pos': $( if [ ! -z ${VIASH_PAR_INIT_POS+x} ]; then echo "r'${VIASH_PAR_INIT_POS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing UMAP for modality '%s'", par["modality"]) + +if par["uns_neighbors"] not in data.uns: + raise ValueError( + f"'{par['uns_neighbors']}' was not found in .mod['{par['modality']}'].uns." + ) + +# create temporary AnnData +# ... because sc.tl.umap doesn't allow to choose +# the obsm output slot +# ... also we can see scanpy is a data format dependency hell +neigh_key = par["uns_neighbors"] +temp_uns = {neigh_key: data.uns[neigh_key]} +conn_key = temp_uns[neigh_key]["connectivities_key"] +dist_key = temp_uns[neigh_key]["distances_key"] +temp_obsp = { + conn_key: data.obsp[conn_key], + dist_key: data.obsp[dist_key], +} +pca_key = temp_uns[neigh_key]["params"]["use_rep"] +temp_obsm = {pca_key: data.obsm[pca_key]} + +temp_adata = ad.AnnData(obsm=temp_obsm, obsp=temp_obsp, uns=temp_uns, shape=data.shape) + +sc.tl.umap( + temp_adata, + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + maxiter=par["max_iter"], + alpha=par["alpha"], + gamma=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init_pos=par["init_pos"], + neighbors_key=neigh_key, +) + +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_umap"] + +logger.info("Writing to %s, compression %s", par["output"], par["output_compression"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/dimred/umap", + "tag" : "main" + }, + "label" : [ + "highcpu", + "midmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/dimred/umap/nextflow.config b/target/nextflow/dimred/umap/nextflow.config new file mode 100644 index 00000000..70c2b621 --- /dev/null +++ b/target/nextflow/dimred/umap/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'dimred/umap' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of umap-learn [McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint.\n' + author = 'Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/dimred/umap/nextflow_labels.config b/target/nextflow/dimred/umap/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/dimred/umap/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/dimred/umap/nextflow_schema.json b/target/nextflow/dimred/umap/nextflow_schema.json new file mode 100644 index 00000000..d7159538 --- /dev/null +++ b/target/nextflow/dimred/umap/nextflow_schema.json @@ -0,0 +1,143 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "umap", + "description": "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of umap-learn [McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "uns_neighbors": { + "type": "string", + "description": "The `.uns` neighbors slot as output by the `find_neighbors` component.", + "help_text": "Type: `string`, multiple: `False`, default: `\"neighbors\"`. ", + "default": "neighbors" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_output": { + "type": "string", + "description": "The pre/postfix under which to store the UMAP results.", + "help_text": "Type: `string`, multiple: `False`, default: `\"umap\"`. ", + "default": "umap" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "min_dist": { + "type": "number", + "description": "The effective minimum distance between embedded points", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "spread": { + "type": "number", + "description": "The effective scale of embedded points", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "num_components": { + "type": "integer", + "description": "The number of dimensions of the embedding.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "max_iter": { + "type": "integer", + "description": "The number of iterations (epochs) of the optimization", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "alpha": { + "type": "number", + "description": "The initial learning rate for the embedding optimization.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "gamma": { + "type": "number", + "description": "Weighting applied to negative samples in low dimensional embedding optimization", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "negative_sample_rate": { + "type": "integer", + "description": "The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.", + "help_text": "Type: `integer`, multiple: `False`, default: `5`. ", + "default": 5 + }, + "init_pos": { + "type": "string", + "description": "How to initialize the low dimensional embedding", + "help_text": "Type: `string`, multiple: `False`, default: `\"spectral\"`. ", + "default": "spectral" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/dimred/umap/setup_logger.py b/target/nextflow/dimred/umap/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/dimred/umap/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/download/download_file/.config.vsh.yaml b/target/nextflow/download/download_file/.config.vsh.yaml new file mode 100644 index 00000000..0c104274 --- /dev/null +++ b/target/nextflow/download/download_file/.config.vsh.yaml @@ -0,0 +1,208 @@ +name: "download_file" +namespace: "download" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--input" + description: "URL to a file to download." + info: null + example: + - "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Path where to store output." + info: null + example: + - "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--verbose" + alternatives: + - "-v" + description: "Increase verbosity" + info: null + direction: "input" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Download a file.\n" +usage: "download_file \\\n --input https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5\ + \ \\\n --output output_rna.h5\n" +test_resources: +- type: "bash_script" + path: "run_test.sh" + is_executable: true +info: null +status: "deprecated" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bash:5.1.16" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/download/download_file/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/download/download_file" + executable: "target/nextflow/download/download_file/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/download/download_file/main.nf b/target/nextflow/download/download_file/main.nf new file mode 100644 index 00000000..87572811 --- /dev/null +++ b/target/nextflow/download/download_file/main.nf @@ -0,0 +1,3835 @@ +// download_file main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "download_file", + "namespace" : "download", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--input", + "description" : "URL to a file to download.", + "example" : [ + "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Path where to store output.", + "example" : [ + "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--verbose", + "alternatives" : [ + "-v" + ], + "description" : "Increase verbosity", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Download a file.\n", + "usage" : "download_file \\\\\n --input https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \\\\\n --output output_rna.h5\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "run_test.sh", + "is_executable" : true + } + ], + "status" : "deprecated", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "bash:5.1.16", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/download/download_file/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/download/download_file", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_VERBOSE+x} ]; then echo "${VIASH_PAR_VERBOSE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_verbose='&'#" ; else echo "# par_verbose="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=() + +if [ "\\$par_verbose" != "true" ]; then + extra_params+=("--quiet") +fi + +wget "\\$par_input" -O "\\$par_output" "\\${extra_params[@]}" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/download/download_file", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/download/download_file/nextflow.config b/target/nextflow/download/download_file/nextflow.config new file mode 100644 index 00000000..49cf6a53 --- /dev/null +++ b/target/nextflow/download/download_file/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'download/download_file' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Download a file.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/download/download_file/nextflow_labels.config b/target/nextflow/download/download_file/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/download/download_file/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/download/download_file/nextflow_schema.json b/target/nextflow/download/download_file/nextflow_schema.json new file mode 100644 index 00000000..f9b9497e --- /dev/null +++ b/target/nextflow/download/download_file/nextflow_schema.json @@ -0,0 +1,53 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "download_file", + "description": "Download a file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "description": "URL to a file to download.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Path where to store output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5\"`, direction: `output`, example: `\"pbmc_1k_protein_v3_raw_feature_bc_matrix.h5\"`. ", + "default": "$id.$key.output.h5" + }, + "verbose": { + "type": "boolean", + "description": "Increase verbosity", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/feature_annotation/align_query_reference/.config.vsh.yaml b/target/nextflow/feature_annotation/align_query_reference/.config.vsh.yaml new file mode 100644 index 00000000..ea73c57b --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/.config.vsh.yaml @@ -0,0 +1,564 @@ +name: "align_query_reference" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + description: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The input (query) data to be labeled. Should be a .h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process. Note that the query and reference modalities\ + \ should be the same." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer in the input (query) data containing raw counts if .X\ + \ is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer_lognormalized" + description: "The layer in the input (query) data containing log normalized counts\ + \ if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the .var column in the input (query) data containing\ + \ gene names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch" + description: "The name of the .obs column in the input (query) data containing\ + \ batch information.\n" + info: null + example: + - "sample_id" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_label" + description: "The name of the .obs column in the input (query) data containing\ + \ cell type labels. If not provided, the --unkown_celltype_label will be assigned\ + \ to all observations.\n" + info: null + example: + - "cell_type" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_id" + description: "Meta id value to be assigned to the --output_obs_id .obs field of\ + \ the aligned input (query) data.\n" + info: null + default: + - "query" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + description: "Arguments related to the reference dataset." + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train the CellTypist classifiers on. Only\ + \ required if a pre-trained --model is not provided." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer in the reference data containing raw counts if .X is not\ + \ to be used. Data are expected to be processed in the same way as the --input\ + \ query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer_lognormalized" + description: "The layer in the reference data containing log normalized counts\ + \ if .X is not to be used. Data are expected to be processed in the same way\ + \ as the --input query dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The name of the .var column in the reference data containing gene\ + \ names; when no gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch" + description: "The name of the .obs column in the reference data containing batch\ + \ information.\n" + info: null + example: + - "sample_id" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_label" + description: "The name of the .obs column in the reference data containing cell\ + \ type labels. If not provided, the --unkown_celltype_label will be assigned\ + \ to all observations.\n" + info: null + example: + - "cell_type" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_id" + description: "Meta id value to be assigned to the --output_obs_id .obs field of\ + \ the aligned reference data.\n" + info: null + default: + - "reference" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output_query" + description: "Aligned query data." + info: null + example: + - "output_query.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_reference" + description: "Aligned reference data." + info: null + example: + - "output_reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Name of the aligned layer containing raw counts in the output query\ + \ and reference datasets." + info: null + default: + - "_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer_lognormalized" + description: "Name of the aligned layer containing log normalized counts in the\ + \ output query and reference datasets." + info: null + default: + - "_log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_gene_names" + description: "Name of the .var column in the output query and reference datasets\ + \ containing the gene names." + info: null + default: + - "_gene_names" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_batch" + description: "Name of the .obs column in the output query and reference datasets\ + \ containing the batch information." + info: null + default: + - "_sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_label" + description: "Name of the .obs column in the output query and reference datasets\ + \ containing the cell type labels." + info: null + default: + - "_cell_type" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_id" + description: "Name of the .obs column in the output query and reference datasets\ + \ containing the dataset id." + info: null + default: + - "_dataset" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_index" + description: "Name of the .var column to which the .var index of the --input and\ + \ --reference datasets is stored. Only relevant if \"--preserve_var_index\"\ + \ is False." + info: null + default: + - "_ori_var_index" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_common_genes" + description: "Name of the .var column in the output query and reference datasets\ + \ containing the boolean array indicating the common variables." + info: null + default: + - "_common_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + description: "Arguments related to the alignment of the input and reference datasets." + arguments: + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--align_layers_raw_counts" + description: "Whether to align the query and reference layers containing raw counts." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--align_layers_lognormalized_counts" + description: "Whether to align the query and reference layers containing log normalized\ + \ counts." + info: null + direction: "input" + - type: "string" + name: "--unkown_celltype_label" + description: "The label to assign to cells with an unknown cell type.\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite_existing_key" + description: "If set to true and the layer, obs or var key already exists in the\ + \ query/reference file, the key will be overwritten." + info: null + direction: "input" + - type: "boolean_true" + name: "--preserve_var_index" + description: "If set to true, the .var index of the --input and --reference datasets\ + \ will be preserved.\nIf set to false (default behavior), the original .var\ + \ index will be stored in the --output_var_index .var column and the .var index\ + \ will be replaced with the sanitized & aligned gene names.\n" + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "cross_check_genes.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Alignment of a query and reference dataset by:\n* Alignment of layers\n\ + * Harmonization of .obs field names for batch and cell type labels\n* Harmonization\ + \ of .var field name for gene names\n* Sanitation of gene names\n* Cross-checking\ + \ of genes\n* Assignment of an id to the query and reference datasets\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/align_query_reference/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/feature_annotation/align_query_reference" + executable: "target/nextflow/feature_annotation/align_query_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/feature_annotation/align_query_reference/compress_h5mu.py b/target/nextflow/feature_annotation/align_query_reference/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/feature_annotation/align_query_reference/cross_check_genes.py b/target/nextflow/feature_annotation/align_query_reference/cross_check_genes.py new file mode 100644 index 00000000..b4d5779d --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/cross_check_genes.py @@ -0,0 +1,26 @@ +from typing import List + + +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: + """Cross check the overlap between two lists of genes + + Parameters + ---------- + query_genes : List[str] + List of gene names + reference_genes : List[str] + List of gene names + + Returns + ------- + List[str] + List of overlapping genes + """ + common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) + assert len(common_ens_ids) >= min_gene_overlap, ( + f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + ) + + return common_ens_ids diff --git a/target/nextflow/feature_annotation/align_query_reference/main.nf b/target/nextflow/feature_annotation/align_query_reference/main.nf new file mode 100644 index 00000000..78fbed28 --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/main.nf @@ -0,0 +1,4525 @@ +// align_query_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "align_query_reference", + "namespace" : "feature_annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input (query) data to be labeled. Should be a .h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process. Note that the query and reference modalities should be the same.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer in the input (query) data containing raw counts if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer_lognormalized", + "description" : "The layer in the input (query) data containing log normalized counts if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the .var column in the input (query) data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch", + "description" : "The name of the .obs column in the input (query) data containing batch information.\n", + "example" : [ + "sample_id" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_label", + "description" : "The name of the .obs column in the input (query) data containing cell type labels. If not provided, the --unkown_celltype_label will be assigned to all observations.\n", + "example" : [ + "cell_type" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_id", + "description" : "Meta id value to be assigned to the --output_obs_id .obs field of the aligned input (query) data.\n", + "default" : [ + "query" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "description" : "Arguments related to the reference dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer in the reference data containing raw counts if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer_lognormalized", + "description" : "The layer in the reference data containing log normalized counts if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The name of the .var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_batch", + "description" : "The name of the .obs column in the reference data containing batch information.\n", + "example" : [ + "sample_id" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_label", + "description" : "The name of the .obs column in the reference data containing cell type labels. If not provided, the --unkown_celltype_label will be assigned to all observations.\n", + "example" : [ + "cell_type" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_id", + "description" : "Meta id value to be assigned to the --output_obs_id .obs field of the aligned reference data.\n", + "default" : [ + "reference" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output_query", + "description" : "Aligned query data.", + "example" : [ + "output_query.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_reference", + "description" : "Aligned reference data.", + "example" : [ + "output_reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Name of the aligned layer containing raw counts in the output query and reference datasets.", + "default" : [ + "_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer_lognormalized", + "description" : "Name of the aligned layer containing log normalized counts in the output query and reference datasets.", + "default" : [ + "_log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_gene_names", + "description" : "Name of the .var column in the output query and reference datasets containing the gene names.", + "default" : [ + "_gene_names" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_batch", + "description" : "Name of the .obs column in the output query and reference datasets containing the batch information.", + "default" : [ + "_sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_label", + "description" : "Name of the .obs column in the output query and reference datasets containing the cell type labels.", + "default" : [ + "_cell_type" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_id", + "description" : "Name of the .obs column in the output query and reference datasets containing the dataset id.", + "default" : [ + "_dataset" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_index", + "description" : "Name of the .var column to which the .var index of the --input and --reference datasets is stored. Only relevant if \\"--preserve_var_index\\" is False.", + "default" : [ + "_ori_var_index" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_common_genes", + "description" : "Name of the .var column in the output query and reference datasets containing the boolean array indicating the common variables.", + "default" : [ + "_common_vars" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "description" : "Arguments related to the alignment of the input and reference datasets.", + "arguments" : [ + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--align_layers_raw_counts", + "description" : "Whether to align the query and reference layers containing raw counts.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--align_layers_lognormalized_counts", + "description" : "Whether to align the query and reference layers containing log normalized counts.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--unkown_celltype_label", + "description" : "The label to assign to cells with an unknown cell type.\n", + "default" : [ + "Unknown" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--overwrite_existing_key", + "description" : "If set to true and the layer, obs or var key already exists in the query/reference file, the key will be overwritten.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--preserve_var_index", + "description" : "If set to true, the .var index of the --input and --reference datasets will be preserved.\nIf set to false (default behavior), the original .var index will be stored in the --output_var_index .var column and the .var index will be replaced with the sanitized & aligned gene names.\n", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/cross_check_genes.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Alignment of a query and reference dataset by:\n* Alignment of layers\n* Harmonization of .obs field names for batch and cell type labels\n* Harmonization of .var field name for gene names\n* Sanitation of gene names\n* Cross-checking of genes\n* Assignment of an id to the query and reference datasets\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/feature_annotation/align_query_reference/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/feature_annotation/align_query_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer_lognormalized': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER_LOGNORMALIZED+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER_LOGNORMALIZED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_layer_lognormalized': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER_LOGNORMALIZED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_var_gene_names': $( if [ ! -z ${VIASH_PAR_REFERENCE_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_batch': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_label': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_id': $( if [ ! -z ${VIASH_PAR_REFERENCE_ID+x} ]; then echo "r'${VIASH_PAR_REFERENCE_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_query': $( if [ ! -z ${VIASH_PAR_OUTPUT_QUERY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_QUERY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_reference': $( if [ ! -z ${VIASH_PAR_OUTPUT_REFERENCE+x} ]; then echo "r'${VIASH_PAR_OUTPUT_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer_lognormalized': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER_LOGNORMALIZED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_gene_names': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_batch': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_label': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_id': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_ID+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_index': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_INDEX+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_common_genes': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_COMMON_GENES+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_COMMON_GENES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_reference_gene_overlap': $( if [ ! -z ${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP+x} ]; then echo "int(r'${VIASH_PAR_INPUT_REFERENCE_GENE_OVERLAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'align_layers_raw_counts': $( if [ ! -z ${VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS+x} ]; then echo "r'${VIASH_PAR_ALIGN_LAYERS_RAW_COUNTS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'align_layers_lognormalized_counts': $( if [ ! -z ${VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS+x} ]; then echo "r'${VIASH_PAR_ALIGN_LAYERS_LOGNORMALIZED_COUNTS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'unkown_celltype_label': $( if [ ! -z ${VIASH_PAR_UNKOWN_CELLTYPE_LABEL+x} ]; then echo "r'${VIASH_PAR_UNKOWN_CELLTYPE_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'overwrite_existing_key': $( if [ ! -z ${VIASH_PAR_OVERWRITE_EXISTING_KEY+x} ]; then echo "r'${VIASH_PAR_OVERWRITE_EXISTING_KEY//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'preserve_var_index': $( if [ ! -z ${VIASH_PAR_PRESERVE_VAR_INDEX+x} ]; then echo "r'${VIASH_PAR_PRESERVE_VAR_INDEX//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression +from cross_check_genes import cross_check_genes + +logger = setup_logger() + + +def copy_layer(adata, input_layer, output_layer, overwrite=False): + if output_layer not in adata.layers.keys() and input_layer: + logger.info(f"Copying layer from \\`{input_layer}\\` to \\`{output_layer}\\`") + adata.layers[output_layer] = adata.layers[input_layer].copy() + + elif output_layer not in adata.layers.keys() and not input_layer: + logger.info(f"Copying layer from .X to \\`{output_layer}\\`") + adata.layers[output_layer] = adata.X.copy() + + else: + if not overwrite: + raise ValueError( + f"Layer \\`{output_layer}\\` already exists. Data can not be copied." + ) + logger.warning(f"Layer \\`{output_layer}\\` already exists. Overwriting data.") + adata.layers[output_layer] = ( + adata.layers[input_layer].copy() if input_layer else adata.X.copy() + ) + + return adata + + +def copy_obs(adata, input_obs_key, output_obs_key, overwrite=False, fill_value=None): + if output_obs_key not in adata.obs.keys() and input_obs_key: + logger.info(f"Copying .obs field from \\`{input_obs_key}\\` to \\`{output_obs_key}\\`") + adata.obs[output_obs_key] = adata.obs[input_obs_key].copy() + + elif output_obs_key not in adata.obs.keys() and not input_obs_key: + logger.info(f"Assigning fill value \\`{fill_value}\\` to .obs {output_obs_key}") + adata.obs[output_obs_key] = fill_value + + else: + if not overwrite: + raise ValueError( + f".obs key \\`{output_obs_key}\\` already exists. Data can not be copied." + ) + + logger.warning(f".obs key \\`{output_obs_key}\\` already exists. Overwriting data.") + adata.obs[output_obs_key] = ( + adata.obs[input_obs_key].copy() if input_obs_key else fill_value + ) + + return adata + + +def copy_and_sanitize_var_gene_names( + adata, + input_var_key, + output_var_key, + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field="ori_var_index", +): + if output_var_key not in adata.var.keys() and input_var_key: + logger.info(f"Copying .var field from \\`{input_var_key}\\` to \\`{output_var_key}\\`") + adata.var[output_var_key] = adata.var[input_var_key].str.replace( + "\\\\\\\\.[0-9]+\\$", "", regex=True + ) + + elif output_var_key not in adata.var.keys() and not input_var_key: + logger.info(f"Copying .var index to \\`{output_var_key}\\`") + adata.var[output_var_key] = adata.var.index.str.replace( + "\\\\\\\\.[0-9]+\\$", "", regex=True + ) + + else: + if not overwrite: + raise ValueError( + f".var key \\`{output_var_key}\\` already exists. Data can not be copied." + ) + + logger.warning(f".var key \\`{output_var_key}\\` already exists. Overwriting data.") + adata.var[output_var_key] = ( + adata.var[input_var_key].str.replace("\\\\\\\\.[0-9]+\\$", "", regex=True) + if input_var_key + else adata.var.index.str.replace("\\\\\\\\.[0-9]+\\$", "", regex=True) + ) + + if not preserve_index: + logger.info("Replacing .var index with sanitized gene names...") + adata.var[var_index_field] = adata.var.index + adata.var.index = list(adata.var[output_var_key]) + + return adata + + +def main(): + logger.info("Reading query and reference data") + input_adata = mu.read_h5ad(par["input"], mod=par["modality"]) + input_modality = input_adata.copy() + + reference_adata = mu.read_h5ad(par["reference"], mod=par["modality"]) + reference_modality = reference_adata.copy() + + # Aligning layers + logger.info("### Aligning layers") + + if par["align_layers_lognormalized_counts"] and par["align_layers_raw_counts"]: + if par["input_layer"] == par["input_layer_lognormalized"]: + raise ValueError( + "Layer names for raw and lognormalized counts in the query data can not be identical." + ) + + if par["reference_layer"] == par["reference_layer_lognormalized"]: + raise ValueError( + "Layer names for raw and lognormalized counts in the reference data can not be identical." + ) + + if par["align_layers_raw_counts"]: + logger.info("## Copying query layer raw counts...") + input_modality = copy_layer( + input_modality, + par["input_layer"], + par["output_layer"], + overwrite=par["overwrite_existing_key"], + ) + logger.info("## Copying reference layer raw counts...") + reference_modality = copy_layer( + reference_modality, + par["reference_layer"], + par["output_layer"], + overwrite=par["overwrite_existing_key"], + ) + + if par["align_layers_lognormalized_counts"]: + logger.info("## Copying query layer lognormalized counts...") + input_modality = copy_layer( + input_modality, + par["input_layer_lognormalized"], + par["output_layer_lognormalized"], + overwrite=par["overwrite_existing_key"], + ) + logger.info("## Copying reference layerlognormalized counts...") + reference_modality = copy_layer( + reference_modality, + par["reference_layer_lognormalized"], + par["output_layer_lognormalized"], + overwrite=par["overwrite_existing_key"], + ) + + # Aligning batch labels + logger.info("### Aligning batch labels") + logger.info("## Copying query .obs batch field...") + input_modality = copy_obs( + input_modality, + par["input_obs_batch"], + par["output_obs_batch"], + par["overwrite_existing_key"], + fill_value=None, + ) + logger.info("## Copying reference .obs batch field...") + reference_modality = copy_obs( + reference_modality, + par["reference_obs_batch"], + par["output_obs_batch"], + overwrite=par["overwrite_existing_key"], + fill_value=None, + ) + + # Aligning cell type labels + logger.info("### Aligning cell type labels") + logger.info("## Copying query .obs cell type label field...") + input_modality = copy_obs( + input_modality, + par["input_obs_label"], + par["output_obs_label"], + par["overwrite_existing_key"], + fill_value=par["unkown_celltype_label"], + ) + logger.info("## Copying reference .obs cell type label field...") + reference_modality = copy_obs( + reference_modality, + par["reference_obs_label"], + par["output_obs_label"], + overwrite=par["overwrite_existing_key"], + fill_value=par["unkown_celltype_label"], + ) + + # Aligning and sanitizing gene names + logger.info("### Aligning and sanitizing gene names") + logger.info("## Copying query .var gene names field...") + input_modality = copy_and_sanitize_var_gene_names( + input_modality, + par["input_var_gene_names"], + par["output_var_gene_names"], + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field=par["output_var_index"], + ) + logger.info("## Copying reference .var gene names field...") + reference_modality = copy_and_sanitize_var_gene_names( + reference_modality, + par["reference_var_gene_names"], + par["output_var_gene_names"], + overwrite=par["overwrite_existing_key"], + preserve_index=par["preserve_var_index"], + var_index_field=par["output_var_index"], + ) + + # Cross check genes + logger.info("### Cross checking genes") + common_vars = cross_check_genes( + input_modality.var[par["output_var_gene_names"]], + reference_modality.var[par["output_var_gene_names"]], + min_gene_overlap=par["input_reference_gene_overlap"], + ) + + # Add common vars to the output + input_modality.var[par["output_var_common_genes"]] = input_modality.var[ + par["output_var_gene_names"] + ].isin(common_vars) + reference_modality.var[par["output_var_common_genes"]] = reference_modality.var[ + par["output_var_gene_names"] + ].isin(common_vars) + + # Adding an id to the query and reference datasets + logger.info("### Adding an id to the query and reference datasets") + logger.info( + f"## Adding \\`{par['input_id']}\\` to the .obs \\`{par['output_obs_id']}\\` field of the query dataset..." + ) + input_modality.obs[par["output_obs_id"]] = par["input_id"] + logger.info( + f"## Adding \\`{par['reference_id']}\\` to the .obs \\`{par['output_obs_id']}\\` field of the reference dataset..." + ) + reference_modality.obs[par["output_obs_id"]] = par["reference_id"] + + # Writing output data + logger.info("Writing output data") + write_h5ad_to_h5mu_with_compression( + par["output_query"], + par["input"], + par["modality"], + input_modality, + par["output_compression"], + ) + + write_h5ad_to_h5mu_with_compression( + par["output_reference"], + par["reference"], + par["modality"], + reference_modality, + par["output_compression"], + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/feature_annotation/align_query_reference", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/feature_annotation/align_query_reference/nextflow.config b/target/nextflow/feature_annotation/align_query_reference/nextflow.config new file mode 100644 index 00000000..8e32c4d9 --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'feature_annotation/align_query_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Alignment of a query and reference dataset by:\n* Alignment of layers\n* Harmonization of .obs field names for batch and cell type labels\n* Harmonization of .var field name for gene names\n* Sanitation of gene names\n* Cross-checking of genes\n* Assignment of an id to the query and reference datasets\n' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/feature_annotation/align_query_reference/nextflow_labels.config b/target/nextflow/feature_annotation/align_query_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/feature_annotation/align_query_reference/nextflow_schema.json b/target/nextflow/feature_annotation/align_query_reference/nextflow_schema.json new file mode 100644 index 00000000..57d08fa8 --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/nextflow_schema.json @@ -0,0 +1,253 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "align_query_reference", + "description": "Alignment of a query and reference dataset by:\n* Alignment of layers\n* Harmonization of .obs field names for batch and cell type labels\n* Harmonization of .var field name for gene names\n* Sanitation of gene names\n* Cross-checking of genes\n* Assignment of an id to the query and reference datasets\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Input dataset (query) arguments", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input (query) data to be labeled", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer in the input (query) data containing raw counts if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_layer_lognormalized": { + "type": "string", + "description": "The layer in the input (query) data containing log normalized counts if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the .var column in the input (query) data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_batch": { + "type": "string", + "description": "The name of the .obs column in the input (query) data containing batch information.\n", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample_id\"`. " + }, + "input_obs_label": { + "type": "string", + "description": "The name of the .obs column in the input (query) data containing cell type labels", + "help_text": "Type: `string`, multiple: `False`, example: `\"cell_type\"`. " + }, + "input_id": { + "type": "string", + "description": "Meta id value to be assigned to the --output_obs_id .obs field of the aligned input (query) data.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"query\"`. ", + "default": "query" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output_query": { + "type": "string", + "format": "path", + "description": "Aligned query data.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_query.h5mu\"`, direction: `output`, example: `\"output_query.h5mu\"`. ", + "default": "$id.$key.output_query.h5mu" + }, + "output_reference": { + "type": "string", + "format": "path", + "description": "Aligned reference data.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_reference.h5mu\"`, direction: `output`, example: `\"output_reference.h5mu\"`. ", + "default": "$id.$key.output_reference.h5mu" + }, + "output_layer": { + "type": "string", + "description": "Name of the aligned layer containing raw counts in the output query and reference datasets.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_counts\"`. ", + "default": "_counts" + }, + "output_layer_lognormalized": { + "type": "string", + "description": "Name of the aligned layer containing log normalized counts in the output query and reference datasets.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_log_normalized\"`. ", + "default": "_log_normalized" + }, + "output_var_gene_names": { + "type": "string", + "description": "Name of the .var column in the output query and reference datasets containing the gene names.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_gene_names\"`. ", + "default": "_gene_names" + }, + "output_obs_batch": { + "type": "string", + "description": "Name of the .obs column in the output query and reference datasets containing the batch information.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_sample_id\"`. ", + "default": "_sample_id" + }, + "output_obs_label": { + "type": "string", + "description": "Name of the .obs column in the output query and reference datasets containing the cell type labels.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_cell_type\"`. ", + "default": "_cell_type" + }, + "output_obs_id": { + "type": "string", + "description": "Name of the .obs column in the output query and reference datasets containing the dataset id.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_dataset\"`. ", + "default": "_dataset" + }, + "output_var_index": { + "type": "string", + "description": "Name of the .var column to which the .var index of the --input and --reference datasets is stored", + "help_text": "Type: `string`, multiple: `False`, default: `\"_ori_var_index\"`. ", + "default": "_ori_var_index" + }, + "output_var_common_genes": { + "type": "string", + "description": "Name of the .var column in the output query and reference datasets containing the boolean array indicating the common variables.", + "help_text": "Type: `string`, multiple: `False`, default: `\"_common_vars\"`. ", + "default": "_common_vars" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "Arguments related to the alignment of the input and reference datasets.", + "properties": { + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "align_layers_raw_counts": { + "type": "boolean", + "description": "Whether to align the query and reference layers containing raw counts.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "align_layers_lognormalized_counts": { + "type": "boolean", + "description": "Whether to align the query and reference layers containing log normalized counts.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "unkown_celltype_label": { + "type": "string", + "description": "The label to assign to cells with an unknown cell type.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"Unknown\"`. ", + "default": "Unknown" + }, + "overwrite_existing_key": { + "type": "boolean", + "description": "If set to true and the layer, obs or var key already exists in the query/reference file, the key will be overwritten.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "preserve_var_index": { + "type": "boolean", + "description": "If set to true, the .var index of the --input and --reference datasets will be preserved.\nIf set to false (default behavior), the original .var index will be stored in the --output_var_index .var column and the .var index will be replaced with the sanitized & aligned gene names.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "Arguments related to the reference dataset.", + "properties": { + "reference": { + "type": "string", + "format": "path", + "description": "The reference data to train the CellTypist classifiers on", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer in the reference data containing raw counts if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_layer_lognormalized": { + "type": "string", + "description": "The layer in the reference data containing log normalized counts if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The name of the .var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_batch": { + "type": "string", + "description": "The name of the .obs column in the reference data containing batch information.\n", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample_id\"`. " + }, + "reference_obs_label": { + "type": "string", + "description": "The name of the .obs column in the reference data containing cell type labels", + "help_text": "Type: `string`, multiple: `False`, example: `\"cell_type\"`. " + }, + "reference_id": { + "type": "string", + "description": "Meta id value to be assigned to the --output_obs_id .obs field of the aligned reference data.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"reference\"`. ", + "default": "reference" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/feature_annotation/align_query_reference/setup_logger.py b/target/nextflow/feature_annotation/align_query_reference/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/feature_annotation/align_query_reference/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/.config.vsh.yaml b/target/nextflow/feature_annotation/highly_variable_features_scanpy/.config.vsh.yaml new file mode 100644 index 00000000..c045445c --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/.config.vsh.yaml @@ -0,0 +1,428 @@ +name: "highly_variable_features_scanpy" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use adata.layers[layer] for expression values instead of adata.X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "If specified, use boolean array in adata.var[var_input] to calculate\ + \ hvg on subset of vars.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_filter" + description: "In which .var slot to store a boolean array corresponding to which\ + \ observations should be filtered out." + info: null + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--varm_name" + description: "In which .varm slot to store additional metadata." + info: null + default: + - "hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--flavor" + description: "Choose the flavor for identifying highly variable features. For\ + \ the dispersion based methods\nin their default workflows, Seurat passes the\ + \ cutoffs whereas Cell Ranger passes n_top_features.\n" + info: null + default: + - "seurat" + required: false + choices: + - "seurat" + - "cell_ranger" + - "seurat_v3" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_top_features" + description: "Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_mean" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'." + info: null + default: + - 0.0125 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_mean" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'." + info: null + default: + - 3.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_disp" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_disp" + description: "If n_top_features is defined, this and all other cutoffs for the\ + \ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.\ + \ Default is +inf." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--span" + description: "The fraction of the data (cells) used when estimating the variance\ + \ in the loess model fit if flavor='seurat_v3'." + info: null + default: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_bins" + description: "Number of bins for binning the mean feature expression. Normalization\ + \ is done with respect to each bin. If just a single feature falls into a bin,\ + \ the normalized dispersion is artificially set to 1." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_key" + description: "If specified, highly-variable features are selected within each\ + \ batch separately and merged. This simple \nprocess avoids the selection of\ + \ batch-specific features and acts as a lightweight batch correction method.\ + \ \nFor all flavors, features are first sorted by how many batches they are\ + \ a HVG. For dispersion-based flavors \nties are broken by normalized dispersion.\ + \ If flavor = 'seurat_v3', ties are broken by the median (across\nbatches) rank\ + \ based on within-batch normalized variance.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\ + \nExpects logarithmized data, except when flavor='seurat_v3' in which count data\ + \ is expected.\n\nDepending on flavor, this reproduces the R-implementations of\ + \ Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the\ + \ dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion\ + \ is obtained by scaling with the mean and standard deviation of the dispersions\ + \ for features falling into a given bin for mean expression of features. This means\ + \ that for each bin of mean expression, highly variable features are selected.\n\ + \nFor [Stuart19], a normalized variance for each feature is computed. First, the\ + \ data are standardized (i.e., z-score normalization per feature) with a regularized\ + \ standard deviation. Next, the normalized variance is computed as the variance\ + \ of each feature after the transformation. Features are ranked by the normalized\ + \ variance.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scikit-misc" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/feature_annotation/highly_variable_features_scanpy" + executable: "target/nextflow/feature_annotation/highly_variable_features_scanpy/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/compress_h5mu.py b/target/nextflow/feature_annotation/highly_variable_features_scanpy/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/main.nf b/target/nextflow/feature_annotation/highly_variable_features_scanpy/main.nf new file mode 100644 index 00000000..297de14e --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/main.nf @@ -0,0 +1,4236 @@ +// highly_variable_features_scanpy main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (contributor) +// * Robrecht Cannoodt (maintainer, contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "highly_variable_features_scanpy", + "namespace" : "feature_annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer", + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use adata.layers[layer] for expression values instead of adata.X.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : "If specified, use boolean array in adata.var[var_input] to calculate hvg on subset of vars.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_filter", + "description" : "In which .var slot to store a boolean array corresponding to which observations should be filtered out.", + "default" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--varm_name", + "description" : "In which .varm slot to store additional metadata.", + "default" : [ + "hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--flavor", + "description" : "Choose the flavor for identifying highly variable features. For the dispersion based methods\nin their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features.\n", + "default" : [ + "seurat" + ], + "required" : false, + "choices" : [ + "seurat", + "cell_ranger", + "seurat_v3" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_top_features", + "description" : "Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_mean", + "description" : "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.", + "default" : [ + 0.0125 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_mean", + "description" : "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.", + "default" : [ + 3.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_disp", + "description" : "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_disp", + "description" : "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. Default is +inf.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--span", + "description" : "The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'.", + "default" : [ + 0.3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_bins", + "description" : "Number of bins for binning the mean feature expression. Normalization is done with respect to each bin. If just a single feature falls into a bin, the normalized dispersion is artificially set to 1.", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch_key", + "description" : "If specified, highly-variable features are selected within each batch separately and merged. This simple \nprocess avoids the selection of batch-specific features and acts as a lightweight batch correction method. \nFor all flavors, features are first sorted by how many batches they are a HVG. For dispersion-based flavors \nties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across\nbatches) rank based on within-batch normalized variance.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\nExpects logarithmized data, except when flavor='seurat_v3' in which count data is expected.\n\nDepending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected.\n\nFor [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "scikit-misc" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/feature_annotation/highly_variable_features_scanpy", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import anndata as ad +import pandas as pd +import sys +import re + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_name_filter': $( if [ ! -z ${VIASH_PAR_VAR_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_VAR_NAME_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'varm_name': $( if [ ! -z ${VIASH_PAR_VARM_NAME+x} ]; then echo "r'${VIASH_PAR_VARM_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'flavor': $( if [ ! -z ${VIASH_PAR_FLAVOR+x} ]; then echo "r'${VIASH_PAR_FLAVOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_top_features': $( if [ ! -z ${VIASH_PAR_N_TOP_FEATURES+x} ]; then echo "int(r'${VIASH_PAR_N_TOP_FEATURES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_mean': $( if [ ! -z ${VIASH_PAR_MIN_MEAN+x} ]; then echo "float(r'${VIASH_PAR_MIN_MEAN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_mean': $( if [ ! -z ${VIASH_PAR_MAX_MEAN+x} ]; then echo "float(r'${VIASH_PAR_MAX_MEAN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_disp': $( if [ ! -z ${VIASH_PAR_MIN_DISP+x} ]; then echo "float(r'${VIASH_PAR_MIN_DISP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_disp': $( if [ ! -z ${VIASH_PAR_MAX_DISP+x} ]; then echo "float(r'${VIASH_PAR_MAX_DISP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'span': $( if [ ! -z ${VIASH_PAR_SPAN+x} ]; then echo "float(r'${VIASH_PAR_SPAN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_bins': $( if [ ! -z ${VIASH_PAR_N_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_BINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'obs_batch_key': $( if [ ! -z ${VIASH_PAR_OBS_BATCH_KEY+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() +modality_name = par["modality"] +data = mu.read_h5ad(par["input"], mod=modality_name) +assert data.var_names.is_unique, "Expected var_names of input modality to be unique." + +logger.info("Processing modality '%s'", modality_name) + +if par["layer"] and par["layer"] not in data.layers: + raise ValueError( + f"Layer '{par['layer']}' not found in layers for modality '{modality_name}'. " + f"Found layers are: {','.join(data.layers)}" + ) + +# input layer argument does not work when batch_key is specified because +# it still uses .X to filter out genes with 0 counts, even if .X might not exist. +# So create a custom anndata as input that always uses .X +input_layer = data.X if not par["layer"] else data.layers[par["layer"]] +obs = pd.DataFrame(index=data.obs_names.copy()) +var = pd.DataFrame(index=data.var_names.copy()) +if par["obs_batch_key"]: + obs = data.obs.loc[:, par["obs_batch_key"]].to_frame() +input_anndata = ad.AnnData(X=input_layer.copy(), obs=obs, var=var) +if "log1p" in data.uns: + input_anndata.uns["log1p"] = data.uns["log1p"] + +# Workaround for issue +# https://github.com/scverse/scanpy/issues/2239 +# https://github.com/scverse/scanpy/issues/2181 +if par["flavor"] != "seurat_v3": + # This component requires log normalized data when flavor is not seurat_v3 + # We assume that the data is correctly normalized but scanpy will look at + # .uns to check the transformations performed on the data. + # To prevent scanpy from automatically tranforming the counts when they are + # already transformed, we set the appropriate values to .uns. + if "log1p" not in input_anndata.uns: + logger.warning( + "When flavor is not set to 'seurat_v3', " + "the input data for this component must be log-transformed. " + "However, the 'log1p' dictionairy in .uns has not been set. " + "This is fine if you did not log transform your data with scanpy." + "Otherwise, please check if you are providing log transformed " + "data using --layer." + ) + input_anndata.uns["log1p"] = {"base": None} + elif "log1p" in input_anndata.uns and "base" not in input_anndata.uns["log1p"]: + input_anndata.uns["log1p"]["base"] = None + +# Enable calculating the HVG only on a subset of vars +# e.g for cell type annotation, only calculate HVG on variables that are common between query and reference +if par["var_input"]: + input_anndata.var[par["var_input"]] = data.var[par["var_input"]] + input_anndata = subset_vars(input_anndata, par["var_input"]) + +logger.info("\\\\tUnfiltered data: %s", data) + +logger.info("\\\\tComputing hvg") +# construct arguments +hvg_args = { + "adata": input_anndata, + "n_top_genes": par["n_top_features"], + "min_mean": par["min_mean"], + "max_mean": par["max_mean"], + "min_disp": par["min_disp"], + "span": par["span"], + "n_bins": par["n_bins"], + "flavor": par["flavor"], + "subset": False, + "inplace": False, + "layer": None, # Always uses .X because the input layer was already handled +} + +optional_parameters = { + "max_disp": "max_disp", + "obs_batch_key": "batch_key", + "n_top_genes": "n_top_features", +} +# only add parameter if it's passed +for par_name, dest_name in optional_parameters.items(): + if par.get(par_name): + hvg_args[dest_name] = par[par_name] + +# scanpy does not do this check, although it is stated in the documentation +if par["flavor"] == "seurat_v3" and not par["n_top_features"]: + raise ValueError( + "When flavor is set to 'seurat_v3', you are required to set 'n_top_features'." + ) + +# call function +try: + out = sc.pp.highly_variable_genes(**hvg_args) + if par["var_input"] is not None: + out.index = input_anndata.var.index + out = out.reindex(index=data.var.index, method=None) + out.highly_variable = out.highly_variable.fillna(False) + assert (out.index == data.var.index).all(), ( + "Expected output index values to be equivalent to the input index" + ) + elif par["obs_batch_key"] is not None: + out = out.reindex(index=data.var.index, method=None) + assert (out.index == data.var.index).all(), ( + "Expected output index values to be equivalent to the input index" + ) + +except ValueError as err: + if str(err) == "cannot specify integer \\`bins\\` when input data contains infinity": + err.args = ( + "Cannot specify integer \\`bins\\` when input data contains infinity. " + "Perhaps input data has not been log normalized?", + ) + if re.search("Bin edges must be unique:", str(err)): + raise RuntimeError( + "Scanpy failed to calculate hvg. The error " + "returned by scanpy (see above) could be the " + "result from trying to use this component on unfiltered data." + ) from err + raise err + +out.index = data.var.index +logger.info("\\\\tStoring output into .var") +if par.get("var_name_filter", None) is not None: + data.var[par["var_name_filter"]] = out["highly_variable"] + +if par.get("varm_name", None) is not None and "mean_bin" in out: + # drop mean_bin as mudata/anndata doesn't support tuples + data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) + +logger.info("Writing h5mu to file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], modality_name, data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/feature_annotation/highly_variable_features_scanpy", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow.config b/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow.config new file mode 100644 index 00000000..609ef1de --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'feature_annotation/highly_variable_features_scanpy' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\nExpects logarithmized data, except when flavor=\'seurat_v3\' in which count data is expected.\n\nDepending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected.\n\nFor [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance.\n' + author = 'Dries De Maeyer, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow_labels.config b/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow_schema.json b/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow_schema.json new file mode 100644 index 00000000..83dab75b --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/nextflow_schema.json @@ -0,0 +1,142 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "highly_variable_features_scanpy", + "description": "Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\nExpects logarithmized data, except when flavor='seurat_v3' in which count data is expected.\n\nDepending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected.\n\nFor [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "use adata.layers[layer] for expression values instead of adata.X.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": "If specified, use boolean array in adata.var[var_input] to calculate hvg on subset of vars.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "var_name_filter": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding to which observations should be filtered out.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_hvg\"`. ", + "default": "filter_with_hvg" + }, + "varm_name": { + "type": "string", + "description": "In which .varm slot to store additional metadata.", + "help_text": "Type: `string`, multiple: `False`, default: `\"hvg\"`. ", + "default": "hvg" + }, + "flavor": { + "type": "string", + "description": "Choose the flavor for identifying highly variable features", + "help_text": "Type: `string`, multiple: `False`, default: `\"seurat\"`, choices: ``seurat`, `cell_ranger`, `seurat_v3``. ", + "enum": [ + "seurat", + "cell_ranger", + "seurat_v3" + ], + "default": "seurat" + }, + "n_top_features": { + "type": "integer", + "description": "Number of highly-variable features to keep", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_mean": { + "type": "number", + "description": "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored", + "help_text": "Type: `double`, multiple: `False`, default: `0.0125`. ", + "default": 0.0125 + }, + "max_mean": { + "type": "number", + "description": "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored", + "help_text": "Type: `double`, multiple: `False`, default: `3.0`. ", + "default": 3.0 + }, + "min_disp": { + "type": "number", + "description": "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "max_disp": { + "type": "number", + "description": "If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored", + "help_text": "Type: `double`, multiple: `False`. " + }, + "span": { + "type": "number", + "description": "The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'.", + "help_text": "Type: `double`, multiple: `False`, default: `0.3`. ", + "default": 0.3 + }, + "n_bins": { + "type": "integer", + "description": "Number of bins for binning the mean feature expression", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "obs_batch_key": { + "type": "string", + "description": "If specified, highly-variable features are selected within each batch separately and merged", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/setup_logger.py b/target/nextflow/feature_annotation/highly_variable_features_scanpy/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/feature_annotation/highly_variable_features_scanpy/subset_vars.py b/target/nextflow/feature_annotation/highly_variable_features_scanpy/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/feature_annotation/highly_variable_features_scanpy/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/.config.vsh.yaml b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/.config.vsh.yaml new file mode 100644 index 00000000..9ed0c3f8 --- /dev/null +++ b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/.config.vsh.yaml @@ -0,0 +1,446 @@ +name: "score_genes_cell_cycle_scanpy" +namespace: "feature_annotation" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input_file.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object containing normalized expression values.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the column in the var attribute of the adata object\ + \ that contains the gene names (symbols).\nIf not provided, the index of the\ + \ var attribute will be used.\n" + info: null + example: + - "gene_names" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Gene list inputs" + description: "The gene list inputs can be provided as a list of gene symbols or\ + \ as a file containing a list of gene symbols. The gene list file should be formatted\ + \ as a single column with gene symbols.\n\nMake sure that the gene list inputs\ + \ are consistent with the gene names in the adata object as provided by the --var_gene_names\ + \ argument.\n" + arguments: + - type: "string" + name: "--s_genes" + description: "List of gene symbols for scoring s phase genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--s_genes_file" + description: "Path to a .txt file containing the gene list of s phase genes to\ + \ be scored. \nThe gene list file should be formatted as a single column with\ + \ gene symbols.\n" + info: null + example: + - "s_gene_list.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--g2m_genes" + description: "List of gene symbols for scoring g2m phase genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--g2m_genes_file" + description: "Path to a .txt file containing the gene list of g2m phase genes\ + \ to be scored. \nThe gene list file should be formatted as a single column\ + \ with gene symbols.\n" + info: null + example: + - "g2m_gene_list.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_pool" + description: "List of gene symbols for sampling the reference set. Default is\ + \ all genes.\n" + info: null + example: + - "gene1" + - "gene2" + - "gene3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gene_pool_file" + description: "File with genes for sampling the reference set. Default is all genes.\ + \ \nThe gene pool file should be formatted as a single column with gene symbols.\n" + info: null + example: + - "gene_pool.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file\n" + info: null + example: + - "output_file.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_phase" + description: "The name of the column in the obs attribute of the adata object\ + \ that will store the cell cycle phase annotation.\n" + info: null + default: + - "phase" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_s_score" + description: "The name of the column in the obs attribute of the adata object\ + \ that will store the s phase score.\n" + info: null + default: + - "S_score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_g2m_score" + description: "The name of the column in the obs attribute of the adata object\ + \ that will store the g2m phase score.\n" + info: null + default: + - "G2M_score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--n_bins" + description: "Number of expression level bins for sampling.\n" + info: null + default: + - 25 + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--random_state" + description: "The random seed for sampling.\n" + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--allow_missing_genes" + description: "If true, missing genes in the gene list will be ignored.\n" + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Calculates the score associated to S phase and G2M phase and annotates\ + \ the cell cycle phase for each cell, as implemented by scanpy. \nThe score is the\ + \ average expression of a set of genes subtracted with the average expression of\ + \ a reference set of genes.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/feature_annotation/score_genes_cell_cycle_scanpy/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy" + executable: "target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/helper.py b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/helper.py new file mode 100644 index 00000000..fe559658 --- /dev/null +++ b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/helper.py @@ -0,0 +1,42 @@ +from typing import List, Dict, Any, Optional + + +def read_gene_list( + par: Dict[str, Any], + gene_names: List[str], + list_key: str, + file_key: str, + required: bool = True, +) -> Optional[List[str]]: + """ + Reads a gene list from the parameters and returns it as a list of strings. + """ + + # check whether one or the other was provided, if required + if required and not par[list_key] and not par[file_key]: + raise ValueError(f"Either --{list_key} or --{file_key} must be set") + + # read gene list from parameters + list_of_genes = par[list_key] if par[list_key] else [] + + # read gene list from file + if par[file_key]: + with open(par[file_key]) as file: + file_genes = [x.strip() for x in file] + list_of_genes.extend(file_genes) + + # check for missing genes + if not par["allow_missing_genes"] and list_of_genes: + missing = set(list_of_genes).difference(gene_names) + if missing: + raise ValueError( + f"The follow genes are missing from the input dataset: {missing}" + ) + + # return gene list + if list_of_genes: + return list_of_genes + elif required: + raise ValueError(f"No genes detected in --{list_key} or --{file_key}") + else: + return None diff --git a/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/main.nf b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/main.nf new file mode 100644 index 00000000..1870a87b --- /dev/null +++ b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/main.nf @@ -0,0 +1,4207 @@ +// score_genes_cell_cycle_scanpy main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author) +// * Dorien Roosen (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "score_genes_cell_cycle_scanpy", + "namespace" : "feature_annotation", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dorien Roosen", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input_file.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the adata object containing normalized expression values.\nIf not provided, the X attribute of the adata object will be used.\n", + "example" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_gene_names", + "description" : "The name of the column in the var attribute of the adata object that contains the gene names (symbols).\nIf not provided, the index of the var attribute will be used.\n", + "example" : [ + "gene_names" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Gene list inputs", + "description" : "The gene list inputs can be provided as a list of gene symbols or as a file containing a list of gene symbols. The gene list file should be formatted as a single column with gene symbols.\n\nMake sure that the gene list inputs are consistent with the gene names in the adata object as provided by the --var_gene_names argument.\n", + "arguments" : [ + { + "type" : "string", + "name" : "--s_genes", + "description" : "List of gene symbols for scoring s phase genes.\n", + "example" : [ + "gene1", + "gene2", + "gene3" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--s_genes_file", + "description" : "Path to a .txt file containing the gene list of s phase genes to be scored. \nThe gene list file should be formatted as a single column with gene symbols.\n", + "example" : [ + "s_gene_list.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--g2m_genes", + "description" : "List of gene symbols for scoring g2m phase genes.\n", + "example" : [ + "gene1", + "gene2", + "gene3" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--g2m_genes_file", + "description" : "Path to a .txt file containing the gene list of g2m phase genes to be scored. \nThe gene list file should be formatted as a single column with gene symbols.\n", + "example" : [ + "g2m_gene_list.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--gene_pool", + "description" : "List of gene symbols for sampling the reference set. Default is all genes.\n", + "example" : [ + "gene1", + "gene2", + "gene3" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--gene_pool_file", + "description" : "File with genes for sampling the reference set. Default is all genes. \nThe gene pool file should be formatted as a single column with gene symbols.\n", + "example" : [ + "gene_pool.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file\n", + "example" : [ + "output_file.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_phase", + "description" : "The name of the column in the obs attribute of the adata object that will store the cell cycle phase annotation.\n", + "default" : [ + "phase" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_s_score", + "description" : "The name of the column in the obs attribute of the adata object that will store the s phase score.\n", + "default" : [ + "S_score" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_g2m_score", + "description" : "The name of the column in the obs attribute of the adata object that will store the g2m phase score.\n", + "default" : [ + "G2M_score" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_bins", + "description" : "Number of expression level bins for sampling.\n", + "default" : [ + 25 + ], + "required" : false, + "min" : 0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--random_state", + "description" : "The random seed for sampling.\n", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--allow_missing_genes", + "description" : "If true, missing genes in the gene list will be ignored.\n", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../score_genes_scanpy/helper.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Calculates the score associated to S phase and G2M phase and annotates the cell cycle phase for each cell, as implemented by scanpy. \nThe score is the average expression of a set of genes subtracted with the average expression of a reference set of genes.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + }, + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "scanpy~=1.10.4", + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/feature_annotation/score_genes_cell_cycle_scanpy/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import anndata as ad +import pandas as pd +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 's_genes': $( if [ ! -z ${VIASH_PAR_S_GENES+x} ]; then echo "r'${VIASH_PAR_S_GENES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 's_genes_file': $( if [ ! -z ${VIASH_PAR_S_GENES_FILE+x} ]; then echo "r'${VIASH_PAR_S_GENES_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'g2m_genes': $( if [ ! -z ${VIASH_PAR_G2M_GENES+x} ]; then echo "r'${VIASH_PAR_G2M_GENES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'g2m_genes_file': $( if [ ! -z ${VIASH_PAR_G2M_GENES_FILE+x} ]; then echo "r'${VIASH_PAR_G2M_GENES_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'gene_pool': $( if [ ! -z ${VIASH_PAR_GENE_POOL+x} ]; then echo "r'${VIASH_PAR_GENE_POOL//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'gene_pool_file': $( if [ ! -z ${VIASH_PAR_GENE_POOL_FILE+x} ]; then echo "r'${VIASH_PAR_GENE_POOL_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_phase': $( if [ ! -z ${VIASH_PAR_OBS_PHASE+x} ]; then echo "r'${VIASH_PAR_OBS_PHASE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_s_score': $( if [ ! -z ${VIASH_PAR_OBS_S_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_S_SCORE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_g2m_score': $( if [ ! -z ${VIASH_PAR_OBS_G2M_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_G2M_SCORE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_bins': $( if [ ! -z ${VIASH_PAR_N_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_BINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'random_state': $( if [ ! -z ${VIASH_PAR_RANDOM_STATE+x} ]; then echo "int(r'${VIASH_PAR_RANDOM_STATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'allow_missing_genes': $( if [ ! -z ${VIASH_PAR_ALLOW_MISSING_GENES+x} ]; then echo "r'${VIASH_PAR_ALLOW_MISSING_GENES//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# import helper functions +sys.path.append(meta["resources_dir"]) +from helper import read_gene_list + +# read data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] + +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) +gene_names = pd.Series(input_adata.var_names, index=gene_names_index) + +# check if var index is unique +# input.var[par["var_gene_names"]] is mapped to var index, but may not contain unique values +if not input_adata.var.index.is_unique: + raise ValueError("var index is not unique") + +# read gene lists +s_genes = read_gene_list(par, gene_names.index, "s_genes", "s_genes_file") +g2m_genes = read_gene_list(par, gene_names.index, "g2m_genes", "g2m_genes_file") +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) + +# find matching index names for given genes +g2m_index = gene_names.loc[g2m_genes].tolist() +s_index = gene_names.loc[s_genes].tolist() +gene_pool_index = gene_names.loc[gene_pool].tolist() if gene_pool else None + +# create input data for scanpy +if par["input_layer"]: + X_data = input_adata.layers[par["input_layer"]].copy() +else: + X_data = input_adata.X.copy() +adata_scanpy = ad.AnnData( + X=X_data, + obs=pd.DataFrame(index=input_adata.obs.index), + var=pd.DataFrame(index=input_adata.var.index), +) + +# run score_genes_cell_cycle +sc.tl.score_genes_cell_cycle( + adata_scanpy, + s_genes=s_index, + g2m_genes=g2m_index, + gene_pool=gene_pool_index, + n_bins=par["n_bins"], + random_state=par["random_state"], +) + +# copy results to mudata +output_slot_mapping = { + par["obs_s_score"]: "S_score", + par["obs_g2m_score"]: "G2M_score", + par["obs_phase"]: "phase", +} +assert all(adata_scanpy.obs.index == input_adata.obs.index), ( + "index mismatch between input adata and scanpy output adata" +) +for dest, orig in output_slot_mapping.items(): + input_adata.obs[dest] = adata_scanpy.obs[orig] + +# write output to mudata +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/feature_annotation/score_genes_cell_cycle_scanpy", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow.config b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow.config new file mode 100644 index 00000000..02f134f1 --- /dev/null +++ b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'feature_annotation/score_genes_cell_cycle_scanpy' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Calculates the score associated to S phase and G2M phase and annotates the cell cycle phase for each cell, as implemented by scanpy. \nThe score is the average expression of a set of genes subtracted with the average expression of a reference set of genes.\n' + author = 'Robrecht Cannoodt, Dorien Roosen, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_labels.config b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_schema.json b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_schema.json new file mode 100644 index 00000000..2a3b351f --- /dev/null +++ b/target/nextflow/feature_annotation/score_genes_cell_cycle_scanpy/nextflow_schema.json @@ -0,0 +1,182 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "score_genes_cell_cycle_scanpy", + "description": "Calculates the score associated to S phase and G2M phase and annotates the cell cycle phase for each cell, as implemented by scanpy. \nThe score is the average expression of a set of genes subtracted with the average expression of a reference set of genes.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input_file.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer of the adata object containing normalized expression values.\nIf not provided, the X attribute of the adata object will be used.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"log_normalized\"`. " + }, + "var_gene_names": { + "type": "string", + "description": "The name of the column in the var attribute of the adata object that contains the gene names (symbols).\nIf not provided, the index of the var attribute will be used.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_names\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file\n", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output_file.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obs_phase": { + "type": "string", + "description": "The name of the column in the obs attribute of the adata object that will store the cell cycle phase annotation.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"phase\"`. ", + "default": "phase" + }, + "obs_s_score": { + "type": "string", + "description": "The name of the column in the obs attribute of the adata object that will store the s phase score.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"S_score\"`. ", + "default": "S_score" + }, + "obs_g2m_score": { + "type": "string", + "description": "The name of the column in the obs attribute of the adata object that will store the g2m phase score.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"G2M_score\"`. ", + "default": "G2M_score" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "n_bins": { + "type": "integer", + "description": "Number of expression level bins for sampling.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `25`. ", + "default": 25 + }, + "random_state": { + "type": "integer", + "description": "The random seed for sampling.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "allow_missing_genes": { + "type": "boolean", + "description": "If true, missing genes in the gene list will be ignored.\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "gene list inputs": { + "title": "Gene list inputs", + "type": "object", + "description": "The gene list inputs can be provided as a list of gene symbols or as a file containing a list of gene symbols. The gene list file should be formatted as a single column with gene symbols.\n\nMake sure that the gene list inputs are consistent with the gene names in the adata object as provided by the --var_gene_names argument.\n", + "properties": { + "s_genes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of gene symbols for scoring s phase genes.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene1\";\"gene2\";\"gene3\"]`. " + }, + "s_genes_file": { + "type": "string", + "format": "path", + "description": "Path to a .txt file containing the gene list of s phase genes to be scored", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"s_gene_list.txt\"`. " + }, + "g2m_genes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of gene symbols for scoring g2m phase genes.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene1\";\"gene2\";\"gene3\"]`. " + }, + "g2m_genes_file": { + "type": "string", + "format": "path", + "description": "Path to a .txt file containing the gene list of g2m phase genes to be scored", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"g2m_gene_list.txt\"`. " + }, + "gene_pool": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of gene symbols for sampling the reference set", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene1\";\"gene2\";\"gene3\"]`. " + }, + "gene_pool_file": { + "type": "string", + "format": "path", + "description": "File with genes for sampling the reference set", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"gene_pool.txt\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/gene list inputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/files/make_params/.config.vsh.yaml b/target/nextflow/files/make_params/.config.vsh.yaml new file mode 100644 index 00000000..d63981a8 --- /dev/null +++ b/target/nextflow/files/make_params/.config.vsh.yaml @@ -0,0 +1,285 @@ +name: "make_params" +namespace: "files" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--base_dir" + description: "Base directory to search recursively" + info: null + example: + - "/path/to/dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pattern" + description: "An optional regular expression. Only file names which match the\ + \ regular expression will be matched." + info: null + example: + - "*.fastq.gz" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_dirname_drop" + description: "For every matched file, the parent directory will be traversed N\ + \ times." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_basename_id" + description: "The unique identifiers will consist of at least N dirnames." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--id_name" + description: "The name for storing the identifier field in the yaml." + info: null + default: + - "id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--path_name" + description: "The name for storing the path field in the yaml." + info: null + default: + - "path" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_name" + description: "Top level name for the group of entries." + info: null + example: + - "param_list" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output YAML file." + info: null + example: + - "params.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Looks for files in a directory and turn it in a params file." +test_resources: +- type: "bash_script" + path: "test_make_params.sh" + is_executable: true +- type: "file" + path: "src" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/randpy:r4.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/files/make_params/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/files/make_params" + executable: "target/nextflow/files/make_params/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/files/make_params/main.nf b/target/nextflow/files/make_params/main.nf new file mode 100644 index 00000000..af38c1b2 --- /dev/null +++ b/target/nextflow/files/make_params/main.nf @@ -0,0 +1,4008 @@ +// make_params main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (maintainer, author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "make_params", + "namespace" : "files", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--base_dir", + "description" : "Base directory to search recursively", + "example" : [ + "/path/to/dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--pattern", + "description" : "An optional regular expression. Only file names which match the regular expression will be matched.", + "example" : [ + "*.fastq.gz" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_dirname_drop", + "description" : "For every matched file, the parent directory will be traversed N times.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_basename_id", + "description" : "The unique identifiers will consist of at least N dirnames.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--id_name", + "description" : "The name for storing the identifier field in the yaml.", + "default" : [ + "id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--path_name", + "description" : "The name for storing the path field in the yaml.", + "default" : [ + "path" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--group_name", + "description" : "Top level name for the group of entries.", + "example" : [ + "param_list" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output YAML file.", + "example" : [ + "params.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Looks for files in a directory and turn it in a params file.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test_make_params.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../src" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/randpy:r4.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/files/make_params/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/files/make_params", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +library(dplyr) +library(purrr) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "base_dir" = $( if [ ! -z ${VIASH_PAR_BASE_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_BASE_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "pattern" = $( if [ ! -z ${VIASH_PAR_PATTERN+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PATTERN" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "n_dirname_drop" = $( if [ ! -z ${VIASH_PAR_N_DIRNAME_DROP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_N_DIRNAME_DROP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "n_basename_id" = $( if [ ! -z ${VIASH_PAR_N_BASENAME_ID+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_N_BASENAME_ID" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "id_name" = $( if [ ! -z ${VIASH_PAR_ID_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ID_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "path_name" = $( if [ ! -z ${VIASH_PAR_PATH_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PATH_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "group_name" = $( if [ ! -z ${VIASH_PAR_GROUP_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_GROUP_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +cat("> Listing files of base dir ", par\\$base_dir, "\\\\n", sep = "") +paths <- list.files( + normalizePath(par\\$base_dir), + pattern = par\\$pattern, + recursive = TRUE, + full.names = TRUE +) + +cat("> Traversing up ", par\\$n_dirname_apply, " times\\\\n", sep = "") +for (i in seq_len(par\\$n_dirname_drop)) { + paths <- dirname(paths) %>% unique() +} + +# removing /viash_automount in case we're inside a docker container +paths <- gsub("^/viash_automount", "", paths) + +cat("> Checking whether basenames are unique\\\\n") +i <- par\\$n_basename_id +maxi <- strsplit(paths, "/") %>% + map_int(length) %>% + max() + +regex <- paste0(".*/(", paste(rep("[^/]+/", i), collapse = ""), "[^/]*)\\$") +ids <- gsub("/", "_", gsub(regex, "\\\\\\\\1", paths)) + +cat("> Printing first five rows\\\\n") +print(tibble(id = ids, path = paths) %>% head(5)) +cat("\\\\n") + +while (i < maxi && any(duplicated(ids))) { + i <- i + 1 + cat( + "Duplicated ids detected, combining with ", i, + " dirnames in an attempt to get unique ids.\\\\n" + ) + regex <- paste0(".*/(", paste(rep("[^/]+/", i), collapse = ""), "[^/]*)\\$") + ids <- gsub("/", "_", gsub(regex, "\\\\\\\\1", paths)) + + cat("> Printing first five rows\\\\n") + print(tibble(id = ids, path = paths) %>% head(5)) + cat("\\\\n") +} + +cat("> Transforming into list of items\\\\n") +par_list <- map2( + ids, paths, + function(id, input) { + setNames(list(id, input), c(par\\$id_name, par\\$path_name)) + } +) + +if (!is.null(par\\$group_name)) { + par_list <- setNames(list(par_list), par\\$group_name) +} + +cat("> Writing as YAML\\\\n") +yaml::write_yaml(par_list, par\\$output) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/files/make_params", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/files/make_params/nextflow.config b/target/nextflow/files/make_params/nextflow.config new file mode 100644 index 00000000..1d584c57 --- /dev/null +++ b/target/nextflow/files/make_params/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'files/make_params' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Looks for files in a directory and turn it in a params file.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/files/make_params/nextflow_labels.config b/target/nextflow/files/make_params/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/files/make_params/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/files/make_params/nextflow_schema.json b/target/nextflow/files/make_params/nextflow_schema.json new file mode 100644 index 00000000..6b20ffe8 --- /dev/null +++ b/target/nextflow/files/make_params/nextflow_schema.json @@ -0,0 +1,83 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "make_params", + "description": "Looks for files in a directory and turn it in a params file.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "base_dir": { + "type": "string", + "format": "path", + "exists": true, + "description": "Base directory to search recursively", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/dir\"`. " + }, + "pattern": { + "type": "string", + "description": "An optional regular expression", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"*.fastq.gz\"`. " + }, + "n_dirname_drop": { + "type": "integer", + "description": "For every matched file, the parent directory will be traversed N times.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "n_basename_id": { + "type": "integer", + "description": "The unique identifiers will consist of at least N dirnames.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "id_name": { + "type": "string", + "description": "The name for storing the identifier field in the yaml.", + "help_text": "Type: `string`, multiple: `False`, default: `\"id\"`. ", + "default": "id" + }, + "path_name": { + "type": "string", + "description": "The name for storing the path field in the yaml.", + "help_text": "Type: `string`, multiple: `False`, default: `\"path\"`. ", + "default": "path" + }, + "group_name": { + "type": "string", + "description": "Top level name for the group of entries.", + "help_text": "Type: `string`, multiple: `False`, example: `\"param_list\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output YAML file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.yaml\"`, direction: `output`, example: `\"params.yaml\"`. ", + "default": "$id.$key.output.yaml" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/delimit_fraction/.config.vsh.yaml b/target/nextflow/filter/delimit_fraction/.config.vsh.yaml new file mode 100644 index 00000000..140a05a4 --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/.config.vsh.yaml @@ -0,0 +1,307 @@ +name: "delimit_fraction" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_fraction_column" + description: "Name of column from .var dataframe selecting\na column that contains\ + \ floating point values between 0 and 1.\n" + info: null + example: + - "fraction_mitochondrial" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_filter" + description: "In which .obs slot to store a boolean array corresponding to which\ + \ observations should be removed." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "double" + name: "--min_fraction" + description: "Min fraction for an observation to be retained (True in output)." + info: null + default: + - 0.0 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_fraction" + description: "Max fraction for an observation to be retained (True in output)." + info: null + default: + - 1.0 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Turns a column containing values between 0 and 1 into a boolean column\ + \ based on thresholds.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/delimit_fraction/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/delimit_fraction" + executable: "target/nextflow/filter/delimit_fraction/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/delimit_fraction/compress_h5mu.py b/target/nextflow/filter/delimit_fraction/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/filter/delimit_fraction/main.nf b/target/nextflow/filter/delimit_fraction/main.nf new file mode 100644 index 00000000..acb98668 --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/main.nf @@ -0,0 +1,4054 @@ +// delimit_fraction main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "delimit_fraction", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_fraction_column", + "description" : "Name of column from .var dataframe selecting\na column that contains floating point values between 0 and 1.\n", + "example" : [ + "fraction_mitochondrial" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_filter", + "description" : "In which .obs slot to store a boolean array corresponding to which observations should be removed.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "double", + "name" : "--min_fraction", + "description" : "Min fraction for an observation to be retained (True in output).", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_fraction", + "description" : "Max fraction for an observation to be retained (True in output).", + "default" : [ + 1.0 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Turns a column containing values between 0 and 1 into a boolean column based on thresholds.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/delimit_fraction/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/delimit_fraction", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import numpy as np +import sys +from operator import le, ge +from pandas.api.types import is_float_dtype + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_fraction_column': $( if [ ! -z ${VIASH_PAR_OBS_FRACTION_COLUMN+x} ]; then echo "r'${VIASH_PAR_OBS_FRACTION_COLUMN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_name_filter': $( if [ ! -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_fraction': $( if [ ! -z ${VIASH_PAR_MIN_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_MIN_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_fraction': $( if [ ! -z ${VIASH_PAR_MAX_FRACTION+x} ]; then echo "float(r'${VIASH_PAR_MAX_FRACTION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from %s", par["modality"], par["input"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +assert data.var_names.is_unique, ( + "The var_names of the input modality must be be unique." +) + +logger.info("\\\\tUnfiltered data: %s", data) +logger.info("\\\\tComputing aggregations.") + + +def apply_filter_to_mask(mask, base, filter, comparator): + new_filt = np.ravel(comparator(base, filter)) + num_removed = np.sum(np.invert(new_filt) & mask) + mask &= new_filt + return num_removed, mask + + +try: + fraction = data.obs[par["obs_fraction_column"]] +except KeyError: + raise ValueError(f"Could not find column '{par['obs_fraction_column']}'") +if not is_float_dtype(fraction): + raise ValueError( + f"Column '{par['obs_fraction_column']}' does not contain float datatype." + ) +if fraction.max() > 1: + raise ValueError(f"Column '{par['obs_fraction_column']}' contains values > 1.") +if fraction.min() < 0: + raise ValueError(f"Column '{par['obs_fraction_column']}' contains values < 0.") + + +# Filter cells +filters = ( + ( + "min_fraction", + fraction, + ge, + "\\\\tRemoving %s cells with <%s percentage mitochondrial reads.", + ), + ( + "max_fraction", + fraction, + le, + "\\\\tRemoving %s cells with >%s percentage mitochondrial reads.", + ), +) + +keep_cells = np.repeat(True, data.n_obs) +for filter_name_or_value, base, comparator, message in filters: + try: + filter = par[filter_name_or_value] + except KeyError: + filter = filter_name_or_value + if filter is not None: + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) + logger.info(message, num_removed, filter) + +data.obs[par["obs_name_filter"]] = keep_cells + +logger.info("\\\\tFiltered data: %s", data) +logger.info("Writing output data to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/delimit_fraction", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/delimit_fraction/nextflow.config b/target/nextflow/filter/delimit_fraction/nextflow.config new file mode 100644 index 00000000..85de6b22 --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/delimit_fraction' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Turns a column containing values between 0 and 1 into a boolean column based on thresholds.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/delimit_fraction/nextflow_labels.config b/target/nextflow/filter/delimit_fraction/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/delimit_fraction/nextflow_schema.json b/target/nextflow/filter/delimit_fraction/nextflow_schema.json new file mode 100644 index 00000000..52b00a1b --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/nextflow_schema.json @@ -0,0 +1,106 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "delimit_fraction", + "description": "Turns a column containing values between 0 and 1 into a boolean column based on thresholds.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_fraction_column": { + "type": "string", + "description": "Name of column from .var dataframe selecting\na column that contains floating point values between 0 and 1.\n", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"fraction_mitochondrial\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obs_name_filter": { + "type": "string", + "description": "In which .obs slot to store a boolean array corresponding to which observations should be removed.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "min_fraction": { + "type": "number", + "description": "Min fraction for an observation to be retained (True in output).", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "max_fraction": { + "type": "number", + "description": "Max fraction for an observation to be retained (True in output).", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/delimit_fraction/setup_logger.py b/target/nextflow/filter/delimit_fraction/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/delimit_fraction/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/filter/do_filter/.config.vsh.yaml b/target/nextflow/filter/do_filter/.config.vsh.yaml new file mode 100644 index 00000000..36550dfd --- /dev/null +++ b/target/nextflow/filter/do_filter/.config.vsh.yaml @@ -0,0 +1,275 @@ +name: "do_filter" +namespace: "filter" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_filter" + description: "Which .obs columns to use to filter the observations by." + info: null + example: + - "filter_with_x" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--var_filter" + description: "Which .var columns to use to filter the observations by." + info: null + example: + - "filter_with_x" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Remove observations and variables based on specified .obs and .var columns.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/do_filter/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/do_filter" + executable: "target/nextflow/filter/do_filter/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/do_filter/compress_h5mu.py b/target/nextflow/filter/do_filter/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/filter/do_filter/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/filter/do_filter/main.nf b/target/nextflow/filter/do_filter/main.nf new file mode 100644 index 00000000..57ac78af --- /dev/null +++ b/target/nextflow/filter/do_filter/main.nf @@ -0,0 +1,3977 @@ +// do_filter main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer, contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "do_filter", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer", + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_filter", + "description" : "Which .obs columns to use to filter the observations by.", + "example" : [ + "filter_with_x" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_filter", + "description" : "Which .var columns to use to filter the observations by.", + "example" : [ + "filter_with_x" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Remove observations and variables based on specified .obs and .var columns.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/do_filter/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/do_filter", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import numpy as np +import sys + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_filter': $( if [ ! -z ${VIASH_PAR_OBS_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_FILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'var_filter': $( if [ ! -z ${VIASH_PAR_VAR_FILTER+x} ]; then echo "r'${VIASH_PAR_VAR_FILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading %s, modality: %s", par["input"], par["modality"]) +adata = mu.read_h5ad(par["input"], mod=par["modality"]) + +obs_filt = np.repeat(True, adata.n_obs) +var_filt = np.repeat(True, adata.n_vars) + +par["obs_filter"] = par["obs_filter"] if par["obs_filter"] else [] +par["var_filter"] = par["var_filter"] if par["var_filter"] else [] + +for obs_name in par["obs_filter"]: + logger.info( + "Filtering modality '%s' observations by .obs['%s']", par["modality"], obs_name + ) + if obs_name not in adata.obs: + raise ValueError(f".mod[{par['modality']}].obs[{obs_name}] does not exist.") + if obs_name in adata.obs: + obs_filt &= adata.obs[obs_name] + +for var_name in par["var_filter"]: + logger.info( + "Filtering modality '%s' variables by .var['%s']", par["modality"], var_name + ) + if var_name not in adata.var: + raise ValueError(f".mod[{par['modality']}].var[{var_name}] does not exist.") + if var_name in adata.var: + var_filt &= adata.var[var_name] + +adata_filtered = adata[obs_filt, var_filt] + +logger.info("Writing h5mu to file %s.", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + adata_filtered, + par["output_compression"], +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/do_filter", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/do_filter/nextflow.config b/target/nextflow/filter/do_filter/nextflow.config new file mode 100644 index 00000000..9e99b152 --- /dev/null +++ b/target/nextflow/filter/do_filter/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/do_filter' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Remove observations and variables based on specified .obs and .var columns.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/do_filter/nextflow_labels.config b/target/nextflow/filter/do_filter/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/do_filter/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/do_filter/nextflow_schema.json b/target/nextflow/filter/do_filter/nextflow_schema.json new file mode 100644 index 00000000..f63c9c3e --- /dev/null +++ b/target/nextflow/filter/do_filter/nextflow_schema.json @@ -0,0 +1,80 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "do_filter", + "description": "Remove observations and variables based on specified .obs and .var columns.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_filter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Which .obs columns to use to filter the observations by.", + "help_text": "Type: `string`, multiple: `True`, example: `[\"filter_with_x\"]`. " + }, + "var_filter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Which .var columns to use to filter the observations by.", + "help_text": "Type: `string`, multiple: `True`, example: `[\"filter_with_x\"]`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/do_filter/setup_logger.py b/target/nextflow/filter/do_filter/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/do_filter/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/filter/filter_with_counts/.config.vsh.yaml b/target/nextflow/filter/filter_with_counts/.config.vsh.yaml new file mode 100644 index 00000000..4e23e894 --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/.config.vsh.yaml @@ -0,0 +1,362 @@ +name: "filter_with_counts" +namespace: "filter" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Location of the count matrix. If specified, will be used to select\ + \ a key from .layers,\notherwise .X is used.\n" + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--do_subset" + description: "Whether to subset before storing the output." + info: null + direction: "input" + - type: "string" + name: "--obs_name_filter" + description: "In which .obs slot to store a boolean array corresponding to which\ + \ observations should be removed." + info: null + default: + - "filter_with_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_filter" + description: "In which .var slot to store a boolean array corresponding to which\ + \ variables should be removed." + info: null + default: + - "filter_with_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts captured per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_counts" + description: "Maximum number of counts captured per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_genes_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_genes_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 1500000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_gene" + description: "Minimum of non-zero values per gene." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter scRNA-seq data based on the primary QC metrics. \nThis is based\ + \ on both the UMI counts, the gene counts \nand the mitochondrial genes (genes starting\ + \ with mt/MT).\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/filter_with_counts/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/filter_with_counts" + executable: "target/nextflow/filter/filter_with_counts/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/filter_with_counts/compress_h5mu.py b/target/nextflow/filter/filter_with_counts/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/filter/filter_with_counts/main.nf b/target/nextflow/filter/filter_with_counts/main.nf new file mode 100644 index 00000000..334e49e2 --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/main.nf @@ -0,0 +1,4153 @@ +// filter_with_counts main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (author) +// * Robrecht Cannoodt (maintainer, author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "filter_with_counts", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Location of the count matrix. If specified, will be used to select a key from .layers,\notherwise .X is used.\n", + "example" : [ + "raw_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--do_subset", + "description" : "Whether to subset before storing the output.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--obs_name_filter", + "description" : "In which .obs slot to store a boolean array corresponding to which observations should be removed.", + "default" : [ + "filter_with_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_filter", + "description" : "In which .var slot to store a boolean array corresponding to which variables should be removed.", + "default" : [ + "filter_with_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of counts captured per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_counts", + "description" : "Maximum number of counts captured per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_genes_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_genes_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 1500000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells_per_gene", + "description" : "Minimum of non-zero values per gene.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Filter scRNA-seq data based on the primary QC metrics. \nThis is based on both the UMI counts, the gene counts \nand the mitochondrial genes (genes starting with mt/MT).\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/filter_with_counts/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/filter_with_counts", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import numpy as np +import sys +from operator import le, ge, gt + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'do_subset': $( if [ ! -z ${VIASH_PAR_DO_SUBSET+x} ]; then echo "r'${VIASH_PAR_DO_SUBSET//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'obs_name_filter': $( if [ ! -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_name_filter': $( if [ ! -z ${VIASH_PAR_VAR_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_VAR_NAME_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_counts': $( if [ ! -z ${VIASH_PAR_MAX_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MAX_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_genes_per_cell': $( if [ ! -z ${VIASH_PAR_MIN_GENES_PER_CELL+x} ]; then echo "int(r'${VIASH_PAR_MIN_GENES_PER_CELL//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_genes_per_cell': $( if [ ! -z ${VIASH_PAR_MAX_GENES_PER_CELL+x} ]; then echo "int(r'${VIASH_PAR_MAX_GENES_PER_CELL//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_cells_per_gene': $( if [ ! -z ${VIASH_PAR_MIN_CELLS_PER_GENE+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS_PER_GENE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input data from %s, modality %s", par["input"], par["modality"]) +modality_data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("\\\\tUnfiltered data: %s", modality_data) + +logger.info("Selecting input layer %s", "X" if par["layer"] else par["layer"]) +input_layer = ( + modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] +) + +logger.info("\\\\tComputing aggregations.") +n_counts_per_cell = np.ravel(np.sum(input_layer, axis=1)) +n_cells_per_gene = np.sum(input_layer > 0, axis=0) +n_genes_per_cell = np.sum(input_layer > 0, axis=1) + + +def apply_filter_to_mask(mask, base, filter, comparator): + new_filt = np.ravel(comparator(base, filter)) + num_removed = np.sum(np.invert(new_filt) & mask) + mask &= new_filt + return num_removed, mask + + +# Filter genes +keep_genes = np.repeat(True, modality_data.n_vars) +if par["min_cells_per_gene"] is not None: + num_removed, keep_genes = apply_filter_to_mask( + keep_genes, n_cells_per_gene, par["min_cells_per_gene"], ge + ) + logger.info( + "\\\\tRemoving %s genes with non-zero values in <%s cells.", + num_removed, + par["min_cells_per_gene"], + ) + +# Filter cells +filters = ( + ( + "min_genes_per_cell", + n_genes_per_cell, + ge, + "\\\\tRemoving %s cells with non-zero values in <%s genes.", + ), + ( + "max_genes_per_cell", + n_genes_per_cell, + le, + "\\\\tRemoving %s cells with non-zero values in >%s genes.", + ), + ("min_counts", n_counts_per_cell, ge, "\\\\tRemoving %s cells with <%s total counts."), + ("max_counts", n_counts_per_cell, le, "\\\\tRemoving %s cells with >%s total counts."), + ( + 0, + np.sum(input_layer[:, keep_genes], axis=1), + gt, + "\\\\tRemoving %s cells with %s counts", + ), +) + +keep_cells = np.repeat(True, modality_data.n_obs) +for filter_name_or_value, base, comparator, message in filters: + try: + filter = par[filter_name_or_value] + except KeyError: + filter = filter_name_or_value + if filter is not None: + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) + logger.info(message, num_removed, filter) + +if par["obs_name_filter"] is not None: + modality_data.obs[par["obs_name_filter"]] = keep_cells +if par["var_name_filter"] is not None: + modality_data.var[par["var_name_filter"]] = keep_genes + +if par["do_subset"]: + modality_data = modality_data[keep_cells, keep_genes] + +logger.info("\\\\tFiltered data: %s", modality_data) +logger.info( + "Writing output data to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], +) + + +logger.info("Finished") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/filter_with_counts", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/filter_with_counts/nextflow.config b/target/nextflow/filter/filter_with_counts/nextflow.config new file mode 100644 index 00000000..442e98b9 --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/filter_with_counts' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Filter scRNA-seq data based on the primary QC metrics. \nThis is based on both the UMI counts, the gene counts \nand the mitochondrial genes (genes starting with mt/MT).\n' + author = 'Dries De Maeyer, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/filter_with_counts/nextflow_labels.config b/target/nextflow/filter/filter_with_counts/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/filter_with_counts/nextflow_schema.json b/target/nextflow/filter/filter_with_counts/nextflow_schema.json new file mode 100644 index 00000000..4e586c5d --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/nextflow_schema.json @@ -0,0 +1,132 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "filter_with_counts", + "description": "Filter scRNA-seq data based on the primary QC metrics. \nThis is based on both the UMI counts, the gene counts \nand the mitochondrial genes (genes starting with mt/MT).\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Location of the count matrix", + "help_text": "Type: `string`, multiple: `False`, example: `\"raw_counts\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "do_subset": { + "type": "boolean", + "description": "Whether to subset before storing the output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "obs_name_filter": { + "type": "string", + "description": "In which .obs slot to store a boolean array corresponding to which observations should be removed.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_counts\"`. ", + "default": "filter_with_counts" + }, + "var_name_filter": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding to which variables should be removed.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_counts\"`. ", + "default": "filter_with_counts" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "min_counts": { + "type": "integer", + "description": "Minimum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_counts": { + "type": "integer", + "description": "Maximum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "min_genes_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_genes_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `1500000`. " + }, + "min_cells_per_gene": { + "type": "integer", + "description": "Minimum of non-zero values per gene.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/filter_with_counts/setup_logger.py b/target/nextflow/filter/filter_with_counts/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/filter_with_counts/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/filter/filter_with_scrublet/.config.vsh.yaml b/target/nextflow/filter/filter_with_scrublet/.config.vsh.yaml new file mode 100644 index 00000000..21ec5f45 --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/.config.vsh.yaml @@ -0,0 +1,425 @@ +name: "filter_with_scrublet" +namespace: "filter" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to use as data for calculating doublets. .X is used\ + \ not specified." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_filter" + description: "In which .obs slot to store a boolean array corresponding to which\ + \ observations should be filtered out." + info: null + default: + - "filter_with_scrublet" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--do_subset" + description: "Whether to subset before storing the output." + info: null + direction: "input" + - type: "string" + name: "--obs_name_doublet_score" + description: "Name of the doublet scores column in the obs slot of the returned\ + \ object." + info: null + default: + - "scrublet_doublet_score" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--expected_doublet_rate" + description: "The estimated fraction of doublets as from the experimental setup.\n" + info: null + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--stdev_doublet_rate" + description: "Uncertainty in the expected doublet rate." + info: null + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors" + description: "Number of neighbors used to construct the KNN classifier of observed\ + \ transcriptomes\nand simulated doublets.\n" + info: null + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--sim_doublet_ratio" + description: "Number of doublets to simulate relative to the number of observed\n\ + transcriptomes.\n" + info: null + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts" + description: "The number of minimal UMI counts per cell that have to be present\ + \ for initial cell detection." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells" + description: "The number of cells in which UMIs for a gene were detected." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_gene_variablity_percent" + description: "Used for gene filtering prior to PCA. Keep the most highly variable\ + \ genes (in the top min_gene_variability_pctl percentile), as measured by the\ + \ v-statistic [Klein et al., Cell 2015]." + info: null + default: + - 85.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_pca_components" + description: "Number of principal components to use during PCA dimensionality\ + \ reduction." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--distance_metric" + description: "The distance metric used for computing similarities." + info: null + default: + - "euclidean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--allow_automatic_threshold_detection_fail" + description: "When scrublet fails to automatically determine the double score\ + \ threshold, \nallow the component to continue and set the output columns to\ + \ NA.\n" + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Doublet detection using the Scrublet method (Wolock, Lopez and Klein,\ + \ 2019).\nThe method tests for potential doublets by using the expression profiles\ + \ of\ncells to generate synthetic potential doubles which are tested against cells.\ + \ \nThe method returns a \"doublet score\" on which it calls for potential doublets.\n\ + \nFor the source code please visit https://github.com/AllonKleinLab/scrublet.\n\n\ + For 10x we expect the doublet rates to be:\n Multiplet Rate (%) - # of Cells Loaded\ + \ - # of Cells Recovered\n ~0.4% ~800 ~500\n ~0.8% ~1,600 ~1,000\n ~1.6% ~3,200\ + \ ~2,000\n ~2.3% ~4,800 ~3,000\n ~3.1% ~6,400 ~4,000\n ~3.9% ~8,000 ~5,000\n\ + \ ~4.6% ~9,600 ~6,000\n ~5.4% ~11,200 ~7,000\n ~6.1% ~12,800 ~8,000\n ~6.9%\ + \ ~14,400 ~9,000\n ~7.6% ~16,000 ~10,000\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" + docker_run_args: + - "--env NUMBA_CACHE_DIR=/tmp" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scrublet" + - "annoy==1.17.3" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/filter_with_scrublet/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/filter_with_scrublet" + executable: "target/nextflow/filter/filter_with_scrublet/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/filter_with_scrublet/compress_h5mu.py b/target/nextflow/filter/filter_with_scrublet/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/filter/filter_with_scrublet/main.nf b/target/nextflow/filter/filter_with_scrublet/main.nf new file mode 100644 index 00000000..3bfc0d71 --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/main.nf @@ -0,0 +1,4191 @@ +// filter_with_scrublet main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (contributor) +// * Robrecht Cannoodt (maintainer, contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "filter_with_scrublet", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer", + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to use as data for calculating doublets. .X is used not specified.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_filter", + "description" : "In which .obs slot to store a boolean array corresponding to which observations should be filtered out.", + "default" : [ + "filter_with_scrublet" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--do_subset", + "description" : "Whether to subset before storing the output.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--obs_name_doublet_score", + "description" : "Name of the doublet scores column in the obs slot of the returned object.", + "default" : [ + "scrublet_doublet_score" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--expected_doublet_rate", + "description" : "The estimated fraction of doublets as from the experimental setup.\n", + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--stdev_doublet_rate", + "description" : "Uncertainty in the expected doublet rate.", + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_neighbors", + "description" : "Number of neighbors used to construct the KNN classifier of observed transcriptomes\nand simulated doublets.\n", + "required" : false, + "min" : 0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--sim_doublet_ratio", + "description" : "Number of doublets to simulate relative to the number of observed\ntranscriptomes.\n", + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_counts", + "description" : "The number of minimal UMI counts per cell that have to be present for initial cell detection.", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells", + "description" : "The number of cells in which UMIs for a gene were detected.", + "default" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_gene_variablity_percent", + "description" : "Used for gene filtering prior to PCA. Keep the most highly variable genes (in the top min_gene_variability_pctl percentile), as measured by the v-statistic [Klein et al., Cell 2015].", + "default" : [ + 85.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num_pca_components", + "description" : "Number of principal components to use during PCA dimensionality reduction.", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--distance_metric", + "description" : "The distance metric used for computing similarities.", + "default" : [ + "euclidean" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--allow_automatic_threshold_detection_fail", + "description" : "When scrublet fails to automatically determine the double score threshold, \nallow the component to continue and set the output columns to NA.\n", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Doublet detection using the Scrublet method (Wolock, Lopez and Klein, 2019).\nThe method tests for potential doublets by using the expression profiles of\ncells to generate synthetic potential doubles which are tested against cells. \nThe method returns a \\"doublet score\\" on which it calls for potential doublets.\n\nFor the source code please visit https://github.com/AllonKleinLab/scrublet.\n\nFor 10x we expect the doublet rates to be:\n Multiplet Rate (%) - # of Cells Loaded - # of Cells Recovered\n ~0.4% ~800 ~500\n ~0.8% ~1,600 ~1,000\n ~1.6% ~3,200 ~2,000\n ~2.3% ~4,800 ~3,000\n ~3.1% ~6,400 ~4,000\n ~3.9% ~8,000 ~5,000\n ~4.6% ~9,600 ~6,000\n ~5.4% ~11,200 ~7,000\n ~6.1% ~12,800 ~8,000\n ~6.9% ~14,400 ~9,000\n ~7.6% ~16,000 ~10,000\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild", + "docker_run_args" : [ + "--env NUMBA_CACHE_DIR=/tmp" + ] + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highcpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "build-essential" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "scrublet", + "annoy==1.17.3" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/filter_with_scrublet/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/filter_with_scrublet", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scrublet as scr +import mudata as mu +import numpy as np +import sys +import pandas as pd + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_name_filter': $( if [ ! -z ${VIASH_PAR_OBS_NAME_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'do_subset': $( if [ ! -z ${VIASH_PAR_DO_SUBSET+x} ]; then echo "r'${VIASH_PAR_DO_SUBSET//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'obs_name_doublet_score': $( if [ ! -z ${VIASH_PAR_OBS_NAME_DOUBLET_SCORE+x} ]; then echo "r'${VIASH_PAR_OBS_NAME_DOUBLET_SCORE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'expected_doublet_rate': $( if [ ! -z ${VIASH_PAR_EXPECTED_DOUBLET_RATE+x} ]; then echo "float(r'${VIASH_PAR_EXPECTED_DOUBLET_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'stdev_doublet_rate': $( if [ ! -z ${VIASH_PAR_STDEV_DOUBLET_RATE+x} ]; then echo "float(r'${VIASH_PAR_STDEV_DOUBLET_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_neighbors': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sim_doublet_ratio': $( if [ ! -z ${VIASH_PAR_SIM_DOUBLET_RATIO+x} ]; then echo "float(r'${VIASH_PAR_SIM_DOUBLET_RATIO//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_cells': $( if [ ! -z ${VIASH_PAR_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_gene_variablity_percent': $( if [ ! -z ${VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT+x} ]; then echo "float(r'${VIASH_PAR_MIN_GENE_VARIABLITY_PERCENT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'num_pca_components': $( if [ ! -z ${VIASH_PAR_NUM_PCA_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_NUM_PCA_COMPONENTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'distance_metric': $( if [ ! -z ${VIASH_PAR_DISTANCE_METRIC+x} ]; then echo "r'${VIASH_PAR_DISTANCE_METRIC//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'allow_automatic_threshold_detection_fail': $( if [ ! -z ${VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL+x} ]; then echo "r'${VIASH_PAR_ALLOW_AUTOMATIC_THRESHOLD_DETECTION_FAIL//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +logger = setup_logger() + +logger.info("Reading %s, modality %s", par["input"], par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"]) +input_layer = data.X if not par["layer"] else data.layers[par["layer"]] + +if 0 in input_layer.shape: + raise ValueError( + f"Modality {par['modality']} of input Mudata {par['input']} appears " + f"to be empty (shape: {input_layer.shape})." + ) + +logger.info("\\\\tRunning scrublet") +initializer_args = { + arg: par[arg] + for arg in ( + "expected_doublet_rate", + "stdev_doublet_rate", + "n_neighbors", + "sim_doublet_ratio", + ) + if par[arg] is not None +} +scrub = scr.Scrublet(input_layer, **initializer_args) + +doublet_scores, predicted_doublets = scrub.scrub_doublets( + min_counts=par["min_counts"], + min_cells=par["min_cells"], + min_gene_variability_pctl=par["min_gene_variablity_percent"], + n_prin_comps=par["num_pca_components"], + distance_metric=par["distance_metric"], + use_approx_neighbors=False, +) + +try: + keep_cells = np.invert(predicted_doublets) +except TypeError: + if par["allow_automatic_threshold_detection_fail"]: + # Scrublet might not throw an error and return None if it fails to detect doublets... + logger.info( + "\\\\tScrublet could not automatically detect the doublet score threshold. Setting output columns to NA." + ) + keep_cells = np.nan + doublet_scores = np.nan + else: + raise RuntimeError( + "Scrublet could not automatically detect the doublet score threshold. " + "--allow_automatic_threshold_detection_fail can be used to ignore this failure " + "and set the corresponding output columns to NA." + ) + +logger.info("\\\\tStoring output into .obs") +if par["obs_name_doublet_score"] is not None: + data.obs[par["obs_name_doublet_score"]] = doublet_scores + data.obs[par["obs_name_doublet_score"]] = data.obs[ + par["obs_name_doublet_score"] + ].astype("float64") +if par["obs_name_filter"] is not None: + data.obs[par["obs_name_filter"]] = keep_cells + data.obs[par["obs_name_filter"]] = data.obs[par["obs_name_filter"]].astype( + pd.BooleanDtype() + ) + +if par["do_subset"]: + if pd.api.types.is_scalar(keep_cells) and pd.isna(keep_cells): + logger.warning("Not subsetting beacuse doublets were not predicted") + else: + data = data[keep_cells, :] + +logger.info( + "Writing h5mu to %s, with compression %s", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/filter_with_scrublet", + "tag" : "main" + }, + "label" : [ + "highcpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/filter_with_scrublet/nextflow.config b/target/nextflow/filter/filter_with_scrublet/nextflow.config new file mode 100644 index 00000000..05baf7b2 --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/filter_with_scrublet' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Doublet detection using the Scrublet method (Wolock, Lopez and Klein, 2019).\nThe method tests for potential doublets by using the expression profiles of\ncells to generate synthetic potential doubles which are tested against cells. \nThe method returns a "doublet score" on which it calls for potential doublets.\n\nFor the source code please visit https://github.com/AllonKleinLab/scrublet.\n\nFor 10x we expect the doublet rates to be:\n Multiplet Rate (%) - # of Cells Loaded - # of Cells Recovered\n ~0.4% ~800 ~500\n ~0.8% ~1,600 ~1,000\n ~1.6% ~3,200 ~2,000\n ~2.3% ~4,800 ~3,000\n ~3.1% ~6,400 ~4,000\n ~3.9% ~8,000 ~5,000\n ~4.6% ~9,600 ~6,000\n ~5.4% ~11,200 ~7,000\n ~6.1% ~12,800 ~8,000\n ~6.9% ~14,400 ~9,000\n ~7.6% ~16,000 ~10,000\n' + author = 'Dries De Maeyer, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/filter_with_scrublet/nextflow_labels.config b/target/nextflow/filter/filter_with_scrublet/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/filter_with_scrublet/nextflow_schema.json b/target/nextflow/filter/filter_with_scrublet/nextflow_schema.json new file mode 100644 index 00000000..dae9f36a --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/nextflow_schema.json @@ -0,0 +1,143 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "filter_with_scrublet", + "description": "Doublet detection using the Scrublet method (Wolock, Lopez and Klein, 2019).\nThe method tests for potential doublets by using the expression profiles of\ncells to generate synthetic potential doubles which are tested against cells. \nThe method returns a \"doublet score\" on which it calls for potential doublets.\n\nFor the source code please visit https://github.com/AllonKleinLab/scrublet.\n\nFor 10x we expect the doublet rates to be:\n Multiplet Rate (%) - # of Cells Loaded - # of Cells Recovered\n ~0.4% ~800 ~500\n ~0.8% ~1,600 ~1,000\n ~1.6% ~3,200 ~2,000\n ~2.3% ~4,800 ~3,000\n ~3.1% ~6,400 ~4,000\n ~3.9% ~8,000 ~5,000\n ~4.6% ~9,600 ~6,000\n ~5.4% ~11,200 ~7,000\n ~6.1% ~12,800 ~8,000\n ~6.9% ~14,400 ~9,000\n ~7.6% ~16,000 ~10,000\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Input layer to use as data for calculating doublets", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obs_name_filter": { + "type": "string", + "description": "In which .obs slot to store a boolean array corresponding to which observations should be filtered out.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_scrublet\"`. ", + "default": "filter_with_scrublet" + }, + "do_subset": { + "type": "boolean", + "description": "Whether to subset before storing the output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "obs_name_doublet_score": { + "type": "string", + "description": "Name of the doublet scores column in the obs slot of the returned object.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scrublet_doublet_score\"`. ", + "default": "scrublet_doublet_score" + }, + "expected_doublet_rate": { + "type": "number", + "description": "The estimated fraction of doublets as from the experimental setup.\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "stdev_doublet_rate": { + "type": "number", + "description": "Uncertainty in the expected doublet rate.", + "help_text": "Type: `double`, multiple: `False`. " + }, + "n_neighbors": { + "type": "integer", + "description": "Number of neighbors used to construct the KNN classifier of observed transcriptomes\nand simulated doublets.\n", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "sim_doublet_ratio": { + "type": "number", + "description": "Number of doublets to simulate relative to the number of observed\ntranscriptomes.\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "min_counts": { + "type": "integer", + "description": "The number of minimal UMI counts per cell that have to be present for initial cell detection.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "min_cells": { + "type": "integer", + "description": "The number of cells in which UMIs for a gene were detected.", + "help_text": "Type: `integer`, multiple: `False`, default: `3`. ", + "default": 3 + }, + "min_gene_variablity_percent": { + "type": "number", + "description": "Used for gene filtering prior to PCA", + "help_text": "Type: `double`, multiple: `False`, default: `85.0`. ", + "default": 85.0 + }, + "num_pca_components": { + "type": "integer", + "description": "Number of principal components to use during PCA dimensionality reduction.", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + }, + "distance_metric": { + "type": "string", + "description": "The distance metric used for computing similarities.", + "help_text": "Type: `string`, multiple: `False`, default: `\"euclidean\"`. ", + "default": "euclidean" + }, + "allow_automatic_threshold_detection_fail": { + "type": "boolean", + "description": "When scrublet fails to automatically determine the double score threshold, \nallow the component to continue and set the output columns to NA.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/filter_with_scrublet/setup_logger.py b/target/nextflow/filter/filter_with_scrublet/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/filter_with_scrublet/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/filter/intersect_obs/.config.vsh.yaml b/target/nextflow/filter/intersect_obs/.config.vsh.yaml new file mode 100644 index 00000000..e9297cdd --- /dev/null +++ b/target/nextflow/filter/intersect_obs/.config.vsh.yaml @@ -0,0 +1,265 @@ +name: "intersect_obs" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Isabelle Bergiers" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "Isabelle-b" + orcid: "0000-0001-9622-7960" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Scientist OMICS Technology" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modalities" + description: "Which modalities from the input MuData file to process.\n" + info: null + example: + - "rna" + - "prot" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create an intersection between two or more modalities.\n\nThis component\ + \ removes any observations which are not present in all modalities.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/intersect_obs/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/intersect_obs" + executable: "target/nextflow/filter/intersect_obs/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/intersect_obs/compress_h5mu.py b/target/nextflow/filter/intersect_obs/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/filter/intersect_obs/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/filter/intersect_obs/main.nf b/target/nextflow/filter/intersect_obs/main.nf new file mode 100644 index 00000000..675c7215 --- /dev/null +++ b/target/nextflow/filter/intersect_obs/main.nf @@ -0,0 +1,3983 @@ +// intersect_obs main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Isabelle Bergiers (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "intersect_obs", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Isabelle Bergiers", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "Isabelle-b", + "orcid" : "0000-0001-9622-7960" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Scientist OMICS Technology" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modalities", + "description" : "Which modalities from the input MuData file to process.\n", + "example" : [ + "rna", + "prot" + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Create an intersection between two or more modalities.\n\nThis component removes any observations which are not present in all modalities.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/intersect_obs/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/intersect_obs", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import anndata as ad +import sys +from pathlib import Path +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modalities': $( if [ ! -z ${VIASH_PAR_MODALITIES+x} ]; then echo "r'${VIASH_PAR_MODALITIES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import compress_h5mu + +logger = setup_logger() + + +def main(): + modality_names = par["modalities"] + + if len(modality_names) < 2: + raise ValueError("Please provide two more more modalities.") + + obs_names = {} + for mod_name in par["modalities"]: + try: + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) + except KeyError: + raise ValueError( + f"Modality {mod_name} does not exist for file {par['input']}." + ) + + obs_names[mod_name] = modality.obs_names.copy() + del modality + + intersected_index = None + for mod_name, mod_index in obs_names.items(): + if intersected_index is None: + intersected_index = mod_index + continue + intersected_index = intersected_index.intersection(mod_index) + + output_file = Path(par["output"]) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) + output_file_uncompressed.touch() + + mdata = mu.MuData({modality: ad.AnnData() for modality in modality_names}) + mdata.write(output_file_uncompressed, compression=par["output_compression"]) + + for mod_name in modality_names: + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) + intersected_modality = modality[intersected_index] + mu.write_h5ad(output_file_uncompressed, data=intersected_modality, mod=mod_name) + + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, output_file, compression=par["output_compression"] + ) + output_file_uncompressed.unlink() + else: + shutil.move(output_file_uncompressed, output_file) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/intersect_obs", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/intersect_obs/nextflow.config b/target/nextflow/filter/intersect_obs/nextflow.config new file mode 100644 index 00000000..6d38f4b1 --- /dev/null +++ b/target/nextflow/filter/intersect_obs/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/intersect_obs' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Create an intersection between two or more modalities.\n\nThis component removes any observations which are not present in all modalities.\n' + author = 'Dries Schaumont, Isabelle Bergiers' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/intersect_obs/nextflow_labels.config b/target/nextflow/filter/intersect_obs/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/intersect_obs/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/intersect_obs/nextflow_schema.json b/target/nextflow/filter/intersect_obs/nextflow_schema.json new file mode 100644 index 00000000..8af22a00 --- /dev/null +++ b/target/nextflow/filter/intersect_obs/nextflow_schema.json @@ -0,0 +1,66 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "intersect_obs", + "description": "Create an intersection between two or more modalities.\n\nThis component removes any observations which are not present in all modalities.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modalities": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Which modalities from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"rna\";\"prot\"]`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/intersect_obs/setup_logger.py b/target/nextflow/filter/intersect_obs/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/intersect_obs/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/filter/remove_modality/.config.vsh.yaml b/target/nextflow/filter/remove_modality/.config.vsh.yaml new file mode 100644 index 00000000..5d65b512 --- /dev/null +++ b/target/nextflow/filter/remove_modality/.config.vsh.yaml @@ -0,0 +1,245 @@ +name: "remove_modality" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Name(s) of the modality to remove\n" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Remove a modality from a .h5mu file\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/remove_modality/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/remove_modality" + executable: "target/nextflow/filter/remove_modality/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/remove_modality/main.nf b/target/nextflow/filter/remove_modality/main.nf new file mode 100644 index 00000000..42700f2f --- /dev/null +++ b/target/nextflow/filter/remove_modality/main.nf @@ -0,0 +1,3900 @@ +// remove_modality main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "remove_modality", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Name(s) of the modality to remove\n", + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Remove a modality from a .h5mu file\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/remove_modality/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/remove_modality", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from mudata import read_h5mu, MuData + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + + +input_mudata = read_h5mu(par["input"]) +new_mods = { + mod_name: mod + for mod_name, mod in input_mudata.mod.items() + if mod_name not in par["modality"] +} + +new_mudata = MuData(new_mods) +new_mudata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/remove_modality", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/remove_modality/nextflow.config b/target/nextflow/filter/remove_modality/nextflow.config new file mode 100644 index 00000000..a33eca7a --- /dev/null +++ b/target/nextflow/filter/remove_modality/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/remove_modality' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Remove a modality from a .h5mu file\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/remove_modality/nextflow_labels.config b/target/nextflow/filter/remove_modality/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/remove_modality/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/remove_modality/nextflow_schema.json b/target/nextflow/filter/remove_modality/nextflow_schema.json new file mode 100644 index 00000000..5ce623d9 --- /dev/null +++ b/target/nextflow/filter/remove_modality/nextflow_schema.json @@ -0,0 +1,66 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "remove_modality", + "description": "Remove a modality from a .h5mu file\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Name(s) of the modality to remove\n", + "help_text": "Type: `string`, multiple: `True`, required. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/subset_h5mu/.config.vsh.yaml b/target/nextflow/filter/subset_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..973dcd25 --- /dev/null +++ b/target/nextflow/filter/subset_h5mu/.config.vsh.yaml @@ -0,0 +1,259 @@ +name: "subset_h5mu" +namespace: "filter" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--number_of_observations" + description: "Number of observations to be selected from the h5mu file." + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create a subset of a mudata file by selecting the first number of observations\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/subset_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/subset_h5mu" + executable: "target/nextflow/filter/subset_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/subset_h5mu/main.nf b/target/nextflow/filter/subset_h5mu/main.nf new file mode 100644 index 00000000..1ee01651 --- /dev/null +++ b/target/nextflow/filter/subset_h5mu/main.nf @@ -0,0 +1,3923 @@ +// subset_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "subset_h5mu", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--number_of_observations", + "description" : "Number of observations to be selected from the h5mu file.", + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Create a subset of a mudata file by selecting the first number of observations\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/subset_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/subset_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'number_of_observations': $( if [ ! -z ${VIASH_PAR_NUMBER_OF_OBSERVATIONS+x} ]; then echo "int(r'${VIASH_PAR_NUMBER_OF_OBSERVATIONS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +if __name__ == "__main__": + # read data + data = mudata.read(par["input"]) + + # subset data + if par["modality"]: + data.mod[par["modality"]] = data.mod[par["modality"]][ + : par["number_of_observations"] + ] + else: + data = data[: par["number_of_observations"]] + + # write data + data.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/subset_h5mu", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/subset_h5mu/nextflow.config b/target/nextflow/filter/subset_h5mu/nextflow.config new file mode 100644 index 00000000..8aef7543 --- /dev/null +++ b/target/nextflow/filter/subset_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/subset_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Create a subset of a mudata file by selecting the first number of observations\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/subset_h5mu/nextflow_labels.config b/target/nextflow/filter/subset_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/subset_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/subset_h5mu/nextflow_schema.json b/target/nextflow/filter/subset_h5mu/nextflow_schema.json new file mode 100644 index 00000000..3faac976 --- /dev/null +++ b/target/nextflow/filter/subset_h5mu/nextflow_schema.json @@ -0,0 +1,69 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "subset_h5mu", + "description": "Create a subset of a mudata file by selecting the first number of observations\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "number_of_observations": { + "type": "integer", + "description": "Number of observations to be selected from the h5mu file.", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/subset_h5mu/setup_logger.py b/target/nextflow/filter/subset_h5mu/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/subset_h5mu/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/filter/subset_obsp/.config.vsh.yaml b/target/nextflow/filter/subset_obsp/.config.vsh.yaml new file mode 100644 index 00000000..fdd78850 --- /dev/null +++ b/target/nextflow/filter/subset_obsp/.config.vsh.yaml @@ -0,0 +1,295 @@ +name: "subset_obsp" +namespace: "filter" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsp_key" + description: "The .obsp field to be filtered." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_key" + description: "The .obs column to filter on." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_value" + description: "The value to filter on in the .obs column." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_key" + description: "The .obsm key to store the subset in." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create a subset of an .obsp field in a mudata file, by filtering the\ + \ columns based on the values of an .obs column. The resulting subset is moved to\ + \ an .obsm slot.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/filter/subset_obsp/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/filter/subset_obsp" + executable: "target/nextflow/filter/subset_obsp/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/filter/subset_obsp/compress_h5mu.py b/target/nextflow/filter/subset_obsp/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/filter/subset_obsp/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/filter/subset_obsp/main.nf b/target/nextflow/filter/subset_obsp/main.nf new file mode 100644 index 00000000..b7e9176c --- /dev/null +++ b/target/nextflow/filter/subset_obsp/main.nf @@ -0,0 +1,3998 @@ +// subset_obsp main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "subset_obsp", + "namespace" : "filter", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obsp_key", + "description" : "The .obsp field to be filtered.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_key", + "description" : "The .obs column to filter on.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_value", + "description" : "The value to filter on in the .obs column.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obsm_key", + "description" : "The .obsm key to store the subset in.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/filter/subset_obsp/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/filter/subset_obsp", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obsp_key': $( if [ ! -z ${VIASH_PAR_INPUT_OBSP_KEY+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSP_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_key': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_KEY+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_value': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_VALUE+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_VALUE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obsm_key': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBSM_KEY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBSM_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading %s, modality", par["input"], par["modality"]) + adata = mu.read_h5ad(par["input"], mod=par["modality"]) + + logger.info( + f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}" + ) + # .obsp, .obs and .obsm index and .obsp columns all have a dimension length of \\`n_obs\\` + # the index dimensions remain unaltered, but .obsp columns will be subset + obsp = adata.obsp[par["input_obsp_key"]] + idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"] + # A Series object cannot be used as an indexer for a scipy sparse array + # when the data type is a pandas boolean extension array because + # extension arrays do not define .nonzero() + # See https://github.com/pandas-dev/pandas/issues/46025 + idx = idx.to_numpy(dtype="bool", na_value=False) + obsm_subset = obsp[:, idx] + + logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}") + adata.obsm[par["output_obsm_key"]] = obsm_subset + + logger.info( + "Writing output to %s, modality %s", par["output"], par["output_compression"] + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/filter/subset_obsp", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/filter/subset_obsp/nextflow.config b/target/nextflow/filter/subset_obsp/nextflow.config new file mode 100644 index 00000000..76f19a99 --- /dev/null +++ b/target/nextflow/filter/subset_obsp/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'filter/subset_obsp' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot.\n' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/filter/subset_obsp/nextflow_labels.config b/target/nextflow/filter/subset_obsp/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/filter/subset_obsp/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/filter/subset_obsp/nextflow_schema.json b/target/nextflow/filter/subset_obsp/nextflow_schema.json new file mode 100644 index 00000000..68eb4f46 --- /dev/null +++ b/target/nextflow/filter/subset_obsp/nextflow_schema.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "subset_obsp", + "description": "Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_obsp_key": { + "type": "string", + "description": "The .obsp field to be filtered.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "input_obs_key": { + "type": "string", + "description": "The .obs column to filter on.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "input_obs_value": { + "type": "string", + "description": "The value to filter on in the .obs column.", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obsm_key": { + "type": "string", + "description": "The .obsm key to store the subset in.", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/filter/subset_obsp/setup_logger.py b/target/nextflow/filter/subset_obsp/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/filter/subset_obsp/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/genetic_demux/bcftools/.config.vsh.yaml b/target/nextflow/genetic_demux/bcftools/.config.vsh.yaml new file mode 100644 index 00000000..bef0bb7c --- /dev/null +++ b/target/nextflow/genetic_demux/bcftools/.config.vsh.yaml @@ -0,0 +1,244 @@ +name: "bcftools" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--vcf" + description: "VCF files, must have the same sample columns appearing in the same\ + \ order." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--concat" + description: "Concatenate or combine VCFs and sort them." + info: null + direction: "input" + - type: "boolean_true" + name: "--filter" + description: "Filter VCFs." + info: null + direction: "input" + - type: "integer" + name: "--filter_qual" + description: "Filter VCFs with specified QUAL threshold." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "bcftools output directory" + info: null + example: + - "bcftools_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter the variants called by freebayes or cellSNP" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "wget" + - "bzip2" + - "gcc" + - "make" + - "libbz2-dev" + - "zlib1g-dev" + - "libncurses5-dev" + - "libncursesw5-dev" + - "liblzma-dev" + - "autoconf" + - "automake" + - "perl" + - "libcurl4-gnutls-dev" + - "libssl-dev" + interactive: false + - type: "docker" + run: + - "wget https://github.com/samtools/bcftools/releases/download/1.16/bcftools-1.16.tar.bz2\ + \ -O bcftools.tar.bz2 && tar -xjvf bcftools.tar.bz2 && cd bcftools-1.16 && make\ + \ prefix=/usr/local install" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/bcftools/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/bcftools" + executable: "target/nextflow/genetic_demux/bcftools/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/bcftools/main.nf b/target/nextflow/genetic_demux/bcftools/main.nf new file mode 100644 index 00000000..3d6b70d4 --- /dev/null +++ b/target/nextflow/genetic_demux/bcftools/main.nf @@ -0,0 +1,3892 @@ +// bcftools main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bcftools", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--vcf", + "description" : "VCF files, must have the same sample columns appearing in the same order.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--concat", + "description" : "Concatenate or combine VCFs and sort them.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--filter", + "description" : "Filter VCFs.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--filter_qual", + "description" : "Filter VCFs with specified QUAL threshold.", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "bcftools output directory", + "example" : [ + "bcftools_out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Filter the variants called by freebayes or cellSNP", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:latest", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "wget", + "bzip2", + "gcc", + "make", + "libbz2-dev", + "zlib1g-dev", + "libncurses5-dev", + "libncursesw5-dev", + "liblzma-dev", + "autoconf", + "automake", + "perl", + "libcurl4-gnutls-dev", + "libssl-dev" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "wget https://github.com/samtools/bcftools/releases/download/1.16/bcftools-1.16.tar.bz2 -O bcftools.tar.bz2 && tar -xjvf bcftools.tar.bz2 && cd bcftools-1.16 && make prefix=/usr/local install" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/bcftools/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/bcftools", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_CONCAT+x} ]; then echo "${VIASH_PAR_CONCAT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_concat='&'#" ; else echo "# par_concat="; fi ) +$( if [ ! -z ${VIASH_PAR_FILTER+x} ]; then echo "${VIASH_PAR_FILTER}" | sed "s#'#'\\"'\\"'#g;s#.*#par_filter='&'#" ; else echo "# par_filter="; fi ) +$( if [ ! -z ${VIASH_PAR_FILTER_QUAL+x} ]; then echo "${VIASH_PAR_FILTER_QUAL}" | sed "s#'#'\\"'\\"'#g;s#.*#par_filter_qual='&'#" ; else echo "# par_filter_qual="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +if [ ! -d "\\$par_output" ]; then + mkdir -p \\$par_output +fi + +IFS=";" read -a vcf_list <<< \\$par_vcf + + +if [ "\\$par_concat" = true ] && [ "\\$par_filter" = true ] ; then + bcftools concat -o "\\$par_output/concated_chroms.vcf" \\${vcf_list[@]} + bcftools sort "\\$par_output/concated_chroms.vcf" -o "\\$par_output/sorted_concated_chroms.vcf" + bcftools filter -i "QUAL>\\$par_filter_qual" "\\$par_output/sorted_concated_chroms.vcf" -o "\\$par_output/filtered_sorted_concated_chroms.vcf" + +elif [ "\\$par_filter" = true ] ; then + bcftools filter -i "QUAL>\\$par_filter_qual" \\${vcf_list[@]} -o "\\$par_output/filtered.vcf" + +else + bcftools concat -o "\\$par_output/concated_chroms.vcf" \\${vcf_list[@]} + bcftools sort "\\$par_output/concated_chroms.vcf" -o "\\$par_output/sorted_concated_chroms.vcf" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/bcftools", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/bcftools/nextflow.config b/target/nextflow/genetic_demux/bcftools/nextflow.config new file mode 100644 index 00000000..f454cb46 --- /dev/null +++ b/target/nextflow/genetic_demux/bcftools/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/bcftools' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Filter the variants called by freebayes or cellSNP' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/bcftools/nextflow_labels.config b/target/nextflow/genetic_demux/bcftools/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/bcftools/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/bcftools/nextflow_schema.json b/target/nextflow/genetic_demux/bcftools/nextflow_schema.json new file mode 100644 index 00000000..73abe799 --- /dev/null +++ b/target/nextflow/genetic_demux/bcftools/nextflow_schema.json @@ -0,0 +1,70 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bcftools", + "description": "Filter the variants called by freebayes or cellSNP", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "vcf": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "VCF files, must have the same sample columns appearing in the same order.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "concat": { + "type": "boolean", + "description": "Concatenate or combine VCFs and sort them.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "filter": { + "type": "boolean", + "description": "Filter VCFs.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "filter_qual": { + "type": "integer", + "description": "Filter VCFs with specified QUAL threshold.", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + }, + "output": { + "type": "string", + "format": "path", + "description": "bcftools output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"bcftools_out\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/cellsnp/.config.vsh.yaml b/target/nextflow/genetic_demux/cellsnp/.config.vsh.yaml new file mode 100644 index 00000000..5a6e0c50 --- /dev/null +++ b/target/nextflow/genetic_demux/cellsnp/.config.vsh.yaml @@ -0,0 +1,418 @@ +name: "cellsnp" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--sam_file" + description: "Indexed sam/bam file(s), comma separated multiple samples. Mode\ + \ 1a & 2a: one sam/bam file with single cell. Mode 1b & 2b: one or multiple\ + \ bulk sam/bam files, no barcodes needed, but sample ids and regionsVCF." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sam_index_file" + description: "Input SAM/BAM Index file, problem with samFileList." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sam_fileList" + description: "A list file containing bam files, each per line, for Mode 1b & 2b." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--regions_vcf" + description: "A vcf file listing all candidate SNPs, for fetch each variants.\ + \ If None, pileup the genome. Needed for bulk samples." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targets_vcf" + description: "Similar as --regions_vcf, but the next position is accessed by streaming\ + \ rather than indexing/jumping (like -T in samtools/bcftools mpileup)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode_file" + description: "A plain file listing all effective cell barcode." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_list" + description: "A list file containing sample IDs, each per line." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sample_ids" + description: "Comma separated sample ids." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--genotype" + description: "If use, do genotyping in addition to counting." + info: null + direction: "input" + - type: "boolean_true" + name: "--gzip" + description: "If use, the output files will be zipped into BGZF format." + info: null + direction: "input" + - type: "boolean_true" + name: "--print_skip_snps" + description: "If use, the SNPs skipped when loading VCF will be printed." + info: null + direction: "input" + - type: "string" + name: "--chrom" + description: "The chromosomes to use in integer format 1-22, comma separated" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_tag" + description: "Tag for cell barcodes, turn off with None." + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--umi_tag" + description: "Tag for UMI: UR, Auto, None. For Auto mode, use UR if barcodes is\ + \ inputted, otherwise use None. None mode means no UMI but read counts." + info: null + default: + - "Auto" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_count" + description: "Minimum aggragated count." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_maf" + description: "Minimum minor allele frequency." + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--doublet_gl" + description: "If use, keep doublet GT likelihood, i.e., GT=0.5 and GT=1.5." + info: null + direction: "input" + - type: "string" + name: "--incl_flag" + description: "Required flags: skip reads with all mask bits unset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--excl_flag" + description: "Filter flags: skip reads with any mask bits set [UNMAP,SECONDARY,QCFAIL\ + \ (when use UMI) or UNMAP,SECONDARY,QCFAIL,DUP (otherwise)]" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--count_orphan" + description: "If use, do not skip anomalous read pairs." + info: null + direction: "input" + - type: "integer" + name: "--min_mapq" + description: "Minimum MAPQ for read filtering." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_len" + description: "Minimum mapped length for read filtering." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "--outDir" + description: "Output directory for VCF and sparse matrices." + info: null + example: + - "cellsnp_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "cellSNP aims to pileup the expressed alleles in single-cell or bulk\ + \ RNA-seq data. It can be directly used for donor deconvolution in multiplexed single-cell\ + \ RNA-seq data, particularly with vireo." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:latest" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "wget" + - "gcc" + - "zlib1g" + - "make" + - "libbz2-dev" + - "zlib1g-dev" + - "libncurses5-dev" + - "liblzma-dev" + - "autoconf" + - "automake" + - "perl" + - "libcurl4-gnutls-dev" + - "libssl-dev" + - "git" + - "bzip2" + interactive: false + - type: "docker" + run: + - "wget https://github.com/samtools/htslib/releases/download/1.16/htslib-1.16.tar.bz2\ + \ -O htslib.tar.bz2 && tar -xjvf htslib.tar.bz2 && cd htslib-1.16 && make &&\ + \ make install" + - type: "docker" + run: + - "git clone https://github.com/single-cell-genetics/cellsnp-lite.git && cd cellsnp-lite\ + \ && autoreconf -iv && ./configure --with-htslib=/htslib-1.16 && make && make\ + \ install" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/cellsnp/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/cellsnp" + executable: "target/nextflow/genetic_demux/cellsnp/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/cellsnp/main.nf b/target/nextflow/genetic_demux/cellsnp/main.nf new file mode 100644 index 00000000..8a860ffb --- /dev/null +++ b/target/nextflow/genetic_demux/cellsnp/main.nf @@ -0,0 +1,4120 @@ +// cellsnp main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellsnp", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--sam_file", + "description" : "Indexed sam/bam file(s), comma separated multiple samples. Mode 1a & 2a: one sam/bam file with single cell. Mode 1b & 2b: one or multiple bulk sam/bam files, no barcodes needed, but sample ids and regionsVCF.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sam_index_file", + "description" : "Input SAM/BAM Index file, problem with samFileList.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sam_fileList", + "description" : "A list file containing bam files, each per line, for Mode 1b & 2b.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--regions_vcf", + "description" : "A vcf file listing all candidate SNPs, for fetch each variants. If None, pileup the genome. Needed for bulk samples.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--targets_vcf", + "description" : "Similar as --regions_vcf, but the next position is accessed by streaming rather than indexing/jumping (like -T in samtools/bcftools mpileup).", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcode_file", + "description" : "A plain file listing all effective cell barcode.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_list", + "description" : "A list file containing sample IDs, each per line.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample_ids", + "description" : "Comma separated sample ids.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--genotype", + "description" : "If use, do genotyping in addition to counting.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--gzip", + "description" : "If use, the output files will be zipped into BGZF format.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--print_skip_snps", + "description" : "If use, the SNPs skipped when loading VCF will be printed.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--chrom", + "description" : "The chromosomes to use in integer format 1-22, comma separated", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_tag", + "description" : "Tag for cell barcodes, turn off with None.", + "default" : [ + "CB" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--umi_tag", + "description" : "Tag for UMI: UR, Auto, None. For Auto mode, use UR if barcodes is inputted, otherwise use None. None mode means no UMI but read counts.", + "default" : [ + "Auto" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_count", + "description" : "Minimum aggragated count.", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_maf", + "description" : "Minimum minor allele frequency.", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--doublet_gl", + "description" : "If use, keep doublet GT likelihood, i.e., GT=0.5 and GT=1.5.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--incl_flag", + "description" : "Required flags: skip reads with all mask bits unset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--excl_flag", + "description" : "Filter flags: skip reads with any mask bits set [UNMAP,SECONDARY,QCFAIL (when use UMI) or UNMAP,SECONDARY,QCFAIL,DUP (otherwise)]", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--count_orphan", + "description" : "If use, do not skip anomalous read pairs.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--min_mapq", + "description" : "Minimum MAPQ for read filtering.", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_len", + "description" : "Minimum mapped length for read filtering.", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "--outDir" + ], + "description" : "Output directory for VCF and sparse matrices.", + "example" : [ + "cellsnp_out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "cellSNP aims to pileup the expressed alleles in single-cell or bulk RNA-seq data. It can be directly used for donor deconvolution in multiplexed single-cell RNA-seq data, particularly with vireo.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:latest", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "wget", + "gcc", + "zlib1g", + "make", + "libbz2-dev", + "zlib1g-dev", + "libncurses5-dev", + "liblzma-dev", + "autoconf", + "automake", + "perl", + "libcurl4-gnutls-dev", + "libssl-dev", + "git", + "bzip2" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "wget https://github.com/samtools/htslib/releases/download/1.16/htslib-1.16.tar.bz2 -O htslib.tar.bz2 && tar -xjvf htslib.tar.bz2 && cd htslib-1.16 && make && make install" + ] + }, + { + "type" : "docker", + "run" : [ + "git clone https://github.com/single-cell-genetics/cellsnp-lite.git && cd cellsnp-lite && autoreconf -iv && ./configure --with-htslib=/htslib-1.16 && make && make install" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/cellsnp/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/cellsnp", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_SAM_FILE+x} ]; then echo "${VIASH_PAR_SAM_FILE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sam_file='&'#" ; else echo "# par_sam_file="; fi ) +$( if [ ! -z ${VIASH_PAR_SAM_INDEX_FILE+x} ]; then echo "${VIASH_PAR_SAM_INDEX_FILE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sam_index_file='&'#" ; else echo "# par_sam_index_file="; fi ) +$( if [ ! -z ${VIASH_PAR_SAM_FILELIST+x} ]; then echo "${VIASH_PAR_SAM_FILELIST}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sam_fileList='&'#" ; else echo "# par_sam_fileList="; fi ) +$( if [ ! -z ${VIASH_PAR_REGIONS_VCF+x} ]; then echo "${VIASH_PAR_REGIONS_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_regions_vcf='&'#" ; else echo "# par_regions_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_TARGETS_VCF+x} ]; then echo "${VIASH_PAR_TARGETS_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_targets_vcf='&'#" ; else echo "# par_targets_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODE_FILE+x} ]; then echo "${VIASH_PAR_BARCODE_FILE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_barcode_file='&'#" ; else echo "# par_barcode_file="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_LIST+x} ]; then echo "${VIASH_PAR_SAMPLE_LIST}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sample_list='&'#" ; else echo "# par_sample_list="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLE_IDS+x} ]; then echo "${VIASH_PAR_SAMPLE_IDS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sample_ids='&'#" ; else echo "# par_sample_ids="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPE+x} ]; then echo "${VIASH_PAR_GENOTYPE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genotype='&'#" ; else echo "# par_genotype="; fi ) +$( if [ ! -z ${VIASH_PAR_GZIP+x} ]; then echo "${VIASH_PAR_GZIP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_gzip='&'#" ; else echo "# par_gzip="; fi ) +$( if [ ! -z ${VIASH_PAR_PRINT_SKIP_SNPS+x} ]; then echo "${VIASH_PAR_PRINT_SKIP_SNPS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_print_skip_snps='&'#" ; else echo "# par_print_skip_snps="; fi ) +$( if [ ! -z ${VIASH_PAR_CHROM+x} ]; then echo "${VIASH_PAR_CHROM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_chrom='&'#" ; else echo "# par_chrom="; fi ) +$( if [ ! -z ${VIASH_PAR_CELL_TAG+x} ]; then echo "${VIASH_PAR_CELL_TAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cell_tag='&'#" ; else echo "# par_cell_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_UMI_TAG+x} ]; then echo "${VIASH_PAR_UMI_TAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_umi_tag='&'#" ; else echo "# par_umi_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_COUNT+x} ]; then echo "${VIASH_PAR_MIN_COUNT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_count='&'#" ; else echo "# par_min_count="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MAF+x} ]; then echo "${VIASH_PAR_MIN_MAF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_maf='&'#" ; else echo "# par_min_maf="; fi ) +$( if [ ! -z ${VIASH_PAR_DOUBLET_GL+x} ]; then echo "${VIASH_PAR_DOUBLET_GL}" | sed "s#'#'\\"'\\"'#g;s#.*#par_doublet_gl='&'#" ; else echo "# par_doublet_gl="; fi ) +$( if [ ! -z ${VIASH_PAR_INCL_FLAG+x} ]; then echo "${VIASH_PAR_INCL_FLAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_incl_flag='&'#" ; else echo "# par_incl_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCL_FLAG+x} ]; then echo "${VIASH_PAR_EXCL_FLAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_excl_flag='&'#" ; else echo "# par_excl_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_COUNT_ORPHAN+x} ]; then echo "${VIASH_PAR_COUNT_ORPHAN}" | sed "s#'#'\\"'\\"'#g;s#.*#par_count_orphan='&'#" ; else echo "# par_count_orphan="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MAPQ+x} ]; then echo "${VIASH_PAR_MIN_MAPQ}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_mapq='&'#" ; else echo "# par_min_mapq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_LEN+x} ]; then echo "${VIASH_PAR_MIN_LEN}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_len='&'#" ; else echo "# par_min_len="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\\$par_genotype" == "false" ]] && unset par_genotype +[[ "\\$par_gzip" == "false" ]] && unset par_gzip +[[ "\\$par_print_skip_snps" == "false" ]] && unset par_print_skip_snps +[[ "\\$par_doublet_gl" == "false" ]] && unset par_doublet_gl +[[ "\\$par_count_orphan" == "false" ]] && unset par_count_orphan + +cellsnp-lite \\\\ + \\${meta_cpus:+--nproc \\$meta_cpus} \\\\ + --cellTAG \\$par_cell_tag \\\\ + --UMItag \\$par_umi_tag \\\\ + --minCOUNT \\$par_min_count \\\\ + --minMAF \\$par_min_maf \\\\ + --minLEN \\$par_min_len \\\\ + --minMAPQ \\$par_min_mapq \\\\ + --outDir \\$par_output \\\\ + \\${par_sam_file:+--samFile \\$par_sam_file} \\\\ + \\${par_sam_fileList:+--samFileList \\$par_sam_fileList} \\\\ + \\${par_regions_vcf:+--regionsVCF \\$par_regions_vcf} \\\\ + \\${par_targets_vcf:+--targetsVCF \\$par_targets_vcf} \\\\ + \\${par_barcode_file:+--barcodeFile \\$par_barcode_file} \\\\ + \\${par_sample_list:+--sampleList \\$par_sample_list} \\\\ + \\${par_sample_ids:+--sampleIDs \\$par_sample_ids} \\\\ + \\${par_genotype:+--genotype} \\\\ + \\${par_gzip:+--gzip} \\\\ + \\${par_print_skip_snps:+--printSkipSNPs} \\\\ + \\${par_chrom:+--chrom \\$par_chrom} \\\\ + \\${par_doublet_gl:+--doubletGL} \\\\ + \\${par_incl_flag:+--inclFLAG \\$par_incl_flag} \\\\ + \\${par_excl_flag:+--exclFLAG \\$par_excl_flag} \\\\ + \\${par_count_orphan:+--countORPHAN} +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/cellsnp", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/cellsnp/nextflow.config b/target/nextflow/genetic_demux/cellsnp/nextflow.config new file mode 100644 index 00000000..85a51804 --- /dev/null +++ b/target/nextflow/genetic_demux/cellsnp/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/cellsnp' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'cellSNP aims to pileup the expressed alleles in single-cell or bulk RNA-seq data. It can be directly used for donor deconvolution in multiplexed single-cell RNA-seq data, particularly with vireo.' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/cellsnp/nextflow_labels.config b/target/nextflow/genetic_demux/cellsnp/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/cellsnp/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/cellsnp/nextflow_schema.json b/target/nextflow/genetic_demux/cellsnp/nextflow_schema.json new file mode 100644 index 00000000..eb4a3257 --- /dev/null +++ b/target/nextflow/genetic_demux/cellsnp/nextflow_schema.json @@ -0,0 +1,180 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellsnp", + "description": "cellSNP aims to pileup the expressed alleles in single-cell or bulk RNA-seq data. It can be directly used for donor deconvolution in multiplexed single-cell RNA-seq data, particularly with vireo.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "sam_file": { + "type": "string", + "format": "path", + "description": "Indexed sam/bam file(s), comma separated multiple samples", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sam_index_file": { + "type": "string", + "format": "path", + "description": "Input SAM/BAM Index file, problem with samFileList.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sam_fileList": { + "type": "string", + "format": "path", + "description": "A list file containing bam files, each per line, for Mode 1b & 2b.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "regions_vcf": { + "type": "string", + "format": "path", + "description": "A vcf file listing all candidate SNPs, for fetch each variants", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "targets_vcf": { + "type": "string", + "format": "path", + "description": "Similar as --regions_vcf, but the next position is accessed by streaming rather than indexing/jumping (like -T in samtools/bcftools mpileup).", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "barcode_file": { + "type": "string", + "format": "path", + "description": "A plain file listing all effective cell barcode.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sample_list": { + "type": "string", + "format": "path", + "description": "A list file containing sample IDs, each per line.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sample_ids": { + "type": "string", + "description": "Comma separated sample ids.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "genotype": { + "type": "boolean", + "description": "If use, do genotyping in addition to counting.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "gzip": { + "type": "boolean", + "description": "If use, the output files will be zipped into BGZF format.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "print_skip_snps": { + "type": "boolean", + "description": "If use, the SNPs skipped when loading VCF will be printed.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "chrom": { + "type": "string", + "description": "The chromosomes to use in integer format 1-22, comma separated", + "help_text": "Type: `string`, multiple: `False`. " + }, + "cell_tag": { + "type": "string", + "description": "Tag for cell barcodes, turn off with None.", + "help_text": "Type: `string`, multiple: `False`, default: `\"CB\"`. ", + "default": "CB" + }, + "umi_tag": { + "type": "string", + "description": "Tag for UMI: UR, Auto, None", + "help_text": "Type: `string`, multiple: `False`, default: `\"Auto\"`. ", + "default": "Auto" + }, + "min_count": { + "type": "integer", + "description": "Minimum aggragated count.", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "min_maf": { + "type": "number", + "description": "Minimum minor allele frequency.", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "doublet_gl": { + "type": "boolean", + "description": "If use, keep doublet GT likelihood, i.e., GT=0.5 and GT=1.5.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "incl_flag": { + "type": "string", + "description": "Required flags: skip reads with all mask bits unset.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "excl_flag": { + "type": "string", + "description": "Filter flags: skip reads with any mask bits set [UNMAP,SECONDARY,QCFAIL (when use UMI) or UNMAP,SECONDARY,QCFAIL,DUP (otherwise)]", + "help_text": "Type: `string`, multiple: `False`. " + }, + "count_orphan": { + "type": "boolean", + "description": "If use, do not skip anomalous read pairs.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "min_mapq": { + "type": "integer", + "description": "Minimum MAPQ for read filtering.", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "min_len": { + "type": "integer", + "description": "Minimum mapped length for read filtering.", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory for VCF and sparse matrices.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"cellsnp_out\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/demuxlet/.config.vsh.yaml b/target/nextflow/genetic_demux/demuxlet/.config.vsh.yaml new file mode 100644 index 00000000..2578d1a2 --- /dev/null +++ b/target/nextflow/genetic_demux/demuxlet/.config.vsh.yaml @@ -0,0 +1,504 @@ +name: "demuxlet" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--sam" + description: "Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_group" + description: "Tag representing readgroup or cell barcodes, in the case to partition\ + \ the BAM file into multiple groups. For 10x genomics, use CB." + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_umi" + description: "Tag representing UMIs. For 10x genomiucs, use UB." + info: null + default: + - "UB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--plp" + description: "Input pileup format. If the value is a string, it will be considered\ + \ as the path of the plp file. If the value is boolean true, it will perform\ + \ dscpileup." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vcf" + description: "Input VCF/BCF file, containing the individual genotypes (GT), posterior\ + \ probability (GP), or genotype likelihood (PL)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--field" + description: "FORMAT field to extract the genotype, likelihood, or posterior from" + info: null + default: + - "GT" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--geno_error_offset" + description: "Offset of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2]" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--geno_error_coeff" + description: "Slope of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2]" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--r2_info" + description: "INFO field name representing R2 value. Used for representing imputation\ + \ quality." + info: null + default: + - "R2" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_mac" + description: "Minimum minor allele frequency." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_call_rate" + description: "Minimum call rate." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alpha" + description: "Grid of alpha to search for (default is 0.1, 0.2, 0.3, 0.4, 0.5)" + info: null + default: + - "0.5" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--doublet_prior" + description: "Prior of doublet" + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm" + description: "List of sample IDs to compare to (default: use all)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm_list" + description: "File containing the list of sample IDs to compare." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sam_verbose" + description: "Verbose message frequency for SAM/BAM/CRAM." + info: null + default: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vcf_verbose" + description: "Verbose message frequency for VCF/BCF." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--cap_bq" + description: "Maximum base quality (higher BQ will be capped)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_bq" + description: "Minimum base quality to consider (lower BQ will be skipped)." + info: null + default: + - 13 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_mq" + description: "Minimum mapping quality to consider (lower MQ will be ignored)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_td" + description: "Minimum distance to the tail (lower will be ignored)." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--excl_flag" + description: "SAM/BAM FLAGs to be excluded." + info: null + default: + - 3844 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_list" + description: "List of tag readgroup/cell barcode to consider in this run. All\ + \ other barcodes will be ignored. This is useful for parallelized run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_total" + description: "Minimum number of total reads for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_snp" + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_umi" + description: "Minimum number of UMIs for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "demux" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--out" + description: "demuxlet output file prefix" + info: null + example: + - "demuxlet" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "demuxlet.patch" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Demuxlet is a software tool to deconvolute sample identity and identify\ + \ multiplets when\nmultiple samples are pooled by barcoded single cell sequencing.\ + \ If external genotyping data\nfor each sample is available (e.g. from SNP arrays),\ + \ demuxlet would be recommended. Be careful\nthat the parameters on the github is\ + \ not in line with the newest help version.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + copy: + - "demuxlet.patch /opt/demuxlet.patch" + - type: "apt" + packages: + - "autoconf" + - "wget" + - "git" + - "build-essential" + - "libcurl4-openssl-dev" + - "cmake" + - "libbz2-dev" + - "libssl-dev" + - "liblzma-dev" + - "zlib1g-dev" + - "r-base" + interactive: false + - type: "docker" + run: + - "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib\ + \ && git submodule update --init --recursive && autoreconf -i && ./configure\ + \ --prefix=/usr/local/ && make && make install" + - type: "docker" + run: + - "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle &&\ + \ cd /tmp/popscle && git apply /opt/demuxlet.patch && mkdir -p /tmp/popscle/build\ + \ && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle\ + \ /usr/local/bin" + - type: "r" + cran: + - "readr" + - "processx" + - "dplyr" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/demuxlet/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/demuxlet" + executable: "target/nextflow/genetic_demux/demuxlet/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/demuxlet/demuxlet.patch b/target/nextflow/genetic_demux/demuxlet/demuxlet.patch new file mode 100644 index 00000000..29cc67c9 --- /dev/null +++ b/target/nextflow/genetic_demux/demuxlet/demuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/target/nextflow/genetic_demux/demuxlet/main.nf b/target/nextflow/genetic_demux/demuxlet/main.nf new file mode 100644 index 00000000..109eaf73 --- /dev/null +++ b/target/nextflow/genetic_demux/demuxlet/main.nf @@ -0,0 +1,4301 @@ +// demuxlet main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "demuxlet", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--sam", + "description" : "Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag_group", + "description" : "Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups. For 10x genomics, use CB.", + "default" : [ + "CB" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag_umi", + "description" : "Tag representing UMIs. For 10x genomiucs, use UB.", + "default" : [ + "UB" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--plp", + "description" : "Input pileup format. If the value is a string, it will be considered as the path of the plp file. If the value is boolean true, it will perform dscpileup.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vcf", + "description" : "Input VCF/BCF file, containing the individual genotypes (GT), posterior probability (GP), or genotype likelihood (PL).", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--field", + "description" : "FORMAT field to extract the genotype, likelihood, or posterior from", + "default" : [ + "GT" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--geno_error_offset", + "description" : "Offset of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2]", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--geno_error_coeff", + "description" : "Slope of genotype error rate. [error] = [offset] + [1-offset]*[coeff]*[1-r2]", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--r2_info", + "description" : "INFO field name representing R2 value. Used for representing imputation quality.", + "default" : [ + "R2" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_mac", + "description" : "Minimum minor allele frequency.", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_call_rate", + "description" : "Minimum call rate.", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alpha", + "description" : "Grid of alpha to search for (default is 0.1, 0.2, 0.3, 0.4, 0.5)", + "default" : [ + "0.5" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--doublet_prior", + "description" : "Prior of doublet", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sm", + "description" : "List of sample IDs to compare to (default: use all).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sm_list", + "description" : "File containing the list of sample IDs to compare.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sam_verbose", + "description" : "Verbose message frequency for SAM/BAM/CRAM.", + "default" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--vcf_verbose", + "description" : "Verbose message frequency for VCF/BCF.", + "default" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--cap_bq", + "description" : "Maximum base quality (higher BQ will be capped).", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_bq", + "description" : "Minimum base quality to consider (lower BQ will be skipped).", + "default" : [ + 13 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_mq", + "description" : "Minimum mapping quality to consider (lower MQ will be ignored).", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_td", + "description" : "Minimum distance to the tail (lower will be ignored).", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--excl_flag", + "description" : "SAM/BAM FLAGs to be excluded.", + "default" : [ + 3844 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--group_list", + "description" : "List of tag readgroup/cell barcode to consider in this run. All other barcodes will be ignored. This is useful for parallelized run.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_total", + "description" : "Minimum number of total reads for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_snp", + "description" : "Minimum number of SNPs with coverage for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_umi", + "description" : "Minimum number of UMIs for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory", + "example" : [ + "demux" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--out", + "description" : "demuxlet output file prefix", + "example" : [ + "demuxlet" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "demuxlet.patch" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Demuxlet is a software tool to deconvolute sample identity and identify multiplets when\nmultiple samples are pooled by barcoded single cell sequencing. If external genotyping data\nfor each sample is available (e.g. from SNP arrays), demuxlet would be recommended. Be careful\nthat the parameters on the github is not in line with the newest help version.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "copy" : [ + "demuxlet.patch /opt/demuxlet.patch" + ] + }, + { + "type" : "apt", + "packages" : [ + "autoconf", + "wget", + "git", + "build-essential", + "libcurl4-openssl-dev", + "cmake", + "libbz2-dev", + "libssl-dev", + "liblzma-dev", + "zlib1g-dev", + "r-base" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install" + ] + }, + { + "type" : "docker", + "run" : [ + "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/demuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin" + ] + }, + { + "type" : "r", + "cran" : [ + "readr", + "processx", + "dplyr" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/demuxlet/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/demuxlet", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +requireNamespace("processx", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "sam" = $( if [ ! -z ${VIASH_PAR_SAM+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_SAM" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "tag_group" = $( if [ ! -z ${VIASH_PAR_TAG_GROUP+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_TAG_GROUP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "tag_umi" = $( if [ ! -z ${VIASH_PAR_TAG_UMI+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_TAG_UMI" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "plp" = $( if [ ! -z ${VIASH_PAR_PLP+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PLP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "vcf" = $( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_VCF" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "field" = $( if [ ! -z ${VIASH_PAR_FIELD+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_FIELD" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "geno_error_offset" = $( if [ ! -z ${VIASH_PAR_GENO_ERROR_OFFSET+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_GENO_ERROR_OFFSET" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "geno_error_coeff" = $( if [ ! -z ${VIASH_PAR_GENO_ERROR_COEFF+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_GENO_ERROR_COEFF" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "r2_info" = $( if [ ! -z ${VIASH_PAR_R2_INFO+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_R2_INFO" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_mac" = $( if [ ! -z ${VIASH_PAR_MIN_MAC+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_MAC" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_call_rate" = $( if [ ! -z ${VIASH_PAR_MIN_CALL_RATE+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_MIN_CALL_RATE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "alpha" = $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ALPHA" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "doublet_prior" = $( if [ ! -z ${VIASH_PAR_DOUBLET_PRIOR+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_DOUBLET_PRIOR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "sm" = $( if [ ! -z ${VIASH_PAR_SM+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_SM" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "sm_list" = $( if [ ! -z ${VIASH_PAR_SM_LIST+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_SM_LIST" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "sam_verbose" = $( if [ ! -z ${VIASH_PAR_SAM_VERBOSE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_SAM_VERBOSE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "vcf_verbose" = $( if [ ! -z ${VIASH_PAR_VCF_VERBOSE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_VCF_VERBOSE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "cap_bq" = $( if [ ! -z ${VIASH_PAR_CAP_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_CAP_BQ" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_bq" = $( if [ ! -z ${VIASH_PAR_MIN_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_BQ" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_mq" = $( if [ ! -z ${VIASH_PAR_MIN_MQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_MQ" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_td" = $( if [ ! -z ${VIASH_PAR_MIN_TD+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_TD" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "excl_flag" = $( if [ ! -z ${VIASH_PAR_EXCL_FLAG+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_EXCL_FLAG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "group_list" = $( if [ ! -z ${VIASH_PAR_GROUP_LIST+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_GROUP_LIST" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_total" = $( if [ ! -z ${VIASH_PAR_MIN_TOTAL+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_TOTAL" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_snp" = $( if [ ! -z ${VIASH_PAR_MIN_SNP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_SNP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_umi" = $( if [ ! -z ${VIASH_PAR_MIN_UMI+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_UMI" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "out" = $( if [ ! -z ${VIASH_PAR_OUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +if (!dir.exists(par\\$output)) { + dir.create(par\\$output, recursive = TRUE, showWarnings = FALSE) +} + +cmd <- c( + "popscle", "demuxlet", + "--out", paste0(par\\$output, "/", par\\$out) +) + +argmap <- c( + "tag_group" = "--tag-group", + "tag_umi" = "--tag-UMI", + "field" = "--field", + "geno_error_offset" = "--geno-error-offset", + "geno_error_coeff" = "--geno-error-coeff", + "r2_info" = "--r2-info", + "min_mac" = "--min-mac", + "min_call_rate" = "--min-callrate", + "alpha" = "--alpha", + "doublet_prior" = "--doublet-prior", + "sm" = "--sm", + "sm_list" = "--sm-list", + "sam_verbose" = "--sam-verbose", + "vcf_verbose" = "--vcf-verbose", + "cap_bq" = "--cap-BQ", + "min_bq" = "--min-BQ", + "min_mq" = "--min-MQ", + "min_td" = "--min-TD", + "excl_flag" = "--excl-flag", + "group_list" = "--group-list", + "min_total" = "--min-total", + "min_snp" = "--min-snp", + "min_umi" = "--min-umi", + "plp" = "--plp", + "vcf" = "--vcf", + "sam" = "--sam", + "sm" = "--sm", + "sm_list" = "--sm-list", + "group_list" = "--group-list" +) + +for (arg in names(argmap)) { + if (!is.null(par[[arg]])) { + cmd <- c(cmd, argmap[[arg]], par[[arg]]) + } +} + +zzz <- processx::run( + cmd[[1]], + args = cmd[-1], + echo = TRUE, + echo_cmd = TRUE +) + +if (zzz\\$status != 0) { + stop("Command failed with status ", zzz\\$status) +} + +out_file <- paste0(par\\$output, "/", par\\$out, ".best") +if (!file.exists(out_file)) { + stop("Output file '", out_file, "' not found") +} +res <- readr::read_tsv(out_file) + +res2 <- res %>% + mutate( + donor_part1 = gsub("([^,]*),([^,]*),.*", "\\\\\\\\1", BEST.GUESS), + donor_part2 = gsub("([^,]*),([^,]*),.*", "\\\\\\\\2", BEST.GUESS), + donor_id = case_when( + donor_part1 == donor_part2 ~ donor_part1, + TRUE ~ DROPLET.TYPE + ) + ) + +demuxlet_assign <- res2 %>% select(cell = BARCODE, donor_id) + +readr::write_csv( + demuxlet_assign, + paste0(par\\$output, "/cell_annotation.csv") +) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/demuxlet", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/demuxlet/nextflow.config b/target/nextflow/genetic_demux/demuxlet/nextflow.config new file mode 100644 index 00000000..76dfcc60 --- /dev/null +++ b/target/nextflow/genetic_demux/demuxlet/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/demuxlet' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Demuxlet is a software tool to deconvolute sample identity and identify multiplets when\nmultiple samples are pooled by barcoded single cell sequencing. If external genotyping data\nfor each sample is available (e.g. from SNP arrays), demuxlet would be recommended. Be careful\nthat the parameters on the github is not in line with the newest help version.\n' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/demuxlet/nextflow_labels.config b/target/nextflow/genetic_demux/demuxlet/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/demuxlet/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/demuxlet/nextflow_schema.json b/target/nextflow/genetic_demux/demuxlet/nextflow_schema.json new file mode 100644 index 00000000..5bc4effe --- /dev/null +++ b/target/nextflow/genetic_demux/demuxlet/nextflow_schema.json @@ -0,0 +1,209 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "demuxlet", + "description": "Demuxlet is a software tool to deconvolute sample identity and identify multiplets when\nmultiple samples are pooled by barcoded single cell sequencing. If external genotyping data\nfor each sample is available (e.g. from SNP arrays), demuxlet would be recommended. Be careful\nthat the parameters on the github is not in line with the newest help version.\n", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "sam": { + "type": "string", + "format": "path", + "description": "Input SAM/BAM/CRAM file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "tag_group": { + "type": "string", + "description": "Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups", + "help_text": "Type: `string`, multiple: `False`, default: `\"CB\"`. ", + "default": "CB" + }, + "tag_umi": { + "type": "string", + "description": "Tag representing UMIs", + "help_text": "Type: `string`, multiple: `False`, default: `\"UB\"`. ", + "default": "UB" + }, + "plp": { + "type": "string", + "description": "Input pileup format", + "help_text": "Type: `string`, multiple: `False`. " + }, + "vcf": { + "type": "string", + "format": "path", + "description": "Input VCF/BCF file, containing the individual genotypes (GT), posterior probability (GP), or genotype likelihood (PL).", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "field": { + "type": "string", + "description": "FORMAT field to extract the genotype, likelihood, or posterior from", + "help_text": "Type: `string`, multiple: `False`, default: `\"GT\"`. ", + "default": "GT" + }, + "geno_error_offset": { + "type": "number", + "description": "Offset of genotype error rate", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + }, + "geno_error_coeff": { + "type": "number", + "description": "Slope of genotype error rate", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "r2_info": { + "type": "string", + "description": "INFO field name representing R2 value", + "help_text": "Type: `string`, multiple: `False`, default: `\"R2\"`. ", + "default": "R2" + }, + "min_mac": { + "type": "integer", + "description": "Minimum minor allele frequency.", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "min_call_rate": { + "type": "number", + "description": "Minimum call rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "alpha": { + "type": "string", + "description": "Grid of alpha to search for (default is 0.1, 0.2, 0.3, 0.4, 0.5)", + "help_text": "Type: `string`, multiple: `False`, default: `\"0.5\"`. ", + "default": "0.5" + }, + "doublet_prior": { + "type": "number", + "description": "Prior of doublet", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "sm": { + "type": "string", + "description": "List of sample IDs to compare to (default: use all).", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sm_list": { + "type": "string", + "description": "File containing the list of sample IDs to compare.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sam_verbose": { + "type": "integer", + "description": "Verbose message frequency for SAM/BAM/CRAM.", + "help_text": "Type: `integer`, multiple: `False`, default: `1000000`. ", + "default": 1000000 + }, + "vcf_verbose": { + "type": "integer", + "description": "Verbose message frequency for VCF/BCF.", + "help_text": "Type: `integer`, multiple: `False`, default: `1000`. ", + "default": 1000 + }, + "cap_bq": { + "type": "integer", + "description": "Maximum base quality (higher BQ will be capped).", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "min_bq": { + "type": "integer", + "description": "Minimum base quality to consider (lower BQ will be skipped).", + "help_text": "Type: `integer`, multiple: `False`, default: `13`. ", + "default": 13 + }, + "min_mq": { + "type": "integer", + "description": "Minimum mapping quality to consider (lower MQ will be ignored).", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "min_td": { + "type": "integer", + "description": "Minimum distance to the tail (lower will be ignored).", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "excl_flag": { + "type": "integer", + "description": "SAM/BAM FLAGs to be excluded.", + "help_text": "Type: `integer`, multiple: `False`, default: `3844`. ", + "default": 3844 + }, + "group_list": { + "type": "string", + "description": "List of tag readgroup/cell barcode to consider in this run", + "help_text": "Type: `string`, multiple: `False`. " + }, + "min_total": { + "type": "integer", + "description": "Minimum number of total reads for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_snp": { + "type": "integer", + "description": "Minimum number of SNPs with coverage for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_umi": { + "type": "integer", + "description": "Minimum number of UMIs for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"demux\"`. ", + "default": "$id.$key.output" + }, + "out": { + "type": "string", + "description": "demuxlet output file prefix", + "help_text": "Type: `string`, multiple: `False`, example: `\"demuxlet\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/dsc_pileup/.config.vsh.yaml b/target/nextflow/genetic_demux/dsc_pileup/.config.vsh.yaml new file mode 100644 index 00000000..9cb698f2 --- /dev/null +++ b/target/nextflow/genetic_demux/dsc_pileup/.config.vsh.yaml @@ -0,0 +1,416 @@ +name: "dsc_pileup" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--sam" + description: "Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_group" + description: "Tag representing readgroup or cell barcodes, in the case to partition\ + \ the BAM file into multiple groups. For 10x genomics, use CB." + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_umi" + description: "Tag representing UMIs. For 10x genomiucs, use UB." + info: null + default: + - "UB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--exclude_flag" + description: "SAM/BAM FLAGs to be excluded." + info: null + default: + - 1796 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vcf" + description: "Input VCF/BCF file for dsc-pileup, containing the AC and AN field." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm" + description: "List of sample IDs to compare to (default: use all)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sm_list" + description: "File containing the list of sample IDs to compare." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sam_verbose" + description: "Verbose message frequency for SAM/BAM/CRAM." + info: null + default: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vcf_verbose" + description: "Verbose message frequency for VCF/BCF." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--skip_umi" + description: "Do not generate [prefix].umi.gz file, which stores the regions covered\ + \ by each barcode/UMI pair." + info: null + direction: "input" + - type: "integer" + name: "--cap_bq" + description: "Maximum base quality (higher BQ will be capped)." + info: null + default: + - 40 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_bq" + description: "Minimum base quality to consider (lower BQ will be skipped)." + info: null + default: + - 13 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_mq" + description: "Minimum mapping quality to consider (lower MQ will be ignored)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_td" + description: "Minimum distance to the tail (lower will be ignored)." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--excl_flag" + description: "SAM/BAM FLAGs to be excluded for SNP overlapping Read filtering\ + \ Options." + info: null + default: + - 3844 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_list" + description: "List of tag readgroup/cell barcode to consider in this run. All\ + \ other barcodes will be ignored. This is useful for parallelized run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_total" + description: "Minimum number of total reads for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_uniq" + description: "Minimum number of unique reads (determined by UMI/SNP pair) for\ + \ a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_snp" + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "demux" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--out" + description: "dsc-pileup output file prefix" + info: null + example: + - "demuxlet_dsc" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Dsc-pileup is a software tool to pileup reads and corresponding base\ + \ quality \nfor each overlapping SNPs and each barcode. By using pileup files,\n\ + it would allow us to run demuxlet/freemuxlet pretty fast multiple times\nwithout\ + \ going over the BAM file again.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:20.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "autoconf" + - "wget" + - "git" + - "build-essential" + - "libcurl4-openssl-dev" + - "cmake" + - "libbz2-dev" + - "libssl-dev" + - "liblzma-dev" + - "zlib1g-dev" + - "r-base" + interactive: false + - type: "docker" + run: + - "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib\ + \ && git submodule update --init --recursive && autoreconf -i && ./configure\ + \ --prefix=/usr/local/ && make && make install" + - type: "docker" + run: + - "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle &&\ + \ mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make &&\ + \ cp /tmp/popscle/bin/popscle /usr/local/bin" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/dsc_pileup/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/dsc_pileup" + executable: "target/nextflow/genetic_demux/dsc_pileup/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/dsc_pileup/main.nf b/target/nextflow/genetic_demux/dsc_pileup/main.nf new file mode 100644 index 00000000..41d46170 --- /dev/null +++ b/target/nextflow/genetic_demux/dsc_pileup/main.nf @@ -0,0 +1,4119 @@ +// dsc_pileup main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "dsc_pileup", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--sam", + "description" : "Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag_group", + "description" : "Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups. For 10x genomics, use CB.", + "default" : [ + "CB" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag_umi", + "description" : "Tag representing UMIs. For 10x genomiucs, use UB.", + "default" : [ + "UB" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--exclude_flag", + "description" : "SAM/BAM FLAGs to be excluded.", + "default" : [ + 1796 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vcf", + "description" : "Input VCF/BCF file for dsc-pileup, containing the AC and AN field.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sm", + "description" : "List of sample IDs to compare to (default: use all).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sm_list", + "description" : "File containing the list of sample IDs to compare.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sam_verbose", + "description" : "Verbose message frequency for SAM/BAM/CRAM.", + "default" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--vcf_verbose", + "description" : "Verbose message frequency for VCF/BCF.", + "default" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--skip_umi", + "description" : "Do not generate [prefix].umi.gz file, which stores the regions covered by each barcode/UMI pair.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--cap_bq", + "description" : "Maximum base quality (higher BQ will be capped).", + "default" : [ + 40 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_bq", + "description" : "Minimum base quality to consider (lower BQ will be skipped).", + "default" : [ + 13 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_mq", + "description" : "Minimum mapping quality to consider (lower MQ will be ignored).", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_td", + "description" : "Minimum distance to the tail (lower will be ignored).", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--excl_flag", + "description" : "SAM/BAM FLAGs to be excluded for SNP overlapping Read filtering Options.", + "default" : [ + 3844 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--group_list", + "description" : "List of tag readgroup/cell barcode to consider in this run. All other barcodes will be ignored. This is useful for parallelized run.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_total", + "description" : "Minimum number of total reads for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_uniq", + "description" : "Minimum number of unique reads (determined by UMI/SNP pair) for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_snp", + "description" : "Minimum number of SNPs with coverage for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory", + "example" : [ + "demux" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--out", + "description" : "dsc-pileup output file prefix", + "example" : [ + "demuxlet_dsc" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Dsc-pileup is a software tool to pileup reads and corresponding base quality \nfor each overlapping SNPs and each barcode. By using pileup files,\nit would allow us to run demuxlet/freemuxlet pretty fast multiple times\nwithout going over the BAM file again.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:20.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "autoconf", + "wget", + "git", + "build-essential", + "libcurl4-openssl-dev", + "cmake", + "libbz2-dev", + "libssl-dev", + "liblzma-dev", + "zlib1g-dev", + "r-base" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install" + ] + }, + { + "type" : "docker", + "run" : [ + "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/dsc_pileup/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/dsc_pileup", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_SAM+x} ]; then echo "${VIASH_PAR_SAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sam='&'#" ; else echo "# par_sam="; fi ) +$( if [ ! -z ${VIASH_PAR_TAG_GROUP+x} ]; then echo "${VIASH_PAR_TAG_GROUP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_tag_group='&'#" ; else echo "# par_tag_group="; fi ) +$( if [ ! -z ${VIASH_PAR_TAG_UMI+x} ]; then echo "${VIASH_PAR_TAG_UMI}" | sed "s#'#'\\"'\\"'#g;s#.*#par_tag_umi='&'#" ; else echo "# par_tag_umi="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE_FLAG+x} ]; then echo "${VIASH_PAR_EXCLUDE_FLAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_exclude_flag='&'#" ; else echo "# par_exclude_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_SM+x} ]; then echo "${VIASH_PAR_SM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sm='&'#" ; else echo "# par_sm="; fi ) +$( if [ ! -z ${VIASH_PAR_SM_LIST+x} ]; then echo "${VIASH_PAR_SM_LIST}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sm_list='&'#" ; else echo "# par_sm_list="; fi ) +$( if [ ! -z ${VIASH_PAR_SAM_VERBOSE+x} ]; then echo "${VIASH_PAR_SAM_VERBOSE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sam_verbose='&'#" ; else echo "# par_sam_verbose="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF_VERBOSE+x} ]; then echo "${VIASH_PAR_VCF_VERBOSE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vcf_verbose='&'#" ; else echo "# par_vcf_verbose="; fi ) +$( if [ ! -z ${VIASH_PAR_SKIP_UMI+x} ]; then echo "${VIASH_PAR_SKIP_UMI}" | sed "s#'#'\\"'\\"'#g;s#.*#par_skip_umi='&'#" ; else echo "# par_skip_umi="; fi ) +$( if [ ! -z ${VIASH_PAR_CAP_BQ+x} ]; then echo "${VIASH_PAR_CAP_BQ}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cap_bq='&'#" ; else echo "# par_cap_bq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_BQ+x} ]; then echo "${VIASH_PAR_MIN_BQ}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_bq='&'#" ; else echo "# par_min_bq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MQ+x} ]; then echo "${VIASH_PAR_MIN_MQ}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_mq='&'#" ; else echo "# par_min_mq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_TD+x} ]; then echo "${VIASH_PAR_MIN_TD}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_td='&'#" ; else echo "# par_min_td="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCL_FLAG+x} ]; then echo "${VIASH_PAR_EXCL_FLAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_excl_flag='&'#" ; else echo "# par_excl_flag="; fi ) +$( if [ ! -z ${VIASH_PAR_GROUP_LIST+x} ]; then echo "${VIASH_PAR_GROUP_LIST}" | sed "s#'#'\\"'\\"'#g;s#.*#par_group_list='&'#" ; else echo "# par_group_list="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_TOTAL+x} ]; then echo "${VIASH_PAR_MIN_TOTAL}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_total='&'#" ; else echo "# par_min_total="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_UNIQ+x} ]; then echo "${VIASH_PAR_MIN_UNIQ}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_uniq='&'#" ; else echo "# par_min_uniq="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_SNP+x} ]; then echo "${VIASH_PAR_MIN_SNP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_snp='&'#" ; else echo "# par_min_snp="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_OUT+x} ]; then echo "${VIASH_PAR_OUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_out='&'#" ; else echo "# par_out="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\\$par_skip_umi" == "false" ]] && unset par_skip_umi + +# Create output directory if it doesn't exist +if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" +fi + +popscle dsc-pileup \\\\ + --sam \\$par_sam \\\\ + --tag-group \\$par_tag_group \\\\ + --tag-UMI \\$par_tag_umi \\\\ + --exclude-flag \\$par_exclude_flag \\\\ + --sam-verbose \\$par_sam_verbose \\\\ + --vcf \\$par_vcf \\\\ + --vcf-verbose \\$par_vcf_verbose \\\\ + --out "\\$par_output/\\$par_out" \\\\ + --cap-BQ \\$par_cap_bq \\\\ + --min-BQ \\$par_min_bq \\\\ + --min-MQ \\$par_min_mq \\\\ + --min-TD \\$par_min_td \\\\ + --excl-flag \\$par_excl_flag \\\\ + --min-total \\$par_min_total \\\\ + --min-uniq \\$par_min_uniq \\\\ + --min-snp \\$par_min_snp \\\\ + \\${par_sm:+--sm \\$par_sm} \\\\ + \\${par_sm_list:+--sm-list \\$par_sm_list} \\\\ + \\${par_skip_umi:+--skip-umi} \\\\ + \\${par_group_list:+--group-list \\$par_group_list} +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/dsc_pileup", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/dsc_pileup/nextflow.config b/target/nextflow/genetic_demux/dsc_pileup/nextflow.config new file mode 100644 index 00000000..5974a83d --- /dev/null +++ b/target/nextflow/genetic_demux/dsc_pileup/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/dsc_pileup' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Dsc-pileup is a software tool to pileup reads and corresponding base quality \nfor each overlapping SNPs and each barcode. By using pileup files,\nit would allow us to run demuxlet/freemuxlet pretty fast multiple times\nwithout going over the BAM file again.\n' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/dsc_pileup/nextflow_labels.config b/target/nextflow/genetic_demux/dsc_pileup/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/dsc_pileup/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/dsc_pileup/nextflow_schema.json b/target/nextflow/genetic_demux/dsc_pileup/nextflow_schema.json new file mode 100644 index 00000000..55ff08e8 --- /dev/null +++ b/target/nextflow/genetic_demux/dsc_pileup/nextflow_schema.json @@ -0,0 +1,168 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "dsc_pileup", + "description": "Dsc-pileup is a software tool to pileup reads and corresponding base quality \nfor each overlapping SNPs and each barcode. By using pileup files,\nit would allow us to run demuxlet/freemuxlet pretty fast multiple times\nwithout going over the BAM file again.\n", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "sam": { + "type": "string", + "format": "path", + "description": "Input SAM/BAM/CRAM file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "tag_group": { + "type": "string", + "description": "Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups", + "help_text": "Type: `string`, multiple: `False`, default: `\"CB\"`. ", + "default": "CB" + }, + "tag_umi": { + "type": "string", + "description": "Tag representing UMIs", + "help_text": "Type: `string`, multiple: `False`, default: `\"UB\"`. ", + "default": "UB" + }, + "exclude_flag": { + "type": "integer", + "description": "SAM/BAM FLAGs to be excluded.", + "help_text": "Type: `integer`, multiple: `False`, default: `1796`. ", + "default": 1796 + }, + "vcf": { + "type": "string", + "format": "path", + "description": "Input VCF/BCF file for dsc-pileup, containing the AC and AN field.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sm": { + "type": "string", + "description": "List of sample IDs to compare to (default: use all).", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sm_list": { + "type": "string", + "description": "File containing the list of sample IDs to compare.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sam_verbose": { + "type": "integer", + "description": "Verbose message frequency for SAM/BAM/CRAM.", + "help_text": "Type: `integer`, multiple: `False`, default: `1000000`. ", + "default": 1000000 + }, + "vcf_verbose": { + "type": "integer", + "description": "Verbose message frequency for VCF/BCF.", + "help_text": "Type: `integer`, multiple: `False`, default: `1000`. ", + "default": 1000 + }, + "skip_umi": { + "type": "boolean", + "description": "Do not generate [prefix].umi.gz file, which stores the regions covered by each barcode/UMI pair.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "cap_bq": { + "type": "integer", + "description": "Maximum base quality (higher BQ will be capped).", + "help_text": "Type: `integer`, multiple: `False`, default: `40`. ", + "default": 40 + }, + "min_bq": { + "type": "integer", + "description": "Minimum base quality to consider (lower BQ will be skipped).", + "help_text": "Type: `integer`, multiple: `False`, default: `13`. ", + "default": 13 + }, + "min_mq": { + "type": "integer", + "description": "Minimum mapping quality to consider (lower MQ will be ignored).", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "min_td": { + "type": "integer", + "description": "Minimum distance to the tail (lower will be ignored).", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "excl_flag": { + "type": "integer", + "description": "SAM/BAM FLAGs to be excluded for SNP overlapping Read filtering Options.", + "help_text": "Type: `integer`, multiple: `False`, default: `3844`. ", + "default": 3844 + }, + "group_list": { + "type": "string", + "description": "List of tag readgroup/cell barcode to consider in this run", + "help_text": "Type: `string`, multiple: `False`. " + }, + "min_total": { + "type": "integer", + "description": "Minimum number of total reads for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_uniq": { + "type": "integer", + "description": "Minimum number of unique reads (determined by UMI/SNP pair) for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_snp": { + "type": "integer", + "description": "Minimum number of SNPs with coverage for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"demux\"`. ", + "default": "$id.$key.output" + }, + "out": { + "type": "string", + "description": "dsc-pileup output file prefix", + "help_text": "Type: `string`, multiple: `False`, example: `\"demuxlet_dsc\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/freebayes/.config.vsh.yaml b/target/nextflow/genetic_demux/freebayes/.config.vsh.yaml new file mode 100644 index 00000000..0b272a2c --- /dev/null +++ b/target/nextflow/genetic_demux/freebayes/.config.vsh.yaml @@ -0,0 +1,841 @@ +name: "freebayes" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--bam" + description: "Add FILE to the set of BAM files to be analyzed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam_list" + description: "A file containing a list of BAM files to be analyzed." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--stdin" + description: "Read BAM input on stdin." + info: null + direction: "input" + - type: "file" + name: "--fasta_reference" + description: "Use FILE as the reference sequence for analysis. An index file (FILE.fai)\ + \ will be created if none exists. If neither --targets nor --region are specified,\ + \ FreeBayes will analyze every position in this reference." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--fasta_reference_index" + description: "Use FILE.fai as the index of reference sequence for analysis." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targets" + description: "Limit analysis to targets listed in the BED-format FILE." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--region" + description: "Limit analysis to the specified region, 0-base coordinates, end_position\ + \ not included (same as BED format)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--samples" + description: "Limit analysis to samples listed (one per line) in the FILE. By\ + \ default FreeBayes will analyze all samples in its input BAM files." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--populations" + description: "Each line of FILE should list a sample and a population which it\ + \ is part of. The population-based bayesian inference model will then be partitioned\ + \ on the basis of the populations." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cnv_map" + description: "Read a copy number map from the BED file FILE, which has either\ + \ a sample-level ploidy or a region-specific format." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--gvcf" + description: "Write gVCF output, which indicates coverage in uncalled regions." + info: null + direction: "input" + - type: "integer" + name: "--gvcf_chunk" + description: "When writing gVCF output emit a record for every NUM bases." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--variant_input" + description: "Use variants reported in VCF file as input to the algorithm. Variants\ + \ in this file will included in the output even if there is not enough support\ + \ in the data to pass input filters." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--only_use_input_alleles" + description: "Only provide variant calls and genotype likelihoods for sites and\ + \ alleles which are provided in the VCF input, and provide output in the VCF\ + \ for all input alleles, not just those which have support in the data." + info: null + direction: "input" + - type: "file" + name: "--haplotype_basis_alleles" + description: "When specified, only variant alleles provided in this input VCF\ + \ will be used for the construction of complex or haplotype alleles." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--report_all_haplotype_alleles" + description: "At sites where genotypes are made over haplotype alleles, provide\ + \ information about all alleles in output, not only those which are called." + info: null + direction: "input" + - type: "boolean_true" + name: "--report_monomorphic" + description: "Report even loci which appear to be monomorphic, and report all\ + \ considered alleles, even those which are not in called genotypes." + info: null + direction: "input" + - type: "double" + name: "--pvar" + description: "Report sites if the probability that there is a polymorphism at\ + \ the site is greater than N. Note that post-filtering is generally recommended\ + \ over the use of this parameter." + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--strict_vcf" + description: "Generate strict VCF format (FORMAT/GQ will be an int)." + info: null + direction: "input" + - type: "double" + name: "--theta" + description: "The expected mutation rate or pairwise nucleotide diversity among\ + \ the population under analysis. This serves as the single parameter to the\ + \ Ewens Sampling Formula prior model." + info: null + default: + - 0.001 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ploidy" + description: "Sets the default ploidy for the analysis to N." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--pooled_discrete" + description: "Assume that samples result from pooled sequencing. Model pooled\ + \ samples using discrete genotypes across pools." + info: null + direction: "input" + - type: "boolean_true" + name: "--pooled_continuous" + description: "Output all alleles which pass input filters, regardles of genotyping\ + \ outcome or model." + info: null + direction: "input" + - type: "boolean_true" + name: "--use_reference_allele" + description: "This flag includes the reference allele in the analysis as if it\ + \ is another sample from the same population." + info: null + direction: "input" + - type: "string" + name: "--reference_quality" + description: "Assign mapping quality of MQ to the reference allele at each site\ + \ and base quality of BQ." + info: null + default: + - "100,60" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--throw_away_snp_obs" + description: "Ignore SNP alleles." + info: null + direction: "input" + - type: "boolean_false" + name: "--throw_away_mnps_obs" + description: "Ignore multi-nuceotide polymorphisms, MNPs. MNPs are excluded as\ + \ default." + info: null + direction: "input" + - type: "boolean_false" + name: "--throw_away_indel_obs" + description: "Ignore insertion and deletion alleles. Indels are excluded as default." + info: null + direction: "input" + - type: "boolean_false" + name: "--throw_away_complex_obs" + description: "Ignore complex events (composites of other classes). Complex are\ + \ excluded as default" + info: null + direction: "input" + - type: "integer" + name: "--use_best_n_alleles" + description: "Evaluate only the best N SNP alleles, ranked by sum of supporting\ + \ quality scores." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_complex_gap" + description: "Allow haplotype calls with contiguous embedded matches of up to\ + \ this length." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_repeat_size" + description: "When assembling observations across repeats, require the total repeat\ + \ length at least this many bp." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_repeat_entropy" + description: "To detect interrupted repeats, build across sequence until it has\ + \ entropy > N bits per bp. Set to 0 to turn off." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_partial_observations" + description: "Exclude observations which do not fully span the dynamically-determined\ + \ detection window. (default, use all observations, dividing partial support\ + \ across matching haplotypes when generating haplotypes.)" + info: null + direction: "input" + - type: "boolean_true" + name: "--dont_left_align_indels" + description: "Turn off left-alignment of indels, which is enabled by default." + info: null + direction: "input" + - type: "boolean_true" + name: "--use_duplicate_reads" + description: "Include duplicate-marked alignments in the analysis. default: exclude\ + \ duplicates marked as such in alignments" + info: null + direction: "input" + - type: "integer" + name: "--min_mapping_quality" + description: "Exclude alignments from analysis if they have a mapping quality\ + \ less than Q." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_base_quality" + description: "Exclude alleles from analysis if their supporting base quality is\ + \ less than Q. Default value is changed according to the instruction of scSplit." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_supporting_allele_qsum" + description: "Consider any allele in which the sum of qualities of supporting\ + \ observations is at least Q." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_supporting_mapping_qsum" + description: "Consider any allele in which and the sum of mapping qualities of\ + \ supporting reads is at least." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--mismatch_base_quality_threshold" + description: "Count mismatches toward --read-mismatch-limit if the base quality\ + \ of the mismatch is >= Q." + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--read_max_mismatch_fraction" + description: "Exclude reads with more than N mismatches where each mismatch has\ + \ base quality >= mismatch-base-quality-threshold." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--read_mismatch_limit" + description: "Exclude reads with more than N [0,1] fraction of mismatches where\ + \ each mismatch has base quality >= mismatch-base-quality-threshold." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--read_snp_limit" + description: "Exclude reads with more than N base mismatches, ignoring gaps with\ + \ quality >= mismatch-base-quality-threshold." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--read_indel_limit" + description: "Exclude reads with more than N separate gaps." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--standard_filters" + description: "Use stringent input base and mapping quality filters, equivalent\ + \ to -m 30 -q 20 -R 0 -S 0" + info: null + direction: "input" + - type: "double" + name: "--min_alternate_fraction" + description: "Require at least this fraction of observations supporting an alternate\ + \ allele within a single individual in order to evaluate the position." + info: null + default: + - 0.05 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alternate_count" + description: "Require at least this count of observations supporting an alternate\ + \ allele within a single individual in order to evaluate the position." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alternate_qsum" + description: "Require at least this sum of quality of observations supporting\ + \ an alternate allele within a single individual in order to evaluate the position." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alternate_total" + description: "Require at least this count of observations supporting an alternate\ + \ allele within the total population in order to use the allele in analysis." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_coverage" + description: "Require at least this coverage to process a site." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_coverage" + description: "Do not process sites with greater than this coverage." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_population_priors" + description: "Equivalent to --pooled-discrete --hwe-priors-off and removal of\ + \ Ewens Sampling Formula component of priors." + info: null + direction: "input" + - type: "boolean_true" + name: "--hwe_priors_off" + description: "Disable estimation of the probability of the combination arising\ + \ under HWE given the allele frequency as estimated by observation frequency." + info: null + direction: "input" + - type: "boolean_true" + name: "--binomial_obs_priors_off" + description: "Disable incorporation of prior expectations about observations.\ + \ Uses read placement probability, strand balance probability, and read position\ + \ probability." + info: null + direction: "input" + - type: "boolean_true" + name: "--allele_balance_priors_off" + description: "Disable use of aggregate probability of observation balance between\ + \ alleles as a component of the priors." + info: null + direction: "input" + - type: "file" + name: "--observation_bias" + description: "Read length-dependent allele observation biases from FILE. The format\ + \ is [length] [alignment efficiency relative to reference] where the efficiency\ + \ is 1 if there is no relative observation bias." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--base_quality_cap" + description: "Limit estimated observation quality by capping base quality at Q." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--prob_contamination" + description: "An estimate of contamination to use for all samples." + info: null + default: + - 1.0E-8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--legacy_gls" + description: "Use legacy (polybayes equivalent) genotype likelihood calculations" + info: null + direction: "input" + - type: "file" + name: "--contamination_estimates" + description: "A file containing per-sample estimates of contamination, such as\ + \ those generated by VerifyBamID." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--report_genotype_likelihood_max" + description: "Report genotypes using the maximum-likelihood estimate provided\ + \ from genotype likelihoods." + info: null + direction: "input" + - type: "integer" + name: "--genotyping_max_iterations" + description: "Iterate no more than N times during genotyping step." + info: null + default: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--genotyping_max_banddepth" + description: "Integrate no deeper than the Nth best genotype by likelihood when\ + \ genotyping." + info: null + default: + - 6 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--posterior_integration_limits" + description: "Integrate all genotype combinations in our posterior space which\ + \ include no more than N samples with their Mth best data likelihood." + info: null + default: + - "1,3" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--exclude_unobserved_genotypes" + description: "Skip sample genotypings for which the sample has no supporting reads." + info: null + direction: "input" + - type: "integer" + name: "--genotype_variant_threshold" + description: "Limit posterior integration to samples where the second-best genotype\ + \ likelihood is no more than log(N) from the highest genotype likelihood for\ + \ the sample." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--use_mapping_quality" + description: "Use mapping quality of alleles when calculating data likelihoods." + info: null + direction: "input" + - type: "boolean_true" + name: "--harmonic_indel_quality" + description: "Use a weighted sum of base qualities around an indel, scaled by\ + \ the distance from the indel. By default use a minimum BQ in flanking sequence." + info: null + direction: "input" + - type: "double" + name: "--read_dependence_factor" + description: "Incorporate non-independence of reads by scaling successive observations\ + \ by this factor during data likelihood calculations." + info: null + default: + - 0.9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--genotype_qualities" + description: "Calculate the marginal probability of genotypes and report as GQ\ + \ in each sample field in the VCF output." + info: null + direction: "input" + - type: "boolean_true" + name: "--debug" + description: "Print debugging output." + info: null + direction: "input" + - type: "boolean_true" + name: "--dd" + description: "Print more verbose debugging output" + info: null + direction: "input" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "freebayes_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--vcf" + description: "Output VCF-format results to FILE." + info: null + example: + - "snp.vcf" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Freebayes is a Bayesian genetic variant detector designed to\nfind small\ + \ polymorphisms, specifically SNPs.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "freebayes" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/freebayes/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/freebayes" + executable: "target/nextflow/genetic_demux/freebayes/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/freebayes/main.nf b/target/nextflow/genetic_demux/freebayes/main.nf new file mode 100644 index 00000000..4bb73e27 --- /dev/null +++ b/target/nextflow/genetic_demux/freebayes/main.nf @@ -0,0 +1,4669 @@ +// freebayes main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "freebayes", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--bam", + "description" : "Add FILE to the set of BAM files to be analyzed.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam_list", + "description" : "A file containing a list of BAM files to be analyzed.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--stdin", + "description" : "Read BAM input on stdin.", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--fasta_reference", + "description" : "Use FILE as the reference sequence for analysis. An index file (FILE.fai) will be created if none exists. If neither --targets nor --region are specified, FreeBayes will analyze every position in this reference.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--fasta_reference_index", + "description" : "Use FILE.fai as the index of reference sequence for analysis.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--targets", + "description" : "Limit analysis to targets listed in the BED-format FILE.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--region", + "description" : "Limit analysis to the specified region, 0-base coordinates, end_position not included (same as BED format).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--samples", + "description" : "Limit analysis to samples listed (one per line) in the FILE. By default FreeBayes will analyze all samples in its input BAM files.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--populations", + "description" : "Each line of FILE should list a sample and a population which it is part of. The population-based bayesian inference model will then be partitioned on the basis of the populations.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cnv_map", + "description" : "Read a copy number map from the BED file FILE, which has either a sample-level ploidy or a region-specific format.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--gvcf", + "description" : "Write gVCF output, which indicates coverage in uncalled regions.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--gvcf_chunk", + "description" : "When writing gVCF output emit a record for every NUM bases.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--variant_input", + "description" : "Use variants reported in VCF file as input to the algorithm. Variants in this file will included in the output even if there is not enough support in the data to pass input filters.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--only_use_input_alleles", + "description" : "Only provide variant calls and genotype likelihoods for sites and alleles which are provided in the VCF input, and provide output in the VCF for all input alleles, not just those which have support in the data.", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--haplotype_basis_alleles", + "description" : "When specified, only variant alleles provided in this input VCF will be used for the construction of complex or haplotype alleles.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--report_all_haplotype_alleles", + "description" : "At sites where genotypes are made over haplotype alleles, provide information about all alleles in output, not only those which are called.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--report_monomorphic", + "description" : "Report even loci which appear to be monomorphic, and report all considered alleles, even those which are not in called genotypes.", + "direction" : "input" + }, + { + "type" : "double", + "name" : "--pvar", + "description" : "Report sites if the probability that there is a polymorphism at the site is greater than N. Note that post-filtering is generally recommended over the use of this parameter.", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--strict_vcf", + "description" : "Generate strict VCF format (FORMAT/GQ will be an int).", + "direction" : "input" + }, + { + "type" : "double", + "name" : "--theta", + "description" : "The expected mutation rate or pairwise nucleotide diversity among the population under analysis. This serves as the single parameter to the Ewens Sampling Formula prior model.", + "default" : [ + 0.001 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--ploidy", + "description" : "Sets the default ploidy for the analysis to N.", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--pooled_discrete", + "description" : "Assume that samples result from pooled sequencing. Model pooled samples using discrete genotypes across pools.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--pooled_continuous", + "description" : "Output all alleles which pass input filters, regardles of genotyping outcome or model.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--use_reference_allele", + "description" : "This flag includes the reference allele in the analysis as if it is another sample from the same population.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--reference_quality", + "description" : "Assign mapping quality of MQ to the reference allele at each site and base quality of BQ.", + "default" : [ + "100,60" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--throw_away_snp_obs", + "description" : "Ignore SNP alleles.", + "direction" : "input" + }, + { + "type" : "boolean_false", + "name" : "--throw_away_mnps_obs", + "description" : "Ignore multi-nuceotide polymorphisms, MNPs. MNPs are excluded as default.", + "direction" : "input" + }, + { + "type" : "boolean_false", + "name" : "--throw_away_indel_obs", + "description" : "Ignore insertion and deletion alleles. Indels are excluded as default.", + "direction" : "input" + }, + { + "type" : "boolean_false", + "name" : "--throw_away_complex_obs", + "description" : "Ignore complex events (composites of other classes). Complex are excluded as default", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--use_best_n_alleles", + "description" : "Evaluate only the best N SNP alleles, ranked by sum of supporting quality scores.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_complex_gap", + "description" : "Allow haplotype calls with contiguous embedded matches of up to this length.", + "default" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_repeat_size", + "description" : "When assembling observations across repeats, require the total repeat length at least this many bp.", + "default" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_repeat_entropy", + "description" : "To detect interrupted repeats, build across sequence until it has entropy > N bits per bp. Set to 0 to turn off.", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--no_partial_observations", + "description" : "Exclude observations which do not fully span the dynamically-determined detection window. (default, use all observations, dividing partial support across matching haplotypes when generating haplotypes.)", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--dont_left_align_indels", + "description" : "Turn off left-alignment of indels, which is enabled by default.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--use_duplicate_reads", + "description" : "Include duplicate-marked alignments in the analysis. default: exclude duplicates marked as such in alignments", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--min_mapping_quality", + "description" : "Exclude alignments from analysis if they have a mapping quality less than Q.", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_base_quality", + "description" : "Exclude alleles from analysis if their supporting base quality is less than Q. Default value is changed according to the instruction of scSplit.", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_supporting_allele_qsum", + "description" : "Consider any allele in which the sum of qualities of supporting observations is at least Q.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_supporting_mapping_qsum", + "description" : "Consider any allele in which and the sum of mapping qualities of supporting reads is at least.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--mismatch_base_quality_threshold", + "description" : "Count mismatches toward --read-mismatch-limit if the base quality of the mismatch is >= Q.", + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--read_max_mismatch_fraction", + "description" : "Exclude reads with more than N mismatches where each mismatch has base quality >= mismatch-base-quality-threshold.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--read_mismatch_limit", + "description" : "Exclude reads with more than N [0,1] fraction of mismatches where each mismatch has base quality >= mismatch-base-quality-threshold.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--read_snp_limit", + "description" : "Exclude reads with more than N base mismatches, ignoring gaps with quality >= mismatch-base-quality-threshold.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--read_indel_limit", + "description" : "Exclude reads with more than N separate gaps.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--standard_filters", + "description" : "Use stringent input base and mapping quality filters, equivalent to -m 30 -q 20 -R 0 -S 0", + "direction" : "input" + }, + { + "type" : "double", + "name" : "--min_alternate_fraction", + "description" : "Require at least this fraction of observations supporting an alternate allele within a single individual in order to evaluate the position.", + "default" : [ + 0.05 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_alternate_count", + "description" : "Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position.", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_alternate_qsum", + "description" : "Require at least this sum of quality of observations supporting an alternate allele within a single individual in order to evaluate the position.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_alternate_total", + "description" : "Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis.", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_coverage", + "description" : "Require at least this coverage to process a site.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_coverage", + "description" : "Do not process sites with greater than this coverage.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--no_population_priors", + "description" : "Equivalent to --pooled-discrete --hwe-priors-off and removal of Ewens Sampling Formula component of priors.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--hwe_priors_off", + "description" : "Disable estimation of the probability of the combination arising under HWE given the allele frequency as estimated by observation frequency.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--binomial_obs_priors_off", + "description" : "Disable incorporation of prior expectations about observations. Uses read placement probability, strand balance probability, and read position probability.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--allele_balance_priors_off", + "description" : "Disable use of aggregate probability of observation balance between alleles as a component of the priors.", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--observation_bias", + "description" : "Read length-dependent allele observation biases from FILE. The format is [length] [alignment efficiency relative to reference] where the efficiency is 1 if there is no relative observation bias.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--base_quality_cap", + "description" : "Limit estimated observation quality by capping base quality at Q.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--prob_contamination", + "description" : "An estimate of contamination to use for all samples.", + "default" : [ + 1.0E-8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--legacy_gls", + "description" : "Use legacy (polybayes equivalent) genotype likelihood calculations", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--contamination_estimates", + "description" : "A file containing per-sample estimates of contamination, such as those generated by VerifyBamID.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--report_genotype_likelihood_max", + "description" : "Report genotypes using the maximum-likelihood estimate provided from genotype likelihoods.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--genotyping_max_iterations", + "description" : "Iterate no more than N times during genotyping step.", + "default" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--genotyping_max_banddepth", + "description" : "Integrate no deeper than the Nth best genotype by likelihood when genotyping.", + "default" : [ + 6 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--posterior_integration_limits", + "description" : "Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood.", + "default" : [ + "1,3" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--exclude_unobserved_genotypes", + "description" : "Skip sample genotypings for which the sample has no supporting reads.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--genotype_variant_threshold", + "description" : "Limit posterior integration to samples where the second-best genotype likelihood is no more than log(N) from the highest genotype likelihood for the sample.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--use_mapping_quality", + "description" : "Use mapping quality of alleles when calculating data likelihoods.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--harmonic_indel_quality", + "description" : "Use a weighted sum of base qualities around an indel, scaled by the distance from the indel. By default use a minimum BQ in flanking sequence.", + "direction" : "input" + }, + { + "type" : "double", + "name" : "--read_dependence_factor", + "description" : "Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations.", + "default" : [ + 0.9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--genotype_qualities", + "description" : "Calculate the marginal probability of genotypes and report as GQ in each sample field in the VCF output.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--debug", + "description" : "Print debugging output.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--dd", + "description" : "Print more verbose debugging output", + "direction" : "input" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory", + "example" : [ + "freebayes_out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--vcf", + "description" : "Output VCF-format results to FILE.", + "example" : [ + "snp.vcf" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Freebayes is a Bayesian genetic variant detector designed to\nfind small polymorphisms, specifically SNPs.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + }, + { + "type" : "file", + "path" : "../../../resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "freebayes" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/freebayes/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/freebayes", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM_LIST+x} ]; then echo "${VIASH_PAR_BAM_LIST}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam_list='&'#" ; else echo "# par_bam_list="; fi ) +$( if [ ! -z ${VIASH_PAR_STDIN+x} ]; then echo "${VIASH_PAR_STDIN}" | sed "s#'#'\\"'\\"'#g;s#.*#par_stdin='&'#" ; else echo "# par_stdin="; fi ) +$( if [ ! -z ${VIASH_PAR_FASTA_REFERENCE+x} ]; then echo "${VIASH_PAR_FASTA_REFERENCE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_fasta_reference='&'#" ; else echo "# par_fasta_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_FASTA_REFERENCE_INDEX+x} ]; then echo "${VIASH_PAR_FASTA_REFERENCE_INDEX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_fasta_reference_index='&'#" ; else echo "# par_fasta_reference_index="; fi ) +$( if [ ! -z ${VIASH_PAR_TARGETS+x} ]; then echo "${VIASH_PAR_TARGETS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_targets='&'#" ; else echo "# par_targets="; fi ) +$( if [ ! -z ${VIASH_PAR_REGION+x} ]; then echo "${VIASH_PAR_REGION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_region='&'#" ; else echo "# par_region="; fi ) +$( if [ ! -z ${VIASH_PAR_SAMPLES+x} ]; then echo "${VIASH_PAR_SAMPLES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_samples='&'#" ; else echo "# par_samples="; fi ) +$( if [ ! -z ${VIASH_PAR_POPULATIONS+x} ]; then echo "${VIASH_PAR_POPULATIONS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_populations='&'#" ; else echo "# par_populations="; fi ) +$( if [ ! -z ${VIASH_PAR_CNV_MAP+x} ]; then echo "${VIASH_PAR_CNV_MAP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cnv_map='&'#" ; else echo "# par_cnv_map="; fi ) +$( if [ ! -z ${VIASH_PAR_GVCF+x} ]; then echo "${VIASH_PAR_GVCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_gvcf='&'#" ; else echo "# par_gvcf="; fi ) +$( if [ ! -z ${VIASH_PAR_GVCF_CHUNK+x} ]; then echo "${VIASH_PAR_GVCF_CHUNK}" | sed "s#'#'\\"'\\"'#g;s#.*#par_gvcf_chunk='&'#" ; else echo "# par_gvcf_chunk="; fi ) +$( if [ ! -z ${VIASH_PAR_VARIANT_INPUT+x} ]; then echo "${VIASH_PAR_VARIANT_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_variant_input='&'#" ; else echo "# par_variant_input="; fi ) +$( if [ ! -z ${VIASH_PAR_ONLY_USE_INPUT_ALLELES+x} ]; then echo "${VIASH_PAR_ONLY_USE_INPUT_ALLELES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_only_use_input_alleles='&'#" ; else echo "# par_only_use_input_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_HAPLOTYPE_BASIS_ALLELES+x} ]; then echo "${VIASH_PAR_HAPLOTYPE_BASIS_ALLELES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_haplotype_basis_alleles='&'#" ; else echo "# par_haplotype_basis_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES+x} ]; then echo "${VIASH_PAR_REPORT_ALL_HAPLOTYPE_ALLELES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_report_all_haplotype_alleles='&'#" ; else echo "# par_report_all_haplotype_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT_MONOMORPHIC+x} ]; then echo "${VIASH_PAR_REPORT_MONOMORPHIC}" | sed "s#'#'\\"'\\"'#g;s#.*#par_report_monomorphic='&'#" ; else echo "# par_report_monomorphic="; fi ) +$( if [ ! -z ${VIASH_PAR_PVAR+x} ]; then echo "${VIASH_PAR_PVAR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_pvar='&'#" ; else echo "# par_pvar="; fi ) +$( if [ ! -z ${VIASH_PAR_STRICT_VCF+x} ]; then echo "${VIASH_PAR_STRICT_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_strict_vcf='&'#" ; else echo "# par_strict_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_THETA+x} ]; then echo "${VIASH_PAR_THETA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_theta='&'#" ; else echo "# par_theta="; fi ) +$( if [ ! -z ${VIASH_PAR_PLOIDY+x} ]; then echo "${VIASH_PAR_PLOIDY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ploidy='&'#" ; else echo "# par_ploidy="; fi ) +$( if [ ! -z ${VIASH_PAR_POOLED_DISCRETE+x} ]; then echo "${VIASH_PAR_POOLED_DISCRETE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_pooled_discrete='&'#" ; else echo "# par_pooled_discrete="; fi ) +$( if [ ! -z ${VIASH_PAR_POOLED_CONTINUOUS+x} ]; then echo "${VIASH_PAR_POOLED_CONTINUOUS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_pooled_continuous='&'#" ; else echo "# par_pooled_continuous="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_REFERENCE_ALLELE+x} ]; then echo "${VIASH_PAR_USE_REFERENCE_ALLELE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_use_reference_allele='&'#" ; else echo "# par_use_reference_allele="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE_QUALITY+x} ]; then echo "${VIASH_PAR_REFERENCE_QUALITY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reference_quality='&'#" ; else echo "# par_reference_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_SNP_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_SNP_OBS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_throw_away_snp_obs='&'#" ; else echo "# par_throw_away_snp_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_MNPS_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_MNPS_OBS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_throw_away_mnps_obs='&'#" ; else echo "# par_throw_away_mnps_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_INDEL_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_INDEL_OBS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_throw_away_indel_obs='&'#" ; else echo "# par_throw_away_indel_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_THROW_AWAY_COMPLEX_OBS+x} ]; then echo "${VIASH_PAR_THROW_AWAY_COMPLEX_OBS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_throw_away_complex_obs='&'#" ; else echo "# par_throw_away_complex_obs="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_BEST_N_ALLELES+x} ]; then echo "${VIASH_PAR_USE_BEST_N_ALLELES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_use_best_n_alleles='&'#" ; else echo "# par_use_best_n_alleles="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_COMPLEX_GAP+x} ]; then echo "${VIASH_PAR_MAX_COMPLEX_GAP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_max_complex_gap='&'#" ; else echo "# par_max_complex_gap="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_REPEAT_SIZE+x} ]; then echo "${VIASH_PAR_MIN_REPEAT_SIZE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_repeat_size='&'#" ; else echo "# par_min_repeat_size="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_REPEAT_ENTROPY+x} ]; then echo "${VIASH_PAR_MIN_REPEAT_ENTROPY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_repeat_entropy='&'#" ; else echo "# par_min_repeat_entropy="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_PARTIAL_OBSERVATIONS+x} ]; then echo "${VIASH_PAR_NO_PARTIAL_OBSERVATIONS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_partial_observations='&'#" ; else echo "# par_no_partial_observations="; fi ) +$( if [ ! -z ${VIASH_PAR_DONT_LEFT_ALIGN_INDELS+x} ]; then echo "${VIASH_PAR_DONT_LEFT_ALIGN_INDELS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_dont_left_align_indels='&'#" ; else echo "# par_dont_left_align_indels="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_DUPLICATE_READS+x} ]; then echo "${VIASH_PAR_USE_DUPLICATE_READS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_use_duplicate_reads='&'#" ; else echo "# par_use_duplicate_reads="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_MAPPING_QUALITY+x} ]; then echo "${VIASH_PAR_MIN_MAPPING_QUALITY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_mapping_quality='&'#" ; else echo "# par_min_mapping_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_BASE_QUALITY+x} ]; then echo "${VIASH_PAR_MIN_BASE_QUALITY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_base_quality='&'#" ; else echo "# par_min_base_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM+x} ]; then echo "${VIASH_PAR_MIN_SUPPORTING_ALLELE_QSUM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_supporting_allele_qsum='&'#" ; else echo "# par_min_supporting_allele_qsum="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM+x} ]; then echo "${VIASH_PAR_MIN_SUPPORTING_MAPPING_QSUM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_supporting_mapping_qsum='&'#" ; else echo "# par_min_supporting_mapping_qsum="; fi ) +$( if [ ! -z ${VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD+x} ]; then echo "${VIASH_PAR_MISMATCH_BASE_QUALITY_THRESHOLD}" | sed "s#'#'\\"'\\"'#g;s#.*#par_mismatch_base_quality_threshold='&'#" ; else echo "# par_mismatch_base_quality_threshold="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_MAX_MISMATCH_FRACTION+x} ]; then echo "${VIASH_PAR_READ_MAX_MISMATCH_FRACTION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_read_max_mismatch_fraction='&'#" ; else echo "# par_read_max_mismatch_fraction="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_MISMATCH_LIMIT+x} ]; then echo "${VIASH_PAR_READ_MISMATCH_LIMIT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_read_mismatch_limit='&'#" ; else echo "# par_read_mismatch_limit="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_SNP_LIMIT+x} ]; then echo "${VIASH_PAR_READ_SNP_LIMIT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_read_snp_limit='&'#" ; else echo "# par_read_snp_limit="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_INDEL_LIMIT+x} ]; then echo "${VIASH_PAR_READ_INDEL_LIMIT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_read_indel_limit='&'#" ; else echo "# par_read_indel_limit="; fi ) +$( if [ ! -z ${VIASH_PAR_STANDARD_FILTERS+x} ]; then echo "${VIASH_PAR_STANDARD_FILTERS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_standard_filters='&'#" ; else echo "# par_standard_filters="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_FRACTION+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_FRACTION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_alternate_fraction='&'#" ; else echo "# par_min_alternate_fraction="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_COUNT+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_COUNT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_alternate_count='&'#" ; else echo "# par_min_alternate_count="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_QSUM+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_QSUM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_alternate_qsum='&'#" ; else echo "# par_min_alternate_qsum="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALTERNATE_TOTAL+x} ]; then echo "${VIASH_PAR_MIN_ALTERNATE_TOTAL}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_alternate_total='&'#" ; else echo "# par_min_alternate_total="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_COVERAGE+x} ]; then echo "${VIASH_PAR_MIN_COVERAGE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_coverage='&'#" ; else echo "# par_min_coverage="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_COVERAGE+x} ]; then echo "${VIASH_PAR_MAX_COVERAGE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_max_coverage='&'#" ; else echo "# par_max_coverage="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_POPULATION_PRIORS+x} ]; then echo "${VIASH_PAR_NO_POPULATION_PRIORS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_population_priors='&'#" ; else echo "# par_no_population_priors="; fi ) +$( if [ ! -z ${VIASH_PAR_HWE_PRIORS_OFF+x} ]; then echo "${VIASH_PAR_HWE_PRIORS_OFF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_hwe_priors_off='&'#" ; else echo "# par_hwe_priors_off="; fi ) +$( if [ ! -z ${VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF+x} ]; then echo "${VIASH_PAR_BINOMIAL_OBS_PRIORS_OFF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_binomial_obs_priors_off='&'#" ; else echo "# par_binomial_obs_priors_off="; fi ) +$( if [ ! -z ${VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF+x} ]; then echo "${VIASH_PAR_ALLELE_BALANCE_PRIORS_OFF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_allele_balance_priors_off='&'#" ; else echo "# par_allele_balance_priors_off="; fi ) +$( if [ ! -z ${VIASH_PAR_OBSERVATION_BIAS+x} ]; then echo "${VIASH_PAR_OBSERVATION_BIAS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_observation_bias='&'#" ; else echo "# par_observation_bias="; fi ) +$( if [ ! -z ${VIASH_PAR_BASE_QUALITY_CAP+x} ]; then echo "${VIASH_PAR_BASE_QUALITY_CAP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_base_quality_cap='&'#" ; else echo "# par_base_quality_cap="; fi ) +$( if [ ! -z ${VIASH_PAR_PROB_CONTAMINATION+x} ]; then echo "${VIASH_PAR_PROB_CONTAMINATION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_prob_contamination='&'#" ; else echo "# par_prob_contamination="; fi ) +$( if [ ! -z ${VIASH_PAR_LEGACY_GLS+x} ]; then echo "${VIASH_PAR_LEGACY_GLS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_legacy_gls='&'#" ; else echo "# par_legacy_gls="; fi ) +$( if [ ! -z ${VIASH_PAR_CONTAMINATION_ESTIMATES+x} ]; then echo "${VIASH_PAR_CONTAMINATION_ESTIMATES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_contamination_estimates='&'#" ; else echo "# par_contamination_estimates="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX+x} ]; then echo "${VIASH_PAR_REPORT_GENOTYPE_LIKELIHOOD_MAX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_report_genotype_likelihood_max='&'#" ; else echo "# par_report_genotype_likelihood_max="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPING_MAX_ITERATIONS+x} ]; then echo "${VIASH_PAR_GENOTYPING_MAX_ITERATIONS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genotyping_max_iterations='&'#" ; else echo "# par_genotyping_max_iterations="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPING_MAX_BANDDEPTH+x} ]; then echo "${VIASH_PAR_GENOTYPING_MAX_BANDDEPTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genotyping_max_banddepth='&'#" ; else echo "# par_genotyping_max_banddepth="; fi ) +$( if [ ! -z ${VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS+x} ]; then echo "${VIASH_PAR_POSTERIOR_INTEGRATION_LIMITS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_posterior_integration_limits='&'#" ; else echo "# par_posterior_integration_limits="; fi ) +$( if [ ! -z ${VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES+x} ]; then echo "${VIASH_PAR_EXCLUDE_UNOBSERVED_GENOTYPES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_exclude_unobserved_genotypes='&'#" ; else echo "# par_exclude_unobserved_genotypes="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD+x} ]; then echo "${VIASH_PAR_GENOTYPE_VARIANT_THRESHOLD}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genotype_variant_threshold='&'#" ; else echo "# par_genotype_variant_threshold="; fi ) +$( if [ ! -z ${VIASH_PAR_USE_MAPPING_QUALITY+x} ]; then echo "${VIASH_PAR_USE_MAPPING_QUALITY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_use_mapping_quality='&'#" ; else echo "# par_use_mapping_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_HARMONIC_INDEL_QUALITY+x} ]; then echo "${VIASH_PAR_HARMONIC_INDEL_QUALITY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_harmonic_indel_quality='&'#" ; else echo "# par_harmonic_indel_quality="; fi ) +$( if [ ! -z ${VIASH_PAR_READ_DEPENDENCE_FACTOR+x} ]; then echo "${VIASH_PAR_READ_DEPENDENCE_FACTOR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_read_dependence_factor='&'#" ; else echo "# par_read_dependence_factor="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOTYPE_QUALITIES+x} ]; then echo "${VIASH_PAR_GENOTYPE_QUALITIES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genotype_qualities='&'#" ; else echo "# par_genotype_qualities="; fi ) +$( if [ ! -z ${VIASH_PAR_DEBUG+x} ]; then echo "${VIASH_PAR_DEBUG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_debug='&'#" ; else echo "# par_debug="; fi ) +$( if [ ! -z ${VIASH_PAR_DD+x} ]; then echo "${VIASH_PAR_DD}" | sed "s#'#'\\"'\\"'#g;s#.*#par_dd='&'#" ; else echo "# par_dd="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +set -eo pipefail + +# Unset boolean flags if their values are not 'true' +for flag in par_stdin par_gvcf par_only_use_input_alleles par_report_all_haplotype_alleles par_report_monomorphic par_strict_vcf \\\\ + par_pooled_discrete par_pooled_continuous par_use_reference_allele par_throw_away_snp_obs par_throw_away_indel_obs par_throw_away_mnps_obs par_throw_away_complex_obs \\\\ + par_no_partial_observations par_dont_left_align_indels par_use_duplicate_reads par_standard_filters par_no_population_priors \\\\ + par_hwe_priors_off par_binomial_obs_priors_off par_allele_balance_priors_off par_legacy_gls par_report_genotype_likelihood_max \\\\ + par_exclude_unobserved_genotypes par_use_mapping_quality par_harmonic_indel_quality par_genotype_qualities par_debug par_dd; do + [[ "\\${!flag}" != "true" ]] && unset "\\$flag" +done + +# Create output directory if it doesn't exist +if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" +fi + +freebayes \\\\ + --fasta-reference \\$par_fasta_reference \\\\ + --pvar \\$par_pvar \\\\ + --theta \\$par_theta \\\\ + --ploidy \\$par_ploidy \\\\ + --min-repeat-entropy \\$par_min_repeat_entropy \\\\ + --reference-quality \\$par_reference_quality \\\\ + --use-best-n-alleles \\$par_use_best_n_alleles \\\\ + --max-complex-gap \\$par_max_complex_gap \\\\ + --min-repeat-size \\$par_min_repeat_size \\\\ + --min-mapping-quality \\$par_min_mapping_quality \\\\ + --min-base-quality \\$par_min_base_quality \\\\ + --min-supporting-allele-qsum \\$par_min_supporting_allele_qsum \\\\ + --min-supporting-mapping-qsum \\$par_min_supporting_mapping_qsum \\\\ + --mismatch-base-quality-threshold \\$par_mismatch_base_quality_threshold \\\\ + --read-max-mismatch-fraction \\$par_read_max_mismatch_fraction \\\\ + --min-alternate-fraction \\$par_min_alternate_fraction \\\\ + --min-alternate-count \\$par_min_alternate_count \\\\ + --min-alternate-qsum \\$par_min_alternate_qsum \\\\ + --min-alternate-total \\$par_min_alternate_total \\\\ + --min-coverage \\$par_min_coverage \\\\ + --genotyping-max-iterations \\$par_genotyping_max_iterations \\\\ + --genotyping-max-banddepth \\$par_genotyping_max_banddepth \\\\ + --posterior-integration-limits \\$par_posterior_integration_limits \\\\ + --read-dependence-factor \\$par_read_dependence_factor \\\\ + --vcf \\${par_output}/\\${par_vcf} \\\\ + \\${par_bam:+--bam \\$par_bam} \\\\ + \\${par_bam_list:+--bam-list \\$par_bam_list} \\\\ + \\${par_stdin:+--stdin} \\\\ + \\${par_targets:+--targets \\$par_targets} \\\\ + \\${par_region:+--region \\$par_region} \\\\ + \\${par_max_coverage:+--max-coverage \\$par_max_coverage} \\\\ + \\${par_samples:+--samples \\$par_samples} \\\\ + \\${par_populations:+--populations \\$par_populations} \\\\ + \\${par_cnv_map:+--cnv-map \\$par_cnv_map} \\\\ + \\${par_gvcf:+--gvcf} \\\\ + \\${par_gvcf_chunk:+--gvcf-chunk \\$par_gvcf_chunk} \\\\ + \\${par_variant_input:+--variant-input \\$par_variant_input} \\\\ + \\${par_only_use_input_alleles:+--only-use-input-alleles} \\\\ + \\${par_haplotype_basis_alleles:+--haplotype-basis-alleles \\$par_haplotype_basis_alleles} \\\\ + \\${par_report_all_haplotype_alleles:+--report-all-haplotype-alleles} \\\\ + \\${par_report_monomorphic:+--report-monomorphic} \\\\ + \\${par_strict_vcf:+--strict-vcf} \\\\ + \\${par_pooled_discrete:+--pooled-discrete} \\\\ + \\${par_pooled_continuous:+--pooled-continuous} \\\\ + \\${par_use_reference_allele:+--use-reference-allele} \\\\ + \\${par_throw_away_snp_obs:+--throw-away-snp-obs} \\\\ + \\${par_throw_away_indel_obs:+--throw-away-indel-obs} \\\\ + \\${par_throw_away_mnps_obs:+--throw-away-mnps-obs} \\\\ + \\${par_throw_away_complex_obs:+--throw-away-complex-obs} \\\\ + \\${par_no_partial_observations:+--no-partial-observations} \\\\ + \\${par_dont_left_align_indels:+--dont-left-align-indels} \\\\ + \\${par_use_duplicate_reads:+--use-duplicate-reads} \\\\ + \\${par_standard_filters:+--standard-filters} \\\\ + \\${par_no_population_priors:+--no-population-priors} \\\\ + \\${par_hwe_priors_off:+--hwe-priors-off} \\\\ + \\${par_binomial_obs_priors_off:+--binomial-obs-priors-off} \\\\ + \\${par_allele_balance_priors_off:+--allele-balance-priors-off} \\\\ + \\${par_legacy_gls:+--legacy-gls} \\\\ + \\${par_report_genotype_likelihood_max:+--report-genotype-likelihood-max} \\\\ + \\${par_exclude_unobserved_genotypes:+--exclude-unobserved-genotypes} \\\\ + \\${par_use_mapping_quality:+--use-mapping-quality} \\\\ + \\${par_harmonic_indel_quality:+--harmonic-indel-quality} \\\\ + \\${par_genotype_qualities:+--genotype-qualities} \\\\ + \\${par_debug:+--debug} \\\\ + \\${par_dd:+-dd} \\\\ + \\${par_observation_bias:+--observation-bias \\$par_observation_bias} \\\\ + \\${par_read_mismatch_limit:+--read-mismatch-limit \\$par_read_mismatch_limit} \\\\ + \\${par_read_snp_limit:+--read-snp-limit \\$par_read_snp_limit} \\\\ + \\${par_read_indel_limit:+--read-indel-limit \\$par_read_indel_limit} \\\\ + \\${par_base_quality_cap:+--base-quality-cap \\$par_base_quality_cap} \\\\ + \\${par_prob_contamination:+--prob-contamination \\$par_prob_contamination} \\\\ + \\${par_contamination_estimates:+--contamination-estimates \\$par_contamination_estimates} \\\\ + \\${par_genotype_variant_threshold:+--genotype-variant-threshold \\$par_genotype_variant_threshold} +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/freebayes", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/freebayes/nextflow.config b/target/nextflow/genetic_demux/freebayes/nextflow.config new file mode 100644 index 00000000..e554ed62 --- /dev/null +++ b/target/nextflow/genetic_demux/freebayes/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/freebayes' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Freebayes is a Bayesian genetic variant detector designed to\nfind small polymorphisms, specifically SNPs.\n' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/freebayes/nextflow_labels.config b/target/nextflow/genetic_demux/freebayes/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/freebayes/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/freebayes/nextflow_schema.json b/target/nextflow/genetic_demux/freebayes/nextflow_schema.json new file mode 100644 index 00000000..45b540ba --- /dev/null +++ b/target/nextflow/genetic_demux/freebayes/nextflow_schema.json @@ -0,0 +1,487 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "freebayes", + "description": "Freebayes is a Bayesian genetic variant detector designed to\nfind small polymorphisms, specifically SNPs.\n", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "bam": { + "type": "string", + "format": "path", + "description": "Add FILE to the set of BAM files to be analyzed.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "bam_list": { + "type": "string", + "format": "path", + "description": "A file containing a list of BAM files to be analyzed.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "stdin": { + "type": "boolean", + "description": "Read BAM input on stdin.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "fasta_reference": { + "type": "string", + "format": "path", + "description": "Use FILE as the reference sequence for analysis", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "fasta_reference_index": { + "type": "string", + "format": "path", + "description": "Use FILE.fai as the index of reference sequence for analysis.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "targets": { + "type": "string", + "format": "path", + "description": "Limit analysis to targets listed in the BED-format FILE.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "region": { + "type": "string", + "description": "Limit analysis to the specified region, 0-base coordinates, end_position not included (same as BED format).", + "help_text": "Type: `string`, multiple: `False`. " + }, + "samples": { + "type": "string", + "format": "path", + "description": "Limit analysis to samples listed (one per line) in the FILE", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "populations": { + "type": "string", + "format": "path", + "description": "Each line of FILE should list a sample and a population which it is part of", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "cnv_map": { + "type": "string", + "format": "path", + "description": "Read a copy number map from the BED file FILE, which has either a sample-level ploidy or a region-specific format.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "gvcf": { + "type": "boolean", + "description": "Write gVCF output, which indicates coverage in uncalled regions.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "gvcf_chunk": { + "type": "integer", + "description": "When writing gVCF output emit a record for every NUM bases.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "variant_input": { + "type": "string", + "format": "path", + "description": "Use variants reported in VCF file as input to the algorithm", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "only_use_input_alleles": { + "type": "boolean", + "description": "Only provide variant calls and genotype likelihoods for sites and alleles which are provided in the VCF input, and provide output in the VCF for all input alleles, not just those which have support in the data.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "haplotype_basis_alleles": { + "type": "string", + "format": "path", + "description": "When specified, only variant alleles provided in this input VCF will be used for the construction of complex or haplotype alleles.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "report_all_haplotype_alleles": { + "type": "boolean", + "description": "At sites where genotypes are made over haplotype alleles, provide information about all alleles in output, not only those which are called.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "report_monomorphic": { + "type": "boolean", + "description": "Report even loci which appear to be monomorphic, and report all considered alleles, even those which are not in called genotypes.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "pvar": { + "type": "number", + "description": "Report sites if the probability that there is a polymorphism at the site is greater than N", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "strict_vcf": { + "type": "boolean", + "description": "Generate strict VCF format (FORMAT/GQ will be an int).", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "theta": { + "type": "number", + "description": "The expected mutation rate or pairwise nucleotide diversity among the population under analysis", + "help_text": "Type: `double`, multiple: `False`, default: `0.001`. ", + "default": 0.001 + }, + "ploidy": { + "type": "integer", + "description": "Sets the default ploidy for the analysis to N.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "pooled_discrete": { + "type": "boolean", + "description": "Assume that samples result from pooled sequencing", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "pooled_continuous": { + "type": "boolean", + "description": "Output all alleles which pass input filters, regardles of genotyping outcome or model.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "use_reference_allele": { + "type": "boolean", + "description": "This flag includes the reference allele in the analysis as if it is another sample from the same population.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "reference_quality": { + "type": "string", + "description": "Assign mapping quality of MQ to the reference allele at each site and base quality of BQ.", + "help_text": "Type: `string`, multiple: `False`, default: `\"100,60\"`. ", + "default": "100,60" + }, + "throw_away_snp_obs": { + "type": "boolean", + "description": "Ignore SNP alleles.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "throw_away_mnps_obs": { + "type": "boolean", + "description": "Ignore multi-nuceotide polymorphisms, MNPs", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + }, + "throw_away_indel_obs": { + "type": "boolean", + "description": "Ignore insertion and deletion alleles", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + }, + "throw_away_complex_obs": { + "type": "boolean", + "description": "Ignore complex events (composites of other classes)", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + }, + "use_best_n_alleles": { + "type": "integer", + "description": "Evaluate only the best N SNP alleles, ranked by sum of supporting quality scores.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "max_complex_gap": { + "type": "integer", + "description": "Allow haplotype calls with contiguous embedded matches of up to this length.", + "help_text": "Type: `integer`, multiple: `False`, default: `3`. ", + "default": 3 + }, + "min_repeat_size": { + "type": "integer", + "description": "When assembling observations across repeats, require the total repeat length at least this many bp.", + "help_text": "Type: `integer`, multiple: `False`, default: `5`. ", + "default": 5 + }, + "min_repeat_entropy": { + "type": "integer", + "description": "To detect interrupted repeats, build across sequence until it has entropy > N bits per bp", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "no_partial_observations": { + "type": "boolean", + "description": "Exclude observations which do not fully span the dynamically-determined detection window", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "dont_left_align_indels": { + "type": "boolean", + "description": "Turn off left-alignment of indels, which is enabled by default.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "use_duplicate_reads": { + "type": "boolean", + "description": "Include duplicate-marked alignments in the analysis", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "min_mapping_quality": { + "type": "integer", + "description": "Exclude alignments from analysis if they have a mapping quality less than Q.", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "min_base_quality": { + "type": "integer", + "description": "Exclude alleles from analysis if their supporting base quality is less than Q", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "min_supporting_allele_qsum": { + "type": "integer", + "description": "Consider any allele in which the sum of qualities of supporting observations is at least Q.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_supporting_mapping_qsum": { + "type": "integer", + "description": "Consider any allele in which and the sum of mapping qualities of supporting reads is at least.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "mismatch_base_quality_threshold": { + "type": "integer", + "description": "Count mismatches toward --read-mismatch-limit if the base quality of the mismatch is >= Q.", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "read_max_mismatch_fraction": { + "type": "number", + "description": "Exclude reads with more than N mismatches where each mismatch has base quality >= mismatch-base-quality-threshold.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "read_mismatch_limit": { + "type": "integer", + "description": "Exclude reads with more than N [0,1] fraction of mismatches where each mismatch has base quality >= mismatch-base-quality-threshold.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "read_snp_limit": { + "type": "integer", + "description": "Exclude reads with more than N base mismatches, ignoring gaps with quality >= mismatch-base-quality-threshold.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "read_indel_limit": { + "type": "integer", + "description": "Exclude reads with more than N separate gaps.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "standard_filters": { + "type": "boolean", + "description": "Use stringent input base and mapping quality filters, equivalent to -m 30 -q 20 -R 0 -S 0", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "min_alternate_fraction": { + "type": "number", + "description": "Require at least this fraction of observations supporting an alternate allele within a single individual in order to evaluate the position.", + "help_text": "Type: `double`, multiple: `False`, default: `0.05`. ", + "default": 0.05 + }, + "min_alternate_count": { + "type": "integer", + "description": "Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "min_alternate_qsum": { + "type": "integer", + "description": "Require at least this sum of quality of observations supporting an alternate allele within a single individual in order to evaluate the position.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_alternate_total": { + "type": "integer", + "description": "Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis.", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + }, + "min_coverage": { + "type": "integer", + "description": "Require at least this coverage to process a site.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "max_coverage": { + "type": "integer", + "description": "Do not process sites with greater than this coverage.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "no_population_priors": { + "type": "boolean", + "description": "Equivalent to --pooled-discrete --hwe-priors-off and removal of Ewens Sampling Formula component of priors.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "hwe_priors_off": { + "type": "boolean", + "description": "Disable estimation of the probability of the combination arising under HWE given the allele frequency as estimated by observation frequency.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "binomial_obs_priors_off": { + "type": "boolean", + "description": "Disable incorporation of prior expectations about observations", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "allele_balance_priors_off": { + "type": "boolean", + "description": "Disable use of aggregate probability of observation balance between alleles as a component of the priors.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "observation_bias": { + "type": "string", + "format": "path", + "description": "Read length-dependent allele observation biases from FILE", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "base_quality_cap": { + "type": "integer", + "description": "Limit estimated observation quality by capping base quality at Q.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "prob_contamination": { + "type": "number", + "description": "An estimate of contamination to use for all samples.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0E-8`. ", + "default": 1.0E-8 + }, + "legacy_gls": { + "type": "boolean", + "description": "Use legacy (polybayes equivalent) genotype likelihood calculations", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "contamination_estimates": { + "type": "string", + "format": "path", + "description": "A file containing per-sample estimates of contamination, such as those generated by VerifyBamID.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "report_genotype_likelihood_max": { + "type": "boolean", + "description": "Report genotypes using the maximum-likelihood estimate provided from genotype likelihoods.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "genotyping_max_iterations": { + "type": "integer", + "description": "Iterate no more than N times during genotyping step.", + "help_text": "Type: `integer`, multiple: `False`, default: `1000`. ", + "default": 1000 + }, + "genotyping_max_banddepth": { + "type": "integer", + "description": "Integrate no deeper than the Nth best genotype by likelihood when genotyping.", + "help_text": "Type: `integer`, multiple: `False`, default: `6`. ", + "default": 6 + }, + "posterior_integration_limits": { + "type": "string", + "description": "Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood.", + "help_text": "Type: `string`, multiple: `False`, default: `\"1,3\"`. ", + "default": "1,3" + }, + "exclude_unobserved_genotypes": { + "type": "boolean", + "description": "Skip sample genotypings for which the sample has no supporting reads.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "genotype_variant_threshold": { + "type": "integer", + "description": "Limit posterior integration to samples where the second-best genotype likelihood is no more than log(N) from the highest genotype likelihood for the sample.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "use_mapping_quality": { + "type": "boolean", + "description": "Use mapping quality of alleles when calculating data likelihoods.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "harmonic_indel_quality": { + "type": "boolean", + "description": "Use a weighted sum of base qualities around an indel, scaled by the distance from the indel", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "read_dependence_factor": { + "type": "number", + "description": "Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations.", + "help_text": "Type: `double`, multiple: `False`, default: `0.9`. ", + "default": 0.9 + }, + "genotype_qualities": { + "type": "boolean", + "description": "Calculate the marginal probability of genotypes and report as GQ in each sample field in the VCF output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "debug": { + "type": "boolean", + "description": "Print debugging output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "dd": { + "type": "boolean", + "description": "Print more verbose debugging output", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"freebayes_out\"`. ", + "default": "$id.$key.output" + }, + "vcf": { + "type": "string", + "description": "Output VCF-format results to FILE.", + "help_text": "Type: `string`, multiple: `False`, example: `\"snp.vcf\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/freemuxlet/.config.vsh.yaml b/target/nextflow/genetic_demux/freemuxlet/.config.vsh.yaml new file mode 100644 index 00000000..9a969c6d --- /dev/null +++ b/target/nextflow/genetic_demux/freemuxlet/.config.vsh.yaml @@ -0,0 +1,420 @@ +name: "freemuxlet" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--plp" + description: "Prefix of input files generated by dsc-pileup" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--init_cluster" + description: "Input file containing the initial cluster information." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--nsample" + description: "Number of samples multiplexed together" + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--aux_files" + description: "Turn on writing auxilary output files" + info: null + direction: "input" + - type: "integer" + name: "--verbose" + description: "Turn on verbose mode with specific verbosity threshold. 0: fully\ + \ verbose, 100 : no verbose messages." + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--doublet_prior" + description: "Prior of doublet." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--geno_error" + description: "Genotype error parameter per cluster." + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--bf_thres" + description: "Bayes Factor Threshold used in the initial clustering." + info: null + default: + - 5.41 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--frac_init_clust" + description: "Fraction of droplets to be clustered in the very first round of\ + \ initial clustering procedure." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--iter_init" + description: "Iteration for initial cluster assignment (set to zero to skip the\ + \ iterations)." + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--keep_init_missing" + description: "Keep missing cluster assignment as missing in the initial iteration." + info: null + direction: "input" + - type: "boolean_true" + name: "--randomize_singlet_score" + description: "Randomize the singlet scores to test its effect." + info: null + direction: "input" + - type: "integer" + name: "--seed" + description: "Seed for random number (use clocks if not set)." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--cap_bq" + description: "Maximum base quality (higher BQ will be capped)." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_bq" + description: "Minimum base quality to consider (lower BQ will be skipped)." + info: null + default: + - 13 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--group_list" + description: "List of tag readgroup/cell barcode to consider in this run. All\ + \ other barcodes will be ignored. This is useful for parallelized run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_total" + description: "Minimum number of total reads for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_umi" + description: "Minimum number of UMIs for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_snp" + description: "Minimum number of SNPs with coverage for a droplet/cell to be considered." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "freemux" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--out" + description: "freemuxlet Output file prefix" + info: null + example: + - "freemuxlet" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "freemuxlet.patch" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Freemuxlet is a software tool to deconvolute sample identity and identify\ + \ multiplets when\nmultiple samples are pooled by barcoded single cell sequencing.\ + \ If external genotyping\ndata is not available, the genotyping-free version demuxlet,\ + \ freemuxlet, would be recommended.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + copy: + - "freemuxlet.patch /opt/freemuxlet.patch" + - type: "apt" + packages: + - "autoconf" + - "wget" + - "git" + - "build-essential" + - "libcurl4-openssl-dev" + - "cmake" + - "libbz2-dev" + - "libssl-dev" + - "liblzma-dev" + - "zlib1g-dev" + - "r-base" + interactive: false + - type: "docker" + run: + - "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib\ + \ && git submodule update --init --recursive && autoreconf -i && ./configure\ + \ --prefix=/usr/local/ && make && make install" + - type: "docker" + run: + - "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle &&\ + \ cd /tmp/popscle && git apply /opt/freemuxlet.patch && mkdir -p /tmp/popscle/build\ + \ && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle\ + \ /usr/local/bin" + - type: "r" + cran: + - "readr" + - "processx" + - "dplyr" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/freemuxlet/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/freemuxlet" + executable: "target/nextflow/genetic_demux/freemuxlet/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/freemuxlet/freemuxlet.patch b/target/nextflow/genetic_demux/freemuxlet/freemuxlet.patch new file mode 100644 index 00000000..29cc67c9 --- /dev/null +++ b/target/nextflow/genetic_demux/freemuxlet/freemuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/target/nextflow/genetic_demux/freemuxlet/main.nf b/target/nextflow/genetic_demux/freemuxlet/main.nf new file mode 100644 index 00000000..dcafddae --- /dev/null +++ b/target/nextflow/genetic_demux/freemuxlet/main.nf @@ -0,0 +1,4199 @@ +// freemuxlet main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "freemuxlet", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "string", + "name" : "--plp", + "description" : "Prefix of input files generated by dsc-pileup", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--init_cluster", + "description" : "Input file containing the initial cluster information.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--nsample", + "description" : "Number of samples multiplexed together", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--aux_files", + "description" : "Turn on writing auxilary output files", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--verbose", + "description" : "Turn on verbose mode with specific verbosity threshold. 0: fully verbose, 100 : no verbose messages.", + "default" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--doublet_prior", + "description" : "Prior of doublet.", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--geno_error", + "description" : "Genotype error parameter per cluster.", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--bf_thres", + "description" : "Bayes Factor Threshold used in the initial clustering.", + "default" : [ + 5.41 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--frac_init_clust", + "description" : "Fraction of droplets to be clustered in the very first round of initial clustering procedure.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--iter_init", + "description" : "Iteration for initial cluster assignment (set to zero to skip the iterations).", + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--keep_init_missing", + "description" : "Keep missing cluster assignment as missing in the initial iteration.", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--randomize_singlet_score", + "description" : "Randomize the singlet scores to test its effect.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "Seed for random number (use clocks if not set).", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--cap_bq", + "description" : "Maximum base quality (higher BQ will be capped).", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_bq", + "description" : "Minimum base quality to consider (lower BQ will be skipped).", + "default" : [ + 13 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--group_list", + "description" : "List of tag readgroup/cell barcode to consider in this run. All other barcodes will be ignored. This is useful for parallelized run.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_total", + "description" : "Minimum number of total reads for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_umi", + "description" : "Minimum number of UMIs for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_snp", + "description" : "Minimum number of SNPs with coverage for a droplet/cell to be considered.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory", + "example" : [ + "freemux" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--out", + "description" : "freemuxlet Output file prefix", + "example" : [ + "freemuxlet" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "freemuxlet.patch" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Freemuxlet is a software tool to deconvolute sample identity and identify multiplets when\nmultiple samples are pooled by barcoded single cell sequencing. If external genotyping\ndata is not available, the genotyping-free version demuxlet, freemuxlet, would be recommended.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "copy" : [ + "freemuxlet.patch /opt/freemuxlet.patch" + ] + }, + { + "type" : "apt", + "packages" : [ + "autoconf", + "wget", + "git", + "build-essential", + "libcurl4-openssl-dev", + "cmake", + "libbz2-dev", + "libssl-dev", + "liblzma-dev", + "zlib1g-dev", + "r-base" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install" + ] + }, + { + "type" : "docker", + "run" : [ + "git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/freemuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin" + ] + }, + { + "type" : "r", + "cran" : [ + "readr", + "processx", + "dplyr" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/freemuxlet/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/freemuxlet", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +requireNamespace("processx", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "plp" = $( if [ ! -z ${VIASH_PAR_PLP+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PLP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "init_cluster" = $( if [ ! -z ${VIASH_PAR_INIT_CLUSTER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INIT_CLUSTER" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "nsample" = $( if [ ! -z ${VIASH_PAR_NSAMPLE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_NSAMPLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "aux_files" = $( if [ ! -z ${VIASH_PAR_AUX_FILES+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_AUX_FILES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'))"; else echo NULL; fi ), + "verbose" = $( if [ ! -z ${VIASH_PAR_VERBOSE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_VERBOSE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "doublet_prior" = $( if [ ! -z ${VIASH_PAR_DOUBLET_PRIOR+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_DOUBLET_PRIOR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "geno_error" = $( if [ ! -z ${VIASH_PAR_GENO_ERROR+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_GENO_ERROR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "bf_thres" = $( if [ ! -z ${VIASH_PAR_BF_THRES+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_BF_THRES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "frac_init_clust" = $( if [ ! -z ${VIASH_PAR_FRAC_INIT_CLUST+x} ]; then echo -n "as.numeric('"; echo -n "$VIASH_PAR_FRAC_INIT_CLUST" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "iter_init" = $( if [ ! -z ${VIASH_PAR_ITER_INIT+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_ITER_INIT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "keep_init_missing" = $( if [ ! -z ${VIASH_PAR_KEEP_INIT_MISSING+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_KEEP_INIT_MISSING" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'))"; else echo NULL; fi ), + "randomize_singlet_score" = $( if [ ! -z ${VIASH_PAR_RANDOMIZE_SINGLET_SCORE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_RANDOMIZE_SINGLET_SCORE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'))"; else echo NULL; fi ), + "seed" = $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_SEED" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "cap_bq" = $( if [ ! -z ${VIASH_PAR_CAP_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_CAP_BQ" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_bq" = $( if [ ! -z ${VIASH_PAR_MIN_BQ+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_BQ" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "group_list" = $( if [ ! -z ${VIASH_PAR_GROUP_LIST+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_GROUP_LIST" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_total" = $( if [ ! -z ${VIASH_PAR_MIN_TOTAL+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_TOTAL" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_umi" = $( if [ ! -z ${VIASH_PAR_MIN_UMI+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_UMI" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_snp" = $( if [ ! -z ${VIASH_PAR_MIN_SNP+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_SNP" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "out" = $( if [ ! -z ${VIASH_PAR_OUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +if (!dir.exists(par\\$output)) { + dir.create(par\\$output, recursive = TRUE, showWarnings = FALSE) +} + +cmd <- c( + "popscle", "freemuxlet", + "--out", paste0(par\\$output, "/", par\\$out) +) + +argmap <- c( + "plp" = "--plp", + "init_cluster" = "--init-cluster", + "nsample" = "--nsample", + "verbose" = "--verbose", + "doublet_prior" = "--doublet-prior", + "geno_error" = "--geno-error", + "bf_thres" = "--bf-thres", + "frac_init_clust" = "--frac-init-clust", + "iter_init" = "--iter-init", + "seed" = "--seed", + "cap_bq" = "--cap-BQ", + "min_bq" = "--min-BQ", + "min_total" = "--min-total", + "min_umi" = "--min-umi", + "min_snp" = "--min-snp", + "group_list" = "--group-list", + "aux_files" = "--aux-files", + "keep_init_missing" = "--keep-init-missing", + "randomize_singlet_score" = "randomize-singlet-score" +) + +for (arg in names(argmap)) { + if (!is.null(par[[arg]])) { + if (arg %in% c( + "aux_files", "keep_init_missing", + "randomize_singlet_score" + )) { + if (toupper(par[[arg]]) == TRUE) { + cmd <- c(cmd, argmap[[arg]]) + } + } else { + cmd <- c(cmd, argmap[[arg]], par[[arg]]) + } + } +} + +zzz <- processx::run( + cmd[[1]], + args = cmd[-1], + echo = TRUE, + echo_cmd = TRUE +) + +if (zzz\\$status != 0) { + stop("Command failed with status ", zzz\\$status) +} + +out_file <- paste0(par\\$output, "/", par\\$out, ".clust1.samples.gz") +if (!file.exists(out_file)) { + stop("Output file '", out_file, "' not found") +} + +res <- readr::read_tsv(out_file) + +res2 <- res %>% + mutate( + donor_part1 = gsub("([^,]*),([^,]*)*", "\\\\\\\\1", BEST.GUESS), + donor_part2 = gsub("([^,]*),([^,]*)*", "\\\\\\\\2", BEST.GUESS), + donor_id = case_when( + donor_part1 == donor_part2 ~ donor_part1, + TRUE ~ DROPLET.TYPE + ) + ) + +freemuxlet_assign <- res2 %>% select(cell = BARCODE, donor_id) + +readr::write_csv( + freemuxlet_assign, + paste0(par\\$output, "/cell_annotation.csv") +) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/freemuxlet", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/freemuxlet/nextflow.config b/target/nextflow/genetic_demux/freemuxlet/nextflow.config new file mode 100644 index 00000000..ec8c50da --- /dev/null +++ b/target/nextflow/genetic_demux/freemuxlet/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/freemuxlet' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Freemuxlet is a software tool to deconvolute sample identity and identify multiplets when\nmultiple samples are pooled by barcoded single cell sequencing. If external genotyping\ndata is not available, the genotyping-free version demuxlet, freemuxlet, would be recommended.\n' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/freemuxlet/nextflow_labels.config b/target/nextflow/genetic_demux/freemuxlet/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/freemuxlet/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/freemuxlet/nextflow_schema.json b/target/nextflow/genetic_demux/freemuxlet/nextflow_schema.json new file mode 100644 index 00000000..7a9c3706 --- /dev/null +++ b/target/nextflow/genetic_demux/freemuxlet/nextflow_schema.json @@ -0,0 +1,169 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "freemuxlet", + "description": "Freemuxlet is a software tool to deconvolute sample identity and identify multiplets when\nmultiple samples are pooled by barcoded single cell sequencing. If external genotyping\ndata is not available, the genotyping-free version demuxlet, freemuxlet, would be recommended.\n", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "plp": { + "type": "string", + "description": "Prefix of input files generated by dsc-pileup", + "help_text": "Type: `string`, multiple: `False`. " + }, + "init_cluster": { + "type": "string", + "format": "path", + "description": "Input file containing the initial cluster information.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "nsample": { + "type": "integer", + "description": "Number of samples multiplexed together", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "aux_files": { + "type": "boolean", + "description": "Turn on writing auxilary output files", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "verbose": { + "type": "integer", + "description": "Turn on verbose mode with specific verbosity threshold", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "doublet_prior": { + "type": "number", + "description": "Prior of doublet.", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + }, + "geno_error": { + "type": "number", + "description": "Genotype error parameter per cluster.", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + }, + "bf_thres": { + "type": "number", + "description": "Bayes Factor Threshold used in the initial clustering.", + "help_text": "Type: `double`, multiple: `False`, default: `5.41`. ", + "default": 5.41 + }, + "frac_init_clust": { + "type": "number", + "description": "Fraction of droplets to be clustered in the very first round of initial clustering procedure.", + "help_text": "Type: `double`, multiple: `False`, default: `1.0`. ", + "default": 1.0 + }, + "iter_init": { + "type": "integer", + "description": "Iteration for initial cluster assignment (set to zero to skip the iterations).", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "keep_init_missing": { + "type": "boolean", + "description": "Keep missing cluster assignment as missing in the initial iteration.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "randomize_singlet_score": { + "type": "boolean", + "description": "Randomize the singlet scores to test its effect.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "seed": { + "type": "integer", + "description": "Seed for random number (use clocks if not set).", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "cap_bq": { + "type": "integer", + "description": "Maximum base quality (higher BQ will be capped).", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "min_bq": { + "type": "integer", + "description": "Minimum base quality to consider (lower BQ will be skipped).", + "help_text": "Type: `integer`, multiple: `False`, default: `13`. ", + "default": 13 + }, + "group_list": { + "type": "string", + "description": "List of tag readgroup/cell barcode to consider in this run", + "help_text": "Type: `string`, multiple: `False`. " + }, + "min_total": { + "type": "integer", + "description": "Minimum number of total reads for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_umi": { + "type": "integer", + "description": "Minimum number of UMIs for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_snp": { + "type": "integer", + "description": "Minimum number of SNPs with coverage for a droplet/cell to be considered.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"freemux\"`. ", + "default": "$id.$key.output" + }, + "out": { + "type": "string", + "description": "freemuxlet Output file prefix", + "help_text": "Type: `string`, multiple: `False`, example: `\"freemuxlet\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/samtools/.config.vsh.yaml b/target/nextflow/genetic_demux/samtools/.config.vsh.yaml new file mode 100644 index 00000000..0ec4a13e --- /dev/null +++ b/target/nextflow/genetic_demux/samtools/.config.vsh.yaml @@ -0,0 +1,222 @@ +name: "samtools" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--bam" + description: "Input bam file for filtering." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Samtools output directory." + info: null + example: + - "samtools_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter the BAM according to the instruction of scSplit via Samtools." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "wget" + - "gcc" + - "make" + - "libbz2-dev" + - "zlib1g-dev" + - "libncurses5-dev" + - "libncursesw5-dev" + - "liblzma-dev" + interactive: false + - type: "docker" + run: + - "wget https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2\ + \ && tar jxf samtools-1.16.1.tar.bz2 && rm samtools-1.16.1.tar.bz2 && cd samtools-1.16.1\ + \ && make prefix=/usr/local install" + - type: "python" + user: false + pip: + - "umi_tools" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/samtools/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/samtools" + executable: "target/nextflow/genetic_demux/samtools/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/samtools/main.nf b/target/nextflow/genetic_demux/samtools/main.nf new file mode 100644 index 00000000..0e266f03 --- /dev/null +++ b/target/nextflow/genetic_demux/samtools/main.nf @@ -0,0 +1,3857 @@ +// samtools main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "samtools", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--bam", + "description" : "Input bam file for filtering.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Samtools output directory.", + "example" : [ + "samtools_out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Filter the BAM according to the instruction of scSplit via Samtools.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "wget", + "gcc", + "make", + "libbz2-dev", + "zlib1g-dev", + "libncurses5-dev", + "libncursesw5-dev", + "liblzma-dev" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "wget https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2 && tar jxf samtools-1.16.1.tar.bz2 && rm samtools-1.16.1.tar.bz2 && cd samtools-1.16.1 && make prefix=/usr/local install" + ] + }, + { + "type" : "python", + "user" : false, + "pip" : [ + "umi_tools" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/samtools/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/samtools", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash +if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" +fi +samtools view -S -b -q 10 -F 3844 "\\$par_bam" > "\\${par_output}/filtered.bam" +cd \\$par_output +samtools index filtered.bam filtered.bam.bai +umi_tools dedup --stdin=filtered.bam --extract-umi-method=tag --umi-tag=UR --cell-tag=CB --log=logfile > no_dup.bam +samtools sort no_dup.bam -o sorted.bam +samtools index sorted.bam sorted.bam.bai +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/samtools", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/samtools/nextflow.config b/target/nextflow/genetic_demux/samtools/nextflow.config new file mode 100644 index 00000000..31d6c90a --- /dev/null +++ b/target/nextflow/genetic_demux/samtools/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/samtools' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Filter the BAM according to the instruction of scSplit via Samtools.' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/samtools/nextflow_labels.config b/target/nextflow/genetic_demux/samtools/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/samtools/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/samtools/nextflow_schema.json b/target/nextflow/genetic_demux/samtools/nextflow_schema.json new file mode 100644 index 00000000..1eeba7c0 --- /dev/null +++ b/target/nextflow/genetic_demux/samtools/nextflow_schema.json @@ -0,0 +1,49 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "samtools", + "description": "Filter the BAM according to the instruction of scSplit via Samtools.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "bam": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input bam file for filtering.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Samtools output directory.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"samtools_out\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/scsplit/.config.vsh.yaml b/target/nextflow/genetic_demux/scsplit/.config.vsh.yaml new file mode 100644 index 00000000..d1258892 --- /dev/null +++ b/target/nextflow/genetic_demux/scsplit/.config.vsh.yaml @@ -0,0 +1,346 @@ +name: "scsplit" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--vcf" + description: "VCF from mixed BAM" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "mixed sample BAM" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bar" + description: "barcodes whitelist" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag" + description: "tag for barcode" + info: null + default: + - "CB" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--com" + description: "common SNVs" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num" + description: "expected number of mixed samples" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sub" + description: "maximum number of subpopulations in autodetect mode" + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ems" + description: "number of EM repeats to avoid local maximum" + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--dbl" + description: "correction for doublets. There will be no refinement on the results\ + \ if this optional parameter is not specified or specified percentage is less\ + \ than doublet rates detected during the run." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vcf_known" + description: "known individual genotypes to limit distinguishing variants to available\ + \ variants, so that users do not need to redo genotyping on selected variants,\ + \ otherwise any variants could be selected as distinguishing variants." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--geno" + description: "generate sample genotypes based on the split result." + info: null + direction: "input" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "scSplit_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--ref" + description: "output Ref count matrix" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alt" + description: "output Alt count matrix" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--psc" + description: "generated P(S|C)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "scsplit is a genotype-free demultiplexing methode of pooled single-cell\ + \ RNA-seq, using a hidden state model for identifying genetically distinct samples\ + \ within a mixed population." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "numpy<2" + - "pandas<2.0" + - "pysam" + - "setuptools<58" + - "scikit-learn==1.1.3" + - "scipy" + - "statistics" + upgrade: true + - type: "python" + user: false + pip: + - "PyVCF" + upgrade: true + - type: "docker" + run: + - "git clone https://github.com/jon-xu/scSplit && cp scSplit/scSplit /usr/local/bin\ + \ && rm -rf scSplit" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/scsplit/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/scsplit" + executable: "target/nextflow/genetic_demux/scsplit/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/scsplit/main.nf b/target/nextflow/genetic_demux/scsplit/main.nf new file mode 100644 index 00000000..648425db --- /dev/null +++ b/target/nextflow/genetic_demux/scsplit/main.nf @@ -0,0 +1,4039 @@ +// scsplit main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scsplit", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--vcf", + "description" : "VCF from mixed BAM", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam", + "description" : "mixed sample BAM", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bar", + "description" : "barcodes whitelist", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag", + "description" : "tag for barcode", + "default" : [ + "CB" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--com", + "description" : "common SNVs", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num", + "description" : "expected number of mixed samples", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sub", + "description" : "maximum number of subpopulations in autodetect mode", + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--ems", + "description" : "number of EM repeats to avoid local maximum", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--dbl", + "description" : "correction for doublets. There will be no refinement on the results if this optional parameter is not specified or specified percentage is less than doublet rates detected during the run.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vcf_known", + "description" : "known individual genotypes to limit distinguishing variants to available variants, so that users do not need to redo genotyping on selected variants, otherwise any variants could be selected as distinguishing variants.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--geno", + "description" : "generate sample genotypes based on the split result.", + "direction" : "input" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory", + "example" : [ + "scSplit_out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--ref", + "description" : "output Ref count matrix", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alt", + "description" : "output Alt count matrix", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--psc", + "description" : "generated P(S|C)", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "scsplit is a genotype-free demultiplexing methode of pooled single-cell RNA-seq, using a hidden state model for identifying genetically distinct samples within a mixed population.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pip" : [ + "numpy<2", + "pandas<2.0", + "pysam", + "setuptools<58", + "scikit-learn==1.1.3", + "scipy", + "statistics" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "pip" : [ + "PyVCF" + ], + "upgrade" : true + }, + { + "type" : "docker", + "run" : [ + "git clone https://github.com/jon-xu/scSplit && cp scSplit/scSplit /usr/local/bin && rm -rf scSplit" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/scsplit/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/scsplit", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_VCF+x} ]; then echo "${VIASH_PAR_VCF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vcf='&'#" ; else echo "# par_vcf="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAR+x} ]; then echo "${VIASH_PAR_BAR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bar='&'#" ; else echo "# par_bar="; fi ) +$( if [ ! -z ${VIASH_PAR_TAG+x} ]; then echo "${VIASH_PAR_TAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_tag='&'#" ; else echo "# par_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_COM+x} ]; then echo "${VIASH_PAR_COM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_com='&'#" ; else echo "# par_com="; fi ) +$( if [ ! -z ${VIASH_PAR_NUM+x} ]; then echo "${VIASH_PAR_NUM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_num='&'#" ; else echo "# par_num="; fi ) +$( if [ ! -z ${VIASH_PAR_SUB+x} ]; then echo "${VIASH_PAR_SUB}" | sed "s#'#'\\"'\\"'#g;s#.*#par_sub='&'#" ; else echo "# par_sub="; fi ) +$( if [ ! -z ${VIASH_PAR_EMS+x} ]; then echo "${VIASH_PAR_EMS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ems='&'#" ; else echo "# par_ems="; fi ) +$( if [ ! -z ${VIASH_PAR_DBL+x} ]; then echo "${VIASH_PAR_DBL}" | sed "s#'#'\\"'\\"'#g;s#.*#par_dbl='&'#" ; else echo "# par_dbl="; fi ) +$( if [ ! -z ${VIASH_PAR_VCF_KNOWN+x} ]; then echo "${VIASH_PAR_VCF_KNOWN}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vcf_known='&'#" ; else echo "# par_vcf_known="; fi ) +$( if [ ! -z ${VIASH_PAR_GENO+x} ]; then echo "${VIASH_PAR_GENO}" | sed "s#'#'\\"'\\"'#g;s#.*#par_geno='&'#" ; else echo "# par_geno="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_REF+x} ]; then echo "${VIASH_PAR_REF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ref='&'#" ; else echo "# par_ref="; fi ) +$( if [ ! -z ${VIASH_PAR_ALT+x} ]; then echo "${VIASH_PAR_ALT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_alt='&'#" ; else echo "# par_alt="; fi ) +$( if [ ! -z ${VIASH_PAR_PSC+x} ]; then echo "${VIASH_PAR_PSC}" | sed "s#'#'\\"'\\"'#g;s#.*#par_psc='&'#" ; else echo "# par_psc="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" +fi + +scSplit count \\\\ + --vcf \\$par_vcf \\\\ + --bam \\$par_bam \\\\ + --bar \\$par_bar \\\\ + --tag \\$par_tag \\\\ + --ref \\$par_ref \\\\ + --alt \\$par_alt \\\\ + --out \\$par_output \\\\ + \\${par_com:+--com \\$par_com} + +scSplit run \\\\ + --ref "\\$par_output/\\$par_ref" \\\\ + --alt "\\$par_output/\\$par_alt" \\\\ + --out \\$par_output \\\\ + --num \\$par_num \\\\ + \\${par_sub:+--sub \\$par_sub} \\\\ + \\${par_ems:+--ems \\$par_ems} \\\\ + \\${par_dbl:+--dbl \\$par_dbl} \\\\ + \\${par_vcf_known:+--vcf \\$par_vcf_known} + +if [ "\\$par_geno" = true ]; then + scSplit genotype \\\\ + --ref "\\$par_output/\\$par_ref" \\\\ + --alt "\\$par_output/\\$par_alt" \\\\ + --psc "\\$par_output/\\$par_psc" \\\\ + "\\$par_output" +fi + +echo "cell,donor_id" > "\\$par_output/cell_annotation.csv" +sed -e '1d' -e 's/SNG-//g' "\\$par_output/scSplit_result.csv" | +sed 's/\\\\t/,/g' | awk 'BEGIN{FS=OFS=","} { if (\\$2 ~ /^DBL-/) \\$2 = "doublet"; print }' \\\\ +>> "\\$par_output/cell_annotation.csv" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/scsplit", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/scsplit/nextflow.config b/target/nextflow/genetic_demux/scsplit/nextflow.config new file mode 100644 index 00000000..19c361f4 --- /dev/null +++ b/target/nextflow/genetic_demux/scsplit/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/scsplit' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'scsplit is a genotype-free demultiplexing methode of pooled single-cell RNA-seq, using a hidden state model for identifying genetically distinct samples within a mixed population.' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/scsplit/nextflow_labels.config b/target/nextflow/genetic_demux/scsplit/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/scsplit/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/scsplit/nextflow_schema.json b/target/nextflow/genetic_demux/scsplit/nextflow_schema.json new file mode 100644 index 00000000..474c7850 --- /dev/null +++ b/target/nextflow/genetic_demux/scsplit/nextflow_schema.json @@ -0,0 +1,131 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scsplit", + "description": "scsplit is a genotype-free demultiplexing methode of pooled single-cell RNA-seq, using a hidden state model for identifying genetically distinct samples within a mixed population.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "vcf": { + "type": "string", + "format": "path", + "description": "VCF from mixed BAM", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "bam": { + "type": "string", + "format": "path", + "description": "mixed sample BAM", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "bar": { + "type": "string", + "format": "path", + "description": "barcodes whitelist", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "tag": { + "type": "string", + "description": "tag for barcode", + "help_text": "Type: `string`, multiple: `False`, default: `\"CB\"`. ", + "default": "CB" + }, + "com": { + "type": "string", + "format": "path", + "description": "common SNVs", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "num": { + "type": "integer", + "description": "expected number of mixed samples", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "sub": { + "type": "integer", + "description": "maximum number of subpopulations in autodetect mode", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "ems": { + "type": "integer", + "description": "number of EM repeats to avoid local maximum", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + }, + "dbl": { + "type": "number", + "description": "correction for doublets", + "help_text": "Type: `double`, multiple: `False`. " + }, + "vcf_known": { + "type": "string", + "format": "path", + "description": "known individual genotypes to limit distinguishing variants to available variants, so that users do not need to redo genotyping on selected variants, otherwise any variants could be selected as distinguishing variants.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "geno": { + "type": "boolean", + "description": "generate sample genotypes based on the split result.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"scSplit_out\"`. ", + "default": "$id.$key.output" + }, + "ref": { + "type": "string", + "description": "output Ref count matrix", + "help_text": "Type: `string`, multiple: `False`. " + }, + "alt": { + "type": "string", + "description": "output Alt count matrix", + "help_text": "Type: `string`, multiple: `False`. " + }, + "psc": { + "type": "string", + "description": "generated P(S|C)", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/souporcell/.config.vsh.yaml b/target/nextflow/genetic_demux/souporcell/.config.vsh.yaml new file mode 100644 index 00000000..ba7d97b9 --- /dev/null +++ b/target/nextflow/genetic_demux/souporcell/.config.vsh.yaml @@ -0,0 +1,334 @@ +name: "souporcell" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--fasta" + description: "reference fasta file" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "cellranger bam" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam_index" + description: "cellranger bam index" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcodes" + description: "barcodes.tsv from cellranger" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clusters" + description: "number cluster, tbd add easy way to run on a range of k" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--ploidy" + description: "ploidy, must be 1 or 2" + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_alt" + description: "min alt to use locus" + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_ref" + description: "min ref to use locus" + info: null + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_loci" + description: "max loci per cell, affects speed" + info: null + default: + - 2048 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--restarts" + description: "number of restarts in clustering, when there are > 12 clusters we\ + \ recommend increasing this to avoid local minima" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--common_variants" + description: "common variant loci or known variant loci vcf, must be vs same reference\ + \ fasta" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--known_genotypes" + description: "known variants per clone in population vcf mode, must be .vcf right\ + \ now we dont accept gzip or bcf sorry" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--known_genotypes_sample_names" + description: "which samples in population vcf from known genotypes option represent\ + \ the donors in your sample" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--skip_remap" + description: "dont remap with minimap2 (not recommended unless in conjunction\ + \ with --common_variants" + info: null + direction: "input" + - type: "boolean_true" + name: "--ignore" + description: "set to True to ignore data error assertions" + info: null + direction: "input" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "name of directory to place souporcell files" + info: null + example: + - "souporcell_out" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "souporcell is a method for clustering mixed-genotype scRNAseq experiments\ + \ by individual." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "demuxafy_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "cumulusprod/souporcell:2022.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/souporcell/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/souporcell" + executable: "target/nextflow/genetic_demux/souporcell/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/souporcell/main.nf b/target/nextflow/genetic_demux/souporcell/main.nf new file mode 100644 index 00000000..da01b9bf --- /dev/null +++ b/target/nextflow/genetic_demux/souporcell/main.nf @@ -0,0 +1,4015 @@ +// souporcell main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "souporcell", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--fasta", + "description" : "reference fasta file", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam", + "description" : "cellranger bam", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam_index", + "description" : "cellranger bam index", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcodes", + "description" : "barcodes.tsv from cellranger", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clusters", + "description" : "number cluster, tbd add easy way to run on a range of k", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--ploidy", + "description" : "ploidy, must be 1 or 2", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_alt", + "description" : "min alt to use locus", + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_ref", + "description" : "min ref to use locus", + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_loci", + "description" : "max loci per cell, affects speed", + "default" : [ + 2048 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--restarts", + "description" : "number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--common_variants", + "description" : "common variant loci or known variant loci vcf, must be vs same reference fasta", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--known_genotypes", + "description" : "known variants per clone in population vcf mode, must be .vcf right now we dont accept gzip or bcf sorry", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--known_genotypes_sample_names", + "description" : "which samples in population vcf from known genotypes option represent the donors in your sample", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--skip_remap", + "description" : "dont remap with minimap2 (not recommended unless in conjunction with --common_variants", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--ignore", + "description" : "set to True to ignore data error assertions", + "direction" : "input" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "name of directory to place souporcell files", + "example" : [ + "souporcell_out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "souporcell is a method for clustering mixed-genotype scRNAseq experiments by individual.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/demuxafy_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "cumulusprod/souporcell:2022.12", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/souporcell/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/souporcell", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_FASTA+x} ]; then echo "${VIASH_PAR_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_fasta='&'#" ; else echo "# par_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM_INDEX+x} ]; then echo "${VIASH_PAR_BAM_INDEX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam_index='&'#" ; else echo "# par_bam_index="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODES+x} ]; then echo "${VIASH_PAR_BARCODES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_barcodes='&'#" ; else echo "# par_barcodes="; fi ) +$( if [ ! -z ${VIASH_PAR_CLUSTERS+x} ]; then echo "${VIASH_PAR_CLUSTERS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_clusters='&'#" ; else echo "# par_clusters="; fi ) +$( if [ ! -z ${VIASH_PAR_PLOIDY+x} ]; then echo "${VIASH_PAR_PLOIDY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ploidy='&'#" ; else echo "# par_ploidy="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_ALT+x} ]; then echo "${VIASH_PAR_MIN_ALT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_alt='&'#" ; else echo "# par_min_alt="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_REF+x} ]; then echo "${VIASH_PAR_MIN_REF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_ref='&'#" ; else echo "# par_min_ref="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_LOCI+x} ]; then echo "${VIASH_PAR_MAX_LOCI}" | sed "s#'#'\\"'\\"'#g;s#.*#par_max_loci='&'#" ; else echo "# par_max_loci="; fi ) +$( if [ ! -z ${VIASH_PAR_RESTARTS+x} ]; then echo "${VIASH_PAR_RESTARTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_restarts='&'#" ; else echo "# par_restarts="; fi ) +$( if [ ! -z ${VIASH_PAR_COMMON_VARIANTS+x} ]; then echo "${VIASH_PAR_COMMON_VARIANTS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_common_variants='&'#" ; else echo "# par_common_variants="; fi ) +$( if [ ! -z ${VIASH_PAR_KNOWN_GENOTYPES+x} ]; then echo "${VIASH_PAR_KNOWN_GENOTYPES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_known_genotypes='&'#" ; else echo "# par_known_genotypes="; fi ) +$( if [ ! -z ${VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES+x} ]; then echo "${VIASH_PAR_KNOWN_GENOTYPES_SAMPLE_NAMES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_known_genotypes_sample_names='&'#" ; else echo "# par_known_genotypes_sample_names="; fi ) +$( if [ ! -z ${VIASH_PAR_SKIP_REMAP+x} ]; then echo "${VIASH_PAR_SKIP_REMAP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_skip_remap='&'#" ; else echo "# par_skip_remap="; fi ) +$( if [ ! -z ${VIASH_PAR_IGNORE+x} ]; then echo "${VIASH_PAR_IGNORE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ignore='&'#" ; else echo "# par_ignore="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\\$par_skip_remap" == "false" ]] && unset par_skip_remap +[[ "\\$par_ignore" == "false" ]] && unset par_ignore + +if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" +fi + +/opt/souporcell/souporcell_pipeline.py \\\\ + --bam \\$par_bam \\\\ + --fasta \\$par_fasta \\\\ + --barcodes \\$par_barcodes \\\\ + --clusters \\$par_clusters \\\\ + --ploidy \\$par_ploidy \\\\ + --min_alt \\$par_min_alt \\\\ + --min_ref \\$par_min_ref \\\\ + --max_loci \\$par_max_loci \\\\ + --out_dir \\$par_output \\\\ + --threads \\${par_threads:=1} \\\\ + \\${par_restarts:+--restarts \\$par_restarts} \\\\ + \\${par_common_variants:+--common_variants \\$par_common_variants} \\\\ + \\${par_known_genotypes:+--known_genotypes \\$par_known_genotypes} \\\\ + \\${par_known_genotypes_sample_names:+--known_genotypes_sample_names \\$par_known_genotypes_sample_names} \\\\ + \\${par_skip_remap:+--skip_remap True} \\\\ + \\${par_ignore:+--ignore True} + +cut -d\\$'\\\\t' -f 1-3 "\\$par_output/clusters.tsv" | +sed 's/\\\\t/,/g' | +awk 'BEGIN{FS=OFS=","} {\\$2=(\\$2=="singlet")?\\$3:\\$2; NF=NF-1; print}' | +sed '1s/barcode,status/cell,donor_id/' > "\\$par_output/cell_annotation.csv" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/souporcell", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/souporcell/nextflow.config b/target/nextflow/genetic_demux/souporcell/nextflow.config new file mode 100644 index 00000000..7da17de3 --- /dev/null +++ b/target/nextflow/genetic_demux/souporcell/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/souporcell' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'souporcell is a method for clustering mixed-genotype scRNAseq experiments by individual.' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/souporcell/nextflow_labels.config b/target/nextflow/genetic_demux/souporcell/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/souporcell/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/souporcell/nextflow_schema.json b/target/nextflow/genetic_demux/souporcell/nextflow_schema.json new file mode 100644 index 00000000..476ef9f6 --- /dev/null +++ b/target/nextflow/genetic_demux/souporcell/nextflow_schema.json @@ -0,0 +1,139 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "souporcell", + "description": "souporcell is a method for clustering mixed-genotype scRNAseq experiments by individual.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "fasta": { + "type": "string", + "format": "path", + "description": "reference fasta file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "bam": { + "type": "string", + "format": "path", + "description": "cellranger bam", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "bam_index": { + "type": "string", + "format": "path", + "description": "cellranger bam index", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "barcodes": { + "type": "string", + "format": "path", + "description": "barcodes.tsv from cellranger", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "clusters": { + "type": "integer", + "description": "number cluster, tbd add easy way to run on a range of k", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "ploidy": { + "type": "integer", + "description": "ploidy, must be 1 or 2", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "min_alt": { + "type": "integer", + "description": "min alt to use locus", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "min_ref": { + "type": "integer", + "description": "min ref to use locus", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "max_loci": { + "type": "integer", + "description": "max loci per cell, affects speed", + "help_text": "Type: `integer`, multiple: `False`, default: `2048`. ", + "default": 2048 + }, + "restarts": { + "type": "integer", + "description": "number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "common_variants": { + "type": "string", + "format": "path", + "description": "common variant loci or known variant loci vcf, must be vs same reference fasta", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "known_genotypes": { + "type": "string", + "format": "path", + "description": "known variants per clone in population vcf mode, must be .vcf right now we dont accept gzip or bcf sorry", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "known_genotypes_sample_names": { + "type": "string", + "description": "which samples in population vcf from known genotypes option represent the donors in your sample", + "help_text": "Type: `string`, multiple: `False`. " + }, + "skip_remap": { + "type": "boolean", + "description": "dont remap with minimap2 (not recommended unless in conjunction with --common_variants", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "ignore": { + "type": "boolean", + "description": "set to True to ignore data error assertions", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "name of directory to place souporcell files", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"souporcell_out\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/genetic_demux/vireo/.config.vsh.yaml b/target/nextflow/genetic_demux/vireo/.config.vsh.yaml new file mode 100644 index 00000000..b06c7979 --- /dev/null +++ b/target/nextflow/genetic_demux/vireo/.config.vsh.yaml @@ -0,0 +1,354 @@ +name: "vireo" +namespace: "genetic_demux" +version: "main" +authors: +- name: "Xichen Wu" + roles: + - "author" + info: + role: "Contributor" + links: + github: "wxicu" + linkedin: "xichen-wu" + orcid: "0009-0008-2168-4508" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Student Assistant" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--cell_data" + description: "The cell genotype file in VCF format or cellSNP folder with sparse\ + \ matrices." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_donor" + description: "Number of donors to demultiplex; can be larger than provided in\ + \ donor_file." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vartrix_data" + description: "The cell genotype files in vartrix outputs." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--donor_file" + description: "The donor genotype file in VCF format. Please filter the sample\ + \ and region with bcftools first!" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--geno_tag" + description: "The tag for donor genotype." + info: null + default: + - "PL" + required: false + choices: + - "GT" + - "GP" + - "PL" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--no_doublet" + description: "If use, not checking doublets." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_init" + description: "Number of random initializations, when GT needs to learn." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--extra_donor" + description: "Number of extra donor in pre-cluster, when GT needs to learn." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--extra_donorMode" + description: "Method for searching from extra donors. size: n_cell per donor;\ + \ distance: GT distance between donors" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--force_learn_gt" + description: "If use, treat donor GT as prior only." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--ase_mode" + description: "If use, turn on SNP specific allelic ratio." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--no_plot" + description: "If use, turn off plotting GT distance." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--rand_seed" + description: "Seed for random initialization" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_range" + description: "Range of cells to process." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--call_ambient_rnas" + description: "If use, detect ambient RNAs in each cell." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory" + info: null + example: + - "vireo" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Vireo is primarily designed for demultiplexing cells into donors by\ + \ modelling of expressed alleles." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "vireo_test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "threadpoolctl" + - "vireoSNP" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/genetic_demux/vireo/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/genetic_demux/vireo" + executable: "target/nextflow/genetic_demux/vireo/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/genetic_demux/vireo/main.nf b/target/nextflow/genetic_demux/vireo/main.nf new file mode 100644 index 00000000..dfaabd1f --- /dev/null +++ b/target/nextflow/genetic_demux/vireo/main.nf @@ -0,0 +1,4048 @@ +// vireo main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Xichen Wu (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "vireo", + "namespace" : "genetic_demux", + "version" : "main", + "authors" : [ + { + "name" : "Xichen Wu", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "wxicu", + "linkedin" : "xichen-wu", + "orcid" : "0009-0008-2168-4508" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Student Assistant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--cell_data", + "description" : "The cell genotype file in VCF format or cellSNP folder with sparse matrices.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_donor", + "description" : "Number of donors to demultiplex; can be larger than provided in donor_file.", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vartrix_data", + "description" : "The cell genotype files in vartrix outputs.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--donor_file", + "description" : "The donor genotype file in VCF format. Please filter the sample and region with bcftools first!", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--geno_tag", + "description" : "The tag for donor genotype.", + "default" : [ + "PL" + ], + "required" : false, + "choices" : [ + "GT", + "GP", + "PL" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--no_doublet", + "description" : "If use, not checking doublets.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_init", + "description" : "Number of random initializations, when GT needs to learn.", + "default" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--extra_donor", + "description" : "Number of extra donor in pre-cluster, when GT needs to learn.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--extra_donorMode", + "description" : "Method for searching from extra donors. size: n_cell per donor; distance: GT distance between donors", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--force_learn_gt", + "description" : "If use, treat donor GT as prior only.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--ase_mode", + "description" : "If use, turn on SNP specific allelic ratio.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--no_plot", + "description" : "If use, turn off plotting GT distance.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--rand_seed", + "description" : "Seed for random initialization", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_range", + "description" : "Range of cells to process.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--call_ambient_rnas", + "description" : "If use, detect ambient RNAs in each cell.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory", + "example" : [ + "vireo" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Vireo is primarily designed for demultiplexing cells into donors by modelling of expressed alleles.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../../../resources_test/vireo_test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pip" : [ + "threadpoolctl", + "vireoSNP" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/genetic_demux/vireo/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/genetic_demux/vireo", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_CELL_DATA+x} ]; then echo "${VIASH_PAR_CELL_DATA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cell_data='&'#" ; else echo "# par_cell_data="; fi ) +$( if [ ! -z ${VIASH_PAR_N_DONOR+x} ]; then echo "${VIASH_PAR_N_DONOR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_n_donor='&'#" ; else echo "# par_n_donor="; fi ) +$( if [ ! -z ${VIASH_PAR_VARTRIX_DATA+x} ]; then echo "${VIASH_PAR_VARTRIX_DATA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_vartrix_data='&'#" ; else echo "# par_vartrix_data="; fi ) +$( if [ ! -z ${VIASH_PAR_DONOR_FILE+x} ]; then echo "${VIASH_PAR_DONOR_FILE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_donor_file='&'#" ; else echo "# par_donor_file="; fi ) +$( if [ ! -z ${VIASH_PAR_GENO_TAG+x} ]; then echo "${VIASH_PAR_GENO_TAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_geno_tag='&'#" ; else echo "# par_geno_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_DOUBLET+x} ]; then echo "${VIASH_PAR_NO_DOUBLET}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_doublet='&'#" ; else echo "# par_no_doublet="; fi ) +$( if [ ! -z ${VIASH_PAR_N_INIT+x} ]; then echo "${VIASH_PAR_N_INIT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_n_init='&'#" ; else echo "# par_n_init="; fi ) +$( if [ ! -z ${VIASH_PAR_EXTRA_DONOR+x} ]; then echo "${VIASH_PAR_EXTRA_DONOR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_extra_donor='&'#" ; else echo "# par_extra_donor="; fi ) +$( if [ ! -z ${VIASH_PAR_EXTRA_DONORMODE+x} ]; then echo "${VIASH_PAR_EXTRA_DONORMODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_extra_donorMode='&'#" ; else echo "# par_extra_donorMode="; fi ) +$( if [ ! -z ${VIASH_PAR_FORCE_LEARN_GT+x} ]; then echo "${VIASH_PAR_FORCE_LEARN_GT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_force_learn_gt='&'#" ; else echo "# par_force_learn_gt="; fi ) +$( if [ ! -z ${VIASH_PAR_ASE_MODE+x} ]; then echo "${VIASH_PAR_ASE_MODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ase_mode='&'#" ; else echo "# par_ase_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_PLOT+x} ]; then echo "${VIASH_PAR_NO_PLOT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_plot='&'#" ; else echo "# par_no_plot="; fi ) +$( if [ ! -z ${VIASH_PAR_RAND_SEED+x} ]; then echo "${VIASH_PAR_RAND_SEED}" | sed "s#'#'\\"'\\"'#g;s#.*#par_rand_seed='&'#" ; else echo "# par_rand_seed="; fi ) +$( if [ ! -z ${VIASH_PAR_CELL_RANGE+x} ]; then echo "${VIASH_PAR_CELL_RANGE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cell_range='&'#" ; else echo "# par_cell_range="; fi ) +$( if [ ! -z ${VIASH_PAR_CALL_AMBIENT_RNAS+x} ]; then echo "${VIASH_PAR_CALL_AMBIENT_RNAS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_call_ambient_rnas='&'#" ; else echo "# par_call_ambient_rnas="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +# Unset flags if they equal 'false' +[[ "\\$par_no_doublet" == "false" ]] && unset par_no_doublet +[[ "\\$par_force_learn_gt" == "false" ]] && unset par_force_learn_gt +[[ "\\$par_ase_mode" == "false" ]] && unset par_ase_mode +[[ "\\$par_no_plot" == "false" ]] && unset par_no_plot +[[ "\\$par_call_ambient_rnas" == "false" ]] && unset par_call_ambient_rnas + +if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" +fi + +vireo \\\\ + --cellData \\$par_cell_data \\\\ + --nDonor \\$par_n_donor \\\\ + --genoTag \\$par_geno_tag \\\\ + --nInit \\$par_n_init \\\\ + --extraDonor \\$par_extra_donor \\\\ + --out "\\${par_output}" \\\\ + \\${par_vartrix_data:+--vatrixData \\$par_vartrix_data} \\\\ + \\${par_donor_file:+--donorFile \\$par_donor_file} \\\\ + \\${par_no_doublet:+--noDoublet} \\\\ + \\${par_extra_donorMode:+--extraDonorMode \\$par_extra_donorMode} \\\\ + \\${par_force_learn_gt:+--forceLearnGT} \\\\ + \\${par_ase_mode:+--ASEmode} \\\\ + \\${par_no_plot:+--noPlot} \\\\ + \\${par_rand_seed:+--randSeed \\$par_rand_seed} \\\\ + \\${par_cell_range:+--cellRange \\$par_cell_range} \\\\ + \\${par_call_ambient_rnas:+--callAmbientRNAs} \\\\ + \\${meta_cpus:+--nproc \\$meta_cpus} + +cut -d\\$'\\\\t' -f 1-2 "\\$par_output/donor_ids.tsv" | tr '\\\\t' ',' > "\\$par_output/cell_annotation.csv" +awk 'BEGIN{FS=OFS=","} NR>1{ gsub("donor", "", \\$2) } 1' "\\$par_output/cell_annotation.csv" > "\\$par_output/cell_annotation_temp.csv" && mv "\\$par_output/cell_annotation_temp.csv" "\\$par_output/cell_annotation.csv" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/genetic_demux/vireo", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/genetic_demux/vireo/nextflow.config b/target/nextflow/genetic_demux/vireo/nextflow.config new file mode 100644 index 00000000..6115daa0 --- /dev/null +++ b/target/nextflow/genetic_demux/vireo/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'genetic_demux/vireo' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Vireo is primarily designed for demultiplexing cells into donors by modelling of expressed alleles.' + author = 'Xichen Wu' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/genetic_demux/vireo/nextflow_labels.config b/target/nextflow/genetic_demux/vireo/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/genetic_demux/vireo/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/genetic_demux/vireo/nextflow_schema.json b/target/nextflow/genetic_demux/vireo/nextflow_schema.json new file mode 100644 index 00000000..77f0171a --- /dev/null +++ b/target/nextflow/genetic_demux/vireo/nextflow_schema.json @@ -0,0 +1,144 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "vireo", + "description": "Vireo is primarily designed for demultiplexing cells into donors by modelling of expressed alleles.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "cell_data": { + "type": "string", + "format": "path", + "description": "The cell genotype file in VCF format or cellSNP folder with sparse matrices.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "n_donor": { + "type": "integer", + "description": "Number of donors to demultiplex; can be larger than provided in donor_file.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "vartrix_data": { + "type": "string", + "format": "path", + "description": "The cell genotype files in vartrix outputs.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "donor_file": { + "type": "string", + "format": "path", + "description": "The donor genotype file in VCF format", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "geno_tag": { + "type": "string", + "description": "The tag for donor genotype.", + "help_text": "Type: `string`, multiple: `False`, default: `\"PL\"`, choices: ``GT`, `GP`, `PL``. ", + "enum": [ + "GT", + "GP", + "PL" + ], + "default": "PL" + }, + "no_doublet": { + "type": "boolean", + "description": "If use, not checking doublets.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "n_init": { + "type": "integer", + "description": "Number of random initializations, when GT needs to learn.", + "help_text": "Type: `integer`, multiple: `False`, default: `50`. ", + "default": 50 + }, + "extra_donor": { + "type": "integer", + "description": "Number of extra donor in pre-cluster, when GT needs to learn.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "extra_donorMode": { + "type": "string", + "description": "Method for searching from extra donors", + "help_text": "Type: `string`, multiple: `False`. " + }, + "force_learn_gt": { + "type": "boolean", + "description": "If use, treat donor GT as prior only.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "ase_mode": { + "type": "boolean", + "description": "If use, turn on SNP specific allelic ratio.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "no_plot": { + "type": "boolean", + "description": "If use, turn off plotting GT distance.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "rand_seed": { + "type": "integer", + "description": "Seed for random initialization", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "cell_range": { + "type": "string", + "description": "Range of cells to process.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "call_ambient_rnas": { + "type": "boolean", + "description": "If use, detect ambient RNAs in each cell.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`, example: `\"vireo\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integrate/harmonypy/.config.vsh.yaml b/target/nextflow/integrate/harmonypy/.config.vsh.yaml new file mode 100644 index 00000000..7a15ecc1 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/.config.vsh.yaml @@ -0,0 +1,322 @@ +name: "harmonypy" +namespace: "integrate" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "Which .obsm slot to use as a starting PCA embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_pca_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--theta" + description: "Diversity clustering penalty parameter. Can be set as a single value\ + \ for all batch observations or as multiple values, one for each observation\ + \ in the batches defined by --obs_covariates. theta=0 does not encourage any\ + \ diversity. Larger values of theta result in more diverse clusters." + info: null + default: + - 2.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--obs_covariates" + description: "The .obs field(s) that define the covariate(s) to regress out." + info: null + example: + - "batch" + - "sample" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony.\ + \ Based on an implementation in python from https://github.com/slowkow/harmonypy" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "harmonypy~=0.0.6" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/harmonypy/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integrate/harmonypy" + executable: "target/nextflow/integrate/harmonypy/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/integrate/harmonypy/compress_h5mu.py b/target/nextflow/integrate/harmonypy/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/integrate/harmonypy/main.nf b/target/nextflow/integrate/harmonypy/main.nf new file mode 100644 index 00000000..00c6d5d4 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/main.nf @@ -0,0 +1,4031 @@ +// harmonypy main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Robrecht Cannoodt (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "harmonypy", + "namespace" : "integrate", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_input", + "description" : "Which .obsm slot to use as a starting PCA embedding.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_pca_integrated" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--theta", + "description" : "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters.", + "default" : [ + 2.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_covariates", + "description" : "The .obs field(s) that define the covariate(s) to regress out.", + "example" : [ + "batch", + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony. Based on an implementation in python from https://github.com/slowkow/harmonypy", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "harmonypy~=0.0.6" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integrate/harmonypy/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/integrate/harmonypy", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata +import sys +from harmonypy import run_harmony + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'theta': $( if [ ! -z ${VIASH_PAR_THETA+x} ]; then echo "list(map(float, r'${VIASH_PAR_THETA//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'obs_covariates': $( if [ ! -z ${VIASH_PAR_OBS_COVARIATES+x} ]; then echo "r'${VIASH_PAR_OBS_COVARIATES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading %s from %s", par["modality"], par["input"]) + mod = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + logger.info("Selecting embedding %s", par["obsm_input"]) + pca_embedding = mod.obsm[par["obsm_input"]] + metadata = mod.obs + logger.info( + "Available covariates to regress out: ", ", ".join(metadata.columns.to_list()) + ) + logger.info( + "Running harmonypy with theta %s to regress out the following variables: ", + par["theta"], + ", ".join(par["obs_covariates"]), + ) + ho = run_harmony(pca_embedding, metadata, par["obs_covariates"], theta=par["theta"]) + logger.info("Adding output to slot %s", par["obsm_output"]) + mod.obsm[par["obsm_output"]] = ho.Z_corr.T + logger.info("Writing to %s") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] + ) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/integrate/harmonypy", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integrate/harmonypy/nextflow.config b/target/nextflow/integrate/harmonypy/nextflow.config new file mode 100644 index 00000000..b2cf9f84 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integrate/harmonypy' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs Harmony integration based as described in https://github.com/immunogenomics/harmony. Based on an implementation in python from https://github.com/slowkow/harmonypy' + author = 'Dries Schaumont, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integrate/harmonypy/nextflow_labels.config b/target/nextflow/integrate/harmonypy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/integrate/harmonypy/nextflow_schema.json b/target/nextflow/integrate/harmonypy/nextflow_schema.json new file mode 100644 index 00000000..20d00b72 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/nextflow_schema.json @@ -0,0 +1,95 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "harmonypy", + "description": "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony. Based on an implementation in python from https://github.com/slowkow/harmonypy", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obsm_input": { + "type": "string", + "description": "Which .obsm slot to use as a starting PCA embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca_integrated\"`. ", + "default": "X_pca_integrated" + }, + "theta": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Diversity clustering penalty parameter", + "help_text": "Type: `double`, multiple: `True`, default: `[2.0]`. ", + "default": [ + 2.0 + ] + }, + "obs_covariates": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The .obs field(s) that define the covariate(s) to regress out.", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"batch\";\"sample\"]`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integrate/harmonypy/setup_logger.py b/target/nextflow/integrate/harmonypy/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/integrate/harmonypy/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/integrate/scanorama/.config.vsh.yaml b/target/nextflow/integrate/scanorama/.config.vsh.yaml new file mode 100644 index 00000000..a8bf808c --- /dev/null +++ b/target/nextflow/integrate/scanorama/.config.vsh.yaml @@ -0,0 +1,360 @@ +name: "scanorama" +namespace: "integrate" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output .h5mu file" + info: null + default: + - "output.h5ad" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "batch" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "Basis obsm slot to run scanorama on." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The name of the field in adata.obsm where the integrated embeddings\ + \ will be stored after running this function. Defaults to X_scanorama." + info: null + default: + - "X_scanorama" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--knn" + description: "Number of nearest neighbors to use for matching." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size used in the alignment vector computation. Useful\ + \ when integrating very large (>100k samples) datasets. Set to large value that\ + \ runs within available memory." + info: null + default: + - 5000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--sigma" + description: "Correction smoothing parameter on Gaussian kernel." + info: null + default: + - 15.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--approx" + description: "Use approximate nearest neighbors with Python annoy; greatly speeds\ + \ up matching runtime." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "Alignment score minimum cutoff" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Use Scanorama to integrate different experiments.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "highmem" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scanorama" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/scanorama/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integrate/scanorama" + executable: "target/nextflow/integrate/scanorama/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/integrate/scanorama/compress_h5mu.py b/target/nextflow/integrate/scanorama/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/integrate/scanorama/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/integrate/scanorama/main.nf b/target/nextflow/integrate/scanorama/main.nf new file mode 100644 index 00000000..76867b5b --- /dev/null +++ b/target/nextflow/integrate/scanorama/main.nf @@ -0,0 +1,4084 @@ +// scanorama main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (author) +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scanorama", + "namespace" : "integrate", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output .h5mu file", + "default" : [ + "output.h5ad" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch", + "description" : "Column name discriminating between your batches.", + "default" : [ + "batch" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_input", + "description" : "Basis obsm slot to run scanorama on.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "The name of the field in adata.obsm where the integrated embeddings will be stored after running this function. Defaults to X_scanorama.", + "default" : [ + "X_scanorama" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--knn", + "description" : "Number of nearest neighbors to use for matching.", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--batch_size", + "description" : "The batch size used in the alignment vector computation. Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory.", + "default" : [ + 5000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--sigma", + "description" : "Correction smoothing parameter on Gaussian kernel.", + "default" : [ + 15.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--approx", + "description" : "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alpha", + "description" : "Alignment score minimum cutoff", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Use Scanorama to integrate different experiments.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "build-essential" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "scanorama" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integrate/scanorama/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/integrate/scanorama", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from scanpy.external.pp import scanorama_integrate +from mudata import read_h5ad + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'knn': $( if [ ! -z ${VIASH_PAR_KNN+x} ]; then echo "int(r'${VIASH_PAR_KNN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'batch_size': $( if [ ! -z ${VIASH_PAR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_BATCH_SIZE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sigma': $( if [ ! -z ${VIASH_PAR_SIGMA+x} ]; then echo "float(r'${VIASH_PAR_SIGMA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'approx': $( if [ ! -z ${VIASH_PAR_APPROX+x} ]; then echo "r'${VIASH_PAR_APPROX//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'alpha': $( if [ ! -z ${VIASH_PAR_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_ALPHA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from input %s", par["modality"], par["input"]) +mod = read_h5ad(par["input"], mod=par["modality"]) + +arguments = { + "key": par["obs_batch"], + "basis": par["obsm_input"], + "adjusted_basis": par["obsm_output"], + "knn": par["knn"], + "alpha": par["alpha"], + "sigma": par["sigma"], + "approx": par["approx"], + "batch_size": par["batch_size"], +} + +logger.info( + "Running scanorama with parameters: \\\\n", + "\\\\n\\\\t".join( + [ + f"{argument_name}: {argument_value}" + for argument_name, argument_value in arguments.items() + ] + ), +) +# Integration. +scanorama_integrate(mod, **arguments) +logger.info("Writing output to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] +) + +logger.info("Finished!") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/integrate/scanorama", + "tag" : "main" + }, + "label" : [ + "midcpu", + "highmem", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integrate/scanorama/nextflow.config b/target/nextflow/integrate/scanorama/nextflow.config new file mode 100644 index 00000000..f59f0a7c --- /dev/null +++ b/target/nextflow/integrate/scanorama/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integrate/scanorama' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Use Scanorama to integrate different experiments.\n' + author = 'Dries De Maeyer, Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integrate/scanorama/nextflow_labels.config b/target/nextflow/integrate/scanorama/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/integrate/scanorama/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/integrate/scanorama/nextflow_schema.json b/target/nextflow/integrate/scanorama/nextflow_schema.json new file mode 100644 index 00000000..5ab6003e --- /dev/null +++ b/target/nextflow/integrate/scanorama/nextflow_schema.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scanorama", + "description": "Use Scanorama to integrate different experiments.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output .h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5ad\"`, direction: `output`. ", + "default": "output.h5ad" + }, + "obs_batch": { + "type": "string", + "description": "Column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"batch\"`. ", + "default": "batch" + }, + "obsm_input": { + "type": "string", + "description": "Basis obsm slot to run scanorama on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "obsm_output": { + "type": "string", + "description": "The name of the field in adata.obsm where the integrated embeddings will be stored after running this function", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scanorama\"`. ", + "default": "X_scanorama" + }, + "knn": { + "type": "integer", + "description": "Number of nearest neighbors to use for matching.", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "batch_size": { + "type": "integer", + "description": "The batch size used in the alignment vector computation", + "help_text": "Type: `integer`, multiple: `False`, default: `5000`. ", + "default": 5000 + }, + "sigma": { + "type": "number", + "description": "Correction smoothing parameter on Gaussian kernel.", + "help_text": "Type: `double`, multiple: `False`, default: `15.0`. ", + "default": 15.0 + }, + "approx": { + "type": "boolean", + "description": "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "alpha": { + "type": "number", + "description": "Alignment score minimum cutoff", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integrate/scanorama/setup_logger.py b/target/nextflow/integrate/scanorama/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/integrate/scanorama/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/integrate/scarches/.config.vsh.yaml b/target/nextflow/integrate/scarches/.config.vsh.yaml new file mode 100644 index 00000000..70e5837c --- /dev/null +++ b/target/nextflow/integrate/scarches/.config.vsh.yaml @@ -0,0 +1,482 @@ +name: "scarches" +namespace: "integrate" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + description: "Arguments related to the input (query) dataset" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file to use as a query" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer to be used for scArches, if .X is not to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch" + description: "Name of the .obs column with batch information." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_label" + description: "Name of the .obs column with celltype information." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "Name of the .var column with gene names, if the var .index is not\ + \ to be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_size_factor" + description: "Key in adata.obs for size factor information. Instead of using library\ + \ size as a size factor,\nthe provided size factor column will be used as offset\ + \ in the mean of the likelihood.\nAssumed to be on linear scale.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference" + arguments: + - type: "file" + name: "--reference" + alternatives: + - "-r" + description: "Path to the directory with reference model or a web link." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_class" + description: "For legacy models; the type of model (where the type of model was\ + \ not saved with it; e.g. when they were generated with scvi-tools versions\ + \ < 0.15)." + info: null + example: + - "SCANVI" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_output" + description: "Output directory for model" + info: null + default: + - "model" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_integrated_scanvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_predictions" + description: "In which .obs slot to store the resulting label predictions. Only\ + \ relevant if a scANVI model was provided." + info: null + default: + - "scanvi_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output_probabilities" + description: "In which .obs slot to store the probabilities of the label predictions.\ + \ Only relevant if a scANVI model was provided." + info: null + default: + - "scanvi_proba" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Early stopping arguments" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs reference mapping with scArches" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "scanvi_model" +- type: "file" + path: "scvi_model" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "HLCA_reference_model.zip" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "jax[cuda]" + - "scvi-tools~=1.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/scarches/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integrate/scarches" + executable: "target/nextflow/integrate/scarches/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/integrate/scarches/compress_h5mu.py b/target/nextflow/integrate/scarches/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/integrate/scarches/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/integrate/scarches/main.nf b/target/nextflow/integrate/scarches/main.nf new file mode 100644 index 00000000..f5861741 --- /dev/null +++ b/target/nextflow/integrate/scarches/main.nf @@ -0,0 +1,4531 @@ +// scarches main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (author) +// * Dorien Roosen (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scarches", + "namespace" : "integrate", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + }, + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Arguments related to the input (query) dataset", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file to use as a query", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Layer to be used for scArches, if .X is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch", + "description" : "Name of the .obs column with batch information.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_label", + "description" : "Name of the .obs column with celltype information.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "Name of the .var column with gene names, if the var .index is not to be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_size_factor", + "description" : "Key in adata.obs for size factor information. Instead of using library size as a size factor,\nthe provided size factor column will be used as offset in the mean of the likelihood.\nAssumed to be on linear scale.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "alternatives" : [ + "-r" + ], + "description" : "Path to the directory with reference model or a web link.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_class", + "description" : "For legacy models; the type of model (where the type of model was not saved with it; e.g. when they were generated with scvi-tools versions < 0.15).", + "example" : [ + "SCANVI" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_output", + "description" : "Output directory for model", + "default" : [ + "model" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_integrated_scanvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_output_predictions", + "description" : "In which .obs slot to store the resulting label predictions. Only relevant if a scANVI model was provided.", + "default" : [ + "scanvi_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_output_probabilities", + "description" : "In which .obs slot to store the probabilities of the label predictions. Only relevant if a scANVI model was provided.", + "default" : [ + "scanvi_proba" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Early stopping arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--early_stopping", + "description" : "Whether to perform early stopping with respect to the validation set.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--early_stopping_monitor", + "description" : "Metric logged during validation set epoch.", + "default" : [ + "elbo_validation" + ], + "required" : false, + "choices" : [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--early_stopping_patience", + "description" : "Number of validation epochs with no improvement after which training will be stopped.", + "default" : [ + 45 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--early_stopping_min_delta", + "description" : "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Learning parameters", + "arguments" : [ + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--reduce_lr_on_plateau", + "description" : "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_factor", + "description" : "Factor to reduce learning rate.", + "default" : [ + 0.6 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_patience", + "description" : "Number of epochs with no improvement after which learning rate will be reduced.", + "default" : [ + 30.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs reference mapping with scArches", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/scanvi_model" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/scvi_model" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/HLCA_reference_model/HLCA_reference_model.zip" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "highdisk", + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:25.05-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "jax[cuda]", + "scvi-tools~=1.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integrate/scarches/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/integrate/scarches", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata +from anndata import AnnData +import scvi +import pandas as pd +import numpy as np +import torch +from tempfile import TemporaryDirectory +from pathlib import Path +import pickle + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obs_size_factor': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_SIZE_FACTOR+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_SIZE_FACTOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_class': $( if [ ! -z ${VIASH_PAR_REFERENCE_CLASS+x} ]; then echo "r'${VIASH_PAR_REFERENCE_CLASS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_output': $( if [ ! -z ${VIASH_PAR_MODEL_OUTPUT+x} ]; then echo "r'${VIASH_PAR_MODEL_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_output_predictions': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_output_probabilities': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT_PROBABILITIES+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT_PROBABILITIES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'early_stopping': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping_monitor': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING_MONITOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'early_stopping_patience': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then echo "int(r'${VIASH_PAR_EARLY_STOPPING_PATIENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'early_stopping_min_delta': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then echo "float(r'${VIASH_PAR_EARLY_STOPPING_MIN_DELTA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reduce_lr_on_plateau': $( if [ ! -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then echo "r'${VIASH_PAR_REDUCE_LR_ON_PLATEAU//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'lr_factor': $( if [ ! -z ${VIASH_PAR_LR_FACTOR+x} ]; then echo "float(r'${VIASH_PAR_LR_FACTOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'lr_patience': $( if [ ! -z ${VIASH_PAR_LR_PATIENCE+x} ]; then echo "float(r'${VIASH_PAR_LR_PATIENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def _read_model_name_from_registry(model_path) -> str: + """Read registry with information about the model, return the model name""" + registry = scvi.model.base.BaseModelClass.load_registry(model_path) + return registry["model_name"] + + +def _detect_base_model(model_path): + """Read from the model's file which scvi_tools model it contains""" + return getattr(scvi.model, _read_model_name_from_registry(model_path)) + + +def _validate_obs_metadata_params(model_registry, model_name): + """ + Validates .obs metadata par variables that are required by scvi-tools models. + + This function checks that necessary .obs metadata field names (batch, labels, size factors) + specified in the model registry have the required corresponding parameters provided in the input. + + Parameters + ---------- + model_registry : dict + Dictionary containing model configuration and requirements. + model_name : str + Name of the model being validated. + """ + + # Maps the keys of the model registry to their corresponding par keys containing the .obs metadata + registry_par_mapper = { + "batch_key": "input_obs_batch", + "labels_key": "input_obs_label", + "size_factor_key": "input_obs_size_factor", + } + + for registry_key, key in registry_par_mapper.items(): + model_registry_key = model_registry.get(registry_key) + par_key = par[key] + + if model_registry_key and not par_key: + if key != "input_obs_label" or not model_registry.get("unlabeled_category"): + raise ValueError( + f"The provided {model_name} model requires \\`--{key}\\` to be provided." + ) + + elif par_key and not model_registry_key: + logger.warning( + f"\\`--{key}\\` was provided but is not used in the provided {model_name} model." + ) + + +def _align_query_with_registry(adata_query, model_path): + """ + Creates a qeury AnnData object with the expected structure and metadata fields that are aligned with the pre-trained reference model. + + Parameters + ---------- + adata_query : AnnData + The query AnnData object to be aligned with the model structure. + model_path : str + Path to the directory containing the pre-trained model. + + Returns + ------- + AnnData + A new AnnData object with structure and metadata aligned to match the + requirements of the pre-trained model. + """ + + model = _detect_base_model(model_path) + model_name = _read_model_name_from_registry(model_path) + model_registry = model.load_registry(model_path)["setup_args"] + _validate_obs_metadata_params(model_registry, model_name) + + # Sanitize gene names and set as index of the AnnData object + # all scArches VAE models expect gene names to be in the .var index + adata_query = set_var_index(adata_query, par["input_var_gene_names"]) + + # align layer + query_layer = ( + adata_query.X if not par["layer"] else adata_query.layers[par["layer"]] + ) + var_index = adata_query.var_names.tolist() + obs_index = adata_query.obs_names.tolist() + + # align observations + query_obs = {} + + ## batch_key, size_factor_key + simple_mappings = { + "batch_key": "input_obs_batch", # relevant for AUTOZI, LinearSCVI, PEAKVI, SCANVI, SCVI, TOTALVI, MULTIVI, JaxSCVI + "size_factor_key": "input_obs_size_factor", # relevant for SCANVI, SCVI, TOTALVI, MULTIVI + } + + for registry_key, par_key in simple_mappings.items(): + if model_registry.get(registry_key): + query_obs[model_registry[registry_key]] = adata_query.obs[ + par[par_key] + ].tolist() + + ## labels-key, relevant for AUTOZI, CondSCVI, LinearSCVI, PEAKVI, SCANVI, SCVI + if model_registry.get("labels_key"): + if par["input_obs_label"]: + query_obs[model_registry["labels_key"]] = adata_query.obs[ + par["input_obs_label"] + ].tolist() + else: + adata_query.obs[model_registry["labels_key"]] = model_registry[ + "unlabeled_category" + ] + query_obs[model_registry["labels_key"]] = adata_query.obs[ + model_registry["labels_key"] + ].tolist() + + obs = pd.DataFrame(query_obs, index=obs_index) + var = pd.DataFrame(index=var_index) + + aligned_query_anndata = AnnData(X=query_layer, obs=obs, var=var) + if model_registry["layer"]: + aligned_query_anndata.layers[model_registry["layer"]] = query_layer + + return aligned_query_anndata + + +def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1): + """ + A function to map the query data to the reference atlas. + + Input: + * adata_query: An AnnData object with the query + * model_path: The reference model directory + + Output: + * vae_query: the trained scvi_tools model + * adata_query: The AnnData object with the query preprocessed for the mapping to the reference + """ + model = _detect_base_model(model_path) + + # Keys of the AnnData query object need to match the exact keys in the reference model registry + + aligned_adata_query = _align_query_with_registry(adata_query, model_path) + + try: + model.prepare_query_anndata(aligned_adata_query, model_path) + except ValueError: + logger.warning( + "ValueError thrown when preparing adata for mapping. Clearing .varm field to prevent it" + ) + aligned_adata_query.varm.clear() + try: + model.prepare_query_anndata(aligned_adata_query, model_path) + except ValueError: + raise ValueError( + f"Could not perform model.prepare_model_anndata, likely because the model was trained with different var names then were found in the index. \\\\n\\\\nmodel var_names: {model.prepare_query_anndata(aligned_adata_query, model_path, return_reference_var_names=True).tolist()} \\\\n\\\\nquery data var_names: {aligned_adata_query.var_names.tolist()} " + ) + + try: + # Load query data into the model + vae_query = model.load_query_data( + aligned_adata_query, model_path, freeze_dropout=True + ) + except KeyError: + # Older models do not have the 'setup_method_name' key saved with the model. + # Assume these were generated from an anndata object. + model_torch = torch.load( + f"{model_path}/model.pt", weights_only=False, map_location="cpu" + ) + model_torch["attr_dict"]["registry_"]["setup_method_name"] = "setup_anndata" + + with TemporaryDirectory(dir=meta["temp_dir"]) as tempdir: + temp_file_name = Path(tempdir) / "model.pt" + torch.save(model_torch, temp_file_name) + del model_torch + vae_query = model.load_query_data( + aligned_adata_query, tempdir, freeze_dropout=True + ) + + # Train scArches model for query mapping + vae_query.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + check_val_every_n_epoch=check_val_every_n_epoch, + accelerator="auto", + ) + + return vae_query + + +def _convert_object_dtypes_to_strings(adata): + """Convert object dtypes in .var and .obs to string to prevent error when saving file""" + + def convert_cols(df): + object_cols = df.columns[df.dtypes == "object"] + for col in object_cols: + df[col] = df[col].astype(str) + return df + + adata.var = convert_cols(adata.var) + adata.obs = convert_cols(adata.obs) + + return adata + + +def _get_model_path(model_path: str): + """Obtain path to the directory with reference model. If the proposed \\`model_path\\` is a .zip archive, unzip it. If nesessary, convert model to the new format + + Parameters + ---------- + model_path : str + Path to a directory, where to search for the model or to a zip file containing the model + + Returns + ------- + Path to a directory with reference model in format of scvi-tools>=0.15 + """ + import os + import zipfile + import tempfile + from pathlib import Path + + if os.path.isdir(model_path) and "model.pt" in os.listdir(model_path): + # Probably, the \\`model_path\\` already contains model in the output format of scvi-tools>=0.15 + return model_path + + # The model either has old format or is a zip file downloaded from Zenodo + new_directory = Path(tempfile.TemporaryDirectory().name) + + if zipfile.is_zipfile(model_path): + with zipfile.ZipFile(model_path) as archive: + archive.extractall(new_directory) + model_dir = next(new_directory.glob("**/*.pt")).parent + + else: + model_dir = next(Path(model_path).glob("**/*.pt")).parent + + if "model_params.pt" in os.listdir(model_dir): + try: + # The current approach is to store the class with the model in a registry + model_class = _detect_base_model(model_dir) + except ValueError: + # Failed to load the registry, try + with (model_dir / "attr.pkl").open("rb") as open_file: + attrs = pickle.load(open_file) + try: + model_name = attrs["registry_"]["model_name"] + except KeyError: + # With older scvi-tools models, the model name was not stored with the model... + model_name = par["reference_class"] + if not model_name: + raise ValueError( + "Could not load reference model. This might be because it is a legacy model for which the model type was not saved with the model. Please provide it manually using the 'reference_class' argument." + ) + if not hasattr(scvi.model, model_name): + raise ValueError( + f"Requested to load legacy model using type {model_name}, but such a class does not exist in \\`scvi.models\\`." + ) + model_class = getattr(scvi.model, model_name) + + # The model is in the \\`directory\\`, but it was generated with scvi-tools<0.15 + # TODO: for new references (that could not be SCANVI based), we need to check the base class somehow. Reading registry does not work with models generated by scvi-tools<0.15 + # Here I assume that the reference model is for HLCA and thus is SCANVI based + converted_model_path = os.path.join(model_dir, "converted") + model_class.convert_legacy_save(model_dir, converted_model_path) + return converted_model_path + + elif "model.pt" in os.listdir(model_dir): + # Archive contained model in the new format, so just return the directory + return model_dir + + else: + raise ValueError( + "Cannot find model in the provided reference path. Please, provide a path or a link to the directory with reference model. For HLCA use https://zenodo.org/record/6337966/files/HLCA_reference_model.zip" + ) + + +def main(): + logger.info("Reading %s, modality %s", par["input"], par["modality"]) + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + adata_query = adata.copy() + + model_path = _get_model_path(par["reference"]) + vae_query = map_to_existing_reference(adata_query, model_path=model_path) + model_name = _read_model_name_from_registry(model_path) + + # Save info about the used model + adata.uns["integration_method"] = model_name + + logger.info("Trying to write latent representation") + output_key = par["obsm_output"].format(model_name=model_name) + adata.obsm[output_key] = vae_query.get_latent_representation() + + # In addition to integrating the data, scANVI also performs label annotations + if model_name == "SCANVI": + adata.obs[par["obs_output_predictions"]] = vae_query.predict() + adata.obs[par["obs_output_probabilities"]] = np.max( + vae_query.predict(soft=True), axis=1 + ) + + logger.info("Converting dtypes") + adata = _convert_object_dtypes_to_strings(adata) + + logger.info( + "Saving h5mu file to %s with compression %s", + par["output"], + par["output_compression"], + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + + logger.info("Saving model") + vae_query.save(par["model_output"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/integrate/scarches", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "highdisk", + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integrate/scarches/nextflow.config b/target/nextflow/integrate/scarches/nextflow.config new file mode 100644 index 00000000..579e3618 --- /dev/null +++ b/target/nextflow/integrate/scarches/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integrate/scarches' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs reference mapping with scArches' + author = 'Vladimir Shitov, Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integrate/scarches/nextflow_labels.config b/target/nextflow/integrate/scarches/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/integrate/scarches/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/integrate/scarches/nextflow_schema.json b/target/nextflow/integrate/scarches/nextflow_schema.json new file mode 100644 index 00000000..2cb0daae --- /dev/null +++ b/target/nextflow/integrate/scarches/nextflow_schema.json @@ -0,0 +1,217 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scarches", + "description": "Performs reference mapping with scArches", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Arguments related to the input (query) dataset", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file to use as a query", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "layer": { + "type": "string", + "description": "Layer to be used for scArches, if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_obs_batch": { + "type": "string", + "description": "Name of the .obs column with batch information.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_label": { + "type": "string", + "description": "Name of the .obs column with celltype information.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "Name of the .var column with gene names, if the var .index is not to be used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_size_factor": { + "type": "string", + "description": "Key in adata.obs for size factor information", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "model_output": { + "type": "string", + "format": "path", + "description": "Output directory for model", + "help_text": "Type: `file`, multiple: `False`, default: `\"model\"`, direction: `output`. ", + "default": "model" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_integrated_scanvi\"`. ", + "default": "X_integrated_scanvi" + }, + "obs_output_predictions": { + "type": "string", + "description": "In which .obs slot to store the resulting label predictions", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_pred\"`. ", + "default": "scanvi_pred" + }, + "obs_output_probabilities": { + "type": "string", + "description": "In which .obs slot to store the probabilities of the label predictions", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_proba\"`. ", + "default": "scanvi_proba" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "reference": { + "title": "Reference", + "type": "object", + "description": "No description", + "properties": { + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the directory with reference model or a web link.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "reference_class": { + "type": "string", + "description": "For legacy models; the type of model (where the type of model was not saved with it; e.g", + "help_text": "Type: `string`, multiple: `False`, example: `\"SCANVI\"`. " + } + } + }, + "early stopping arguments": { + "title": "Early stopping arguments", + "type": "object", + "description": "No description", + "properties": { + "early_stopping": { + "type": "boolean", + "description": "Whether to perform early stopping with respect to the validation set.", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "early_stopping_monitor": { + "type": "string", + "description": "Metric logged during validation set epoch.", + "help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ", + "enum": [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "default": "elbo_validation" + }, + "early_stopping_patience": { + "type": "integer", + "description": "Number of validation epochs with no improvement after which training will be stopped.", + "help_text": "Type: `integer`, multiple: `False`, default: `45`. ", + "default": 45 + }, + "early_stopping_min_delta": { + "type": "number", + "description": "Minimum change in the monitored quantity to qualify as an improvement, i.e", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + } + } + }, + "learning parameters": { + "title": "Learning parameters", + "type": "object", + "description": "No description", + "properties": { + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "reduce_lr_on_plateau": { + "type": "boolean", + "description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "lr_factor": { + "type": "number", + "description": "Factor to reduce learning rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.6`. ", + "default": 0.6 + }, + "lr_patience": { + "type": "number", + "description": "Number of epochs with no improvement after which learning rate will be reduced.", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/reference" + }, + { + "$ref": "#/$defs/early stopping arguments" + }, + { + "$ref": "#/$defs/learning parameters" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integrate/scarches/set_var_index.py b/target/nextflow/integrate/scarches/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/integrate/scarches/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/integrate/scarches/setup_logger.py b/target/nextflow/integrate/scarches/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/integrate/scarches/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/integrate/scvi/.config.vsh.yaml b/target/nextflow/integrate/scvi/.config.vsh.yaml new file mode 100644 index 00000000..2ad49e4e --- /dev/null +++ b/target/nextflow/integrate/scvi/.config.vsh.yaml @@ -0,0 +1,638 @@ +name: "scvi" +namespace: "integrate" +version: "main" +authors: +- name: "Malte D. Luecken" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "malte.luecken@helmholtz-muenchen.de" + github: "LuckyMD" + orcid: "0000-0001-7464-7921" + linkedin: "malte-l%C3%BCcken-b8b21049" + twitter: "MDLuecken" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "Group Leader" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Matthias Beyens" + roles: + - "contributor" + info: + role: "Contributor" + links: + github: "MatthiasBeyens" + orcid: "0000-0003-3304-0706" + email: "matthias.beyens@gmail.com" + linkedin: "mbeyens" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: ".var column containing gene names. By default, use the index." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_labels" + description: "Key in adata.obs for label information. Categories will automatically\ + \ be \nconverted into integer categories and saved to adata.obs['_scvi_labels'].\n\ + If None, assigns the same label to all the data.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_size_factor" + description: "Key in adata.obs for size factor information. Instead of using library\ + \ size as a size factor,\nthe provided size factor column will be used as offset\ + \ in the mean of the likelihood.\nAssumed to be on linear scale.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_categorical_covariate" + description: "Keys in adata.obs that correspond to categorical data. These covariates\ + \ can be added in\naddition to the batch covariate and are also treated as nuisance\ + \ factors\n(i.e., the model tries to minimize their effects on the latent space).\n\ + Thus, these should not be used for biologically-relevant factors that you do\ + \ _not_ want to correct for.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--obs_continuous_covariate" + description: "Keys in adata.obs that correspond to continuous data. These covariates\ + \ can be added in\naddition to the batch covariate and are also treated as nuisance\ + \ factors\n(i.e., the model tries to minimize their effects on the latent space).\ + \ Thus, these should not be\nused for biologically-relevant factors that you\ + \ do _not_ want to correct for.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_model" + description: "Folder where the state of the trained model will be saved to." + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_scvi_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "SCVI options" + arguments: + - type: "integer" + name: "--n_hidden_nodes" + description: "Number of nodes per hidden layer." + info: null + default: + - 128 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_dimensions_latent_space" + description: "Dimensionality of the latent space." + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_hidden_layers" + description: "Number of hidden layers used for encoder and decoder neural-networks." + info: null + default: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--dropout_rate" + description: "Dropout rate for the neural networks." + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--dispersion" + description: "Set the behavior for the dispersion for negative binomial distributions:\n\ + - gene: dispersion parameter of negative binomial is constant per gene across\ + \ cells\n- gene-batch: dispersion can differ between different batches\n- gene-label:\ + \ dispersion can differ between different labels\n- gene-cell: dispersion can\ + \ differ for every gene in every cell\n" + info: null + default: + - "gene" + required: false + choices: + - "gene" + - "gene-batch" + - "gene-label" + - "gene-cell" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_likelihood" + description: "Model used to generate the expression data from a count-based likelihood\ + \ distribution.\n- nb: Negative binomial distribution\n- zinb: Zero-inflated\ + \ negative binomial distribution\n- poisson: Poisson distribution\n" + info: null + default: + - "nb" + required: false + choices: + - "nb" + - "zinb" + - "poisson" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variational auto-encoder model options" + arguments: + - type: "string" + name: "--use_layer_normalization" + description: "Neural networks for which to enable layer normalization. \n" + info: null + default: + - "both" + required: false + choices: + - "encoder" + - "decoder" + - "none" + - "both" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--use_batch_normalization" + description: "Neural networks for which to enable batch normalization. \n" + info: null + default: + - "none" + required: false + choices: + - "encoder" + - "decoder" + - "none" + - "both" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--encode_covariates" + description: "Whether to concatenate covariates to expression in encoder" + info: null + direction: "input" + - type: "boolean_true" + name: "--deeply_inject_covariates" + description: "Whether to concatenate covariates into output of hidden layers in\ + \ encoder/decoder. \nThis option only applies when n_layers > 1. The covariates\ + \ are concatenated to\nthe input of subsequent hidden layers.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--use_observed_lib_size" + description: "Use observed library size for RNA as scaling factor in mean of conditional\ + \ distribution.\n" + info: null + direction: "input" +- name: "Early stopping arguments" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Data validition" + arguments: + - type: "integer" + name: "--n_obs_min_count" + description: "Minimum number of cells threshold ensuring that every obs_batch\ + \ category has sufficient observations (cells) for model training." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_var_min_count" + description: "Minimum number of genes threshold ensuring that every var_input\ + \ filter has sufficient observations (genes) for model training." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "set_var_index.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + - "gpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "jax[cuda]" + - "scvi-tools~=1.3.1" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/scvi/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integrate/scvi" + executable: "target/nextflow/integrate/scvi/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/integrate/scvi/compress_h5mu.py b/target/nextflow/integrate/scvi/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/integrate/scvi/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/integrate/scvi/main.nf b/target/nextflow/integrate/scvi/main.nf new file mode 100644 index 00000000..c1fcd76b --- /dev/null +++ b/target/nextflow/integrate/scvi/main.nf @@ -0,0 +1,4502 @@ +// scvi main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Malte D. Luecken (author) +// * Dries Schaumont (maintainer) +// * Matthias Beyens (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scvi", + "namespace" : "integrate", + "version" : "main", + "authors" : [ + { + "name" : "Malte D. Luecken", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "malte.luecken@helmholtz-muenchen.de", + "github" : "LuckyMD", + "orcid" : "0000-0001-7464-7921", + "linkedin" : "malte-l%C3%BCcken-b8b21049", + "twitter" : "MDLuecken" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "Group Leader" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Matthias Beyens", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "MatthiasBeyens", + "orcid" : "0000-0003-3304-0706", + "email" : "matthias.beyens@gmail.com", + "linkedin" : "mbeyens" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. If None, X is used", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch", + "description" : "Column name discriminating between your batches.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_gene_names", + "description" : ".var column containing gene names. By default, use the index.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_labels", + "description" : "Key in adata.obs for label information. Categories will automatically be \nconverted into integer categories and saved to adata.obs['_scvi_labels'].\nIf None, assigns the same label to all the data.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_size_factor", + "description" : "Key in adata.obs for size factor information. Instead of using library size as a size factor,\nthe provided size factor column will be used as offset in the mean of the likelihood.\nAssumed to be on linear scale.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_categorical_covariate", + "description" : "Keys in adata.obs that correspond to categorical data. These covariates can be added in\naddition to the batch covariate and are also treated as nuisance factors\n(i.e., the model tries to minimize their effects on the latent space).\nThus, these should not be used for biologically-relevant factors that you do _not_ want to correct for.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_continuous_covariate", + "description" : "Keys in adata.obs that correspond to continuous data. These covariates can be added in\naddition to the batch covariate and are also treated as nuisance factors\n(i.e., the model tries to minimize their effects on the latent space). Thus, these should not be\nused for biologically-relevant factors that you do _not_ want to correct for.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_model", + "description" : "Folder where the state of the trained model will be saved to.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_scvi_integrated" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "SCVI options", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_hidden_nodes", + "description" : "Number of nodes per hidden layer.", + "default" : [ + 128 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_dimensions_latent_space", + "description" : "Dimensionality of the latent space.", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_hidden_layers", + "description" : "Number of hidden layers used for encoder and decoder neural-networks.", + "default" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--dropout_rate", + "description" : "Dropout rate for the neural networks.", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--dispersion", + "description" : "Set the behavior for the dispersion for negative binomial distributions:\n- gene: dispersion parameter of negative binomial is constant per gene across cells\n- gene-batch: dispersion can differ between different batches\n- gene-label: dispersion can differ between different labels\n- gene-cell: dispersion can differ for every gene in every cell\n", + "default" : [ + "gene" + ], + "required" : false, + "choices" : [ + "gene", + "gene-batch", + "gene-label", + "gene-cell" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--gene_likelihood", + "description" : "Model used to generate the expression data from a count-based likelihood distribution.\n- nb: Negative binomial distribution\n- zinb: Zero-inflated negative binomial distribution\n- poisson: Poisson distribution\n", + "default" : [ + "nb" + ], + "required" : false, + "choices" : [ + "nb", + "zinb", + "poisson" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Variational auto-encoder model options", + "arguments" : [ + { + "type" : "string", + "name" : "--use_layer_normalization", + "description" : "Neural networks for which to enable layer normalization. \n", + "default" : [ + "both" + ], + "required" : false, + "choices" : [ + "encoder", + "decoder", + "none", + "both" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--use_batch_normalization", + "description" : "Neural networks for which to enable batch normalization. \n", + "default" : [ + "none" + ], + "required" : false, + "choices" : [ + "encoder", + "decoder", + "none", + "both" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_false", + "name" : "--encode_covariates", + "description" : "Whether to concatenate covariates to expression in encoder", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--deeply_inject_covariates", + "description" : "Whether to concatenate covariates into output of hidden layers in encoder/decoder. \nThis option only applies when n_layers > 1. The covariates are concatenated to\nthe input of subsequent hidden layers.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--use_observed_lib_size", + "description" : "Use observed library size for RNA as scaling factor in mean of conditional distribution.\n", + "direction" : "input" + } + ] + }, + { + "name" : "Early stopping arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--early_stopping", + "description" : "Whether to perform early stopping with respect to the validation set.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--early_stopping_monitor", + "description" : "Metric logged during validation set epoch.", + "default" : [ + "elbo_validation" + ], + "required" : false, + "choices" : [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--early_stopping_patience", + "description" : "Number of validation epochs with no improvement after which training will be stopped.", + "default" : [ + 45 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--early_stopping_min_delta", + "description" : "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Learning parameters", + "arguments" : [ + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--reduce_lr_on_plateau", + "description" : "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_factor", + "description" : "Factor to reduce learning rate.", + "default" : [ + 0.6 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_patience", + "description" : "Number of epochs with no improvement after which learning rate will be reduced.", + "default" : [ + 30.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Data validition", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_obs_min_count", + "description" : "Minimum number of cells threshold ensuring that every obs_batch category has sufficient observations (cells) for model training.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_var_min_count", + "description" : "Minimum number of genes threshold ensuring that every var_input filter has sufficient observations (genes) for model training.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/utils/set_var_index.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midcpu", + "midmem", + "gpu", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:25.05-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "jax[cuda]", + "scvi-tools~=1.3.1" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integrate/scvi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/integrate/scvi", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from scanpy._utils import check_nonnegative_integers +import mudata +import scvi + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_labels': $( if [ ! -z ${VIASH_PAR_OBS_LABELS+x} ]; then echo "r'${VIASH_PAR_OBS_LABELS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_size_factor': $( if [ ! -z ${VIASH_PAR_OBS_SIZE_FACTOR+x} ]; then echo "r'${VIASH_PAR_OBS_SIZE_FACTOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_categorical_covariate': $( if [ ! -z ${VIASH_PAR_OBS_CATEGORICAL_COVARIATE+x} ]; then echo "r'${VIASH_PAR_OBS_CATEGORICAL_COVARIATE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'obs_continuous_covariate': $( if [ ! -z ${VIASH_PAR_OBS_CONTINUOUS_COVARIATE+x} ]; then echo "r'${VIASH_PAR_OBS_CONTINUOUS_COVARIATE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_model': $( if [ ! -z ${VIASH_PAR_OUTPUT_MODEL+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_hidden_nodes': $( if [ ! -z ${VIASH_PAR_N_HIDDEN_NODES+x} ]; then echo "int(r'${VIASH_PAR_N_HIDDEN_NODES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_dimensions_latent_space': $( if [ ! -z ${VIASH_PAR_N_DIMENSIONS_LATENT_SPACE+x} ]; then echo "int(r'${VIASH_PAR_N_DIMENSIONS_LATENT_SPACE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_hidden_layers': $( if [ ! -z ${VIASH_PAR_N_HIDDEN_LAYERS+x} ]; then echo "int(r'${VIASH_PAR_N_HIDDEN_LAYERS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'dropout_rate': $( if [ ! -z ${VIASH_PAR_DROPOUT_RATE+x} ]; then echo "float(r'${VIASH_PAR_DROPOUT_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'dispersion': $( if [ ! -z ${VIASH_PAR_DISPERSION+x} ]; then echo "r'${VIASH_PAR_DISPERSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'gene_likelihood': $( if [ ! -z ${VIASH_PAR_GENE_LIKELIHOOD+x} ]; then echo "r'${VIASH_PAR_GENE_LIKELIHOOD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'use_layer_normalization': $( if [ ! -z ${VIASH_PAR_USE_LAYER_NORMALIZATION+x} ]; then echo "r'${VIASH_PAR_USE_LAYER_NORMALIZATION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'use_batch_normalization': $( if [ ! -z ${VIASH_PAR_USE_BATCH_NORMALIZATION+x} ]; then echo "r'${VIASH_PAR_USE_BATCH_NORMALIZATION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'encode_covariates': $( if [ ! -z ${VIASH_PAR_ENCODE_COVARIATES+x} ]; then echo "r'${VIASH_PAR_ENCODE_COVARIATES//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'deeply_inject_covariates': $( if [ ! -z ${VIASH_PAR_DEEPLY_INJECT_COVARIATES+x} ]; then echo "r'${VIASH_PAR_DEEPLY_INJECT_COVARIATES//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'use_observed_lib_size': $( if [ ! -z ${VIASH_PAR_USE_OBSERVED_LIB_SIZE+x} ]; then echo "r'${VIASH_PAR_USE_OBSERVED_LIB_SIZE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'early_stopping_monitor': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MONITOR+x} ]; then echo "r'${VIASH_PAR_EARLY_STOPPING_MONITOR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'early_stopping_patience': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_PATIENCE+x} ]; then echo "int(r'${VIASH_PAR_EARLY_STOPPING_PATIENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'early_stopping_min_delta': $( if [ ! -z ${VIASH_PAR_EARLY_STOPPING_MIN_DELTA+x} ]; then echo "float(r'${VIASH_PAR_EARLY_STOPPING_MIN_DELTA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reduce_lr_on_plateau': $( if [ ! -z ${VIASH_PAR_REDUCE_LR_ON_PLATEAU+x} ]; then echo "r'${VIASH_PAR_REDUCE_LR_ON_PLATEAU//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'lr_factor': $( if [ ! -z ${VIASH_PAR_LR_FACTOR+x} ]; then echo "float(r'${VIASH_PAR_LR_FACTOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'lr_patience': $( if [ ! -z ${VIASH_PAR_LR_PATIENCE+x} ]; then echo "float(r'${VIASH_PAR_LR_PATIENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_obs_min_count': $( if [ ! -z ${VIASH_PAR_N_OBS_MIN_COUNT+x} ]; then echo "int(r'${VIASH_PAR_N_OBS_MIN_COUNT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_var_min_count': $( if [ ! -z ${VIASH_PAR_N_VAR_MIN_COUNT+x} ]; then echo "int(r'${VIASH_PAR_N_VAR_MIN_COUNT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +import sys + +sys.path.append(meta["resources_dir"]) + +from subset_vars import subset_vars +from set_var_index import set_var_index +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +# TODO: optionally, move to qa +# https://github.com/openpipelines-bio/openpipeline/issues/435 +def check_validity_anndata(adata, layer, obs_batch, n_obs_min_count, n_var_min_count): + assert check_nonnegative_integers(adata.layers[layer] if layer else adata.X), ( + "Make sure input adata contains raw_counts" + ) + + assert len(set(adata.var_names)) == len(adata.var_names), ( + "Dataset contains multiple genes with same gene name." + ) + + # Ensure every obs_batch category has sufficient observations + assert min(adata.obs[[obs_batch]].value_counts()) > n_obs_min_count, ( + f"Anndata has fewer than {n_obs_min_count} cells." + ) + + assert adata.n_vars > n_var_min_count, ( + f"Anndata has fewer than {n_var_min_count} genes." + ) + + +def main(): + adata = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + if par["var_input"]: + # Subset to HVG + adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() + else: + adata_subset = adata.copy() + + # Sanitize gene names and set as index of the AnnData object + adata_subset = set_var_index(adata_subset, par["var_gene_names"]) + + check_validity_anndata( + adata_subset, + par["input_layer"], + par["obs_batch"], + par["n_obs_min_count"], + par["n_var_min_count"], + ) + + # Set up the data + scvi.model.SCVI.setup_anndata( + adata_subset, + batch_key=par["obs_batch"], + layer=par["input_layer"], + labels_key=par["obs_labels"], + size_factor_key=par["obs_size_factor"], + categorical_covariate_keys=par["obs_categorical_covariate"], + continuous_covariate_keys=par["obs_continuous_covariate"], + ) + + # Set up the model + vae_uns = scvi.model.SCVI( + adata_subset, + n_hidden=par["n_hidden_nodes"], + n_latent=par["n_dimensions_latent_space"], + n_layers=par["n_hidden_layers"], + dropout_rate=par["dropout_rate"], + dispersion=par["dispersion"], + gene_likelihood=par["gene_likelihood"], + use_layer_norm=par["use_layer_normalization"], + use_batch_norm=par["use_batch_normalization"], + encode_covariates=par[ + "encode_covariates" + ], # Default (True) is for better scArches performance -> maybe don't use this always? + deeply_inject_covariates=par[ + "deeply_inject_covariates" + ], # Default (False) for better scArches performance -> maybe don't use this always? + use_observed_lib_size=par[ + "use_observed_lib_size" + ], # When size_factors are not passed + ) + + plan_kwargs = { + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], + } + + # Train the model + vae_uns.train( + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + plan_kwargs=plan_kwargs, + check_val_every_n_epoch=1, + accelerator="auto", + ) + # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set + + # Get the latent output + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() + + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] + ) + if par["output_model"]: + vae_uns.save(par["output_model"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/integrate/scvi", + "tag" : "main" + }, + "label" : [ + "midcpu", + "midmem", + "gpu", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integrate/scvi/nextflow.config b/target/nextflow/integrate/scvi/nextflow.config new file mode 100644 index 00000000..bbf62d6b --- /dev/null +++ b/target/nextflow/integrate/scvi/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integrate/scvi' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA' + author = 'Malte D. Luecken, Dries Schaumont, Matthias Beyens' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integrate/scvi/nextflow_labels.config b/target/nextflow/integrate/scvi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/integrate/scvi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/integrate/scvi/nextflow_schema.json b/target/nextflow/integrate/scvi/nextflow_schema.json new file mode 100644 index 00000000..8e6c056d --- /dev/null +++ b/target/nextflow/integrate/scvi/nextflow_schema.json @@ -0,0 +1,336 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scvi", + "description": "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_batch": { + "type": "string", + "description": "Column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "var_gene_names": { + "type": "string", + "description": ".var column containing gene names", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_labels": { + "type": "string", + "description": "Key in adata.obs for label information", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_size_factor": { + "type": "string", + "description": "Key in adata.obs for size factor information", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_categorical_covariate": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys in adata.obs that correspond to categorical data", + "help_text": "Type: `string`, multiple: `True`. " + }, + "obs_continuous_covariate": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys in adata.obs that correspond to continuous data", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_model": { + "type": "string", + "format": "path", + "description": "Folder where the state of the trained model will be saved to.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_model\"`, direction: `output`. ", + "default": "$id.$key.output_model" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scvi_integrated\"`. ", + "default": "X_scvi_integrated" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "scvi options": { + "title": "SCVI options", + "type": "object", + "description": "No description", + "properties": { + "n_hidden_nodes": { + "type": "integer", + "description": "Number of nodes per hidden layer.", + "help_text": "Type: `integer`, multiple: `False`, default: `128`. ", + "default": 128 + }, + "n_dimensions_latent_space": { + "type": "integer", + "description": "Dimensionality of the latent space.", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + }, + "n_hidden_layers": { + "type": "integer", + "description": "Number of hidden layers used for encoder and decoder neural-networks.", + "help_text": "Type: `integer`, multiple: `False`, default: `2`. ", + "default": 2 + }, + "dropout_rate": { + "type": "number", + "description": "Dropout rate for the neural networks.", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + }, + "dispersion": { + "type": "string", + "description": "Set the behavior for the dispersion for negative binomial distributions:\n- gene: dispersion parameter of negative binomial is constant per gene across cells\n- gene-batch: dispersion can differ between different batches\n- gene-label: dispersion can differ between different labels\n- gene-cell: dispersion can differ for every gene in every cell\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"gene\"`, choices: ``gene`, `gene-batch`, `gene-label`, `gene-cell``. ", + "enum": [ + "gene", + "gene-batch", + "gene-label", + "gene-cell" + ], + "default": "gene" + }, + "gene_likelihood": { + "type": "string", + "description": "Model used to generate the expression data from a count-based likelihood distribution.\n- nb: Negative binomial distribution\n- zinb: Zero-inflated negative binomial distribution\n- poisson: Poisson distribution\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"nb\"`, choices: ``nb`, `zinb`, `poisson``. ", + "enum": [ + "nb", + "zinb", + "poisson" + ], + "default": "nb" + } + } + }, + "variational auto-encoder model options": { + "title": "Variational auto-encoder model options", + "type": "object", + "description": "No description", + "properties": { + "use_layer_normalization": { + "type": "string", + "description": "Neural networks for which to enable layer normalization", + "help_text": "Type: `string`, multiple: `False`, default: `\"both\"`, choices: ``encoder`, `decoder`, `none`, `both``. ", + "enum": [ + "encoder", + "decoder", + "none", + "both" + ], + "default": "both" + }, + "use_batch_normalization": { + "type": "string", + "description": "Neural networks for which to enable batch normalization", + "help_text": "Type: `string`, multiple: `False`, default: `\"none\"`, choices: ``encoder`, `decoder`, `none`, `both``. ", + "enum": [ + "encoder", + "decoder", + "none", + "both" + ], + "default": "none" + }, + "encode_covariates": { + "type": "boolean", + "description": "Whether to concatenate covariates to expression in encoder", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + }, + "deeply_inject_covariates": { + "type": "boolean", + "description": "Whether to concatenate covariates into output of hidden layers in encoder/decoder", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "use_observed_lib_size": { + "type": "boolean", + "description": "Use observed library size for RNA as scaling factor in mean of conditional distribution.\n", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "early stopping arguments": { + "title": "Early stopping arguments", + "type": "object", + "description": "No description", + "properties": { + "early_stopping": { + "type": "boolean", + "description": "Whether to perform early stopping with respect to the validation set.", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "early_stopping_monitor": { + "type": "string", + "description": "Metric logged during validation set epoch.", + "help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ", + "enum": [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "default": "elbo_validation" + }, + "early_stopping_patience": { + "type": "integer", + "description": "Number of validation epochs with no improvement after which training will be stopped.", + "help_text": "Type: `integer`, multiple: `False`, default: `45`. ", + "default": 45 + }, + "early_stopping_min_delta": { + "type": "number", + "description": "Minimum change in the monitored quantity to qualify as an improvement, i.e", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + } + } + }, + "learning parameters": { + "title": "Learning parameters", + "type": "object", + "description": "No description", + "properties": { + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "reduce_lr_on_plateau": { + "type": "boolean", + "description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "lr_factor": { + "type": "number", + "description": "Factor to reduce learning rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.6`. ", + "default": 0.6 + }, + "lr_patience": { + "type": "number", + "description": "Number of epochs with no improvement after which learning rate will be reduced.", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + } + } + }, + "data validition": { + "title": "Data validition", + "type": "object", + "description": "No description", + "properties": { + "n_obs_min_count": { + "type": "integer", + "description": "Minimum number of cells threshold ensuring that every obs_batch category has sufficient observations (cells) for model training.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "n_var_min_count": { + "type": "integer", + "description": "Minimum number of genes threshold ensuring that every var_input filter has sufficient observations (genes) for model training.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/scvi options" + }, + { + "$ref": "#/$defs/variational auto-encoder model options" + }, + { + "$ref": "#/$defs/early stopping arguments" + }, + { + "$ref": "#/$defs/learning parameters" + }, + { + "$ref": "#/$defs/data validition" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integrate/scvi/set_var_index.py b/target/nextflow/integrate/scvi/set_var_index.py new file mode 100644 index 00000000..3a1803eb --- /dev/null +++ b/target/nextflow/integrate/scvi/set_var_index.py @@ -0,0 +1,24 @@ +import anndata as ad +import re + + +def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData: + """Sanitize gene names and set the index of the .var DataFrame. + + Parameters + ---------- + adata : AnnData + Annotated data object + var_name : str | None + Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced. + + Returns + ------- + AnnData + Copy of `adata` with sanitized and replaced index + """ + if var_name: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]] + else: + adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index] + return adata diff --git a/target/nextflow/integrate/scvi/subset_vars.py b/target/nextflow/integrate/scvi/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/integrate/scvi/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/integrate/totalvi/.config.vsh.yaml b/target/nextflow/integrate/totalvi/.config.vsh.yaml new file mode 100644 index 00000000..c6b5a8b0 --- /dev/null +++ b/target/nextflow/integrate/totalvi/.config.vsh.yaml @@ -0,0 +1,395 @@ +name: "totalvi" +namespace: "integrate" +version: "main" +authors: +- name: "Vladimir Shitov" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file with query data to integrate with reference." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "-r" + description: "Input h5mu file with reference data to train the TOTALVI model." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--force_retrain" + alternatives: + - "-f" + description: "If true, retrain the model and save it to reference_model_path" + info: null + direction: "input" + - type: "string" + name: "--query_modality" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--query_proteins_modality" + description: "Name of the modality in the input (query) h5mu file containing protein\ + \ data" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_modality" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_proteins_modality" + description: "Name of the modality containing proteins in the reference" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is used" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_integrated_totalvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_normalized_rna_output" + description: "In which .obsm slot to store the normalized RNA from TOTALVI." + info: null + default: + - "X_totalvi_normalized_rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_normalized_protein_output" + description: "In which .obsm slot to store the normalized protein data from TOTALVI." + info: null + default: + - "X_totalvi_normalized_protein" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference_model_path" + description: "Directory with the reference model. If not exists, trained model\ + \ will be saved there" + info: null + default: + - "totalvi_model_reference" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--query_model_path" + description: "Directory, where the query model will be saved" + info: null + default: + - "totalvi_model_query" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset" + info: null + default: + - 400 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_query_epochs" + description: "Number of passes through the dataset, when fine-tuning model for\ + \ query" + info: null + default: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--weight_decay" + description: "Weight decay, when fine-tuning model for query" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:25.05-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "jax[cuda]" + - "scvi-tools~=1.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integrate/totalvi/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integrate/totalvi" + executable: "target/nextflow/integrate/totalvi/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/integrate/totalvi/main.nf b/target/nextflow/integrate/totalvi/main.nf new file mode 100644 index 00000000..08591337 --- /dev/null +++ b/target/nextflow/integrate/totalvi/main.nf @@ -0,0 +1,4268 @@ +// totalvi main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "totalvi", + "namespace" : "integrate", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file with query data to integrate with reference.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "alternatives" : [ + "-r" + ], + "description" : "Input h5mu file with reference data to train the TOTALVI model.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--force_retrain", + "alternatives" : [ + "-f" + ], + "description" : "If true, retrain the model and save it to reference_model_path", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--query_modality", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--query_proteins_modality", + "description" : "Name of the modality in the input (query) h5mu file containing protein data", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_modality", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_proteins_modality", + "description" : "Name of the modality containing proteins in the reference", + "default" : [ + "prot" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. If None, X is used", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch", + "description" : "Column name discriminating between your batches.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_integrated_totalvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_normalized_rna_output", + "description" : "In which .obsm slot to store the normalized RNA from TOTALVI.", + "default" : [ + "X_totalvi_normalized_rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_normalized_protein_output", + "description" : "In which .obsm slot to store the normalized protein data from TOTALVI.", + "default" : [ + "X_totalvi_normalized_protein" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference_model_path", + "description" : "Directory with the reference model. If not exists, trained model will be saved there", + "default" : [ + "totalvi_model_reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--query_model_path", + "description" : "Directory, where the query model will be saved", + "default" : [ + "totalvi_model_query" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Learning parameters", + "arguments" : [ + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset", + "default" : [ + 400 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_query_epochs", + "description" : "Number of passes through the dataset, when fine-tuning model for query", + "default" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--weight_decay", + "description" : "Weight decay, when fine-tuning model for query", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:25.05-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "jax[cuda]", + "scvi-tools~=1.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integrate/totalvi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/integrate/totalvi", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from typing import Tuple + +import os +import sys +import mudata +from anndata import AnnData # For type hints +from mudata import MuData # For type hints +import numpy as np +import scvi +from scipy.sparse import issparse + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'force_retrain': $( if [ ! -z ${VIASH_PAR_FORCE_RETRAIN+x} ]; then echo "r'${VIASH_PAR_FORCE_RETRAIN//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'query_modality': $( if [ ! -z ${VIASH_PAR_QUERY_MODALITY+x} ]; then echo "r'${VIASH_PAR_QUERY_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'query_proteins_modality': $( if [ ! -z ${VIASH_PAR_QUERY_PROTEINS_MODALITY+x} ]; then echo "r'${VIASH_PAR_QUERY_PROTEINS_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_modality': $( if [ ! -z ${VIASH_PAR_REFERENCE_MODALITY+x} ]; then echo "r'${VIASH_PAR_REFERENCE_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_proteins_modality': $( if [ ! -z ${VIASH_PAR_REFERENCE_PROTEINS_MODALITY+x} ]; then echo "r'${VIASH_PAR_REFERENCE_PROTEINS_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_output': $( if [ ! -z ${VIASH_PAR_OBSM_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_normalized_rna_output': $( if [ ! -z ${VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_NORMALIZED_RNA_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_normalized_protein_output': $( if [ ! -z ${VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_NORMALIZED_PROTEIN_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_model_path': $( if [ ! -z ${VIASH_PAR_REFERENCE_MODEL_PATH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_MODEL_PATH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'query_model_path': $( if [ ! -z ${VIASH_PAR_QUERY_MODEL_PATH+x} ]; then echo "r'${VIASH_PAR_QUERY_MODEL_PATH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'max_epochs': $( if [ ! -z ${VIASH_PAR_MAX_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_query_epochs': $( if [ ! -z ${VIASH_PAR_MAX_QUERY_EPOCHS+x} ]; then echo "int(r'${VIASH_PAR_MAX_QUERY_EPOCHS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'weight_decay': $( if [ ! -z ${VIASH_PAR_WEIGHT_DECAY+x} ]; then echo "float(r'${VIASH_PAR_WEIGHT_DECAY//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def align_proteins_names( + adata_reference: AnnData, + mdata_query: MuData, + adata_query: AnnData, + reference_proteins_key: str, + query_proteins_key: str, +) -> AnnData: + """Make sure that proteins are located in the same .obsm slot in reference and query. Pad query proteins with zeros if they are absent""" + proteins_reference = adata_reference.obsm[reference_proteins_key] + + # If query has no protein data, put matrix of zeros + if not query_proteins_key or query_proteins_key not in mdata_query.mod: + adata_query.obsm[reference_proteins_key] = np.zeros( + (adata_query.n_obs, proteins_reference.shape[1]) + ) + else: + # Make sure that proteins expression has the same key in query and reference + adata_query.obsm[reference_proteins_key] = adata_query.obsm[query_proteins_key] + + return adata_query + + +def extract_proteins_to_anndata( + mdata: MuData, rna_modality_key, protein_modality_key, input_layer, hvg_var_key=None +) -> AnnData: + """TOTALVI requires data to be stored in AnnData format with protein counts in .obsm slot. This function performs the conversion""" + adata: AnnData = mdata.mod[rna_modality_key].copy() + + if hvg_var_key: + selected_genes = adata.var_names[adata.var[hvg_var_key]] + adata = adata[:, selected_genes].copy() + + if protein_modality_key in mdata.mod: + # Put the proteins modality into .obsm slot + proteins_reference_adata = mdata.mod[protein_modality_key].copy() + + if input_layer is None: + proteins = proteins_reference_adata.X + else: + proteins = proteins_reference_adata.obsm[input_layer] + + if issparse(proteins): + proteins = proteins.toarray() + + adata.obsm[protein_modality_key] = proteins + + return adata + + +def build_reference_model( + adata_reference: AnnData, max_train_epochs: int = 400 +) -> scvi.model.TOTALVI: + vae_reference = scvi.model.TOTALVI( + adata_reference, use_layer_norm="both", use_batch_norm="none" + ) + vae_reference.train(max_train_epochs) + + vae_reference.save(par["reference_model_path"]) + + return vae_reference + + +def is_retraining_model() -> bool: + """Decide, whether reference model should be trained. It happens when no model exists or force_retrain flag is on""" + + trained_model_exists = os.path.isdir(par["reference_model_path"]) and ( + "model.pt" in os.listdir(par["reference_model_path"]) + ) + return not trained_model_exists or par["force_retrain"] + + +def map_query_to_reference( + mdata_reference: MuData, mdata_query: MuData, adata_query: AnnData +) -> Tuple[scvi.model.TOTALVI, AnnData]: + """Build model on the provided reference if necessary, and map query to the reference""" + + adata_reference: AnnData = extract_proteins_to_anndata( + mdata_reference, + rna_modality_key=par["reference_modality"], + protein_modality_key=par["reference_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) + + scvi.model.TOTALVI.setup_anndata( + adata_reference, + batch_key=par["obs_batch"], + protein_expression_obsm_key=par["reference_proteins_modality"], + ) + + if is_retraining_model(): + vae_reference = build_reference_model( + adata_reference, max_train_epochs=par["max_epochs"] + ) + else: + vae_reference = scvi.model.TOTALVI.load( + dir_path=par["reference_model_path"], adata=adata_reference + ) + + adata_query: AnnData = align_proteins_names( + adata_reference, + mdata_query, + adata_query, + reference_proteins_key=par["reference_proteins_modality"], + query_proteins_key=par["query_proteins_modality"], + ) + + # Reorder genes and pad missing genes with 0s + scvi.model.TOTALVI.prepare_query_anndata(adata_query, vae_reference) + + # Train the model for query + vae_query = scvi.model.TOTALVI.load_query_data(adata_query, vae_reference) + vae_query.train( + par["max_query_epochs"], plan_kwargs=dict(weight_decay=par["weight_decay"]) + ) + + return vae_query, adata_query + + +def main(): + mdata_query = mudata.read(par["input"].strip()) + adata_query = extract_proteins_to_anndata( + mdata_query, + rna_modality_key=par["query_modality"], + protein_modality_key=par["query_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) + + if par["reference"].endswith(".h5mu"): + logger.info("Reading reference") + mdata_reference = mudata.read(par["reference"].strip()) + + logger.info("Mapping query to the reference") + vae_query, adata_query = map_query_to_reference( + mdata_reference, mdata_query, adata_query + ) + else: + raise ValueError("Incorrect format of reference, please provide a .h5mu file") + + adata_query.uns["integration_method"] = "totalvi" + + logger.info("Getting the latent representation of query") + mdata_query.mod[par["query_modality"]].obsm[par["obsm_output"]] = ( + vae_query.get_latent_representation() + ) + + norm_rna, norm_protein = vae_query.get_normalized_expression() + mdata_query.mod[par["query_modality"]].obsm[par["obsm_normalized_rna_output"]] = ( + norm_rna.to_numpy() + ) + + if par["query_proteins_modality"] in mdata_query.mod: + mdata_query.mod[par["query_proteins_modality"]].obsm[ + par["obsm_normalized_protein_output"] + ] = norm_protein.to_numpy() + + logger.info("Updating mdata") + mdata_query.update() + + logger.info("Saving updated query data") + mdata_query.write_h5mu(par["output"].strip()) + + logger.info("Saving query model") + vae_query.save(par["query_model_path"], overwrite=True) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/integrate/totalvi", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integrate/totalvi/nextflow.config b/target/nextflow/integrate/totalvi/nextflow.config new file mode 100644 index 00000000..250bd0cb --- /dev/null +++ b/target/nextflow/integrate/totalvi/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integrate/totalvi' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integrate/totalvi/nextflow_labels.config b/target/nextflow/integrate/totalvi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/integrate/totalvi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/integrate/totalvi/nextflow_schema.json b/target/nextflow/integrate/totalvi/nextflow_schema.json new file mode 100644 index 00000000..62bedd09 --- /dev/null +++ b/target/nextflow/integrate/totalvi/nextflow_schema.json @@ -0,0 +1,171 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "totalvi", + "description": "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file with query data to integrate with reference.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file with reference data to train the TOTALVI model.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "force_retrain": { + "type": "boolean", + "description": "If true, retrain the model and save it to reference_model_path", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "query_modality": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "query_proteins_modality": { + "type": "string", + "description": "Name of the modality in the input (query) h5mu file containing protein data", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_modality": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "reference_proteins_modality": { + "type": "string", + "description": "Name of the modality containing proteins in the reference", + "help_text": "Type: `string`, multiple: `False`, default: `\"prot\"`. ", + "default": "prot" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_batch": { + "type": "string", + "description": "Column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_integrated_totalvi\"`. ", + "default": "X_integrated_totalvi" + }, + "obsm_normalized_rna_output": { + "type": "string", + "description": "In which .obsm slot to store the normalized RNA from TOTALVI.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_totalvi_normalized_rna\"`. ", + "default": "X_totalvi_normalized_rna" + }, + "obsm_normalized_protein_output": { + "type": "string", + "description": "In which .obsm slot to store the normalized protein data from TOTALVI.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_totalvi_normalized_protein\"`. ", + "default": "X_totalvi_normalized_protein" + }, + "reference_model_path": { + "type": "string", + "format": "path", + "description": "Directory with the reference model", + "help_text": "Type: `file`, multiple: `False`, default: `\"totalvi_model_reference\"`, direction: `output`. ", + "default": "totalvi_model_reference" + }, + "query_model_path": { + "type": "string", + "format": "path", + "description": "Directory, where the query model will be saved", + "help_text": "Type: `file`, multiple: `False`, default: `\"totalvi_model_query\"`, direction: `output`. ", + "default": "totalvi_model_query" + } + } + }, + "learning parameters": { + "title": "Learning parameters", + "type": "object", + "description": "No description", + "properties": { + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset", + "help_text": "Type: `integer`, multiple: `False`, default: `400`. ", + "default": 400 + }, + "max_query_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, when fine-tuning model for query", + "help_text": "Type: `integer`, multiple: `False`, default: `200`. ", + "default": 200 + }, + "weight_decay": { + "type": "number", + "description": "Weight decay, when fine-tuning model for query", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/learning parameters" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integrate/totalvi/setup_logger.py b/target/nextflow/integrate/totalvi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/integrate/totalvi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/interpret/lianapy/.config.vsh.yaml b/target/nextflow/interpret/lianapy/.config.vsh.yaml new file mode 100644 index 00000000..232836a8 --- /dev/null +++ b/target/nextflow/interpret/lianapy/.config.vsh.yaml @@ -0,0 +1,404 @@ +name: "lianapy" +namespace: "interpret" +version: "main" +authors: +- name: "Mauro Saporita" + roles: + - "author" + info: + role: "Contributor" + links: + email: "maurosaporita@gmail.com" + github: "mauro-saporita" + linkedin: "mauro-saporita-930b06a5" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Lead Nextflow Developer" +- name: "Povilas Gibas" + roles: + - "author" + info: + role: "Contributor" + links: + email: "povilasgibas@gmail.com" + github: "PoGibas" + linkedin: "povilas-gibas" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Bioinformatician" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer in anndata.AnnData.layers to use. If None, use mudata.mod[modality].X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--de_method" + description: "Differential expression method. `scanpy.tl.rank_genes_groups` is\ + \ used to rank genes according to 1vsRest." + info: null + default: + - "t-test" + required: false + choices: + - "logreg" + - "t-test" + - "wilcoxon" + - "t-test_overestim_var" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--groupby" + description: "The key of the observations grouping to consider." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--resource_name" + description: "Name of the resource to be loaded and use for ligand-receptor inference." + info: null + default: + - "consensus" + required: false + choices: + - "baccin2019" + - "cellcall" + - "cellchatdb" + - "cellinker" + - "cellphonedb" + - "celltalkdb" + - "connectomedb2020" + - "consensus" + - "embrace" + - "guide2pharma" + - "hpmr" + - "icellnet" + - "italk" + - "kirouac2010" + - "lrdb" + - "mouseconsensus" + - "ramilowski2015" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gene_symbol" + description: "Column name in var DataFrame in which gene symbol are stored." + info: null + default: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--expr_prop" + description: "Minimum expression proportion for the ligands/receptors (and their\ + \ subunits) in the corresponding cell identities. Set to '0', to return unfiltered\ + \ results." + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells" + description: "Minimum cells per cell identity ('groupby') to be considered for\ + \ downstream analysis." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--aggregate_method" + description: "Method aggregation approach, one of ['mean', 'rra'], where 'mean'\ + \ represents the mean rank, while 'rra' is the RobustRankAggregate (Kolde et\ + \ al., 2014) of the interactions." + info: null + default: + - "rra" + required: false + choices: + - "mean" + - "rra" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--consensus_opts" + description: "Strategies to aggregate interactions across methods. Default is\ + \ None - i.e. ['Specificity', 'Magnitude'] and both specificity and magnitude\ + \ are aggregated." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean" + name: "--return_all_lrs" + description: "Bool whether to return all LRs, or only those that surpass the 'expr_prop'\ + \ threshold. Those interactions that do not pass the 'expr_prop' threshold will\ + \ be assigned to the *worst* score of the ones that do. 'False' by default." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_perms" + description: "Number of permutations for the permutation test. Note that this\ + \ is relevant only for permutation-based methods - e.g. 'CellPhoneDB" + info: null + default: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs LIANA integration based as described in https://github.com/saezlab/liana-py" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "liana~=1.5.0" + - "scipy<1.16" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/interpret/lianapy/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/interpret/lianapy" + executable: "target/nextflow/interpret/lianapy/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/interpret/lianapy/compress_h5mu.py b/target/nextflow/interpret/lianapy/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/interpret/lianapy/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/interpret/lianapy/main.nf b/target/nextflow/interpret/lianapy/main.nf new file mode 100644 index 00000000..f0dbc693 --- /dev/null +++ b/target/nextflow/interpret/lianapy/main.nf @@ -0,0 +1,4146 @@ +// lianapy main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Mauro Saporita (author) +// * Povilas Gibas (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "lianapy", + "namespace" : "interpret", + "version" : "main", + "authors" : [ + { + "name" : "Mauro Saporita", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "maurosaporita@gmail.com", + "github" : "mauro-saporita", + "linkedin" : "mauro-saporita-930b06a5" + }, + "organizations" : [ + { + "name" : "Ardigen", + "href" : "https://ardigen.com", + "role" : "Lead Nextflow Developer" + } + ] + } + }, + { + "name" : "Povilas Gibas", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "povilasgibas@gmail.com", + "github" : "PoGibas", + "linkedin" : "povilas-gibas" + }, + "organizations" : [ + { + "name" : "Ardigen", + "href" : "https://ardigen.com", + "role" : "Bioinformatician" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Layer in anndata.AnnData.layers to use. If None, use mudata.mod[modality].X.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--de_method", + "description" : "Differential expression method. `scanpy.tl.rank_genes_groups` is used to rank genes according to 1vsRest.", + "default" : [ + "t-test" + ], + "required" : false, + "choices" : [ + "logreg", + "t-test", + "wilcoxon", + "t-test_overestim_var" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--groupby", + "description" : "The key of the observations grouping to consider.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--resource_name", + "description" : "Name of the resource to be loaded and use for ligand-receptor inference.", + "default" : [ + "consensus" + ], + "required" : false, + "choices" : [ + "baccin2019", + "cellcall", + "cellchatdb", + "cellinker", + "cellphonedb", + "celltalkdb", + "connectomedb2020", + "consensus", + "embrace", + "guide2pharma", + "hpmr", + "icellnet", + "italk", + "kirouac2010", + "lrdb", + "mouseconsensus", + "ramilowski2015" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--gene_symbol", + "description" : "Column name in var DataFrame in which gene symbol are stored.", + "default" : [ + "gene_symbol" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--expr_prop", + "description" : "Minimum expression proportion for the ligands/receptors (and their subunits) in the corresponding cell identities. Set to '0', to return unfiltered results.", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells", + "description" : "Minimum cells per cell identity ('groupby') to be considered for downstream analysis.", + "default" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--aggregate_method", + "description" : "Method aggregation approach, one of ['mean', 'rra'], where 'mean' represents the mean rank, while 'rra' is the RobustRankAggregate (Kolde et al., 2014) of the interactions.", + "default" : [ + "rra" + ], + "required" : false, + "choices" : [ + "mean", + "rra" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--consensus_opts", + "description" : "Strategies to aggregate interactions across methods. Default is None - i.e. ['Specificity', 'Magnitude'] and both specificity and magnitude are aggregated.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--return_all_lrs", + "description" : "Bool whether to return all LRs, or only those that surpass the 'expr_prop' threshold. Those interactions that do not pass the 'expr_prop' threshold will be assigned to the *worst* score of the ones that do. 'False' by default.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_perms", + "description" : "Number of permutations for the permutation test. Note that this is relevant only for permutation-based methods - e.g. 'CellPhoneDB", + "default" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs LIANA integration based as described in https://github.com/saezlab/liana-py", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "liana~=1.5.0", + "scipy<1.16" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/interpret/lianapy/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/interpret/lianapy", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import liana +import mudata + +# TODO: Remove when grouping labels exist +# For sign/PCA/ +import pandas as pd + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'de_method': $( if [ ! -z ${VIASH_PAR_DE_METHOD+x} ]; then echo "r'${VIASH_PAR_DE_METHOD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'groupby': $( if [ ! -z ${VIASH_PAR_GROUPBY+x} ]; then echo "r'${VIASH_PAR_GROUPBY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resource_name': $( if [ ! -z ${VIASH_PAR_RESOURCE_NAME+x} ]; then echo "r'${VIASH_PAR_RESOURCE_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'gene_symbol': $( if [ ! -z ${VIASH_PAR_GENE_SYMBOL+x} ]; then echo "r'${VIASH_PAR_GENE_SYMBOL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'expr_prop': $( if [ ! -z ${VIASH_PAR_EXPR_PROP+x} ]; then echo "float(r'${VIASH_PAR_EXPR_PROP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_cells': $( if [ ! -z ${VIASH_PAR_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'aggregate_method': $( if [ ! -z ${VIASH_PAR_AGGREGATE_METHOD+x} ]; then echo "r'${VIASH_PAR_AGGREGATE_METHOD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'consensus_opts': $( if [ ! -z ${VIASH_PAR_CONSENSUS_OPTS+x} ]; then echo "r'${VIASH_PAR_CONSENSUS_OPTS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'return_all_lrs': $( if [ ! -z ${VIASH_PAR_RETURN_ALL_LRS+x} ]; then echo "r'${VIASH_PAR_RETURN_ALL_LRS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'n_perms': $( if [ ! -z ${VIASH_PAR_N_PERMS+x} ]; then echo "int(r'${VIASH_PAR_N_PERMS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def main(): + # Get input data + mod = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + + # Add dummy grouping labels when they do not exist + if par["groupby"] not in mod.obs: + raise ValueError( + f"Column {par['groupy']} does not exist in " + f".obs for modality {par['modality']}." + ) + mod_col = mod.obs[par["groupby"]] + original_groupby_col = mod_col.copy() + if not isinstance(mod_col, pd.CategoricalDtype): + mod.obs[par["groupby"]] = mod_col.astype(str).astype("category") + + # Solve gene labels + orig_gene_label = mod.var.index + mod.var_names = mod.var[par["gene_symbol"]].astype(str) + mod.var_names_make_unique() + + if not meta.get("cpus"): + meta["cpus"] = 1 + n_jobs = meta["cpus"] - 1 if meta["cpus"] > 2 else 1 + liana.mt.rank_aggregate( + adata=mod, + groupby=par["groupby"], + resource_name=par["resource_name"], + expr_prop=par["expr_prop"], + min_cells=par["min_cells"], + aggregate_method=par["aggregate_method"], + consensus_opts=par["consensus_opts"], + return_all_lrs=par["return_all_lrs"], + layer=par["layer"], + de_method=par["de_method"], + n_perms=par["n_perms"], + n_jobs=n_jobs, + verbose=True, + inplace=True, + use_raw=False, + ) + + # Return original gene labels + mod.var_names = orig_gene_label + + # Undo modifications to groupby column + mod.obs[par["groupby"]] = original_groupby_col + + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod, par["output_compression"] + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/interpret/lianapy", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/interpret/lianapy/nextflow.config b/target/nextflow/interpret/lianapy/nextflow.config new file mode 100644 index 00000000..8a38b42d --- /dev/null +++ b/target/nextflow/interpret/lianapy/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'interpret/lianapy' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs LIANA integration based as described in https://github.com/saezlab/liana-py' + author = 'Mauro Saporita, Povilas Gibas' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/interpret/lianapy/nextflow_labels.config b/target/nextflow/interpret/lianapy/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/interpret/lianapy/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/interpret/lianapy/nextflow_schema.json b/target/nextflow/interpret/lianapy/nextflow_schema.json new file mode 100644 index 00000000..4ac6fac7 --- /dev/null +++ b/target/nextflow/interpret/lianapy/nextflow_schema.json @@ -0,0 +1,159 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "lianapy", + "description": "Performs LIANA integration based as described in https://github.com/saezlab/liana-py", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Layer in anndata.AnnData.layers to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "de_method": { + "type": "string", + "description": "Differential expression method", + "help_text": "Type: `string`, multiple: `False`, default: `\"t-test\"`, choices: ``logreg`, `t-test`, `wilcoxon`, `t-test_overestim_var``. ", + "enum": [ + "logreg", + "t-test", + "wilcoxon", + "t-test_overestim_var" + ], + "default": "t-test" + }, + "groupby": { + "type": "string", + "description": "The key of the observations grouping to consider.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "resource_name": { + "type": "string", + "description": "Name of the resource to be loaded and use for ligand-receptor inference.", + "help_text": "Type: `string`, multiple: `False`, default: `\"consensus\"`, choices: ``baccin2019`, `cellcall`, `cellchatdb`, `cellinker`, `cellphonedb`, `celltalkdb`, `connectomedb2020`, `consensus`, `embrace`, `guide2pharma`, `hpmr`, `icellnet`, `italk`, `kirouac2010`, `lrdb`, `mouseconsensus`, `ramilowski2015``. ", + "enum": [ + "baccin2019", + "cellcall", + "cellchatdb", + "cellinker", + "cellphonedb", + "celltalkdb", + "connectomedb2020", + "consensus", + "embrace", + "guide2pharma", + "hpmr", + "icellnet", + "italk", + "kirouac2010", + "lrdb", + "mouseconsensus", + "ramilowski2015" + ], + "default": "consensus" + }, + "gene_symbol": { + "type": "string", + "description": "Column name in var DataFrame in which gene symbol are stored.", + "help_text": "Type: `string`, multiple: `False`, default: `\"gene_symbol\"`. ", + "default": "gene_symbol" + }, + "expr_prop": { + "type": "number", + "description": "Minimum expression proportion for the ligands/receptors (and their subunits) in the corresponding cell identities", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + }, + "min_cells": { + "type": "integer", + "description": "Minimum cells per cell identity ('groupby') to be considered for downstream analysis.", + "help_text": "Type: `integer`, multiple: `False`, default: `5`. ", + "default": 5 + }, + "aggregate_method": { + "type": "string", + "description": "Method aggregation approach, one of ['mean', 'rra'], where 'mean' represents the mean rank, while 'rra' is the RobustRankAggregate (Kolde et al., 2014) of the interactions.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rra\"`, choices: ``mean`, `rra``. ", + "enum": [ + "mean", + "rra" + ], + "default": "rra" + }, + "consensus_opts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Strategies to aggregate interactions across methods", + "help_text": "Type: `string`, multiple: `True`. " + }, + "return_all_lrs": { + "type": "boolean", + "description": "Bool whether to return all LRs, or only those that surpass the 'expr_prop' threshold", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "n_perms": { + "type": "integer", + "description": "Number of permutations for the permutation test", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/labels_transfer/knn/.config.vsh.yaml b/target/nextflow/labels_transfer/knn/.config.vsh.yaml new file mode 100644 index 00000000..ab1bcdf3 --- /dev/null +++ b/target/nextflow/labels_transfer/knn/.config.vsh.yaml @@ -0,0 +1,502 @@ +name: "knn" +namespace: "labels_transfer" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The query data to transfer the labels to. Should be a .h5mu file." + info: + label: "Query" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's inference,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + required: false + description: "The embedding to use for the classifier's inference.\ + \ Override using the `--input_obsm_features` argument. If not provided,\ + \ the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g.\ + \ by the same model or preprocessing).\n" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to use." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's inference.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g. by the same\ + \ model or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference dataset arguments" + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train classifiers on." + info: + label: "Reference" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's training,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + description: "The embedding to use for the classifier's training.\ + \ Override using the `--reference_obsm_features` argument.\nMake\ + \ sure that embedding was obtained in the same way as the query\ + \ embedding (e.g. by the same model or preprocessing).\n" + required: true + obs: + - type: "string" + name: "targets" + multiple: true + example: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + description: "The target labels to transfer. Override using the `--reference_obs_targets`\ + \ argument." + required: true + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's training.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the query embedding (e.g. by the same model\ + \ or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_targets" + description: "The `.obs` key(s) of the target labels to tranfer." + info: null + default: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels transfered\ + \ from the reference." + info: + label: "Output data" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + obs: + - type: "string" + name: "predictions" + description: "The predicted labels. Override using the `--output_obs_predictions`\ + \ argument." + required: true + - type: "double" + name: "probability" + description: "The probability of the predicted labels. Override using\ + \ the `--output_obs_probability` argument." + required: false + obsm: + - type: "double" + name: "X_scvi" + description: "The embedding used for the classifier's inference. Could\ + \ have any name, specified by `input_obsm_features` argument.\"" + required: false + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\nIf provided,\ + \ must have the same length as `--reference_obs_targets`.\nIf empty, will default\ + \ to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n\ + If provided, must have the same length as `--reference_obs_targets`.\nIf empty,\ + \ will default to the `reference_obs_targets` combined with the `\"_probability\"\ + ` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Input dataset (query) arguments" + arguments: + - type: "string" + name: "--input_obsm_distances" + description: "The `.obsm` key of the input (query) dataset containing pre-calculated\ + \ distances. \nIf not provided, the distances will be calculated using PyNNDescent.\n\ + Make sure the distance matrix contains distances relative to the reference dataset\ + \ and were obtained in the same way as the reference embedding.\n" + info: null + example: + - "bbknn_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference dataset arguments" + arguments: + - type: "string" + name: "--reference_obsm_distances" + description: "The `.obsm` key of the reference dataset containing pre-calculated\ + \ distances. \nIf not provided, the distances will be calculated using PyNNDescent.\n" + info: null + example: + - "bbknn_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "KNN label transfer arguments" + arguments: + - type: "string" + name: "--weights" + description: "Weight function used in prediction. Possible values are:\n- `uniform`\ + \ - all points in each neighborhood are weighted equally \n- `distance` - weight\ + \ points by the inverse of their distance\n- `gaussian` - weight points by the\ + \ sum of their Gaussian kernel similarities to each sample\n" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "distance" + - "gaussian" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors" + description: "The number of neighbors to use in k-neighbor graph structure used\ + \ for fast approximate nearest neighbor search with PyNNDescent. \nLarger values\ + \ will result in more accurate search results at the cost of computation time.\n" + info: null + default: + - 15 + required: false + min: 5 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This component performs label transfer from reference to query using\ + \ a K-Neirest Neighbors classifier.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pkg-config" + - "libhdf5-dev" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "pynndescent~=0.5.10" + - "numpy<2" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/labels_transfer/knn/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/labels_transfer/knn" + executable: "target/nextflow/labels_transfer/knn/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/labels_transfer/knn/compress_h5mu.py b/target/nextflow/labels_transfer/knn/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/labels_transfer/knn/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/labels_transfer/knn/helper.py b/target/nextflow/labels_transfer/knn/helper.py new file mode 100644 index 00000000..06d5d3a3 --- /dev/null +++ b/target/nextflow/labels_transfer/knn/helper.py @@ -0,0 +1,42 @@ +def check_arguments(par): + # check output .obs predictions + if not par["output_obs_predictions"]: + par["output_obs_predictions"] = [ + t + "_pred" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" + ) + + # check output .obs uncertainty + if not par["output_obs_probability"]: + par["output_obs_probability"] = [ + t + "_probability" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" + ) + + return par + + +def get_reference_features(adata_reference, par, logger): + if par["reference_obsm_features"] is None: + logger.info("Using .X of reference data") + train_data = adata_reference.X + else: + logger.info(f"Using .obsm[{par['reference_obsm_features']}] of reference data") + train_data = adata_reference.obsm[par["reference_obsm_features"]] + + return train_data + + +def get_query_features(adata, par, logger): + if par["input_obsm_features"] is None: + logger.info("Using .X of query data") + query_data = adata.X + else: + logger.info(f"Using .obsm[{par['input_obsm_features']}] of query data") + query_data = adata.obsm[par["input_obsm_features"]] + + return query_data diff --git a/target/nextflow/labels_transfer/knn/main.nf b/target/nextflow/labels_transfer/knn/main.nf new file mode 100644 index 00000000..092e1bf5 --- /dev/null +++ b/target/nextflow/labels_transfer/knn/main.nf @@ -0,0 +1,4361 @@ +// knn main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Vladimir Shitov (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "knn", + "namespace" : "labels_transfer", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The query data to transfer the labels to. Should be a .h5mu file.", + "info" : { + "label" : "Query", + "file_format" : { + "type" : "h5mu", + "mod" : { + "rna" : { + "description" : "Modality in AnnData format containing RNA data.", + "required" : true, + "slots" : { + "X" : { + "type" : "double", + "name" : "features", + "required" : false, + "description" : "The expression data to use for the classifier's inference, if `--input_obsm_features` argument is not provided.\n" + }, + "obsm" : [ + { + "type" : "double", + "name" : "features", + "example" : "X_scvi", + "required" : false, + "description" : "The embedding to use for the classifier's inference. Override using the `--input_obsm_features` argument. If not provided, the `.X` slot will be used instead.\nMake sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing).\n" + } + ] + } + } + } + } + }, + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to use.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obsm_features", + "description" : "The `.obsm` key of the embedding to use for the classifier's inference. If not provided, the `.X` slot will be used instead.\nMake sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing).\n", + "example" : [ + "X_scvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference dataset arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train classifiers on.", + "info" : { + "label" : "Reference", + "file_format" : { + "type" : "h5mu", + "mod" : { + "rna" : { + "description" : "Modality in AnnData format containing RNA data.", + "required" : true, + "slots" : { + "X" : { + "type" : "double", + "name" : "features", + "required" : false, + "description" : "The expression data to use for the classifier's training, if `--input_obsm_features` argument is not provided.\n" + }, + "obsm" : [ + { + "type" : "double", + "name" : "features", + "example" : "X_scvi", + "description" : "The embedding to use for the classifier's training. Override using the `--reference_obsm_features` argument.\nMake sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing).\n", + "required" : true + } + ], + "obs" : [ + { + "type" : "string", + "name" : "targets", + "multiple" : true, + "example" : [ + "ann_level_1", + "ann_level_2", + "ann_level_3", + "ann_level_4", + "ann_level_5", + "ann_finest_level" + ], + "description" : "The target labels to transfer. Override using the `--reference_obs_targets` argument.", + "required" : true + } + ] + } + } + } + } + }, + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obsm_features", + "description" : "The `.obsm` key of the embedding to use for the classifier's training. If not provided, the `.X` slot will be used instead.\nMake sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing).\n", + "example" : [ + "X_scvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_targets", + "description" : "The `.obs` key(s) of the target labels to tranfer.", + "default" : [ + "ann_level_1", + "ann_level_2", + "ann_level_3", + "ann_level_4", + "ann_level_5", + "ann_finest_level" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The query data in .h5mu format with predicted labels transfered from the reference.", + "info" : { + "label" : "Output data", + "file_format" : { + "type" : "h5mu", + "mod" : { + "rna" : { + "description" : "Modality in AnnData format containing RNA data.", + "required" : true, + "obs" : [ + { + "type" : "string", + "name" : "predictions", + "description" : "The predicted labels. Override using the `--output_obs_predictions` argument.", + "required" : true + }, + { + "type" : "double", + "name" : "probability", + "description" : "The probability of the predicted labels. Override using the `--output_obs_probability` argument.", + "required" : false + } + ], + "obsm" : [ + { + "type" : "double", + "name" : "X_scvi", + "description" : "The embedding used for the classifier's inference. Could have any name, specified by `input_obsm_features` argument.\\"", + "required" : false + } + ] + } + } + } + }, + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted information.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_pred\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_probability\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--input_obsm_distances", + "description" : "The `.obsm` key of the input (query) dataset containing pre-calculated distances. \nIf not provided, the distances will be calculated using PyNNDescent.\nMake sure the distance matrix contains distances relative to the reference dataset and were obtained in the same way as the reference embedding.\n", + "example" : [ + "bbknn_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference dataset arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--reference_obsm_distances", + "description" : "The `.obsm` key of the reference dataset containing pre-calculated distances. \nIf not provided, the distances will be calculated using PyNNDescent.\n", + "example" : [ + "bbknn_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "KNN label transfer arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--weights", + "description" : "Weight function used in prediction. Possible values are:\n- `uniform` - all points in each neighborhood are weighted equally \n- `distance` - weight points by the inverse of their distance\n- `gaussian` - weight points by the sum of their Gaussian kernel similarities to each sample\n", + "default" : [ + "uniform" + ], + "required" : false, + "choices" : [ + "uniform", + "distance", + "gaussian" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_neighbors", + "description" : "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. \nLarger values will result in more accurate search results at the cost of computation time.\n", + "default" : [ + 15 + ], + "required" : false, + "min" : 5, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/labels_transfer/utils/helper.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This component performs label transfer from reference to query using a K-Neirest Neighbors classifier.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "pkg-config", + "libhdf5-dev" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pynndescent~=0.5.10", + "numpy<2" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/labels_transfer/knn/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/labels_transfer/knn", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import mudata as mu +import numpy as np +import sys +from pynndescent import PyNNDescentTransformer +from sklearn.neighbors import KNeighborsClassifier + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obsm_features': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_FEATURES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obsm_features': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBSM_FEATURES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_targets': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGETS+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGETS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obsm_distances': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_DISTANCES+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_DISTANCES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obsm_distances': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBSM_DISTANCES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBSM_DISTANCES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'weights': $( if [ ! -z ${VIASH_PAR_WEIGHTS+x} ]; then echo "r'${VIASH_PAR_WEIGHTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_neighbors': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger + + +# END TEMPORARY WORKAROUND setup_logger + + +def distances_to_affinities(distances): + # Apply Gaussian kernel to distances + stds = np.std(distances, axis=1) + stds = (2.0 / stds) ** 2 + stds = stds.reshape(-1, 1) + distances_tilda = np.exp(-np.true_divide(distances, stds)) + + # normalize the distances_tilda + # if the sum of a row of the distances tilda equals 0, + # set normalized distances for that row to 1 + # else divide the row values by the value of the sum of the row + distances_tilda_normalized = np.where( + np.sum(distances_tilda, axis=1, keepdims=True) == 0, + 1, + distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True), + ) + return distances_tilda_normalized + + +logger = setup_logger() + +# Reading in data +logger.info( + f"Reading in query dataset {par['input']} and reference datasets {par['reference']}" +) +q_adata = mu.read_h5ad(par["input"], mod=par["modality"]) +r_adata = mu.read_h5ad(par["reference"], mod=par["modality"]) + +# check arguments +logger.info("Checking arguments") +par = check_arguments(par) + +if par["input_obsm_distances"] and par["reference_obsm_distances"]: + logger.info( + "Using pre-calculated distances for KNN classification as provided in \\`--input_obsm_distances\\` and \\`--reference_obsm_distances\\`." + ) + + assert par["input_obsm_distances"] in q_adata.obsm, ( + f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}." + ) + assert par["reference_obsm_distances"] in r_adata.obsm, ( + f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}." + ) + + query_neighbors = q_adata.obsm[par["input_obsm_distances"]] + reference_neighbors = r_adata.obsm[par["reference_obsm_distances"]] + + if query_neighbors.shape[1] != reference_neighbors.shape[1]: + raise ValueError( + "The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset." + ) + + # Make sure the number of neighbors present in the distance matrix matches the requested number of neighbors in --n_neighbors + # Otherwise reduce n_neighbors for KNN + smallest_neighbor_count = min( + np.diff(query_neighbors.indptr).min(), np.diff(reference_neighbors.indptr).min() + ) + if smallest_neighbor_count < par["n_neighbors"]: + logger.warning( + f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification" + ) + par["n_neighbors"] = smallest_neighbor_count + +elif par["input_obsm_distances"] or par["reference_obsm_distances"]: + raise ValueError( + "Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification." + ) + +elif not par["input_obsm_distances"] and not par["reference_obsm_distances"]: + logger.info( + "No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm." + ) + # Generating training and inference data + train_X = get_reference_features(r_adata, par, logger) + inference_X = get_query_features(q_adata, par, logger) + + neighbors_transformer = PyNNDescentTransformer( + n_neighbors=par["n_neighbors"], + parallel_batch_queries=True, + ) + neighbors_transformer.fit(train_X) + + # Square sparse matrix with distances to n neighbors in reference data + query_neighbors = neighbors_transformer.transform(inference_X) + reference_neighbors = neighbors_transformer.transform(train_X) + +# For each target, train a classifier and predict labels +for obs_tar, obs_pred, obs_proba in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], +): + logger.info(f"Predicting labels for {obs_tar}") + + weights_dict = { + "uniform": "uniform", + "distance": "distance", + "gaussian": distances_to_affinities, + } + + logger.info(f"Using KNN classifier with {par['weights']} weights") + train_y = r_adata.obs[obs_tar].to_numpy() + classifier = KNeighborsClassifier( + n_neighbors=par["n_neighbors"], + metric="precomputed", + weights=weights_dict[par["weights"]], + ) + classifier.fit(X=reference_neighbors, y=train_y) + predicted_labels = classifier.predict(query_neighbors) + probabilities = classifier.predict_proba(query_neighbors).max(axis=1) + + # save_results + logger.info( + f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs" + ) + q_adata.obs[obs_pred] = predicted_labels + q_adata.obs[obs_proba] = probabilities + +logger.info(f"Saving output data to {par['output']}") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], q_adata, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/labels_transfer/knn", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/labels_transfer/knn/nextflow.config b/target/nextflow/labels_transfer/knn/nextflow.config new file mode 100644 index 00000000..9a0a8dc4 --- /dev/null +++ b/target/nextflow/labels_transfer/knn/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'labels_transfer/knn' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This component performs label transfer from reference to query using a K-Neirest Neighbors classifier.\n' + author = 'Dorien Roosen, Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/labels_transfer/knn/nextflow_labels.config b/target/nextflow/labels_transfer/knn/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/labels_transfer/knn/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/labels_transfer/knn/nextflow_schema.json b/target/nextflow/labels_transfer/knn/nextflow_schema.json new file mode 100644 index 00000000..e69de29b diff --git a/target/nextflow/labels_transfer/xgboost/.config.vsh.yaml b/target/nextflow/labels_transfer/xgboost/.config.vsh.yaml new file mode 100644 index 00000000..55c4b18c --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/.config.vsh.yaml @@ -0,0 +1,651 @@ +name: "xgboost" +namespace: "labels_transfer" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Input dataset (query) arguments" + arguments: + - type: "file" + name: "--input" + description: "The query data to transfer the labels to. Should be a .h5mu file." + info: + label: "Query" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's inference,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + required: false + description: "The embedding to use for the classifier's inference.\ + \ Override using the `--input_obsm_features` argument. If not provided,\ + \ the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g.\ + \ by the same model or preprocessing).\n" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to use." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's inference.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the reference embedding (e.g. by the same\ + \ model or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference dataset arguments" + arguments: + - type: "file" + name: "--reference" + description: "The reference data to train classifiers on." + info: + label: "Reference" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + slots: + X: + type: "double" + name: "features" + required: false + description: "The expression data to use for the classifier's training,\ + \ if `--input_obsm_features` argument is not provided.\n" + obsm: + - type: "double" + name: "features" + example: "X_scvi" + description: "The embedding to use for the classifier's training.\ + \ Override using the `--reference_obsm_features` argument.\nMake\ + \ sure that embedding was obtained in the same way as the query\ + \ embedding (e.g. by the same model or preprocessing).\n" + required: true + obs: + - type: "string" + name: "targets" + multiple: true + example: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + description: "The target labels to transfer. Override using the `--reference_obs_targets`\ + \ argument." + required: true + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obsm_features" + description: "The `.obsm` key of the embedding to use for the classifier's training.\ + \ If not provided, the `.X` slot will be used instead.\nMake sure that embedding\ + \ was obtained in the same way as the query embedding (e.g. by the same model\ + \ or preprocessing).\n" + info: null + example: + - "X_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_targets" + description: "The `.obs` key(s) of the target labels to tranfer." + info: null + default: + - "ann_level_1" + - "ann_level_2" + - "ann_level_3" + - "ann_level_4" + - "ann_level_5" + - "ann_finest_level" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels transfered\ + \ from the reference." + info: + label: "Output data" + file_format: + type: "h5mu" + mod: + rna: + description: "Modality in AnnData format containing RNA data." + required: true + obs: + - type: "string" + name: "predictions" + description: "The predicted labels. Override using the `--output_obs_predictions`\ + \ argument." + required: true + - type: "double" + name: "probability" + description: "The probability of the predicted labels. Override using\ + \ the `--output_obs_probability` argument." + required: false + obsm: + - type: "double" + name: "X_scvi" + description: "The embedding used for the classifier's inference. Could\ + \ have any name, specified by `input_obsm_features` argument.\"" + required: false + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted information.\nIf provided,\ + \ must have the same length as `--reference_obs_targets`.\nIf empty, will default\ + \ to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n\ + If provided, must have the same length as `--reference_obs_targets`.\nIf empty,\ + \ will default to the `reference_obs_targets` combined with the `\"_probability\"\ + ` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Execution arguments" + arguments: + - type: "boolean_true" + name: "--force_retrain" + alternatives: + - "-f" + description: "Retrain models on the reference even if model_output directory already\ + \ has trained classifiers. WARNING! It will rewrite existing classifiers for\ + \ targets in the model_output directory!" + info: null + direction: "input" + - type: "boolean" + name: "--use_gpu" + description: "Use GPU during models training and inference (recommended)." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--verbosity" + alternatives: + - "-v" + description: "The verbosity level for evaluation of the classifier from the range\ + \ [0,2]" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_output" + description: "Output directory for model" + info: null + default: + - "model" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_uns_parameters" + description: "The key in `uns` slot of the output AnnData object to store the\ + \ parameters of the XGBoost classifier." + info: null + default: + - "xgboost_parameters" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Learning parameters" + arguments: + - type: "double" + name: "--learning_rate" + alternatives: + - "--eta" + description: "Step size shrinkage used in update to prevents overfitting. Range:\ + \ [0,1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_split_loss" + alternatives: + - "--gamma" + description: "Minimum loss reduction required to make a further partition on a\ + \ leaf node of the tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_depth" + alternatives: + - "-d" + description: "Maximum depth of a tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 6 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_child_weight" + description: "Minimum sum of instance weight (hessian) needed in a child. See\ + \ https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_delta_step" + description: "Maximum delta step we allow each leaf output to be. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--subsample" + description: "Subsample ratio of the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sampling_method" + description: "The method to use to sample the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "gradient_based" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--colsample_bytree" + description: "Fraction of columns to be subsampled. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--colsample_bylevel" + description: "Subsample ratio of columns for each level. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--colsample_bynode" + description: "Subsample ratio of columns for each node (split). Range (0, 1].\ + \ See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--reg_lambda" + alternatives: + - "--lambda" + description: "L2 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--reg_alpha" + alternatives: + - "--alpha" + description: "L1 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--scale_pos_weight" + description: "Control the balance of positive and negative weights, useful for\ + \ unbalanced classes. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster\ + \ for the reference" + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "helper.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs label transfer from reference to query using XGBoost classifier" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "annotation_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + method_id: "XGBClassifier" +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "apt" + packages: + - "libopenblas-dev" + - "liblapack-dev" + - "gfortran" + interactive: false + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "xgboost~=2.1.3" + - "scikit-learn<1.6" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/labels_transfer/xgboost/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/labels_transfer/xgboost" + executable: "target/nextflow/labels_transfer/xgboost/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/labels_transfer/xgboost/compress_h5mu.py b/target/nextflow/labels_transfer/xgboost/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/labels_transfer/xgboost/helper.py b/target/nextflow/labels_transfer/xgboost/helper.py new file mode 100644 index 00000000..06d5d3a3 --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/helper.py @@ -0,0 +1,42 @@ +def check_arguments(par): + # check output .obs predictions + if not par["output_obs_predictions"]: + par["output_obs_predictions"] = [ + t + "_pred" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" + ) + + # check output .obs uncertainty + if not par["output_obs_probability"]: + par["output_obs_probability"] = [ + t + "_probability" for t in par["reference_obs_targets"] + ] + assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), ( + f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" + ) + + return par + + +def get_reference_features(adata_reference, par, logger): + if par["reference_obsm_features"] is None: + logger.info("Using .X of reference data") + train_data = adata_reference.X + else: + logger.info(f"Using .obsm[{par['reference_obsm_features']}] of reference data") + train_data = adata_reference.obsm[par["reference_obsm_features"]] + + return train_data + + +def get_query_features(adata, par, logger): + if par["input_obsm_features"] is None: + logger.info("Using .X of query data") + query_data = adata.X + else: + logger.info(f"Using .obsm[{par['input_obsm_features']}] of query data") + query_data = adata.obsm[par["input_obsm_features"]] + + return query_data diff --git a/target/nextflow/labels_transfer/xgboost/main.nf b/target/nextflow/labels_transfer/xgboost/main.nf new file mode 100644 index 00000000..c7c03761 --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/main.nf @@ -0,0 +1,4784 @@ +// xgboost main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "xgboost", + "namespace" : "labels_transfer", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input dataset (query) arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The query data to transfer the labels to. Should be a .h5mu file.", + "info" : { + "label" : "Query", + "file_format" : { + "type" : "h5mu", + "mod" : { + "rna" : { + "description" : "Modality in AnnData format containing RNA data.", + "required" : true, + "slots" : { + "X" : { + "type" : "double", + "name" : "features", + "required" : false, + "description" : "The expression data to use for the classifier's inference, if `--input_obsm_features` argument is not provided.\n" + }, + "obsm" : [ + { + "type" : "double", + "name" : "features", + "example" : "X_scvi", + "required" : false, + "description" : "The embedding to use for the classifier's inference. Override using the `--input_obsm_features` argument. If not provided, the `.X` slot will be used instead.\nMake sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing).\n" + } + ] + } + } + } + } + }, + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to use.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obsm_features", + "description" : "The `.obsm` key of the embedding to use for the classifier's inference. If not provided, the `.X` slot will be used instead.\nMake sure that embedding was obtained in the same way as the reference embedding (e.g. by the same model or preprocessing).\n", + "example" : [ + "X_scvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference dataset arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "The reference data to train classifiers on.", + "info" : { + "label" : "Reference", + "file_format" : { + "type" : "h5mu", + "mod" : { + "rna" : { + "description" : "Modality in AnnData format containing RNA data.", + "required" : true, + "slots" : { + "X" : { + "type" : "double", + "name" : "features", + "required" : false, + "description" : "The expression data to use for the classifier's training, if `--input_obsm_features` argument is not provided.\n" + }, + "obsm" : [ + { + "type" : "double", + "name" : "features", + "example" : "X_scvi", + "description" : "The embedding to use for the classifier's training. Override using the `--reference_obsm_features` argument.\nMake sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing).\n", + "required" : true + } + ], + "obs" : [ + { + "type" : "string", + "name" : "targets", + "multiple" : true, + "example" : [ + "ann_level_1", + "ann_level_2", + "ann_level_3", + "ann_level_4", + "ann_level_5", + "ann_finest_level" + ], + "description" : "The target labels to transfer. Override using the `--reference_obs_targets` argument.", + "required" : true + } + ] + } + } + } + } + }, + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obsm_features", + "description" : "The `.obsm` key of the embedding to use for the classifier's training. If not provided, the `.X` slot will be used instead.\nMake sure that embedding was obtained in the same way as the query embedding (e.g. by the same model or preprocessing).\n", + "example" : [ + "X_scvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_targets", + "description" : "The `.obs` key(s) of the target labels to tranfer.", + "default" : [ + "ann_level_1", + "ann_level_2", + "ann_level_3", + "ann_level_4", + "ann_level_5", + "ann_finest_level" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The query data in .h5mu format with predicted labels transfered from the reference.", + "info" : { + "label" : "Output data", + "file_format" : { + "type" : "h5mu", + "mod" : { + "rna" : { + "description" : "Modality in AnnData format containing RNA data.", + "required" : true, + "obs" : [ + { + "type" : "string", + "name" : "predictions", + "description" : "The predicted labels. Override using the `--output_obs_predictions` argument.", + "required" : true + }, + { + "type" : "double", + "name" : "probability", + "description" : "The probability of the predicted labels. Override using the `--output_obs_probability` argument.", + "required" : false + } + ], + "obsm" : [ + { + "type" : "double", + "name" : "X_scvi", + "description" : "The embedding used for the classifier's inference. Could have any name, specified by `input_obsm_features` argument.\\"", + "required" : false + } + ] + } + } + } + }, + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted information.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_pred\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_probability\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Execution arguments", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--force_retrain", + "alternatives" : [ + "-f" + ], + "description" : "Retrain models on the reference even if model_output directory already has trained classifiers. WARNING! It will rewrite existing classifiers for targets in the model_output directory!", + "direction" : "input" + }, + { + "type" : "boolean", + "name" : "--use_gpu", + "description" : "Use GPU during models training and inference (recommended).", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--verbosity", + "alternatives" : [ + "-v" + ], + "description" : "The verbosity level for evaluation of the classifier from the range [0,2]", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_output", + "description" : "Output directory for model", + "default" : [ + "model" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_uns_parameters", + "description" : "The key in `uns` slot of the output AnnData object to store the parameters of the XGBoost classifier.", + "default" : [ + "xgboost_parameters" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Learning parameters", + "arguments" : [ + { + "type" : "double", + "name" : "--learning_rate", + "alternatives" : [ + "--eta" + ], + "description" : "Step size shrinkage used in update to prevents overfitting. Range: [0,1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 0.3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_split_loss", + "alternatives" : [ + "--gamma" + ], + "description" : "Minimum loss reduction required to make a further partition on a leaf node of the tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_depth", + "alternatives" : [ + "-d" + ], + "description" : "Maximum depth of a tree. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 6 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_child_weight", + "description" : "Minimum sum of instance weight (hessian) needed in a child. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_delta_step", + "description" : "Maximum delta step we allow each leaf output to be. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--subsample", + "description" : "Subsample ratio of the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sampling_method", + "description" : "The method to use to sample the training instances. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + "uniform" + ], + "required" : false, + "choices" : [ + "uniform", + "gradient_based" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--colsample_bytree", + "description" : "Fraction of columns to be subsampled. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--colsample_bylevel", + "description" : "Subsample ratio of columns for each level. Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--colsample_bynode", + "description" : "Subsample ratio of columns for each node (split). Range (0, 1]. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--reg_lambda", + "alternatives" : [ + "--lambda" + ], + "description" : "L2 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--reg_alpha", + "alternatives" : [ + "--alpha" + ], + "description" : "L1 regularization term on weights. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--scale_pos_weight", + "description" : "Control the balance of positive and negative weights, useful for unbalanced classes. See https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster for the reference", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "../utils/helper.py" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs label transfer from reference to query using XGBoost classifier", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/" + } + ], + "info" : { + "method_id" : "XGBClassifier" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "apt", + "packages" : [ + "libopenblas-dev", + "liblapack-dev", + "gfortran" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scanpy~=1.10.4", + "xgboost~=2.1.3", + "scikit-learn<1.6" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/labels_transfer/xgboost/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/labels_transfer/xgboost", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import json +import os +from typing import Optional +import yaml +from pathlib import Path + +import mudata +import numpy as np +import pandas as pd +import xgboost as xgb +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.preprocessing import LabelEncoder + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obsm_features': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_FEATURES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obsm_features': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBSM_FEATURES+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBSM_FEATURES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_obs_targets': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_TARGETS+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_TARGETS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'force_retrain': $( if [ ! -z ${VIASH_PAR_FORCE_RETRAIN+x} ]; then echo "r'${VIASH_PAR_FORCE_RETRAIN//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'use_gpu': $( if [ ! -z ${VIASH_PAR_USE_GPU+x} ]; then echo "r'${VIASH_PAR_USE_GPU//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'verbosity': $( if [ ! -z ${VIASH_PAR_VERBOSITY+x} ]; then echo "int(r'${VIASH_PAR_VERBOSITY//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'model_output': $( if [ ! -z ${VIASH_PAR_MODEL_OUTPUT+x} ]; then echo "r'${VIASH_PAR_MODEL_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_uns_parameters': $( if [ ! -z ${VIASH_PAR_OUTPUT_UNS_PARAMETERS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_UNS_PARAMETERS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'learning_rate': $( if [ ! -z ${VIASH_PAR_LEARNING_RATE+x} ]; then echo "float(r'${VIASH_PAR_LEARNING_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_split_loss': $( if [ ! -z ${VIASH_PAR_MIN_SPLIT_LOSS+x} ]; then echo "float(r'${VIASH_PAR_MIN_SPLIT_LOSS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_depth': $( if [ ! -z ${VIASH_PAR_MAX_DEPTH+x} ]; then echo "int(r'${VIASH_PAR_MAX_DEPTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_child_weight': $( if [ ! -z ${VIASH_PAR_MIN_CHILD_WEIGHT+x} ]; then echo "int(r'${VIASH_PAR_MIN_CHILD_WEIGHT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_delta_step': $( if [ ! -z ${VIASH_PAR_MAX_DELTA_STEP+x} ]; then echo "float(r'${VIASH_PAR_MAX_DELTA_STEP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'subsample': $( if [ ! -z ${VIASH_PAR_SUBSAMPLE+x} ]; then echo "float(r'${VIASH_PAR_SUBSAMPLE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sampling_method': $( if [ ! -z ${VIASH_PAR_SAMPLING_METHOD+x} ]; then echo "r'${VIASH_PAR_SAMPLING_METHOD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'colsample_bytree': $( if [ ! -z ${VIASH_PAR_COLSAMPLE_BYTREE+x} ]; then echo "float(r'${VIASH_PAR_COLSAMPLE_BYTREE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'colsample_bylevel': $( if [ ! -z ${VIASH_PAR_COLSAMPLE_BYLEVEL+x} ]; then echo "float(r'${VIASH_PAR_COLSAMPLE_BYLEVEL//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'colsample_bynode': $( if [ ! -z ${VIASH_PAR_COLSAMPLE_BYNODE+x} ]; then echo "float(r'${VIASH_PAR_COLSAMPLE_BYNODE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reg_lambda': $( if [ ! -z ${VIASH_PAR_REG_LAMBDA+x} ]; then echo "float(r'${VIASH_PAR_REG_LAMBDA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'reg_alpha': $( if [ ! -z ${VIASH_PAR_REG_ALPHA+x} ]; then echo "float(r'${VIASH_PAR_REG_ALPHA//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scale_pos_weight': $( if [ ! -z ${VIASH_PAR_SCALE_POS_WEIGHT+x} ]; then echo "float(r'${VIASH_PAR_SCALE_POS_WEIGHT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from helper import check_arguments, get_reference_features, get_query_features +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + +# look for training params for method +argument_groups = {grp["name"]: grp["arguments"] for grp in config["argument_groups"]} +training_arg_names = [ + arg["name"].replace("--", "") for arg in argument_groups["Learning parameters"] +] +training_params = {arg_name: par[arg_name] for arg_name in training_arg_names} + + +def encode_labels(y): + labels_encoder = LabelEncoder() + labels_encoder.fit(y) + + return labels_encoder.transform(y), labels_encoder + + +def get_model_eval(xgb_model, X_test, y_test, labels_encoder): + preds = xgb_model.predict(X_test) + + cr = classification_report( + labels_encoder.inverse_transform(y_test), + labels_encoder.inverse_transform(preds), + output_dict=True, + ) + cr_df = pd.DataFrame(cr).transpose() + + return cr_df + + +def train_test_split_adata(adata, labels): + train_data = pd.DataFrame(data=adata.X, index=adata.obs_names) + + X_train, X_test, y_train, y_test = train_test_split( + train_data, labels, test_size=0.2, random_state=42, stratify=labels + ) + + return X_train, X_test, y_train, y_test + + +def train_xgb_model(X_train, y_train, gpu=True) -> xgb.XGBClassifier: + n_classes = len(np.unique(y_train)) + objective = "binary:logistic" if n_classes == 2 else "multi:softprob" + + tree_method = "gpu_hist" if gpu else "hist" + xgbc = xgb.XGBClassifier( + tree_method=tree_method, objective=objective, **training_params + ) + xgbc.fit(X_train, y_train) + + return xgbc + + +def build_classifier( + X, y, labels_encoder, label_key, eval_verbosity: Optional[int] = 1, gpu=True +) -> xgb.XGBClassifier: + # Adata prep + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + # Note: Do we need a new train-test split for each classifier? + + # Model training + xgb_model = train_xgb_model(X_train, y_train, gpu=gpu) + + # Model eval + if eval_verbosity != 0: + cr_df = get_model_eval(xgb_model, X_test, y_test, labels_encoder) + + if eval_verbosity == 2: + print(cr_df) + + else: + overall_accuracy = cr_df["support"]["accuracy"] + low_prec_key = cr_df.precision.idxmin() + low_prec_val = cr_df.precision.min() + low_rec_key = cr_df.recall.idxmin() + low_rec_val = cr_df.recall.min() + low_f1_key = cr_df["f1-score"].idxmin() + low_f1_val = cr_df["f1-score"].min() + + print("") + print(f"Summary stats for {label_key} model:") + print(f"Overall accuracy: {overall_accuracy}") + print(f"Min. precision: {low_prec_key}: {low_prec_val}") + print(f"Min. Recall: {low_rec_key}: {low_rec_val}") + print(f"Min. F1-score: {low_f1_key}: {low_f1_val}") + print("") + + return xgb_model + + +def build_ref_classifiers( + adata_reference, + targets, + model_path, + eval_verbosity: Optional[int] = 1, + gpu: Optional[bool] = True, +) -> None: + """ + This function builds xgboost classifiers on a reference embedding for a designated number of + adata_reference.obs columns. Classifier .xgb files and a model_info.json file is written to the \\`model_path\\` + directory. Model evaluation is printed to stdout. + + Inputs: + * \\`adata_reference\\`: The AnnData object that was used to train the reference model + * \\`model_path\\`: The reference model directory where the classifiers will also be stored + * \\`eval_verbosity\\`: The verbosity level for evaluation of the classifier from the range [0;2]. + * \\`gpu\\`: Boolean indicating whether a gpu is available for classifier training + + + Example: + \\`\\`\\` + >>> adata + AnnData object with n_obs x n_vars = 700 x 765 + obs: "ann_finest_level", "ann_level_1" + + >>> os.listdir("/path/to/model") + model_params.pt* + + >>> build_ref_classifiers(adata, "path/to/model", eval_verbosity=1, gpu=True) + >>> os.listdir("/path/to/model") + classifier_ann_finest_level.xgb* model_info.json* + classifier_ann_level_1.xgb* model_params.pt* + \\`\\`\\` + """ + + # Check inputs + if not isinstance(eval_verbosity, int): + raise TypeError("\\`eval_verbosity\\` should be an integer between 0 and 2.") + + if eval_verbosity < 0 or eval_verbosity > 2: + raise ValueError("\\`eval_verbosity\\` should be an integer between 0 and 2.") + + train_data = get_reference_features(adata_reference, par, logger) + + if not os.path.exists(model_path): + os.makedirs(model_path, exist_ok=True) + + # Map from name of classifier to file names + classifiers = dict() + + for label, obs_pred in zip(targets, par["output_obs_predictions"]): + if label not in adata_reference.obs: + raise ValueError(f"{label} is not in the \\`adata\\` object passed!") + + filename = "classifier_" + label + ".xgb" + + labels, labels_encoder = encode_labels(adata_reference.obs[label]) + logger.info(f"Classes: {labels_encoder.classes_}") + + logger.info(f"Building classifier for {label}...") + xgb_model = build_classifier( + X=train_data, + y=labels, + labels_encoder=labels_encoder, + label_key=label, + eval_verbosity=eval_verbosity, + gpu=gpu, + ) + + # Save classifier + logger.info("Saving model") + xgb_model.save_model(os.path.join(model_path, filename)) + + # Store classifier info + classifiers[label] = { + "filename": filename, + "labels": labels_encoder.classes_.tolist(), + "obs_column": obs_pred, + "model_params": training_params, + } + + # Store model_info.json file + model_info = {"classifier_info": classifiers} + + logger.info("Writing model_info to the file") + # Read previous file if it exists + if os.path.exists(model_path + "/model_info.json"): + logger.info("Old model_info file found, updating") + with open(model_path + "/model_info.json", "r") as f: + old_model_info = json.loads(f.read()) + + for key in old_model_info: + if key in model_info: + old_model_info[key].update(model_info[key]) + json_string = json.dumps(old_model_info, indent=4) + + else: + logger.info("Creating a new file") + json_string = json.dumps(model_info, indent=4) + + with open(model_path + "/model_info.json", "w") as f: + f.write(json_string) + + +def project_labels( + query_dataset, + cell_type_classifier_model: xgb.XGBClassifier, + annotation_column_name="label_pred", + probability_column_name="label_probability", + probability_thresh=None, # Note: currently not passed to predict function +): + """ + A function that projects predicted labels onto the query dataset, along with probability estimations. + Performs in-place update of the adata object, adding columns to the \\`obs\\` DataFrame. + + Input: + * \\`query_dataset\\`: The query \\`AnnData\\` object + * \\`model_file\\`: Path to the classification model file + * \\`prediction_key\\`: Column name in \\`adata.obs\\` where to store the predicted labels + * \\`probability_key\\`: Column name in \\`adata.obs\\` where to store the labels probabilities + * \\`probability_thresh\\`: The probability threshold below which we call a cell 'Unknown' + + Output: + Nothing is output, the passed anndata is modified inplace + + """ + + if (probability_thresh is not None) and ( + probability_thresh < 0 or probability_thresh > 1 + ): + raise ValueError("\\`probability_thresh\\` must be \\`None\\` or between 0 and 1.") + + query_data = get_query_features(query_dataset, par, logger) + + # Predict labels and probabilities + query_dataset.obs[annotation_column_name] = cell_type_classifier_model.predict( + query_data + ) + + logger.info("Predicting probabilities") + probs = cell_type_classifier_model.predict_proba(query_data) + + # Format probabilities + df_probs = pd.DataFrame( + probs, + columns=cell_type_classifier_model.classes_, + index=query_dataset.obs_names, + ) + query_dataset.obs[probability_column_name] = df_probs.max(1) + + # Note: this is here in case we want to propose a set of values for the user to accept to seed the + # manual curation of predicted labels + if probability_thresh is not None: + logger.info("Marking uncertain predictions") + query_dataset.obs[annotation_column_name + "_filtered"] = [ + val + if query_dataset.obs[probability_column_name][i] >= probability_thresh + else "Unknown" + for i, val in enumerate(query_dataset.obs[annotation_column_name]) + ] + + return query_dataset + + +def predict( + query_dataset, + cell_type_classifier_model_path, + annotation_column_name: str, + prediction_column_name: str, + probability_column_name: str, + models_info, + use_gpu: bool = False, +) -> pd.DataFrame: + """ + Returns \\`obs\\` DataFrame with prediction columns appended + """ + + tree_method = "gpu_hist" if use_gpu else "hist" + + labels = models_info["classifier_info"][annotation_column_name]["labels"] + + objective = "binary:logistic" if len(labels) == 2 else "multi:softprob" + cell_type_classifier_model = xgb.XGBClassifier( + tree_method=tree_method, objective=objective + ) + + logger.info("Loading model") + cell_type_classifier_model.load_model(fname=cell_type_classifier_model_path) + + logger.info("Predicting labels") + project_labels( + query_dataset, + cell_type_classifier_model, + annotation_column_name=prediction_column_name, + probability_column_name=probability_column_name, + ) + + logger.info("Converting labels from numbers to classes") + labels_encoder = LabelEncoder() + labels_encoder.classes_ = np.array(labels) + query_dataset.obs[prediction_column_name] = labels_encoder.inverse_transform( + query_dataset.obs[prediction_column_name] + ) + + return query_dataset + + +def main(par): + logger.info("Checking arguments") + par = check_arguments(par) + + adata_query = mudata.read_h5ad(par["input"].strip(), mod=par["modality"]) + adata_reference = mudata.read_h5ad(par["reference"], mod=par["modality"]) + + # If classifiers for targets are in the model_output directory, simply open them and run (unless \\`retrain\\` != True) + # If some classifiers are missing, train and save them first + # Predict and save the query data + + targets_to_train = [] + + for obs_target in par["reference_obs_targets"]: + if ( + not os.path.exists(par["model_output"]) + or f"classifier_{obs_target}.xgb" not in os.listdir(par["model_output"]) + or par["force_retrain"] + ): + logger.info(f"Classifier for {obs_target} added to a training schedule") + targets_to_train.append(obs_target) + else: + logger.info(f"Found classifier for {obs_target}, no retraining required") + + build_ref_classifiers( + adata_reference, + targets_to_train, + model_path=par["model_output"], + gpu=par["use_gpu"], + eval_verbosity=par["verbosity"], + ) + + output_uns_parameters = adata_query.uns.get(par["output_uns_parameters"], {}) + + with open(par["model_output"] + "/model_info.json", "r") as f: + models_info = json.loads(f.read()) + + for obs_target, obs_pred, obs_unc in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], + ): + logger.info(f"Predicting {obs_target}") + + adata_query = predict( + query_dataset=adata_query, + cell_type_classifier_model_path=os.path.join( + par["model_output"], "classifier_" + obs_target + ".xgb" + ), + annotation_column_name=obs_target, + prediction_column_name=obs_pred, + probability_column_name=obs_unc, + models_info=models_info, + use_gpu=par["use_gpu"], + ) + + if obs_target in targets_to_train: + # Save information about the transfer to .uns + output_uns_parameters[obs_target] = { + "method": "XGBClassifier", + **training_params, + } + + adata_query.uns[par["output_uns_parameters"]] = output_uns_parameters + + logger.info("Writing output to %s", par["output"]) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata_query, None + ) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/labels_transfer/xgboost", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/labels_transfer/xgboost/nextflow.config b/target/nextflow/labels_transfer/xgboost/nextflow.config new file mode 100644 index 00000000..39107536 --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'labels_transfer/xgboost' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs label transfer from reference to query using XGBoost classifier' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/labels_transfer/xgboost/nextflow_labels.config b/target/nextflow/labels_transfer/xgboost/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/labels_transfer/xgboost/nextflow_schema.json b/target/nextflow/labels_transfer/xgboost/nextflow_schema.json new file mode 100644 index 00000000..e69de29b diff --git a/target/nextflow/labels_transfer/xgboost/setup_logger.py b/target/nextflow/labels_transfer/xgboost/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/labels_transfer/xgboost/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/mapping/bd_rhapsody/.config.vsh.yaml b/target/nextflow/mapping/bd_rhapsody/.config.vsh.yaml new file mode 100644 index 00000000..e1bfe39f --- /dev/null +++ b/target/nextflow/mapping/bd_rhapsody/.config.vsh.yaml @@ -0,0 +1,1174 @@ +name: "bd_rhapsody" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--reads" + description: "Reads (optional) - Path to your FASTQ.GZ formatted read files from\ + \ libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample\ + \ Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n" + info: + config_key: "Reads" + example: + - "WTALibrary_S1_L001_R1_001.fastq.gz" + - "WTALibrary_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reads_atac" + description: "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\n\ + You may specify as many R1/R2/I2 files as you want.\n" + info: + config_key: "Reads_ATAC" + example: + - "ATACLibrary_S2_L001_R1_001.fastq.gz" + - "ATACLibrary_S2_L001_R2_001.fastq.gz" + - "ATACLibrary_S2_L001_I2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "References" + description: "Assay type will be inferred from the provided reference(s).\nDo not\ + \ provide both reference_archive and targeted_reference at the same time.\n \n\ + Valid reference input combinations:\n - reference_archive: WTA only\n - reference_archive\ + \ & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference:\ + \ WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference:\ + \ WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n\ + \ - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n\ + \ - targeted_reference: Targeted only\n - targeted_reference & abseq_reference:\ + \ Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can\ + \ be generated with the bd_rhapsody_make_reference component.\nAlternatively,\ + \ BD also provides standard references which can be downloaded from these locations:\n\ + \n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n\ + \ - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n" + arguments: + - type: "file" + name: "--reference_archive" + description: "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure\ + \ of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level\ + \ folder\n - `star_index/`: sub-folder containing STAR index, that is files\ + \ created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation\ + \ e.g. \"gencode.v43.primary_assembly.annotation.gtf\"\n" + info: + config_key: "Reference_Archive" + example: + - "RhapRef_Human_WTA_2023-02.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targeted_reference" + description: "Path to the targeted reference file in FASTA format.\n" + info: + config_key: "Targeted_Reference" + example: + - "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--abseq_reference" + description: "Path to the AbSeq reference file in FASTA format. Only needed if\ + \ BD AbSeq Ab-Oligos are used." + info: + config_key: "AbSeq_Reference" + example: + - "AbSeq_reference.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--supplemental_reference" + alternatives: + - "-s" + description: "Path to the supplemental reference file in FASTA format. Only needed\ + \ if there are additional transgene sequences to be aligned against in a WTA\ + \ assay experiment." + info: + config_key: "Supplemental_Reference" + example: + - "supplemental_reference.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + description: "Outputs for all pipeline runs" + arguments: + - type: "file" + name: "--output_dir" + alternatives: + - "-o" + description: "The unprocessed output directory containing all the outputs from\ + \ the pipeline." + info: null + example: + - "output_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_seurat" + description: "Single-cell analysis tool inputs. Seurat (.rds) input file containing\ + \ RSEC molecules data table and all cell annotation metadata." + info: + template: "sample_Seurat.rds" + example: + - "output_seurat.rds" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_mudata" + info: + template: "sample.h5mu" + example: + - "output_mudata.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--metrics_summary" + description: "Metrics Summary. Report containing sequencing, molecules, and cell\ + \ metrics." + info: + template: "sample_Metrics_Summary.csv" + example: + - "metrics_summary.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--pipeline_report" + description: "Pipeline Report. Summary report containing the results from the\ + \ sequencing analysis pipeline run." + info: + template: "sample_Pipeline_Report.html" + example: + - "pipeline_report.html" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--rsec_mols_per_cell" + description: "Molecules per bioproduct per cell bassed on RSEC" + info: + template: "sample_RSEC_MolsPerCell_MEX.zip" + example: + - "RSEC_MolsPerCell_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--dbec_mols_per_cell" + description: "Molecules per bioproduct per cell bassed on DBEC. DBEC data table\ + \ is only output if the experiment includes targeted mRNA or AbSeq bioproducts." + info: + template: "sample_DBEC_MolsPerCell_MEX.zip" + example: + - "DBEC_MolsPerCell_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--rsec_mols_per_cell_unfiltered" + description: "Unfiltered tables containing all cell labels with 10 reads." + info: + template: "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip" + example: + - "RSEC_MolsPerCell_Unfiltered_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "Alignment file of R2 with associated R1 annotations for Bioproduct." + info: + template: "sample_Bioproduct.bam" + example: + - "BioProduct.bam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam_index" + description: "Index file for the alignment file." + info: + template: "sample_Bioproduct.bam.bai" + example: + - "BioProduct.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bioproduct_stats" + description: "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier\ + \ adjustment algorithms on a per-bioproduct basis." + info: + template: "sample_Bioproduct_Stats.csv" + example: + - "Bioproduct_Stats.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--dimred_tsne" + description: "t-SNE dimensionality reduction coordinates per cell index" + info: + template: "sample_assay_tSNE_coordinates.csv" + example: + - "tSNE_coordinates.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--dimred_umap" + description: "UMAP dimensionality reduction coordinates per cell index" + info: + template: "sample_assay_UMAP_coordinates.csv" + example: + - "UMAP_coordinates.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--immune_cell_classification" + description: "Immune Cell Classification. Cell type classification based on the\ + \ expression of immune cell markers." + info: + template: "sample_assay_cell_type_experimental.csv" + example: + - "Immune_Cell_Classification.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Multiplex outputs" + description: "Outputs when multiplex option is selected" + arguments: + - type: "file" + name: "--sample_tag_metrics" + description: "Sample Tag Metrics. Metrics from the sample determination algorithm." + info: + template: "sample_Sample_Tag_Metrics.csv" + example: + - "Sample_Tag_Metrics.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_tag_calls" + description: "Sample Tag Calls. Assigned Sample Tag for each putative cell" + info: + template: "sample_Sample_Tag_Calls.csv" + example: + - "Sample_Tag_Calls.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_tag_counts" + description: "Sample Tag Counts. Separate data tables and metric summary for cells\ + \ assigned to each sample tag. Note: For putative cells that could not be assigned\ + \ a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output." + info: + template: "sample_Sample_Tag.zip" + example: + - "Sample_Tag1.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sample_tag_counts_unassigned" + description: "Sample Tag Counts Unassigned. Data table and metric summary for\ + \ cells that could not be assigned a specific Sample Tag." + info: + template: "sample_Multiplet_and_Undetermined.zip" + example: + - "Multiplet_and_Undetermined.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "VDJ Outputs" + description: "Outputs when VDJ option selected" + arguments: + - type: "file" + name: "--vdj_metrics" + description: "VDJ Metrics. Overall metrics from the VDJ analysis." + info: + template: "sample_VDJ_Metrics.csv" + example: + - "VDJ_Metrics.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_per_cell" + description: "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments,\ + \ CDR3 sequences, paired chains, and cell type." + info: + template: "sample_VDJ_perCell.csv" + example: + - "VDJ_perCell.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_per_cell_uncorrected" + description: "VDJ Per Cell Uncorrected. Cell specific read and molecule counts,\ + \ VDJ gene segments, CDR3 sequences, paired chains, and cell type." + info: + template: "sample_VDJ_perCell_uncorrected.csv" + example: + - "VDJ_perCell_uncorrected.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_dominant_contigs" + description: "VDJ Dominant Contigs. Dominant contig for each cell label chain\ + \ type combination (putative cells only)." + info: + template: "sample_VDJ_Dominant_Contigs_AIRR.csv" + example: + - "VDJ_Dominant_Contigs_AIRR.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_unfiltered_contigs" + description: "VDJ Unfiltered Contigs. All contigs that were assembled and annotated\ + \ successfully (all cells)." + info: + template: "sample_VDJ_Unfiltered_Contigs_AIRR.csv" + example: + - "VDJ_Unfiltered_Contigs_AIRR.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "ATAC-Seq outputs" + description: "Outputs when ATAC-Seq option selected" + arguments: + - type: "file" + name: "--atac_metrics" + description: "ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + info: + template: "sample_ATAC_Metrics.csv" + example: + - "ATAC_Metrics.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_metrics_json" + description: "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in\ + \ JSON format." + info: + template: "sample_ATAC_Metrics.json" + example: + - "ATAC_Metrics.json" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_fragments" + description: "ATAC Fragments. Chromosomal location, cell index, and read support\ + \ for each fragment detected" + info: + template: "sample_ATAC_Fragments.bed.gz" + example: + - "ATAC_Fragments.bed.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_fragments_index" + description: "Index of ATAC Fragments." + info: + template: "sample_ATAC_Fragments.bed.gz.tbi" + example: + - "ATAC_Fragments.bed.gz.tbi" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_transposase_sites" + description: "ATAC Transposase Sites. Chromosomal location, cell index, and read\ + \ support for each transposase site detected" + info: + template: "sample_ATAC_Transposase_Sites.bed.gz" + example: + - "ATAC_Transposase_Sites.bed.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_transposase_sites_index" + description: "Index of ATAC Transposase Sites." + info: + template: "sample_ATAC_Transposase_Sites.bed.gz.tbi" + example: + - "ATAC_Transposase_Sites.bed.gz.tbi" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_peaks" + description: "ATAC Peaks. Peak regions of transposase activity" + info: + template: "sample_ATAC_Peaks.bed.gz" + example: + - "ATAC_Peaks.bed.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_peaks_index" + description: "Index of ATAC Peaks." + info: + template: "sample_ATAC_Peaks.bed.gz.tbi" + example: + - "ATAC_Peaks.bed.gz.tbi" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_peak_annotation" + description: "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + info: + template: "sample_peak_annotation.tsv.gz" + example: + - "peak_annotation.tsv.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_cell_by_peak" + description: "ATAC Cell by Peak. Peak regions of transposase activity per cell" + info: + template: "sample_ATAC_Cell_by_Peak_MEX.zip" + example: + - "ATAC_Cell_by_Peak_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_cell_by_peak_unfiltered" + description: "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell\ + \ labels with >=1 transposase sites in peaks." + info: + template: "sample_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + example: + - "ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_bam" + description: "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations\ + \ for ATAC-Seq. Only output if the BAM generation flag is set to true." + info: + template: "sample_ATAC.bam" + example: + - "ATAC.bam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--atac_bam_index" + description: "Index of ATAC BAM." + info: + template: "sample_ATAC.bam.bai" + example: + - "ATAC.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "AbSeq Cell Calling outputs" + description: "Outputs when Cell Calling Abseq is selected" + arguments: + - type: "file" + name: "--protein_aggregates_experimental" + description: "Protein Aggregates Experimental" + info: + template: "sample_Protein_Aggregates_Experimental.csv" + example: + - "Protein_Aggregates_Experimental.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Putative Cell Calling Settings" + arguments: + - type: "string" + name: "--cell_calling_data" + description: "Specify the dataset to be used for putative cell calling: mRNA,\ + \ AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset,\ + \ please provide an AbSeq_Reference fasta file above.\n \nFor putative cell\ + \ calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive\ + \ file above.\n \nThe default data for putative cell calling, will be determined\ + \ the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n\ + - If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n" + info: + config_key: "Cell_Calling_Data" + example: + - "mRNA" + required: false + choices: + - "mRNA" + - "AbSeq" + - "ATAC" + - "mRNA_and_ATAC" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_calling_bioproduct_algorithm" + description: "Specify the bioproduct algorithm to be used for putative cell calling:\ + \ Basic or Refined\n \nBy default, the Basic algorithm will be used for putative\ + \ cell calling.\n" + info: + config_key: "Cell_Calling_Bioproduct_Algorithm" + example: + - "Basic" + required: false + choices: + - "Basic" + - "Refined" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_calling_atac_algorithm" + description: "Specify the ATAC-seq algorithm to be used for putative cell calling:\ + \ Basic or Refined\n \nBy default, the Basic algorithm will be used for putative\ + \ cell calling.\n" + info: + config_key: "Cell_Calling_ATAC_Algorithm" + example: + - "Basic" + required: false + choices: + - "Basic" + - "Refined" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--exact_cell_count" + description: "Set a specific number of cells as putative, based on those with\ + \ the highest error-corrected read count\n" + info: + config_key: "Exact_Cell_Count" + example: + - 10000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--expected_cell_count" + description: "Guide the basic putative cell calling algorithm by providing an\ + \ estimate of the number of cells expected. Usually this can be the number\ + \ of cells loaded into the Rhapsody cartridge. If there are multiple inflection\ + \ points on the second derivative cumulative curve, this will ensure the one\ + \ selected is near the expected. \n" + info: + config_key: "Expected_Cell_Count" + example: + - 20000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Intronic Reads Settings" + arguments: + - type: "boolean" + name: "--exclude_intronic_reads" + description: "By default, the flag is false, and reads aligned to exons and introns\ + \ are considered and represented in molecule counts. When the flag is set to\ + \ true, intronic reads will be excluded.\nThe value can be true or false.\n" + info: + config_key: "Exclude_Intronic_Reads" + example: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Multiplex Settings" + arguments: + - type: "string" + name: "--sample_tags_version" + description: "Specify the version of the Sample Tags used in the run:\n\n* If\ + \ Sample Tag Multiplexing was done, specify the appropriate version: human,\ + \ mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK +\ + \ Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not\ + \ an SMK + ATAC-Seq only run), choose the \"nuclei_includes_mrna\" option.\n\ + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)),\ + \ choose the \"nuclei_atac_only\" option.\n" + info: + config_key: "Sample_Tags_Version" + example: + - "human" + required: false + choices: + - "human" + - "mouse" + - "flex" + - "nuclei_includes_mrna" + - "nuclei_atac_only" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_names" + description: "Specify the tag number followed by '-' and the desired sample name\ + \ to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n" + info: + config_key: "Tag_Names" + example: + - "4-mySample" + - "9-myOtherSample" + - "6-alsoThisSample" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "VDJ arguments" + arguments: + - type: "string" + name: "--vdj_version" + description: "If VDJ was done, specify the appropriate option: human, mouse, humanBCR,\ + \ humanTCR, mouseBCR, mouseTCR\n" + info: + config_key: "VDJ_Version" + example: + - "human" + required: false + choices: + - "human" + - "mouse" + - "humanBCR" + - "humanTCR" + - "mouseBCR" + - "mouseTCR" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "ATAC options" + arguments: + - type: "file" + name: "--predefined_atac_peaks" + description: "An optional BED file containing pre-established chromatin accessibility\ + \ peak regions for generating the ATAC cell-by-peak matrix." + info: + config_key: "Predefined_ATAC_Peaks" + example: + - "predefined_peaks.bed" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Additional options" + arguments: + - type: "string" + name: "--run_name" + description: "Specify a run name to use as the output file base name. Use only\ + \ letters, numbers, or hyphens. Do not use special characters or spaces.\n" + info: + config_key: "Run_Name" + default: + - "sample" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--generate_bam" + description: "Specify whether to create the BAM file output\n" + info: + config_key: "Generate_Bam" + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--long_reads" + description: "Use STARlong (default: undefined - i.e. autodetects based on read\ + \ lengths) - Specify if the STARlong aligner should be used instead of STAR.\ + \ Set to true if the reads are longer than 650bp.\n" + info: + config_key: "Long_Reads" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Advanced options" + description: "NOTE: Only change these if you are really sure about what you are\ + \ doing\n" + arguments: + - type: "string" + name: "--custom_star_params" + description: "Modify STAR alignment parameters - Set this parameter to fully override\ + \ default STAR mapping parameters used in the pipeline.\nFor reference this\ + \ is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread\ + \ 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq\ + \ AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin\ + \ 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax\ + \ 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT\ + \ set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`,\ + \ `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n" + info: + config_key: "Custom_STAR_Params" + example: + - "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed\ + \ 2000000" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--custom_bwa_mem2_params" + description: "Modify bwa-mem2 alignment parameters - Set this parameter to fully\ + \ override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does\ + \ not specify any custom mapping params to bwa-mem2 so program default values\ + \ are used\nThis applies to fastqs provided in the Reads_ATAC user input \n\ + Do NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2\ + \ version 2.2.1\n" + info: + config_key: "Custom_bwa_mem2_Params" + example: + - "-k 16 -w 200 -r" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "CWL-runner arguments" + arguments: + - type: "boolean" + name: "--parallel" + description: "Run jobs in parallel." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--timestamps" + description: "Add timestamps to the errors, warnings, and notifications." + info: null + direction: "input" +- name: "Undocumented arguments" + arguments: + - type: "integer" + name: "--abseq_umi" + info: + config_key: "AbSeq_UMI" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--target_analysis" + info: + config_key: "Target_analysis" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--vdj_jgene_evalue" + description: "e-value threshold for J gene. The e-value threshold for J gene call\ + \ by IgBlast/PyIR, default is set as 0.001\n" + info: + config_key: "VDJ_JGene_Evalue" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--vdj_vgene_evalue" + description: "e-value threshold for V gene. The e-value threshold for V gene call\ + \ by IgBlast/PyIR, default is set as 0.001\n" + info: + config_key: "VDJ_VGene_Evalue" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--write_filtered_reads" + description: "Output processed FASTQ reads." + info: + config_key: "Write_Filtered_Reads" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "rhapsody_pipeline_2.2.1_nodocker.cwl" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n \nThis pipeline\ + \ performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\n\ + sequencing libraries are those generated by the BD Rhapsody assay kits, including:\ + \ Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell\ + \ Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning\ + \ 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement'\ + \ from the YAML.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "rhapsody_cell_label.py" +- type: "file" + path: "reference.fa.gz" +- type: "file" + path: "reference.gtf.gz" +- type: "file" + path: "reference_bd_rhapsody.tar.gz" +- type: "file" + path: "raw" +info: + keywords: + - "rna-seq" + - "single-cell" + - "multiomic" + - "atac-seq" + - "targeted" + - "abseq" + - "tcr" + - "bcr" + links: + repository: "https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1" + documentation: "https://bd-rhapsody-bioinfo-docs.genomics.bd.com" + license: "Unknown" +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bdgenomics/rhapsody:2.2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "cwlref-runner" + - "cwl-runner" + - "ruamel.yaml" + - "biopython" + - "gffutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/bd_rhapsody/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/bd_rhapsody" + executable: "target/nextflow/mapping/bd_rhapsody/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/bd_rhapsody/main.nf b/target/nextflow/mapping/bd_rhapsody/main.nf new file mode 100644 index 00000000..24af458f --- /dev/null +++ b/target/nextflow/mapping/bd_rhapsody/main.nf @@ -0,0 +1,5330 @@ +// bd_rhapsody main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bd_rhapsody", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--reads", + "description" : "Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n", + "info" : { + "config_key" : "Reads" + }, + "example" : [ + "WTALibrary_S1_L001_R1_001.fastq.gz", + "WTALibrary_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reads_atac", + "description" : "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\nYou may specify as many R1/R2/I2 files as you want.\n", + "info" : { + "config_key" : "Reads_ATAC" + }, + "example" : [ + "ATACLibrary_S2_L001_R1_001.fastq.gz", + "ATACLibrary_S2_L001_R2_001.fastq.gz", + "ATACLibrary_S2_L001_I2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "References", + "description" : "Assay type will be inferred from the provided reference(s).\nDo not provide both reference_archive and targeted_reference at the same time.\n \nValid reference input combinations:\n - reference_archive: WTA only\n - reference_archive & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference: WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n - targeted_reference: Targeted only\n - targeted_reference & abseq_reference: Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can be generated with the bd_rhapsody_make_reference component.\nAlternatively, BD also provides standard references which can be downloaded from these locations:\n\n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n", + "arguments" : [ + { + "type" : "file", + "name" : "--reference_archive", + "description" : "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level folder\n - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation e.g. \\"gencode.v43.primary_assembly.annotation.gtf\\"\n", + "info" : { + "config_key" : "Reference_Archive" + }, + "example" : [ + "RhapRef_Human_WTA_2023-02.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--targeted_reference", + "description" : "Path to the targeted reference file in FASTA format.\n", + "info" : { + "config_key" : "Targeted_Reference" + }, + "example" : [ + "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--abseq_reference", + "description" : "Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used.", + "info" : { + "config_key" : "AbSeq_Reference" + }, + "example" : [ + "AbSeq_reference.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--supplemental_reference", + "alternatives" : [ + "-s" + ], + "description" : "Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment.", + "info" : { + "config_key" : "Supplemental_Reference" + }, + "example" : [ + "supplemental_reference.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Outputs for all pipeline runs", + "arguments" : [ + { + "type" : "file", + "name" : "--output_dir", + "alternatives" : [ + "-o" + ], + "description" : "The unprocessed output directory containing all the outputs from the pipeline.", + "example" : [ + "output_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_seurat", + "description" : "Single-cell analysis tool inputs. Seurat (.rds) input file containing RSEC molecules data table and all cell annotation metadata.", + "info" : { + "template" : "sample_Seurat.rds" + }, + "example" : [ + "output_seurat.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_mudata", + "info" : { + "template" : "sample.h5mu" + }, + "example" : [ + "output_mudata.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--metrics_summary", + "description" : "Metrics Summary. Report containing sequencing, molecules, and cell metrics.", + "info" : { + "template" : "sample_Metrics_Summary.csv" + }, + "example" : [ + "metrics_summary.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--pipeline_report", + "description" : "Pipeline Report. Summary report containing the results from the sequencing analysis pipeline run.", + "info" : { + "template" : "sample_Pipeline_Report.html" + }, + "example" : [ + "pipeline_report.html" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--rsec_mols_per_cell", + "description" : "Molecules per bioproduct per cell bassed on RSEC", + "info" : { + "template" : "sample_RSEC_MolsPerCell_MEX.zip" + }, + "example" : [ + "RSEC_MolsPerCell_MEX.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--dbec_mols_per_cell", + "description" : "Molecules per bioproduct per cell bassed on DBEC. DBEC data table is only output if the experiment includes targeted mRNA or AbSeq bioproducts.", + "info" : { + "template" : "sample_DBEC_MolsPerCell_MEX.zip" + }, + "example" : [ + "DBEC_MolsPerCell_MEX.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--rsec_mols_per_cell_unfiltered", + "description" : "Unfiltered tables containing all cell labels with 10 reads.", + "info" : { + "template" : "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip" + }, + "example" : [ + "RSEC_MolsPerCell_Unfiltered_MEX.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam", + "description" : "Alignment file of R2 with associated R1 annotations for Bioproduct.", + "info" : { + "template" : "sample_Bioproduct.bam" + }, + "example" : [ + "BioProduct.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam_index", + "description" : "Index file for the alignment file.", + "info" : { + "template" : "sample_Bioproduct.bam.bai" + }, + "example" : [ + "BioProduct.bam.bai" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bioproduct_stats", + "description" : "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier adjustment algorithms on a per-bioproduct basis.", + "info" : { + "template" : "sample_Bioproduct_Stats.csv" + }, + "example" : [ + "Bioproduct_Stats.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--dimred_tsne", + "description" : "t-SNE dimensionality reduction coordinates per cell index", + "info" : { + "template" : "sample_assay_tSNE_coordinates.csv" + }, + "example" : [ + "tSNE_coordinates.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--dimred_umap", + "description" : "UMAP dimensionality reduction coordinates per cell index", + "info" : { + "template" : "sample_assay_UMAP_coordinates.csv" + }, + "example" : [ + "UMAP_coordinates.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--immune_cell_classification", + "description" : "Immune Cell Classification. Cell type classification based on the expression of immune cell markers.", + "info" : { + "template" : "sample_assay_cell_type_experimental.csv" + }, + "example" : [ + "Immune_Cell_Classification.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Multiplex outputs", + "description" : "Outputs when multiplex option is selected", + "arguments" : [ + { + "type" : "file", + "name" : "--sample_tag_metrics", + "description" : "Sample Tag Metrics. Metrics from the sample determination algorithm.", + "info" : { + "template" : "sample_Sample_Tag_Metrics.csv" + }, + "example" : [ + "Sample_Tag_Metrics.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_tag_calls", + "description" : "Sample Tag Calls. Assigned Sample Tag for each putative cell", + "info" : { + "template" : "sample_Sample_Tag_Calls.csv" + }, + "example" : [ + "Sample_Tag_Calls.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_tag_counts", + "description" : "Sample Tag Counts. Separate data tables and metric summary for cells assigned to each sample tag. Note: For putative cells that could not be assigned a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output.", + "info" : { + "template" : "sample_Sample_Tag.zip" + }, + "example" : [ + "Sample_Tag1.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_tag_counts_unassigned", + "description" : "Sample Tag Counts Unassigned. Data table and metric summary for cells that could not be assigned a specific Sample Tag.", + "info" : { + "template" : "sample_Multiplet_and_Undetermined.zip" + }, + "example" : [ + "Multiplet_and_Undetermined.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "VDJ Outputs", + "description" : "Outputs when VDJ option selected", + "arguments" : [ + { + "type" : "file", + "name" : "--vdj_metrics", + "description" : "VDJ Metrics. Overall metrics from the VDJ analysis.", + "info" : { + "template" : "sample_VDJ_Metrics.csv" + }, + "example" : [ + "VDJ_Metrics.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_per_cell", + "description" : "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type.", + "info" : { + "template" : "sample_VDJ_perCell.csv" + }, + "example" : [ + "VDJ_perCell.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_per_cell_uncorrected", + "description" : "VDJ Per Cell Uncorrected. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type.", + "info" : { + "template" : "sample_VDJ_perCell_uncorrected.csv" + }, + "example" : [ + "VDJ_perCell_uncorrected.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_dominant_contigs", + "description" : "VDJ Dominant Contigs. Dominant contig for each cell label chain type combination (putative cells only).", + "info" : { + "template" : "sample_VDJ_Dominant_Contigs_AIRR.csv" + }, + "example" : [ + "VDJ_Dominant_Contigs_AIRR.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_unfiltered_contigs", + "description" : "VDJ Unfiltered Contigs. All contigs that were assembled and annotated successfully (all cells).", + "info" : { + "template" : "sample_VDJ_Unfiltered_Contigs_AIRR.csv" + }, + "example" : [ + "VDJ_Unfiltered_Contigs_AIRR.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "ATAC-Seq outputs", + "description" : "Outputs when ATAC-Seq option selected", + "arguments" : [ + { + "type" : "file", + "name" : "--atac_metrics", + "description" : "ATAC Metrics. Overall metrics from the ATAC-Seq analysis.", + "info" : { + "template" : "sample_ATAC_Metrics.csv" + }, + "example" : [ + "ATAC_Metrics.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_metrics_json", + "description" : "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in JSON format.", + "info" : { + "template" : "sample_ATAC_Metrics.json" + }, + "example" : [ + "ATAC_Metrics.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_fragments", + "description" : "ATAC Fragments. Chromosomal location, cell index, and read support for each fragment detected", + "info" : { + "template" : "sample_ATAC_Fragments.bed.gz" + }, + "example" : [ + "ATAC_Fragments.bed.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_fragments_index", + "description" : "Index of ATAC Fragments.", + "info" : { + "template" : "sample_ATAC_Fragments.bed.gz.tbi" + }, + "example" : [ + "ATAC_Fragments.bed.gz.tbi" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_transposase_sites", + "description" : "ATAC Transposase Sites. Chromosomal location, cell index, and read support for each transposase site detected", + "info" : { + "template" : "sample_ATAC_Transposase_Sites.bed.gz" + }, + "example" : [ + "ATAC_Transposase_Sites.bed.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_transposase_sites_index", + "description" : "Index of ATAC Transposase Sites.", + "info" : { + "template" : "sample_ATAC_Transposase_Sites.bed.gz.tbi" + }, + "example" : [ + "ATAC_Transposase_Sites.bed.gz.tbi" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_peaks", + "description" : "ATAC Peaks. Peak regions of transposase activity", + "info" : { + "template" : "sample_ATAC_Peaks.bed.gz" + }, + "example" : [ + "ATAC_Peaks.bed.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_peaks_index", + "description" : "Index of ATAC Peaks.", + "info" : { + "template" : "sample_ATAC_Peaks.bed.gz.tbi" + }, + "example" : [ + "ATAC_Peaks.bed.gz.tbi" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_peak_annotation", + "description" : "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections", + "info" : { + "template" : "sample_peak_annotation.tsv.gz" + }, + "example" : [ + "peak_annotation.tsv.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_cell_by_peak", + "description" : "ATAC Cell by Peak. Peak regions of transposase activity per cell", + "info" : { + "template" : "sample_ATAC_Cell_by_Peak_MEX.zip" + }, + "example" : [ + "ATAC_Cell_by_Peak_MEX.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_cell_by_peak_unfiltered", + "description" : "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell labels with >=1 transposase sites in peaks.", + "info" : { + "template" : "sample_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + }, + "example" : [ + "ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_bam", + "description" : "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations for ATAC-Seq. Only output if the BAM generation flag is set to true.", + "info" : { + "template" : "sample_ATAC.bam" + }, + "example" : [ + "ATAC.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--atac_bam_index", + "description" : "Index of ATAC BAM.", + "info" : { + "template" : "sample_ATAC.bam.bai" + }, + "example" : [ + "ATAC.bam.bai" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "AbSeq Cell Calling outputs", + "description" : "Outputs when Cell Calling Abseq is selected", + "arguments" : [ + { + "type" : "file", + "name" : "--protein_aggregates_experimental", + "description" : "Protein Aggregates Experimental", + "info" : { + "template" : "sample_Protein_Aggregates_Experimental.csv" + }, + "example" : [ + "Protein_Aggregates_Experimental.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Putative Cell Calling Settings", + "arguments" : [ + { + "type" : "string", + "name" : "--cell_calling_data", + "description" : "Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above.\n \nFor putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above.\n \nThe default data for putative cell calling, will be determined the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n- If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n", + "info" : { + "config_key" : "Cell_Calling_Data" + }, + "example" : [ + "mRNA" + ], + "required" : false, + "choices" : [ + "mRNA", + "AbSeq", + "ATAC", + "mRNA_and_ATAC" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_calling_bioproduct_algorithm", + "description" : "Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "info" : { + "config_key" : "Cell_Calling_Bioproduct_Algorithm" + }, + "example" : [ + "Basic" + ], + "required" : false, + "choices" : [ + "Basic", + "Refined" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_calling_atac_algorithm", + "description" : "Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "info" : { + "config_key" : "Cell_Calling_ATAC_Algorithm" + }, + "example" : [ + "Basic" + ], + "required" : false, + "choices" : [ + "Basic", + "Refined" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--exact_cell_count", + "description" : "Set a specific number of cells as putative, based on those with the highest error-corrected read count\n", + "info" : { + "config_key" : "Exact_Cell_Count" + }, + "example" : [ + 10000 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--expected_cell_count", + "description" : "Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. \n", + "info" : { + "config_key" : "Expected_Cell_Count" + }, + "example" : [ + 20000 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Intronic Reads Settings", + "arguments" : [ + { + "type" : "boolean", + "name" : "--exclude_intronic_reads", + "description" : "By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded.\nThe value can be true or false.\n", + "info" : { + "config_key" : "Exclude_Intronic_Reads" + }, + "example" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Multiplex Settings", + "arguments" : [ + { + "type" : "string", + "name" : "--sample_tags_version", + "description" : "Specify the version of the Sample Tags used in the run:\n\n* If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the \\"nuclei_includes_mrna\\" option.\n* If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the \\"nuclei_atac_only\\" option.\n", + "info" : { + "config_key" : "Sample_Tags_Version" + }, + "example" : [ + "human" + ], + "required" : false, + "choices" : [ + "human", + "mouse", + "flex", + "nuclei_includes_mrna", + "nuclei_atac_only" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag_names", + "description" : "Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n", + "info" : { + "config_key" : "Tag_Names" + }, + "example" : [ + "4-mySample", + "9-myOtherSample", + "6-alsoThisSample" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "VDJ arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--vdj_version", + "description" : "If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR\n", + "info" : { + "config_key" : "VDJ_Version" + }, + "example" : [ + "human" + ], + "required" : false, + "choices" : [ + "human", + "mouse", + "humanBCR", + "humanTCR", + "mouseBCR", + "mouseTCR" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "ATAC options", + "arguments" : [ + { + "type" : "file", + "name" : "--predefined_atac_peaks", + "description" : "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix.", + "info" : { + "config_key" : "Predefined_ATAC_Peaks" + }, + "example" : [ + "predefined_peaks.bed" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Additional options", + "arguments" : [ + { + "type" : "string", + "name" : "--run_name", + "description" : "Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces.\n", + "info" : { + "config_key" : "Run_Name" + }, + "default" : [ + "sample" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--generate_bam", + "description" : "Specify whether to create the BAM file output\n", + "info" : { + "config_key" : "Generate_Bam" + }, + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--long_reads", + "description" : "Use STARlong (default: undefined - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp.\n", + "info" : { + "config_key" : "Long_Reads" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Advanced options", + "description" : "NOTE: Only change these if you are really sure about what you are doing\n", + "arguments" : [ + { + "type" : "string", + "name" : "--custom_star_params", + "description" : "Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline.\nFor reference this is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n", + "info" : { + "config_key" : "Custom_STAR_Params" + }, + "example" : [ + "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--custom_bwa_mem2_params", + "description" : "Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used\nThis applies to fastqs provided in the Reads_ATAC user input \nDo NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2 version 2.2.1\n", + "info" : { + "config_key" : "Custom_bwa_mem2_Params" + }, + "example" : [ + "-k 16 -w 200 -r" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "CWL-runner arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--parallel", + "description" : "Run jobs in parallel.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--timestamps", + "description" : "Add timestamps to the errors, warnings, and notifications.", + "direction" : "input" + } + ] + }, + { + "name" : "Undocumented arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--abseq_umi", + "info" : { + "config_key" : "AbSeq_UMI" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--target_analysis", + "info" : { + "config_key" : "Target_analysis" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--vdj_jgene_evalue", + "description" : "e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001\n", + "info" : { + "config_key" : "VDJ_JGene_Evalue" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--vdj_vgene_evalue", + "description" : "e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001\n", + "info" : { + "config_key" : "VDJ_VGene_Evalue" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--write_filtered_reads", + "description" : "Output processed FASTQ reads.", + "info" : { + "config_key" : "Write_Filtered_Reads" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "rhapsody_pipeline_2.2.1_nodocker.cwl" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n \nThis pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\nsequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement' from the YAML.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "rhapsody_cell_label.py" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1/reference.fa.gz" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1/reference.gtf.gz" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz" + }, + { + "type" : "file", + "path" : "/resources_test/bdrhap_5kjrt/raw" + } + ], + "info" : { + "keywords" : [ + "rna-seq", + "single-cell", + "multiomic", + "atac-seq", + "targeted", + "abseq", + "tcr", + "bcr" + ], + "links" : { + "repository" : "https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1", + "documentation" : "https://bd-rhapsody-bioinfo-docs.genomics.bd.com" + }, + "license" : "Unknown" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "bdgenomics/rhapsody:2.2.1", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "cwlref-runner", + "cwl-runner", + "ruamel.yaml", + "biopython", + "gffutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/bd_rhapsody/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/bd_rhapsody", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil +import glob + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'reads': $( if [ ! -z ${VIASH_PAR_READS+x} ]; then echo "r'${VIASH_PAR_READS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reads_atac': $( if [ ! -z ${VIASH_PAR_READS_ATAC+x} ]; then echo "r'${VIASH_PAR_READS_ATAC//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference_archive': $( if [ ! -z ${VIASH_PAR_REFERENCE_ARCHIVE+x} ]; then echo "r'${VIASH_PAR_REFERENCE_ARCHIVE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'targeted_reference': $( if [ ! -z ${VIASH_PAR_TARGETED_REFERENCE+x} ]; then echo "r'${VIASH_PAR_TARGETED_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'abseq_reference': $( if [ ! -z ${VIASH_PAR_ABSEQ_REFERENCE+x} ]; then echo "r'${VIASH_PAR_ABSEQ_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'supplemental_reference': $( if [ ! -z ${VIASH_PAR_SUPPLEMENTAL_REFERENCE+x} ]; then echo "r'${VIASH_PAR_SUPPLEMENTAL_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_dir': $( if [ ! -z ${VIASH_PAR_OUTPUT_DIR+x} ]; then echo "r'${VIASH_PAR_OUTPUT_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_seurat': $( if [ ! -z ${VIASH_PAR_OUTPUT_SEURAT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_SEURAT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_mudata': $( if [ ! -z ${VIASH_PAR_OUTPUT_MUDATA+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MUDATA//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'metrics_summary': $( if [ ! -z ${VIASH_PAR_METRICS_SUMMARY+x} ]; then echo "r'${VIASH_PAR_METRICS_SUMMARY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pipeline_report': $( if [ ! -z ${VIASH_PAR_PIPELINE_REPORT+x} ]; then echo "r'${VIASH_PAR_PIPELINE_REPORT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rsec_mols_per_cell': $( if [ ! -z ${VIASH_PAR_RSEC_MOLS_PER_CELL+x} ]; then echo "r'${VIASH_PAR_RSEC_MOLS_PER_CELL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'dbec_mols_per_cell': $( if [ ! -z ${VIASH_PAR_DBEC_MOLS_PER_CELL+x} ]; then echo "r'${VIASH_PAR_DBEC_MOLS_PER_CELL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'rsec_mols_per_cell_unfiltered': $( if [ ! -z ${VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED+x} ]; then echo "r'${VIASH_PAR_RSEC_MOLS_PER_CELL_UNFILTERED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'bam': $( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "r'${VIASH_PAR_BAM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'bam_index': $( if [ ! -z ${VIASH_PAR_BAM_INDEX+x} ]; then echo "r'${VIASH_PAR_BAM_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'bioproduct_stats': $( if [ ! -z ${VIASH_PAR_BIOPRODUCT_STATS+x} ]; then echo "r'${VIASH_PAR_BIOPRODUCT_STATS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'dimred_tsne': $( if [ ! -z ${VIASH_PAR_DIMRED_TSNE+x} ]; then echo "r'${VIASH_PAR_DIMRED_TSNE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'dimred_umap': $( if [ ! -z ${VIASH_PAR_DIMRED_UMAP+x} ]; then echo "r'${VIASH_PAR_DIMRED_UMAP//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'immune_cell_classification': $( if [ ! -z ${VIASH_PAR_IMMUNE_CELL_CLASSIFICATION+x} ]; then echo "r'${VIASH_PAR_IMMUNE_CELL_CLASSIFICATION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sample_tag_metrics': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_METRICS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_METRICS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sample_tag_calls': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_CALLS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_CALLS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sample_tag_counts': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_COUNTS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_COUNTS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sample_tag_counts_unassigned': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAG_COUNTS_UNASSIGNED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_metrics': $( if [ ! -z ${VIASH_PAR_VDJ_METRICS+x} ]; then echo "r'${VIASH_PAR_VDJ_METRICS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_per_cell': $( if [ ! -z ${VIASH_PAR_VDJ_PER_CELL+x} ]; then echo "r'${VIASH_PAR_VDJ_PER_CELL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_per_cell_uncorrected': $( if [ ! -z ${VIASH_PAR_VDJ_PER_CELL_UNCORRECTED+x} ]; then echo "r'${VIASH_PAR_VDJ_PER_CELL_UNCORRECTED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_dominant_contigs': $( if [ ! -z ${VIASH_PAR_VDJ_DOMINANT_CONTIGS+x} ]; then echo "r'${VIASH_PAR_VDJ_DOMINANT_CONTIGS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_unfiltered_contigs': $( if [ ! -z ${VIASH_PAR_VDJ_UNFILTERED_CONTIGS+x} ]; then echo "r'${VIASH_PAR_VDJ_UNFILTERED_CONTIGS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_metrics': $( if [ ! -z ${VIASH_PAR_ATAC_METRICS+x} ]; then echo "r'${VIASH_PAR_ATAC_METRICS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_metrics_json': $( if [ ! -z ${VIASH_PAR_ATAC_METRICS_JSON+x} ]; then echo "r'${VIASH_PAR_ATAC_METRICS_JSON//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_fragments': $( if [ ! -z ${VIASH_PAR_ATAC_FRAGMENTS+x} ]; then echo "r'${VIASH_PAR_ATAC_FRAGMENTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_fragments_index': $( if [ ! -z ${VIASH_PAR_ATAC_FRAGMENTS_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_FRAGMENTS_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_transposase_sites': $( if [ ! -z ${VIASH_PAR_ATAC_TRANSPOSASE_SITES+x} ]; then echo "r'${VIASH_PAR_ATAC_TRANSPOSASE_SITES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_transposase_sites_index': $( if [ ! -z ${VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_TRANSPOSASE_SITES_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_peaks': $( if [ ! -z ${VIASH_PAR_ATAC_PEAKS+x} ]; then echo "r'${VIASH_PAR_ATAC_PEAKS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_peaks_index': $( if [ ! -z ${VIASH_PAR_ATAC_PEAKS_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_PEAKS_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_peak_annotation': $( if [ ! -z ${VIASH_PAR_ATAC_PEAK_ANNOTATION+x} ]; then echo "r'${VIASH_PAR_ATAC_PEAK_ANNOTATION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_cell_by_peak': $( if [ ! -z ${VIASH_PAR_ATAC_CELL_BY_PEAK+x} ]; then echo "r'${VIASH_PAR_ATAC_CELL_BY_PEAK//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_cell_by_peak_unfiltered': $( if [ ! -z ${VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED+x} ]; then echo "r'${VIASH_PAR_ATAC_CELL_BY_PEAK_UNFILTERED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_bam': $( if [ ! -z ${VIASH_PAR_ATAC_BAM+x} ]; then echo "r'${VIASH_PAR_ATAC_BAM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'atac_bam_index': $( if [ ! -z ${VIASH_PAR_ATAC_BAM_INDEX+x} ]; then echo "r'${VIASH_PAR_ATAC_BAM_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'protein_aggregates_experimental': $( if [ ! -z ${VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL+x} ]; then echo "r'${VIASH_PAR_PROTEIN_AGGREGATES_EXPERIMENTAL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cell_calling_data': $( if [ ! -z ${VIASH_PAR_CELL_CALLING_DATA+x} ]; then echo "r'${VIASH_PAR_CELL_CALLING_DATA//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cell_calling_bioproduct_algorithm': $( if [ ! -z ${VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM+x} ]; then echo "r'${VIASH_PAR_CELL_CALLING_BIOPRODUCT_ALGORITHM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cell_calling_atac_algorithm': $( if [ ! -z ${VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM+x} ]; then echo "r'${VIASH_PAR_CELL_CALLING_ATAC_ALGORITHM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'exact_cell_count': $( if [ ! -z ${VIASH_PAR_EXACT_CELL_COUNT+x} ]; then echo "int(r'${VIASH_PAR_EXACT_CELL_COUNT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'expected_cell_count': $( if [ ! -z ${VIASH_PAR_EXPECTED_CELL_COUNT+x} ]; then echo "int(r'${VIASH_PAR_EXPECTED_CELL_COUNT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'exclude_intronic_reads': $( if [ ! -z ${VIASH_PAR_EXCLUDE_INTRONIC_READS+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_INTRONIC_READS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'sample_tags_version': $( if [ ! -z ${VIASH_PAR_SAMPLE_TAGS_VERSION+x} ]; then echo "r'${VIASH_PAR_SAMPLE_TAGS_VERSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'tag_names': $( if [ ! -z ${VIASH_PAR_TAG_NAMES+x} ]; then echo "r'${VIASH_PAR_TAG_NAMES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'vdj_version': $( if [ ! -z ${VIASH_PAR_VDJ_VERSION+x} ]; then echo "r'${VIASH_PAR_VDJ_VERSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'predefined_atac_peaks': $( if [ ! -z ${VIASH_PAR_PREDEFINED_ATAC_PEAKS+x} ]; then echo "r'${VIASH_PAR_PREDEFINED_ATAC_PEAKS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'run_name': $( if [ ! -z ${VIASH_PAR_RUN_NAME+x} ]; then echo "r'${VIASH_PAR_RUN_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'generate_bam': $( if [ ! -z ${VIASH_PAR_GENERATE_BAM+x} ]; then echo "r'${VIASH_PAR_GENERATE_BAM//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'long_reads': $( if [ ! -z ${VIASH_PAR_LONG_READS+x} ]; then echo "r'${VIASH_PAR_LONG_READS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'custom_star_params': $( if [ ! -z ${VIASH_PAR_CUSTOM_STAR_PARAMS+x} ]; then echo "r'${VIASH_PAR_CUSTOM_STAR_PARAMS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'custom_bwa_mem2_params': $( if [ ! -z ${VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS+x} ]; then echo "r'${VIASH_PAR_CUSTOM_BWA_MEM2_PARAMS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'parallel': $( if [ ! -z ${VIASH_PAR_PARALLEL+x} ]; then echo "r'${VIASH_PAR_PARALLEL//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'timestamps': $( if [ ! -z ${VIASH_PAR_TIMESTAMPS+x} ]; then echo "r'${VIASH_PAR_TIMESTAMPS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'abseq_umi': $( if [ ! -z ${VIASH_PAR_ABSEQ_UMI+x} ]; then echo "int(r'${VIASH_PAR_ABSEQ_UMI//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'target_analysis': $( if [ ! -z ${VIASH_PAR_TARGET_ANALYSIS+x} ]; then echo "r'${VIASH_PAR_TARGET_ANALYSIS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'vdj_jgene_evalue': $( if [ ! -z ${VIASH_PAR_VDJ_JGENE_EVALUE+x} ]; then echo "float(r'${VIASH_PAR_VDJ_JGENE_EVALUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'vdj_vgene_evalue': $( if [ ! -z ${VIASH_PAR_VDJ_VGENE_EVALUE+x} ]; then echo "float(r'${VIASH_PAR_VDJ_VGENE_EVALUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'write_filtered_reads': $( if [ ! -z ${VIASH_PAR_WRITE_FILTERED_READS+x} ]; then echo "r'${VIASH_PAR_WRITE_FILTERED_READS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] + ] + + return config + + +def strip_margin(text: str) -> str: + return re.sub("(\\\\n?)[ \\\\t]*\\\\|", "\\\\\\\\1", text) + + +def process_params(par: dict[str, Any], config, temp_dir: str) -> str: + # check input parameters + assert par["reads"] or par["reads_atac"], ( + "Pass at least one set of inputs to --reads or --reads_atac." + ) + + # output to temp dir if output_dir was not passed + if not par["output_dir"]: + par["output_dir"] = os.path.join(temp_dir, "output") + + # checking sample prefix + if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): + print( + "--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", + flush=True, + ) + par["run_name"] = re.sub("[^A-Za-z0-9\\\\\\\\-]", "-", par["run_name"]) + + # make paths absolute + for argument in config["arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] + else: + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + + return par + + +def generate_config(par: dict[str, Any], config) -> str: + content_list = [ + strip_margin("""\\\\ +#!/usr/bin/env cwl-runner + +cwl:tool: rhapsody +""") + ] + + for argument in config["arguments"]: + arg_info = argument.get("info") or {} + config_key = arg_info.get("config_key") + if par[argument["clean_name"]] and config_key: + if argument["type"] == "file": + str = strip_margin(f"""\\\\ +{config_key}: +""") + if isinstance(par[argument["clean_name"]], list): + for file in par[argument["clean_name"]]: + str += strip_margin(f"""\\\\ + - class: File + location: "{file}" +""") + else: + str += strip_margin(f"""\\\\ + class: File + location: "{par[argument["clean_name"]]}" +""") + content_list.append(str) + else: + content_list.append( + strip_margin(f"""\\\\ +{config_key}: {par[argument["clean_name"]]} +""") + ) + + ## Write config to file + return "".join(content_list) + + +def generate_config_file( + par: dict[str, Any], config: dict[str, Any], temp_dir: str +) -> str: + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, config) + with open(config_file, "w") as f: + f.write(config_content) + return config_file + + +def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: + # create cwl file (if need be) + orig_cwl_file = os.path.join( + meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl" + ) + + # Inject computational requirements into pipeline + if meta["memory_mb"] or meta["cpus"]: + cwl_file = os.path.join(dir, "pipeline.cwl") + + # Read in the file + with open(orig_cwl_file, "r") as file: + cwl_data = file.read() + + # Inject computational requirements into pipeline + if meta["memory_mb"]: + memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS + cwl_data = re.sub( + '"ramMin": [^\\\\n]*[^,](,?)\\\\n', f'"ramMin": {memory}\\\\\\\\1\\\\n', cwl_data + ) + if meta["cpus"]: + cwl_data = re.sub( + '"coresMin": [^\\\\n]*[^,](,?)\\\\n', + f'"coresMin": {meta["cpus"]}\\\\\\\\1\\\\n', + cwl_data, + ) + + # Write the file out again + with open(cwl_file, "w") as file: + file.write(cwl_data) + else: + cwl_file = orig_cwl_file + + return cwl_file + + +def copy_outputs(par: dict[str, Any], config: dict[str, Any]): + for arg in config["arguments"]: + par_value = par[arg["clean_name"]] + if par_value and arg["type"] == "file" and arg["direction"] == "output": + # example template: '[sample_name]_(assay)_cell_type_experimental.csv' + template = (arg.get("info") or {}).get("template") + if template: + template_glob = ( + template.replace("sample", par["run_name"]) + .replace("assay", "*") + .replace("number", "*") + ) + files = glob.glob(os.path.join(par["output_dir"], template_glob)) + if len(files) == 0 and arg["required"]: + raise ValueError( + f"Expected output file '{template_glob}' not found." + ) + elif len(files) > 1 and not arg["multiple"]: + raise ValueError( + f"Expected single output file '{template_glob}', but found multiple." + ) + + if not arg["multiple"]: + try: + shutil.copy(files[0], par_value) + print(f"Copied {files[0]} to {par_value}") + except IndexError: + print(f"Unable to copy {template_glob} to {par_value}") + else: + # replace '*' in par_value with index + for i, file in enumerate(files): + shutil.copy(file, par_value.replace("*", str(i))) + + +def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config, temp_dir) + + ## Process parameters + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + par["output_dir"], + ] + + if par["parallel"]: + cmd.append("--parallel") + + if par["timestamps"]: + cmd.append("--timestamps") + + # Create cwl file (if need be) + cwl_file = generate_cwl_file(meta, temp_dir) + cmd.append(cwl_file) + + # Create params file + config_file = generate_config_file(par, config, temp_dir) + cmd.append(config_file) + + # keep environment variables but set TMPDIR to temp_dir + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + # Create output dir if not exists + if not os.path.exists(par["output_dir"]): + os.makedirs(par["output_dir"]) + + # Run command + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + # Copy outputs + copy_outputs(par, config) + + +if __name__ == "__main__": + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"] + ) as temp_dir: + main(par, meta, temp_dir) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/bd_rhapsody", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/bd_rhapsody/nextflow.config b/target/nextflow/mapping/bd_rhapsody/nextflow.config new file mode 100644 index 00000000..426c7649 --- /dev/null +++ b/target/nextflow/mapping/bd_rhapsody/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/bd_rhapsody' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n \nThis pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\nsequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning \'https://bitbucket.org/CRSwDev/cwl\' and removing all objects with class \'DockerRequirement\' from the YAML.\n' + author = 'Robrecht Cannoodt, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/bd_rhapsody/nextflow_labels.config b/target/nextflow/mapping/bd_rhapsody/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/bd_rhapsody/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/bd_rhapsody/nextflow_schema.json b/target/nextflow/mapping/bd_rhapsody/nextflow_schema.json new file mode 100644 index 00000000..06ceb80d --- /dev/null +++ b/target/nextflow/mapping/bd_rhapsody/nextflow_schema.json @@ -0,0 +1,645 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bd_rhapsody", + "description": "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n \nThis pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\nsequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement' from the YAML.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "reads": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"WTALibrary_S1_L001_R1_001.fastq.gz\";\"WTALibrary_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reads_atac": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\nYou may specify as many R1/R2/I2 files as you want.\n", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"ATACLibrary_S2_L001_R1_001.fastq.gz\";\"ATACLibrary_S2_L001_R2_001.fastq.gz\";\"ATACLibrary_S2_L001_I2_001.fastq.gz\"]`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Outputs for all pipeline runs", + "properties": { + "output_dir": { + "type": "string", + "format": "path", + "description": "The unprocessed output directory containing all the outputs from the pipeline.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_dir\"`, direction: `output`, example: `\"output_dir\"`. ", + "default": "$id.$key.output_dir" + }, + "output_seurat": { + "type": "string", + "format": "path", + "description": "Single-cell analysis tool inputs", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_seurat.rds\"`, direction: `output`, example: `\"output_seurat.rds\"`. ", + "default": "$id.$key.output_seurat.rds" + }, + "output_mudata": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_mudata.h5mu\"`, direction: `output`, example: `\"output_mudata.h5mu\"`. ", + "default": "$id.$key.output_mudata.h5mu" + }, + "metrics_summary": { + "type": "string", + "format": "path", + "description": "Metrics Summary", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.metrics_summary.csv\"`, direction: `output`, example: `\"metrics_summary.csv\"`. ", + "default": "$id.$key.metrics_summary.csv" + }, + "pipeline_report": { + "type": "string", + "format": "path", + "description": "Pipeline Report", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.pipeline_report.html\"`, direction: `output`, example: `\"pipeline_report.html\"`. ", + "default": "$id.$key.pipeline_report.html" + }, + "rsec_mols_per_cell": { + "type": "string", + "format": "path", + "description": "Molecules per bioproduct per cell bassed on RSEC", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.rsec_mols_per_cell.zip\"`, direction: `output`, example: `\"RSEC_MolsPerCell_MEX.zip\"`. ", + "default": "$id.$key.rsec_mols_per_cell.zip" + }, + "dbec_mols_per_cell": { + "type": "string", + "format": "path", + "description": "Molecules per bioproduct per cell bassed on DBEC", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.dbec_mols_per_cell.zip\"`, direction: `output`, example: `\"DBEC_MolsPerCell_MEX.zip\"`. ", + "default": "$id.$key.dbec_mols_per_cell.zip" + }, + "rsec_mols_per_cell_unfiltered": { + "type": "string", + "format": "path", + "description": "Unfiltered tables containing all cell labels with 10 reads.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.rsec_mols_per_cell_unfiltered.zip\"`, direction: `output`, example: `\"RSEC_MolsPerCell_Unfiltered_MEX.zip\"`. ", + "default": "$id.$key.rsec_mols_per_cell_unfiltered.zip" + }, + "bam": { + "type": "string", + "format": "path", + "description": "Alignment file of R2 with associated R1 annotations for Bioproduct.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.bam.bam\"`, direction: `output`, example: `\"BioProduct.bam\"`. ", + "default": "$id.$key.bam.bam" + }, + "bam_index": { + "type": "string", + "format": "path", + "description": "Index file for the alignment file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.bam_index.bai\"`, direction: `output`, example: `\"BioProduct.bam.bai\"`. ", + "default": "$id.$key.bam_index.bai" + }, + "bioproduct_stats": { + "type": "string", + "format": "path", + "description": "Bioproduct Stats", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.bioproduct_stats.csv\"`, direction: `output`, example: `\"Bioproduct_Stats.csv\"`. ", + "default": "$id.$key.bioproduct_stats.csv" + }, + "dimred_tsne": { + "type": "string", + "format": "path", + "description": "t-SNE dimensionality reduction coordinates per cell index", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.dimred_tsne.csv\"`, direction: `output`, example: `\"tSNE_coordinates.csv\"`. ", + "default": "$id.$key.dimred_tsne.csv" + }, + "dimred_umap": { + "type": "string", + "format": "path", + "description": "UMAP dimensionality reduction coordinates per cell index", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.dimred_umap.csv\"`, direction: `output`, example: `\"UMAP_coordinates.csv\"`. ", + "default": "$id.$key.dimred_umap.csv" + }, + "immune_cell_classification": { + "type": "string", + "format": "path", + "description": "Immune Cell Classification", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.immune_cell_classification.csv\"`, direction: `output`, example: `\"Immune_Cell_Classification.csv\"`. ", + "default": "$id.$key.immune_cell_classification.csv" + } + } + }, + "references": { + "title": "References", + "type": "object", + "description": "Assay type will be inferred from the provided reference(s).\nDo not provide both reference_archive and targeted_reference at the same time.\n \nValid reference input combinations:\n - reference_archive: WTA only\n - reference_archive & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference: WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n - targeted_reference: Targeted only\n - targeted_reference & abseq_reference: Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can be generated with the bd_rhapsody_make_reference component.\nAlternatively, BD also provides standard references which can be downloaded from these locations:\n\n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n", + "properties": { + "reference_archive": { + "type": "string", + "format": "path", + "description": "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level folder\n - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation e.g", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"RhapRef_Human_WTA_2023-02.tar.gz\"`. " + }, + "targeted_reference": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to the targeted reference file in FASTA format.\n", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"BD_Rhapsody_Immune_Response_Panel_Hs.fasta\"]`. " + }, + "abseq_reference": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to the AbSeq reference file in FASTA format", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"AbSeq_reference.fasta\"]`. " + }, + "supplemental_reference": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to the supplemental reference file in FASTA format", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"supplemental_reference.fasta\"]`. " + } + } + }, + "multiplex outputs": { + "title": "Multiplex outputs", + "type": "object", + "description": "Outputs when multiplex option is selected", + "properties": { + "sample_tag_metrics": { + "type": "string", + "format": "path", + "description": "Sample Tag Metrics", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.sample_tag_metrics.csv\"`, direction: `output`, example: `\"Sample_Tag_Metrics.csv\"`. ", + "default": "$id.$key.sample_tag_metrics.csv" + }, + "sample_tag_calls": { + "type": "string", + "format": "path", + "description": "Sample Tag Calls", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.sample_tag_calls.csv\"`, direction: `output`, example: `\"Sample_Tag_Calls.csv\"`. ", + "default": "$id.$key.sample_tag_calls.csv" + }, + "sample_tag_counts": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Sample Tag Counts", + "help_text": "Type: `file`, multiple: `True`, default: `\"$id.$key.sample_tag_counts_*.zip\"`, direction: `output`, example: `[\"Sample_Tag1.zip\"]`. ", + "default": "$id.$key.sample_tag_counts_*.zip" + }, + "sample_tag_counts_unassigned": { + "type": "string", + "format": "path", + "description": "Sample Tag Counts Unassigned", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.sample_tag_counts_unassigned.zip\"`, direction: `output`, example: `\"Multiplet_and_Undetermined.zip\"`. ", + "default": "$id.$key.sample_tag_counts_unassigned.zip" + } + } + }, + "vdj outputs": { + "title": "VDJ Outputs", + "type": "object", + "description": "Outputs when VDJ option selected", + "properties": { + "vdj_metrics": { + "type": "string", + "format": "path", + "description": "VDJ Metrics", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.vdj_metrics.csv\"`, direction: `output`, example: `\"VDJ_Metrics.csv\"`. ", + "default": "$id.$key.vdj_metrics.csv" + }, + "vdj_per_cell": { + "type": "string", + "format": "path", + "description": "VDJ Per Cell", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.vdj_per_cell.csv\"`, direction: `output`, example: `\"VDJ_perCell.csv\"`. ", + "default": "$id.$key.vdj_per_cell.csv" + }, + "vdj_per_cell_uncorrected": { + "type": "string", + "format": "path", + "description": "VDJ Per Cell Uncorrected", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.vdj_per_cell_uncorrected.csv\"`, direction: `output`, example: `\"VDJ_perCell_uncorrected.csv\"`. ", + "default": "$id.$key.vdj_per_cell_uncorrected.csv" + }, + "vdj_dominant_contigs": { + "type": "string", + "format": "path", + "description": "VDJ Dominant Contigs", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.vdj_dominant_contigs.csv\"`, direction: `output`, example: `\"VDJ_Dominant_Contigs_AIRR.csv\"`. ", + "default": "$id.$key.vdj_dominant_contigs.csv" + }, + "vdj_unfiltered_contigs": { + "type": "string", + "format": "path", + "description": "VDJ Unfiltered Contigs", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.vdj_unfiltered_contigs.csv\"`, direction: `output`, example: `\"VDJ_Unfiltered_Contigs_AIRR.csv\"`. ", + "default": "$id.$key.vdj_unfiltered_contigs.csv" + } + } + }, + "atac-seq outputs": { + "title": "ATAC-Seq outputs", + "type": "object", + "description": "Outputs when ATAC-Seq option selected", + "properties": { + "atac_metrics": { + "type": "string", + "format": "path", + "description": "ATAC Metrics", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_metrics.csv\"`, direction: `output`, example: `\"ATAC_Metrics.csv\"`. ", + "default": "$id.$key.atac_metrics.csv" + }, + "atac_metrics_json": { + "type": "string", + "format": "path", + "description": "ATAC Metrics JSON", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_metrics_json.json\"`, direction: `output`, example: `\"ATAC_Metrics.json\"`. ", + "default": "$id.$key.atac_metrics_json.json" + }, + "atac_fragments": { + "type": "string", + "format": "path", + "description": "ATAC Fragments", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_fragments.gz\"`, direction: `output`, example: `\"ATAC_Fragments.bed.gz\"`. ", + "default": "$id.$key.atac_fragments.gz" + }, + "atac_fragments_index": { + "type": "string", + "format": "path", + "description": "Index of ATAC Fragments.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_fragments_index.tbi\"`, direction: `output`, example: `\"ATAC_Fragments.bed.gz.tbi\"`. ", + "default": "$id.$key.atac_fragments_index.tbi" + }, + "atac_transposase_sites": { + "type": "string", + "format": "path", + "description": "ATAC Transposase Sites", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_transposase_sites.gz\"`, direction: `output`, example: `\"ATAC_Transposase_Sites.bed.gz\"`. ", + "default": "$id.$key.atac_transposase_sites.gz" + }, + "atac_transposase_sites_index": { + "type": "string", + "format": "path", + "description": "Index of ATAC Transposase Sites.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_transposase_sites_index.tbi\"`, direction: `output`, example: `\"ATAC_Transposase_Sites.bed.gz.tbi\"`. ", + "default": "$id.$key.atac_transposase_sites_index.tbi" + }, + "atac_peaks": { + "type": "string", + "format": "path", + "description": "ATAC Peaks", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_peaks.gz\"`, direction: `output`, example: `\"ATAC_Peaks.bed.gz\"`. ", + "default": "$id.$key.atac_peaks.gz" + }, + "atac_peaks_index": { + "type": "string", + "format": "path", + "description": "Index of ATAC Peaks.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_peaks_index.tbi\"`, direction: `output`, example: `\"ATAC_Peaks.bed.gz.tbi\"`. ", + "default": "$id.$key.atac_peaks_index.tbi" + }, + "atac_peak_annotation": { + "type": "string", + "format": "path", + "description": "ATAC Peak Annotation", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_peak_annotation.gz\"`, direction: `output`, example: `\"peak_annotation.tsv.gz\"`. ", + "default": "$id.$key.atac_peak_annotation.gz" + }, + "atac_cell_by_peak": { + "type": "string", + "format": "path", + "description": "ATAC Cell by Peak", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_cell_by_peak.zip\"`, direction: `output`, example: `\"ATAC_Cell_by_Peak_MEX.zip\"`. ", + "default": "$id.$key.atac_cell_by_peak.zip" + }, + "atac_cell_by_peak_unfiltered": { + "type": "string", + "format": "path", + "description": "ATAC Cell by Peak Unfiltered", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_cell_by_peak_unfiltered.zip\"`, direction: `output`, example: `\"ATAC_Cell_by_Peak_Unfiltered_MEX.zip\"`. ", + "default": "$id.$key.atac_cell_by_peak_unfiltered.zip" + }, + "atac_bam": { + "type": "string", + "format": "path", + "description": "ATAC BAM", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_bam.bam\"`, direction: `output`, example: `\"ATAC.bam\"`. ", + "default": "$id.$key.atac_bam.bam" + }, + "atac_bam_index": { + "type": "string", + "format": "path", + "description": "Index of ATAC BAM.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.atac_bam_index.bai\"`, direction: `output`, example: `\"ATAC.bam.bai\"`. ", + "default": "$id.$key.atac_bam_index.bai" + } + } + }, + "abseq cell calling outputs": { + "title": "AbSeq Cell Calling outputs", + "type": "object", + "description": "Outputs when Cell Calling Abseq is selected", + "properties": { + "protein_aggregates_experimental": { + "type": "string", + "format": "path", + "description": "Protein Aggregates Experimental", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.protein_aggregates_experimental.csv\"`, direction: `output`, example: `\"Protein_Aggregates_Experimental.csv\"`. ", + "default": "$id.$key.protein_aggregates_experimental.csv" + } + } + }, + "putative cell calling settings": { + "title": "Putative Cell Calling Settings", + "type": "object", + "description": "No description", + "properties": { + "cell_calling_data": { + "type": "string", + "description": "Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above.\n \nFor putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above.\n \nThe default data for putative cell calling, will be determined the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n- If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"mRNA\"`, choices: ``mRNA`, `AbSeq`, `ATAC`, `mRNA_and_ATAC``. ", + "enum": [ + "mRNA", + "AbSeq", + "ATAC", + "mRNA_and_ATAC" + ] + }, + "cell_calling_bioproduct_algorithm": { + "type": "string", + "description": "Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`, choices: ``Basic`, `Refined``. ", + "enum": [ + "Basic", + "Refined" + ] + }, + "cell_calling_atac_algorithm": { + "type": "string", + "description": "Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`, choices: ``Basic`, `Refined``. ", + "enum": [ + "Basic", + "Refined" + ] + }, + "exact_cell_count": { + "type": "integer", + "description": "Set a specific number of cells as putative, based on those with the highest error-corrected read count\n", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "expected_cell_count": { + "type": "integer", + "description": "Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected", + "help_text": "Type: `integer`, multiple: `False`, example: `20000`. " + } + } + }, + "intronic reads settings": { + "title": "Intronic Reads Settings", + "type": "object", + "description": "No description", + "properties": { + "exclude_intronic_reads": { + "type": "boolean", + "description": "By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts", + "help_text": "Type: `boolean`, multiple: `False`, example: `false`. " + } + } + }, + "multiplex settings": { + "title": "Multiplex Settings", + "type": "object", + "description": "No description", + "properties": { + "sample_tags_version": { + "type": "string", + "description": "Specify the version of the Sample Tags used in the run:\n\n* If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the \"nuclei_includes_mrna\" option.\n* If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the \"nuclei_atac_only\" option.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"human\"`, choices: ``human`, `mouse`, `flex`, `nuclei_includes_mrna`, `nuclei_atac_only``. ", + "enum": [ + "human", + "mouse", + "flex", + "nuclei_includes_mrna", + "nuclei_atac_only" + ] + }, + "tag_names": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"4-mySample\";\"9-myOtherSample\";\"6-alsoThisSample\"]`. " + } + } + }, + "vdj arguments": { + "title": "VDJ arguments", + "type": "object", + "description": "No description", + "properties": { + "vdj_version": { + "type": "string", + "description": "If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"human\"`, choices: ``human`, `mouse`, `humanBCR`, `humanTCR`, `mouseBCR`, `mouseTCR``. ", + "enum": [ + "human", + "mouse", + "humanBCR", + "humanTCR", + "mouseBCR", + "mouseTCR" + ] + } + } + }, + "atac options": { + "title": "ATAC options", + "type": "object", + "description": "No description", + "properties": { + "predefined_atac_peaks": { + "type": "string", + "format": "path", + "description": "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"predefined_peaks.bed\"`. " + } + } + }, + "additional options": { + "title": "Additional options", + "type": "object", + "description": "No description", + "properties": { + "run_name": { + "type": "string", + "description": "Specify a run name to use as the output file base name", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample\"`. ", + "default": "sample" + }, + "generate_bam": { + "type": "boolean", + "description": "Specify whether to create the BAM file output\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "long_reads": { + "type": "boolean", + "description": "Use STARlong (default: undefined - i.e", + "help_text": "Type: `boolean`, multiple: `False`. " + } + } + }, + "advanced options": { + "title": "Advanced options", + "type": "object", + "description": "NOTE: Only change these if you are really sure about what you are doing\n", + "properties": { + "custom_star_params": { + "type": "string", + "description": "Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline.\nFor reference this is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000\"`. " + }, + "custom_bwa_mem2_params": { + "type": "string", + "description": "Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used\nThis applies to fastqs provided in the Reads_ATAC user input \nDo NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2 version 2.2.1\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"-k 16 -w 200 -r\"`. " + } + } + }, + "cwl-runner arguments": { + "title": "CWL-runner arguments", + "type": "object", + "description": "No description", + "properties": { + "parallel": { + "type": "boolean", + "description": "Run jobs in parallel.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "timestamps": { + "type": "boolean", + "description": "Add timestamps to the errors, warnings, and notifications.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "undocumented arguments": { + "title": "Undocumented arguments", + "type": "object", + "description": "No description", + "properties": { + "abseq_umi": { + "type": "integer", + "description": "", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "target_analysis": { + "type": "boolean", + "description": "", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "vdj_jgene_evalue": { + "type": "number", + "description": "e-value threshold for J gene", + "help_text": "Type: `double`, multiple: `False`. " + }, + "vdj_vgene_evalue": { + "type": "number", + "description": "e-value threshold for V gene", + "help_text": "Type: `double`, multiple: `False`. " + }, + "write_filtered_reads": { + "type": "boolean", + "description": "Output processed FASTQ reads.", + "help_text": "Type: `boolean`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/references" + }, + { + "$ref": "#/$defs/multiplex outputs" + }, + { + "$ref": "#/$defs/vdj outputs" + }, + { + "$ref": "#/$defs/atac-seq outputs" + }, + { + "$ref": "#/$defs/abseq cell calling outputs" + }, + { + "$ref": "#/$defs/putative cell calling settings" + }, + { + "$ref": "#/$defs/intronic reads settings" + }, + { + "$ref": "#/$defs/multiplex settings" + }, + { + "$ref": "#/$defs/vdj arguments" + }, + { + "$ref": "#/$defs/atac options" + }, + { + "$ref": "#/$defs/additional options" + }, + { + "$ref": "#/$defs/advanced options" + }, + { + "$ref": "#/$defs/cwl-runner arguments" + }, + { + "$ref": "#/$defs/undocumented arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl b/target/nextflow/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl new file mode 100755 index 00000000..30009f05 --- /dev/null +++ b/target/nextflow/mapping/bd_rhapsody/rhapsody_pipeline_2.2.1_nodocker.cwl @@ -0,0 +1,6146 @@ +#!/usr/bin/env cwl-runner +{ + "$graph": [ + { + "class": "CommandLineTool", + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "baseCommand": "ATAC_Cell_by_Peak.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--fragments" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Fragments" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--gtf" + }, + "id": "#ATAC_Cell_by_Peak.cwl/GTF" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peaks" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Peaks" + }, + { + "type": "File", + "loadContents": true, + "id": "#ATAC_Cell_by_Peak.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--transposase-sites" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Transposase_Sites" + } + ], + "arguments": [ + { + "prefix": "--base-name", + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + } + ], + "id": "#ATAC_Cell_by_Peak.cwl", + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_ATAC_Cell_Calling_Data.csv" + }, + "id": "#ATAC_Cell_by_Peak.cwl/ATAC_Cell_Calling_Data" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Initial_Seurat.rds" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Initial_Seurat_RDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Total_Fragment_Metrics.json" + }, + "id": "#ATAC_Cell_by_Peak.cwl/Total_Fragment_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.log" + }, + "id": "#ATAC_Cell_by_Peak.cwl/output" + } + ] + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 48000 + } + ], + "baseCommand": "ATAC_Compile_Results.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--biop-putative-data-table" + }, + "id": "#ATAC_Compile_Results.cwl/Biop_putative_data_table" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--cell-order" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Order" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--cell-order-subsampled" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Order_Subsampled" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--fragments" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#ATAC_Compile_Results.cwl/Fragments" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--initial-seurat" + }, + "id": "#ATAC_Compile_Results.cwl/Initial_Seurat_RDS" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--input-metrics-tar" + }, + "id": "#ATAC_Compile_Results.cwl/Input_Metrics_tar" + }, + { + "type": "string", + "inputBinding": { + "prefix": "--genome-size" + }, + "id": "#ATAC_Compile_Results.cwl/Reference_Genome_Size" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#ATAC_Compile_Results.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--total-fragment-metrics" + }, + "id": "#ATAC_Compile_Results.cwl/Total_Fragment_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--unified-metrics" + }, + "id": "#ATAC_Compile_Results.cwl/Unified_Metrics" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_MEX.zip" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*/*_coordinates.csv" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_ATAC_Seurat.rds" + }, + "id": "#ATAC_Compile_Results.cwl/ATAC_Seurat_RDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*cell_type_experimental.csv" + }, + "id": "#ATAC_Compile_Results.cwl/Cell_Type_Predictions" + }, + { + "type": "File", + "outputBinding": { + "glob": "metrics-files.tar.gz" + }, + "id": "#ATAC_Compile_Results.cwl/Metrics_tar" + }, + { + "type": "File", + "outputBinding": { + "glob": "mist_atac_compile_results.log" + }, + "id": "#ATAC_Compile_Results.cwl/output" + } + ], + "id": "#ATAC_Compile_Results.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": "${ if (inputs.Assay == 'ATAC') { return 2; } else { return 4; } }", + "ramMin": "${ if (inputs.Assay == 'ATAC') { return 4000; } else { return 32000; } }" + } + ], + "baseCommand": [ + "mist_add_to_bam.py" + ], + "inputs": [ + { + "type": "string", + "inputBinding": { + "prefix": "--assay" + }, + "id": "#AddtoBam.cwl/Assay" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--input-bam" + }, + "id": "#AddtoBam.cwl/Bam" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--cell-order-json" + }, + "id": "#AddtoBam.cwl/Cell_Order" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--corrected-mols-list", + "itemSeparator": "," + }, + "id": "#AddtoBam.cwl/Corrected_Mols" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#AddtoBam.cwl/Generate_Bam" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata-json" + }, + "id": "#AddtoBam.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#AddtoBam.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--target-gene-mapping" + }, + "id": "#AddtoBam.cwl/Target_Gene_Mapping" + } + ], + "arguments": [ + { + "prefix": "--bamIO-threads", + "valueFrom": "$(runtime.cores)" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "*.bam" + }, + "id": "#AddtoBam.cwl/Annotated_Bam" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#AddtoBam.cwl/output" + } + ], + "id": "#AddtoBam.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "AlignmentAnalysis stage of the Rain pipeline annotates aligned reads and collects a myriad of metrics on the aligned reads. Additional annotation is performed to the reads\n", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8, + "ramMin": 24000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "AlignmentAnalysisAndCountCB.sh" + ], + "inputs": [ + { + "inputBinding": { + "prefix": "--assay" + }, + "type": [ + "null", + "string" + ], + "id": "#AlignmentAnalysis.cwl/Assay" + }, + { + "inputBinding": { + "prefix": "--exclude-intronic-reads" + }, + "type": [ + "null", + "boolean" + ], + "id": "#AlignmentAnalysis.cwl/Exclude_Intronic_Reads" + }, + { + "inputBinding": { + "prefix": "--extra-seqs" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Extra_Seqs" + }, + { + "inputBinding": { + "prefix": "--gtf" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/GTF" + }, + { + "inputBinding": { + "prefix": "--threads" + }, + "type": [ + "null", + "int" + ], + "id": "#AlignmentAnalysis.cwl/Maximum_Threads" + }, + { + "inputBinding": { + "prefix": "--r2-bam", + "itemSeparator": "," + }, + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#AlignmentAnalysis.cwl/R2_BAM" + }, + { + "inputBinding": { + "prefix": "--quality-metrics" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/ReadQualityMetrics" + }, + { + "inputBinding": { + "prefix": "--run-metadata" + }, + "type": "File", + "id": "#AlignmentAnalysis.cwl/Run_Metadata" + }, + { + "inputBinding": { + "prefix": "--transcript-length" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Transcript_Length" + } + ], + "outputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*.annotated.*.bam" + }, + "id": "#AlignmentAnalysis.cwl/Annotated_Bam_Files" + }, + { + "outputBinding": { + "glob": "*logs.tar.gz" + }, + "type": [ + "null", + "File" + ], + "id": "#AlignmentAnalysis.cwl/Logs" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_SeqMetrics.csv" + }, + "id": "#AlignmentAnalysis.cwl/Seq_Metrics" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*Sorted_Valid_Reads.csv.*" + }, + "id": "#AlignmentAnalysis.cwl/Sorted_Valid_Reads_CSV" + }, + { + "type": "int", + "outputBinding": { + "glob": "count_estimates.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 30000; } return parseInt(JSON.parse(self[0].contents).num_bioproducts); }" + }, + "id": "#AlignmentAnalysis.cwl/num_bioproducts" + }, + { + "type": "int", + "outputBinding": { + "glob": "count_estimates.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 10000; } return parseInt(JSON.parse(self[0].contents).num_cell_estimate); }" + }, + "id": "#AlignmentAnalysis.cwl/num_cell_estimate" + }, + { + "type": "int", + "outputBinding": { + "glob": "num_vdj_reads.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 0; } return parseInt(JSON.parse(self[0].contents).BCR); }" + }, + "id": "#AlignmentAnalysis.cwl/num_valid_ig_reads" + }, + { + "type": "int", + "outputBinding": { + "glob": "num_vdj_reads.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return 0; } return parseInt(JSON.parse(self[0].contents).TCR); }" + }, + "id": "#AlignmentAnalysis.cwl/num_valid_tcr_reads" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_BCR_Valid_Reads.fastq.gz" + }, + "id": "#AlignmentAnalysis.cwl/validIgReads" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_TCR_Valid_Reads.fastq.gz" + }, + "id": "#AlignmentAnalysis.cwl/validTcrReads" + } + ], + "id": "#AlignmentAnalysis.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_annotate_molecules.py" + ], + "inputs": [ + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--umi-option" + }, + "id": "#AnnotateMolecules.cwl/AbSeq_UMI" + }, + { + "type": [ + "null", + "string" + ], + "id": "#AnnotateMolecules.cwl/Assay" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#AnnotateMolecules.cwl/Run_Metadata" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--valid-annot" + }, + "id": "#AnnotateMolecules.cwl/Valids" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "*_bioproduct_stats*.json" + }, + "id": "#AnnotateMolecules.cwl/Bioproduct_Stats_List" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_CellBiopSummary.csv.*" + }, + "id": "#AnnotateMolecules.cwl/Cell_Biop_Summary_List" + }, + { + "type": "File", + "outputBinding": { + "glob": "*_Annotation_Molecule_corrected.csv.*" + }, + "id": "#AnnotateMolecules.cwl/Corrected_Mols_List" + }, + { + "type": "int", + "outputBinding": { + "glob": "stats.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).total_molecules)\n" + }, + "id": "#AnnotateMolecules.cwl/Total_Molecules" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#AnnotateMolecules.cwl/output" + } + ], + "id": "#AnnotateMolecules.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": "boolean", + "id": "#Assay_Settings.cwl/AbSeq_Reference_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reads_ATAC_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reads_RNA_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Reference_Archive_Present" + }, + { + "type": "boolean", + "id": "#Assay_Settings.cwl/Targeted_Reference_Present" + } + ], + "outputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#Assay_Settings.cwl/Assay_ATAC" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Assay_Settings.cwl/Assay_RNA" + } + ], + "expression": "${\n var assay_rna = null;\n var assay_atac = null;\n\n if (!inputs.Reads_ATAC_Present && !inputs.Reads_RNA_Present)\n {\n throw new Error('Invalid pipeline inputs: Please provide Reads for at least 1 of RNA or ATAC analysis.')\n }\n if (inputs.Targeted_Reference_Present && inputs.Reference_Archive_Present) {\n throw new Error('Invalid pipeline inputs: Do not provide both Targeted Reference and Reference Archive.')\n }\n if (!inputs.Targeted_Reference_Present && !inputs.AbSeq_Reference_Present && !inputs.Reference_Archive_Present) {\n throw new Error('Invalid pipeline inputs: Please provide either a Reference Archive or a Targeted Reference or an AbSeq Reference.')\n }\n\n if ( inputs.Reads_ATAC_Present )\n {\n assay_atac = \"ATAC\"\n }\n\n if (inputs.Reads_RNA_Present && inputs.Reference_Archive_Present) {\n assay_rna = \"WTA\"\n }\n else if (inputs.Reads_RNA_Present && (inputs.Targeted_Reference_Present || inputs.AbSeq_Reference_Present)) {\n assay_rna = \"Targeted\"\n }\n\n return ({Assay_RNA: assay_rna, Assay_ATAC: assay_atac})\n}", + "id": "#Assay_Settings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "boolean" + ], + "id": "#BamSettings.cwl/_Generate_Bam" + } + ], + "outputs": [ + { + "type": [ + "null", + "boolean" + ], + "id": "#BamSettings.cwl/Generate_Bam" + } + ], + "expression": "${\n // the create bam flag defaults to false\n var generateBam = false;\n // the user can set this flag to true, to enable creation of the bam file.\n if (inputs._Generate_Bam != null) {\n generateBam = inputs._Generate_Bam;\n }\n return ({\n Generate_Bam: generateBam,\n });\n}", + "id": "#BamSettings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "MultipleInputFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#BundleLogs.cwl/log_files" + } + ], + "outputs": [ + { + "type": "Directory", + "id": "#BundleLogs.cwl/logs_dir" + } + ], + "expression": "${\n /* shamelly cribbed from https://gist.github.com/jcxplorer/823878 */\n function uuid() {\n var uuid = \"\", i, random;\n for (i = 0; i < 32; i++) {\n random = Math.random() * 16 | 0;\n if (i == 8 || i == 12 || i == 16 || i == 20) {\n uuid += \"-\";\n }\n uuid += (i == 12 ? 4 : (i == 16 ? (random & 3 | 8) : random)).toString(16);\n }\n return uuid;\n }\n var listing = [];\n for (var i = 0; i < inputs.log_files.length; i++) {\n var log_file = inputs.log_files[i];\n /*\n Checking here for null in case a Node was skipped because of conditional execution.\n For e.g. Generate_Bam is used to skip the AddToBam, MergeBam and IndexBam nodes\n */\n if (log_file != null) {\n log_file.basename = uuid() + \"-\" + log_file.basename;\n listing.push(log_file);\n }\n }\n return ({\n logs_dir: {\n class: \"Directory\",\n basename: \"Logs\",\n listing: listing\n }\n });\n}", + "id": "#BundleLogs.cwl" + }, + { + "requirements": [ + { + "listing": [ + "$(inputs.Reference_Archive)" + ], + "class": "InitialWorkDirRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_check_references.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--abseq-reference" + }, + "id": "#CheckReference.cwl/AbSeq_Reference" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "inputBinding": { + "itemSeparator": ",", + "prefix": "--assay" + }, + "id": "#CheckReference.cwl/Assay" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--putative-cell-call" + }, + "id": "#CheckReference.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--predefined-peaks" + }, + "id": "#CheckReference.cwl/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + "File", + "Directory" + ], + "inputBinding": { + "prefix": "--reference-archive" + }, + "id": "#CheckReference.cwl/Reference_Archive" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--sample-tags-version" + }, + "id": "#CheckReference.cwl/Sample_Tags_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--supplemental-reference" + }, + "id": "#CheckReference.cwl/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--targeted-reference" + }, + "id": "#CheckReference.cwl/Targeted_Reference" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version" + }, + "id": "#CheckReference.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#CheckReference.cwl/Checked_Predefined_Peaks" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#CheckReference.cwl/Checked_Predefined_Peaks_Index" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "combined_extra_seq.fasta" + }, + "id": "#CheckReference.cwl/Extra_Seqs" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "full-gene-list.json" + }, + "id": "#CheckReference.cwl/Full_Genes" + }, + { + "type": "File", + "outputBinding": { + "glob": "BD_Rhapsody_Reference_Files/*.gtf" + }, + "id": "#CheckReference.cwl/GTF" + }, + { + "type": "Directory", + "outputBinding": { + "glob": "BD_Rhapsody_Reference_Files" + }, + "id": "#CheckReference.cwl/Index" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "target-gene.json" + }, + "id": "#CheckReference.cwl/Target_Gene_Mapping" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "transcript_length.json" + }, + "id": "#CheckReference.cwl/Transcript_Length" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#CheckReference.cwl/output" + } + ], + "id": "#CheckReference.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_generate_H5MU.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--atac-cell-by-peak" + }, + "id": "#GenerateH5MU.cwl/Atac_Datatables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--atac-metrics" + }, + "id": "#GenerateH5MU.cwl/Atac_Metrics" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--bioproduct-stats" + }, + "id": "#GenerateH5MU.cwl/Bioproduct_Stats" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-experimental", + "itemSeparator": "," + }, + "id": "#GenerateH5MU.cwl/Cell_Type_Experimental" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--data-tables" + }, + "id": "#GenerateH5MU.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--coordinates-file-list" + }, + "id": "#GenerateH5MU.cwl/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--metrics-summary" + }, + "id": "#GenerateH5MU.cwl/Metrics_Summary" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peak_annotation" + }, + "id": "#GenerateH5MU.cwl/Peak_Annotation" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--protein-aggregates-experimental" + }, + "id": "#GenerateH5MU.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--putative-cells-origin" + }, + "id": "#GenerateH5MU.cwl/Putative_Cells_Origin" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#GenerateH5MU.cwl/Run_Metadata" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--sample-tag-files" + }, + "id": "#GenerateH5MU.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#GenerateH5MU.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell" + }, + "id": "#GenerateH5MU.cwl/VDJ_Per_Cell" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.h5mu" + }, + "id": "#GenerateH5MU.cwl/H5MU" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#GenerateH5MU.cwl/output" + } + ], + "id": "#GenerateH5MU.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "StepInputExpressionRequirement" + } + ], + "hints": [], + "baseCommand": "GenerateSeurat.R", + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--atac-seurat-rds" + }, + "id": "#GenerateSeurat.cwl/ATAC_Seurat" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--bioproduct-stats" + }, + "id": "#GenerateSeurat.cwl/Bioproduct_Stats" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-experimental", + "itemSeparator": "," + }, + "id": "#GenerateSeurat.cwl/Cell_Type_Experimental" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--data-tables" + }, + "id": "#GenerateSeurat.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--coordinates-file-list" + }, + "id": "#GenerateSeurat.cwl/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--protein-aggregates-experimental" + }, + "id": "#GenerateSeurat.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--putative-cells-origin" + }, + "id": "#GenerateSeurat.cwl/Putative_Cells_Origin" + }, + { + "type": "File", + "loadContents": true, + "id": "#GenerateSeurat.cwl/Run_Metadata" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--sample-tag-csvs" + }, + "id": "#GenerateSeurat.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--sample-tag-calls" + }, + "id": "#GenerateSeurat.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell" + }, + "id": "#GenerateSeurat.cwl/VDJ_Per_Cell" + } + ], + "arguments": [ + { + "prefix": "--base-name", + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.rds" + }, + "id": "#GenerateSeurat.cwl/SeuratRDS" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.log" + }, + "id": "#GenerateSeurat.cwl/output" + } + ], + "id": "#GenerateSeurat.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 48000 + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_get_datatables.py" + ], + "inputs": [ + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--atac-cell-calling-data-file" + }, + "id": "#GetDataTable.cwl/ATAC_Cell_Calling_Input" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--bioproduct-stats-list" + }, + "id": "#GetDataTable.cwl/Bioproduct_Stats_List" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "itemSeparator": ",", + "prefix": "--cell-biop-summary-list" + }, + "id": "#GetDataTable.cwl/Cell_Biop_Summary_List" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--full-gene-list" + }, + "id": "#GetDataTable.cwl/Full_Genes" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#GetDataTable.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--seq-metrics" + }, + "id": "#GetDataTable.cwl/Seq_Metrics" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "int" + } + ], + "id": "#GetDataTable.cwl/Total_Molecules" + }, + { + "type": [ + "null", + "int" + ], + "id": "#GetDataTable.cwl/num_bioproducts" + }, + { + "type": [ + "null", + "int" + ], + "id": "#GetDataTable.cwl/num_cell_estimate" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_RSEC_MolsPerCell_MEX.zip" + }, + "id": "#GetDataTable.cwl/Biop_putative_data_table" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Bioproduct_Stats.csv" + }, + "id": "#GetDataTable.cwl/Bioproduct_Stats" + }, + { + "type": "File", + "outputBinding": { + "glob": "cell_order.json" + }, + "id": "#GetDataTable.cwl/Cell_Order" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "cell_order_subsampled.json" + }, + "id": "#GetDataTable.cwl/Cell_Order_Subsampled" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*cell_type_experimental.csv" + }, + "id": "#GetDataTable.cwl/Cell_Type_Predictions" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_MEX.zip" + }, + "id": "#GetDataTable.cwl/Data_Tables" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "*_coordinates.csv" + }, + "id": "#GetDataTable.cwl/Dim_Reduction_Coord" + }, + { + "type": "File", + "outputBinding": { + "glob": "metrics-files.tar.gz" + }, + "id": "#GetDataTable.cwl/Metrics_tar" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "Protein_Agg/*_Protein_Aggregates_Experimental.csv" + }, + "id": "#GetDataTable.cwl/Protein_Aggregates_Experimental" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "Cell_Label_Filtering/*_Putative_Cells_Origin.csv" + }, + "id": "#GetDataTable.cwl/Putative_Cells_Origin" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "SampleTag/*csv" + }, + "id": "#GetDataTable.cwl/SampleTag_CSVs" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "SampleTag/*_Sample_Tag_Calls.csv" + }, + "id": "#GetDataTable.cwl/SampleTag_Calls" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": "SampleTagArchives/*zip" + }, + "id": "#GetDataTable.cwl/SampleTag_perTagZips" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#GetDataTable.cwl/output" + } + ], + "id": "#GetDataTable.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "GetMachineResources gets available resources (current only the number of cpus) on the machine that is running the local deployment of the MIST pipeline.\n", + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_machine_resources.py" + ], + "inputs": [], + "outputs": [ + { + "type": "int", + "outputBinding": { + "glob": "resources.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).total_cpus_avail)" + }, + "id": "#GetMachineResources.cwl/Total_CPUs_Avail" + } + ], + "id": "#GetMachineResources.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/_AbSeq_UMI" + }, + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/_MinChunkSize" + }, + { + "type": [ + "null", + "long" + ], + "id": "#InternalSettings.cwl/_NumRecordsPerSplit" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_Subsample_Sample_Tags" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#InternalSettings.cwl/_Target_analysis" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/_VDJ_VGene_Evalue" + } + ], + "outputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/AbSeq_UMI" + }, + { + "type": [ + "null", + "int" + ], + "id": "#InternalSettings.cwl/MinChunkSize" + }, + { + "type": [ + "null", + "long" + ], + "id": "#InternalSettings.cwl/NumRecordsPerSplit" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/Subsample_Sample_Tags" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#InternalSettings.cwl/Target_analysis" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#InternalSettings.cwl/VDJ_VGene_Evalue" + } + ], + "expression": "${\n var internalInputs = [\n '_AbSeq_UMI',\n '_MinChunkSize',\n '_NumRecordsPerSplit',\n '_Target_analysis',\n '_Subsample_Sample_Tags',\n '_VDJ_VGene_Evalue',\n '_VDJ_JGene_Evalue',\n ];\n var internalOutputs = {}\n for (var i = 0; i < internalInputs.length; i++) {\n var internalInput = internalInputs[i];\n var internalOutput = internalInput.slice(1); // remove leading underscore\n if (inputs.hasOwnProperty(internalInput)) {\n internalOutputs[internalOutput] = inputs[internalInput]; // if input specified, redirect to output\n } else {\n internalOutputs[internalOutput] = null; // if input not specified, provide a null\n }\n }\n return internalOutputs;\n}", + "id": "#InternalSettings.cwl" + }, + { + "class": "Workflow", + "label": "BD Rhapsody™ Sequence Analysis Pipeline", + "doc": "The BD Rhapsody™ assays are used to create sequencing libraries from single cell transcriptomes.\n\nAfter sequencing, the analysis pipeline takes the FASTQ files and a reference file for gene alignment. The pipeline generates molecular counts per cell, read counts per cell, metrics, and an alignment file.", + "requirements": [ + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "MultipleInputFeatureRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "AbSeq Reference", + "id": "#main/AbSeq_Reference" + }, + { + "type": [ + "null", + "int" + ], + "id": "#main/AbSeq_UMI" + }, + { + "label": "Cell Calling ATAC Algorithm", + "doc": "Specify the ATAC algorithm to be used for ATAC putative cell calling. The Basic algorithm is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm", + "symbols": [ + "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm/Basic", + "#main/Cell_Calling_ATAC_Algorithm/Cell_Calling_ATAC_Algorithm/Refined" + ] + } + ], + "id": "#main/Cell_Calling_ATAC_Algorithm" + }, + { + "label": "Cell Calling Bioproduct Algorithm", + "doc": "Specify the bioproduct algorithm to be used for mRNA/AbSeq putative cell calling. The Basic algorithm is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm", + "symbols": [ + "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm/Basic", + "#main/Cell_Calling_Bioproduct_Algorithm/Cell_Calling_Bioproduct_Algorithm/Refined" + ] + } + ], + "id": "#main/Cell_Calling_Bioproduct_Algorithm" + }, + { + "label": "Cell Calling Data", + "doc": "Specify the data to be used for putative cell calling.\nThe default data for putative cell calling will be determined the following way:\n - If mRNA and ATAC Reads exist, mRNA_and_ATAC is the default.\n - If only ATAC Reads exist, ATAC is the default.\n - Otherwise, mRNA is the default.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Cell_Calling_Data/Cell_Calling_Data", + "symbols": [ + "#main/Cell_Calling_Data/Cell_Calling_Data/mRNA", + "#main/Cell_Calling_Data/Cell_Calling_Data/AbSeq", + "#main/Cell_Calling_Data/Cell_Calling_Data/ATAC", + "#main/Cell_Calling_Data/Cell_Calling_Data/mRNA_and_ATAC" + ] + } + ], + "id": "#main/Cell_Calling_Data" + }, + { + "type": [ + "null", + "string" + ], + "label": "Custom STAR Params", + "doc": "Allows you to specify custom STAR aligner mapping parameters. Only the mapping parameters you provide here will be used with STAR, meaning that you must provide the complete list of parameters that you want to take effect. For reference, the parameters used by default in the pipeline are:\n\n 1. Short Reads: --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000\n 2. Long Reads: Same options as short reads + --seedPerReadNmax 10000\n\n\nExample input: --alignIntronMax 500000 --outFilterScoreMinOverLread 0 --limitOutSJcollapsed 2000000\n\nImportant:\n 1. This applies to fastqs provided in the Reads user input\n 2. Please do not specify any non-mapping related params like: --runThreadN, --genomeDir --outSAMtype, etc.\n 3. Please only use params supported by STAR version 2.7.10b\n", + "id": "#main/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "label": "Custom bwa-mem2 Params", + "doc": "Allows you to specify custom bwa-mem2 mapping parameters. Only the mapping parameters you provide here will be used with bwa-mem2, meaning that you must provide the complete list of parameters that you want to take effect. The pipeline uses program default mapping parameters.\n\nExample input: -k 15 -w 200 -r 2\n\nImportant:\n 1. This applies to fastqs provided in the Reads_ATAC user input\n 2. Please do not specify any non-mapping related params like: -C, -t, etc.\n 3. Please only use params supported by bwa-mem2 version 2.2.1\n", + "id": "#main/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "int" + ], + "label": "Exact Cell Count", + "doc": "Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count", + "id": "#main/Exact_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Exclude Intronic Reads", + "doc": "By default, reads aligned to exons and introns are considered and represented in molecule counts. Including intronic reads may increase sensitivity, resulting in an increase in molecule counts and the number of genes per cell for both cellular and nuclei samples. Intronic reads may indicate unspliced mRNAs and are also useful, for example, in the study of nuclei and RNA velocity. When set to true, intronic reads will be excluded.", + "id": "#main/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "int" + ], + "label": "Expected Cell Count", + "doc": "Optional. Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected.", + "id": "#main/Expected_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Generate Bam Output", + "doc": "Default: false. A Bam read alignment file contains reads from all the input libraries, but creating it can consume a lot of compute and disk resources. By setting this field to true, the Bam file will be created. This option is shared for both Bioproduct and ATAC libraries.\n", + "id": "#main/Generate_Bam" + }, + { + "type": [ + "null", + "boolean" + ], + "label": "Long Reads (>=650bp)", + "doc": "By default, we detect if there are any reads longer than 650bp and then flag QualCLAlign to use STARlong instead of STAR. This flag can be explicitly set if it is known in advance that there are reads longer than 650bp.\n", + "id": "#main/Long_Reads" + }, + { + "type": [ + "null", + "int" + ], + "label": "Maximum Number of Threads", + "doc": "The maximum number of threads to use in the pipeline. By default, all available cores are used.", + "id": "#main/Maximum_Threads" + }, + { + "type": [ + "null", + "File" + ], + "label": "ATAC Predefined Peak Regions", + "doc": "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. Only applies to ATAC assays.", + "id": "#main/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Reads", + "doc": "FASTQ files from libraries that may include WTA mRNA, Targeted mRNA, AbSeq, Sample Multiplexing, and related technologies", + "id": "#main/Reads" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Reads-ATAC", + "doc": "FASTQ files from libraries generated using the ATAC assay protocol. Each lane of a library is expected to have 3 FASTQs - R1, R2 and I1/I2, where the index read contains the Cell Barcode and UMI sequence. Only applies to ATAC assays.", + "id": "#main/Reads_ATAC" + }, + { + "type": [ + "null", + "File" + ], + "label": "Reference Files Archive", + "id": "#main/Reference_Archive" + }, + { + "label": "Run Name", + "type": [ + "null", + "string" + ], + "doc": "This is a name for output files, for example Experiment1_Metrics_Summary.csv. Default if left empty is to name run based on a library. Any non-alpha numeric characters will be changed to a hyphen.", + "id": "#main/Run_Name" + }, + { + "label": "Sample Tags Version", + "doc": "The sample multiplexing kit version. This option should only be set for a multiplexed experiment.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/Sample_Tags_Version/Sample_Tags_Version", + "symbols": [ + "#main/Sample_Tags_Version/Sample_Tags_Version/human", + "#main/Sample_Tags_Version/Sample_Tags_Version/hs", + "#main/Sample_Tags_Version/Sample_Tags_Version/mouse", + "#main/Sample_Tags_Version/Sample_Tags_Version/mm", + "#main/Sample_Tags_Version/Sample_Tags_Version/flex", + "#main/Sample_Tags_Version/Sample_Tags_Version/nuclei_includes_mrna", + "#main/Sample_Tags_Version/Sample_Tags_Version/nuclei_atac_only", + "#main/Sample_Tags_Version/Sample_Tags_Version/custom" + ] + } + ], + "id": "#main/Sample_Tags_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Supplemental Reference", + "id": "#main/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "label": "Sample Tag Names", + "doc": "Specify the Sample Tag number followed by - (hyphen) and a sample name to appear in the output files. For example: 4-Ramos. Should be alpha numeric, with + - and _ allowed. Any special characters: &, (), [], {}, <>, ?, | will be corrected to underscores. \n", + "id": "#main/Tag_Names" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#main/Target_analysis" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "label": "Targeted Reference", + "id": "#main/Targeted_Reference" + }, + { + "type": [ + "null", + "float" + ], + "label": "e-value threshold for J gene", + "doc": "The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001\n", + "id": "#main/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "label": "e-value threshold for V gene", + "doc": "The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001\n", + "id": "#main/VDJ_VGene_Evalue" + }, + { + "label": "VDJ Species Version", + "doc": "The VDJ species and chain types. This option should only be set for VDJ experiment.", + "type": [ + "null", + { + "type": "enum", + "name": "#main/VDJ_Version/VDJ_Version", + "symbols": [ + "#main/VDJ_Version/VDJ_Version/human", + "#main/VDJ_Version/VDJ_Version/hs", + "#main/VDJ_Version/VDJ_Version/mouse", + "#main/VDJ_Version/VDJ_Version/mm", + "#main/VDJ_Version/VDJ_Version/humanBCR", + "#main/VDJ_Version/VDJ_Version/humanTCR", + "#main/VDJ_Version/VDJ_Version/mouseBCR", + "#main/VDJ_Version/VDJ_Version/mouseTCR" + ] + } + ], + "id": "#main/VDJ_Version" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#main/Write_Filtered_Reads" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/MergeATAC/ATAC_out", + "id": "#main/ATAC" + }, + { + "label": "BAM files and indices", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/MergeBAM_RNA/Bam", + "#main/MergeBAM_RNA/BamIndex", + "#main/MergeBAM_ATAC/Bam", + "#main/MergeBAM_ATAC/BamIndex" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Bam" + }, + { + "label": "Bioproduct Statistics", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/Bioproduct_Stats" + }, + { + "label": "Data Tables", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/GetDataTable/Data_Tables", + "id": "#main/Data_Tables" + }, + { + "label": "Dimensionality Reduction Coordinates", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/Dim_Reduction_Coord" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/QualCLAlign_ATAC/Failed_Reads_CSVs", + "id": "#main/Failed_Reads_CSVs_ATAC" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/QualCLAlign_RNA/Failed_Reads_CSVs", + "id": "#main/Failed_Reads_CSVs_RNA" + }, + { + "label": "Scanpy-Muon H5MU File", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GenerateH5MU/H5MU", + "id": "#main/H5MU" + }, + { + "label": "Immune Cell Classification (Experimental)", + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/Immune_Cell_Classification(Experimental)" + }, + { + "label": "Pipeline Logs", + "type": "Directory", + "outputSource": "#main/BundleLogs/logs_dir", + "id": "#main/Logs" + }, + { + "label": "Metrics Summary", + "type": [ + "null", + "File" + ], + "outputSource": "#main/Metrics/Metrics_Summary", + "id": "#main/Metrics_Summary" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#main/MergeMultiplex/Multiplex_out", + "id": "#main/Multiplexing" + }, + { + "label": "Pipeline Report HTML", + "type": [ + "null", + "File" + ], + "outputSource": "#main/Metrics/Pipeline_Report_HTML", + "id": "#main/Pipeline_Report_HTML" + }, + { + "label": "Protein Aggregates (Experimental)", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/Protein_Aggregates_Experimental" + }, + { + "label": "Seurat RDS File", + "type": [ + "null", + "File" + ], + "outputSource": "#main/GenerateSeurat/SeuratRDS", + "id": "#main/Seurat" + }, + { + "label": "vdjCellsDatatable", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/vdjCellsDatatable" + }, + { + "label": "vdjCellsDatatableUncorrected", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjCellsDatatableUncorrected", + "id": "#main/vdjCellsDatatableUncorrected" + }, + { + "label": "vdjDbecFilterImages", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjDbecFilterImages", + "id": "#main/vdjDbecFilterImages" + }, + { + "label": "vdjDominantContigsAIRR", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjDominantContigsAIRR", + "id": "#main/vdjDominantContigsAIRR" + }, + { + "label": "vdjMetricsCsv", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjMetricsCsv", + "id": "#main/vdjMetricsCsv" + }, + { + "label": "vdjUnfilteredContigsAIRR", + "type": [ + "null", + "File" + ], + "outputSource": "#main/VDJ_Compile_Results/vdjUnfilteredContigsAIRR", + "id": "#main/vdjUnfilteredContigsAIRR" + } + ], + "steps": [ + { + "run": "#ATAC_Cell_by_Peak.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/ATAC_Cell_by_Peak/Assay" + }, + { + "source": "#main/QualCLAlign_ATAC/Fragments", + "id": "#main/ATAC_Cell_by_Peak/Fragments" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/ATAC_Cell_by_Peak/GTF" + }, + { + "source": "#main/QualCLAlign_ATAC/Peaks", + "id": "#main/ATAC_Cell_by_Peak/Peaks" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/ATAC_Cell_by_Peak/Run_Metadata" + }, + { + "source": "#main/QualCLAlign_ATAC/Transposase_Sites", + "id": "#main/ATAC_Cell_by_Peak/Transposase_Sites" + } + ], + "out": [ + "#main/ATAC_Cell_by_Peak/Initial_Seurat_RDS", + "#main/ATAC_Cell_by_Peak/ATAC_Cell_Calling_Data", + "#main/ATAC_Cell_by_Peak/Total_Fragment_Metrics", + "#main/ATAC_Cell_by_Peak/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/ATAC_Cell_by_Peak" + }, + { + "run": "#ATAC_Compile_Results.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/ATAC_Compile_Results/Assay" + }, + { + "source": "#main/GetDataTable/Biop_putative_data_table", + "id": "#main/ATAC_Compile_Results/Biop_putative_data_table" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/ATAC_Compile_Results/Cell_Order" + }, + { + "source": "#main/GetDataTable/Cell_Order_Subsampled", + "id": "#main/ATAC_Compile_Results/Cell_Order_Subsampled" + }, + { + "source": "#main/QualCLAlign_ATAC/Fragments", + "id": "#main/ATAC_Compile_Results/Fragments" + }, + { + "source": "#main/ATAC_Cell_by_Peak/Initial_Seurat_RDS", + "id": "#main/ATAC_Compile_Results/Initial_Seurat_RDS" + }, + { + "source": "#main/GetDataTable/Metrics_tar", + "id": "#main/ATAC_Compile_Results/Input_Metrics_tar" + }, + { + "source": "#main/QualCLAlign_ATAC/Reference_Genome_Size", + "id": "#main/ATAC_Compile_Results/Reference_Genome_Size" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/ATAC_Compile_Results/Run_Metadata" + }, + { + "source": "#main/ATAC_Cell_by_Peak/Total_Fragment_Metrics", + "id": "#main/ATAC_Compile_Results/Total_Fragment_Metrics" + }, + { + "source": "#main/QualCLAlign_ATAC/UnifiedMetrics", + "id": "#main/ATAC_Compile_Results/Unified_Metrics" + } + ], + "out": [ + "#main/ATAC_Compile_Results/Metrics_tar", + "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Seurat_RDS", + "#main/ATAC_Compile_Results/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/ATAC_Compile_Results" + }, + { + "run": "#AddtoBam.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/AddtoBam_ATAC/Assay" + }, + { + "source": [ + "#main/QualCLAlign_ATAC/BAMFiles" + ], + "default": [ + "does_not_exist" + ], + "id": "#main/AddtoBam_ATAC/Bam" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/AddtoBam_ATAC/Cell_Order" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/AddtoBam_ATAC/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AddtoBam_ATAC/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/AddtoBam_ATAC/SampleTag_Calls" + } + ], + "when": "$(inputs.Generate_Bam == true && inputs.Assay == \"ATAC\")", + "out": [ + "#main/AddtoBam_ATAC/Annotated_Bam", + "#main/AddtoBam_ATAC/output" + ], + "scatter": [ + "#main/AddtoBam_ATAC/Bam" + ], + "id": "#main/AddtoBam_ATAC" + }, + { + "run": "#AddtoBam.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AddtoBam_RNA/Assay" + }, + { + "source": "#main/AlignmentAnalysis/Annotated_Bam_Files", + "default": [ + "does_not_exist" + ], + "id": "#main/AddtoBam_RNA/Bam" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/AddtoBam_RNA/Cell_Order" + }, + { + "source": "#main/AnnotateMolecules/Corrected_Mols_List", + "id": "#main/AddtoBam_RNA/Corrected_Mols" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/AddtoBam_RNA/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AddtoBam_RNA/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/AddtoBam_RNA/SampleTag_Calls" + }, + { + "source": "#main/CheckReference/Target_Gene_Mapping", + "id": "#main/AddtoBam_RNA/Target_Gene_Mapping" + } + ], + "when": "$(inputs.Generate_Bam == true && (inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\"))", + "out": [ + "#main/AddtoBam_RNA/Annotated_Bam", + "#main/AddtoBam_RNA/output" + ], + "scatter": [ + "#main/AddtoBam_RNA/Bam" + ], + "id": "#main/AddtoBam_RNA" + }, + { + "run": "#AlignmentAnalysis.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AlignmentAnalysis/Assay" + }, + { + "source": "#main/Misc_Settings/Exclude_Intronic_Reads", + "id": "#main/AlignmentAnalysis/Exclude_Intronic_Reads" + }, + { + "source": "#main/CheckReference/Extra_Seqs", + "id": "#main/AlignmentAnalysis/Extra_Seqs" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/AlignmentAnalysis/GTF" + }, + { + "source": "#main/Maximum_Threads", + "id": "#main/AlignmentAnalysis/Maximum_Threads" + }, + { + "source": "#main/QualCLAlign_RNA/BAMFiles", + "id": "#main/AlignmentAnalysis/R2_BAM" + }, + { + "source": "#main/QualCLAlign_RNA/QualCLAlignMetrics", + "id": "#main/AlignmentAnalysis/ReadQualityMetrics" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AlignmentAnalysis/Run_Metadata" + }, + { + "source": "#main/CheckReference/Transcript_Length", + "id": "#main/AlignmentAnalysis/Transcript_Length" + } + ], + "out": [ + "#main/AlignmentAnalysis/Seq_Metrics", + "#main/AlignmentAnalysis/Annotated_Bam_Files", + "#main/AlignmentAnalysis/Sorted_Valid_Reads_CSV", + "#main/AlignmentAnalysis/num_valid_ig_reads", + "#main/AlignmentAnalysis/num_valid_tcr_reads", + "#main/AlignmentAnalysis/validIgReads", + "#main/AlignmentAnalysis/validTcrReads", + "#main/AlignmentAnalysis/num_cell_estimate", + "#main/AlignmentAnalysis/num_bioproducts" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/AlignmentAnalysis" + }, + { + "requirements": [ + { + "ramMin": 32000, + "class": "ResourceRequirement" + } + ], + "run": "#AnnotateMolecules.cwl", + "in": [ + { + "source": "#main/Internal_Settings/AbSeq_UMI", + "id": "#main/AnnotateMolecules/AbSeq_UMI" + }, + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/AnnotateMolecules/Assay" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/AnnotateMolecules/Run_Metadata" + }, + { + "source": "#main/AlignmentAnalysis/Sorted_Valid_Reads_CSV", + "default": [ + "does_not_exist" + ], + "id": "#main/AnnotateMolecules/Valids" + } + ], + "out": [ + "#main/AnnotateMolecules/Bioproduct_Stats_List", + "#main/AnnotateMolecules/Cell_Biop_Summary_List", + "#main/AnnotateMolecules/Corrected_Mols_List", + "#main/AnnotateMolecules/Total_Molecules", + "#main/AnnotateMolecules/output" + ], + "scatter": [ + "#main/AnnotateMolecules/Valids" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/AnnotateMolecules" + }, + { + "run": "#Assay_Settings.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "valueFrom": "${ if (self && self.length > 0){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/AbSeq_Reference_Present" + }, + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Assay_Settings/Reads_ATAC_Present" + }, + { + "source": "#main/Reads", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Assay_Settings/Reads_RNA_Present" + }, + { + "source": "#main/Reference_Archive", + "valueFrom": "${ if (self){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/Reference_Archive_Present" + }, + { + "source": "#main/Targeted_Reference", + "valueFrom": "${ if (self && self.length > 0){ return true; } else{ return false; } }", + "id": "#main/Assay_Settings/Targeted_Reference_Present" + } + ], + "out": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "id": "#main/Assay_Settings" + }, + { + "label": "Bam Settings", + "run": "#BamSettings.cwl", + "in": [ + { + "source": "#main/Generate_Bam", + "id": "#main/Bam_Settings/_Generate_Bam" + } + ], + "out": [ + "#main/Bam_Settings/Generate_Bam" + ], + "id": "#main/Bam_Settings" + }, + { + "run": "#BundleLogs.cwl", + "in": [ + { + "source": [ + "#main/CheckReference/output", + "#main/GetDataTable/output", + "#main/ATAC_Cell_by_Peak/output", + "#main/ATAC_Compile_Results/output", + "#main/Metrics/output", + "#main/AddtoBam_RNA/output", + "#main/AddtoBam_ATAC/output", + "#main/AnnotateMolecules/output", + "#main/MergeBAM_RNA/log", + "#main/MergeBAM_ATAC/log", + "#main/GenerateH5MU/output", + "#main/GenerateSeurat/output", + "#main/Peak_Annotation/output" + ], + "pickValue": "all_non_null", + "linkMerge": "merge_flattened", + "id": "#main/BundleLogs/log_files" + } + ], + "out": [ + "#main/BundleLogs/logs_dir" + ], + "id": "#main/BundleLogs" + }, + { + "requirements": [ + { + "ramMin": 10000, + "class": "ResourceRequirement" + } + ], + "run": "#CheckReference.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "id": "#main/CheckReference/AbSeq_Reference" + }, + { + "source": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/CheckReference/Assay" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "id": "#main/CheckReference/Cell_Calling_Data" + }, + { + "source": "#main/Predefined_ATAC_Peaks", + "id": "#main/CheckReference/Predefined_ATAC_Peaks" + }, + { + "source": "#main/Reference_Archive", + "id": "#main/CheckReference/Reference_Archive" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tags_Version", + "id": "#main/CheckReference/Sample_Tags_Version" + }, + { + "source": "#main/Supplemental_Reference", + "id": "#main/CheckReference/Supplemental_Reference" + }, + { + "source": "#main/Targeted_Reference", + "id": "#main/CheckReference/Targeted_Reference" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/CheckReference/VDJ_Version" + } + ], + "out": [ + "#main/CheckReference/Index", + "#main/CheckReference/Extra_Seqs", + "#main/CheckReference/Full_Genes", + "#main/CheckReference/output", + "#main/CheckReference/Transcript_Length", + "#main/CheckReference/GTF", + "#main/CheckReference/Target_Gene_Mapping", + "#main/CheckReference/Checked_Predefined_Peaks", + "#main/CheckReference/Checked_Predefined_Peaks_Index" + ], + "id": "#main/CheckReference" + }, + { + "run": "#GenerateH5MU.cwl", + "in": [ + { + "source": "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "id": "#main/GenerateH5MU/Atac_Datatables" + }, + { + "source": "#main/Metrics/Metrics_ATAC", + "id": "#main/GenerateH5MU/Atac_Metrics" + }, + { + "source": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/GenerateH5MU/Bioproduct_Stats" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateH5MU/Cell_Type_Experimental" + }, + { + "source": "#main/GetDataTable/Data_Tables", + "id": "#main/GenerateH5MU/Data_Tables" + }, + { + "source": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateH5MU/Dim_Reduction_Coord" + }, + { + "source": "#main/Metrics/Metrics_Summary", + "id": "#main/GenerateH5MU/Metrics_Summary" + }, + { + "source": "#main/Peak_Annotation/Peak_Annotation_TSV", + "id": "#main/GenerateH5MU/Peak_Annotation" + }, + { + "source": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/GenerateH5MU/Protein_Aggregates_Experimental" + }, + { + "source": "#main/GetDataTable/Putative_Cells_Origin", + "id": "#main/GenerateH5MU/Putative_Cells_Origin" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GenerateH5MU/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_CSVs", + "id": "#main/GenerateH5MU/SampleTag_CSVs" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/GenerateH5MU/SampleTag_Calls" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/GenerateH5MU/VDJ_Per_Cell" + } + ], + "out": [ + "#main/GenerateH5MU/H5MU", + "#main/GenerateH5MU/output" + ], + "id": "#main/GenerateH5MU" + }, + { + "run": "#GenerateSeurat.cwl", + "in": [ + { + "source": "#main/ATAC_Compile_Results/ATAC_Seurat_RDS", + "id": "#main/GenerateSeurat/ATAC_Seurat" + }, + { + "source": "#main/GetDataTable/Bioproduct_Stats", + "id": "#main/GenerateSeurat/Bioproduct_Stats" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateSeurat/Cell_Type_Experimental" + }, + { + "source": "#main/GetDataTable/Data_Tables", + "id": "#main/GenerateSeurat/Data_Tables" + }, + { + "source": [ + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/ATAC_Compile_Results/ATAC_Dim_Reduction_Coord" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/GenerateSeurat/Dim_Reduction_Coord" + }, + { + "source": "#main/GetDataTable/Protein_Aggregates_Experimental", + "id": "#main/GenerateSeurat/Protein_Aggregates_Experimental" + }, + { + "source": "#main/GetDataTable/Putative_Cells_Origin", + "id": "#main/GenerateSeurat/Putative_Cells_Origin" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GenerateSeurat/Run_Metadata" + }, + { + "source": "#main/GetDataTable/SampleTag_CSVs", + "id": "#main/GenerateSeurat/SampleTag_CSVs" + }, + { + "source": "#main/GetDataTable/SampleTag_Calls", + "id": "#main/GenerateSeurat/SampleTag_Calls" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/GenerateSeurat/VDJ_Per_Cell" + } + ], + "out": [ + "#main/GenerateSeurat/SeuratRDS", + "#main/GenerateSeurat/output" + ], + "id": "#main/GenerateSeurat" + }, + { + "run": "#GetDataTable.cwl", + "in": [ + { + "source": "#main/ATAC_Cell_by_Peak/ATAC_Cell_Calling_Data", + "id": "#main/GetDataTable/ATAC_Cell_Calling_Input" + }, + { + "source": "#main/AnnotateMolecules/Bioproduct_Stats_List", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Bioproduct_Stats_List" + }, + { + "source": "#main/AnnotateMolecules/Cell_Biop_Summary_List", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Cell_Biop_Summary_List" + }, + { + "source": "#main/CheckReference/Full_Genes", + "id": "#main/GetDataTable/Full_Genes" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/GetDataTable/Run_Metadata" + }, + { + "source": "#main/AlignmentAnalysis/Seq_Metrics", + "id": "#main/GetDataTable/Seq_Metrics" + }, + { + "source": "#main/AnnotateMolecules/Total_Molecules", + "pickValue": "all_non_null", + "id": "#main/GetDataTable/Total_Molecules" + }, + { + "source": "#main/AlignmentAnalysis/num_bioproducts", + "id": "#main/GetDataTable/num_bioproducts" + }, + { + "source": "#main/AlignmentAnalysis/num_cell_estimate", + "id": "#main/GetDataTable/num_cell_estimate" + } + ], + "out": [ + "#main/GetDataTable/Metrics_tar", + "#main/GetDataTable/Bioproduct_Stats", + "#main/GetDataTable/Cell_Order", + "#main/GetDataTable/Cell_Order_Subsampled", + "#main/GetDataTable/Cell_Type_Predictions", + "#main/GetDataTable/Data_Tables", + "#main/GetDataTable/Dim_Reduction_Coord", + "#main/GetDataTable/output", + "#main/GetDataTable/Protein_Aggregates_Experimental", + "#main/GetDataTable/Putative_Cells_Origin", + "#main/GetDataTable/SampleTag_Calls", + "#main/GetDataTable/SampleTag_CSVs", + "#main/GetDataTable/SampleTag_perTagZips", + "#main/GetDataTable/Biop_putative_data_table" + ], + "id": "#main/GetDataTable" + }, + { + "label": "Get Machine Resources", + "run": "#GetMachineResources.cwl", + "in": [], + "out": [ + "#main/GetMachineResources/Total_CPUs_Avail" + ], + "id": "#main/GetMachineResources" + }, + { + "label": "Internal Settings", + "run": "#InternalSettings.cwl", + "in": [ + { + "source": "#main/AbSeq_UMI", + "id": "#main/Internal_Settings/_AbSeq_UMI" + }, + { + "source": "#main/Target_analysis", + "id": "#main/Internal_Settings/_Target_analysis" + }, + { + "source": "#main/VDJ_JGene_Evalue", + "id": "#main/Internal_Settings/_VDJ_JGene_Evalue" + }, + { + "source": "#main/VDJ_VGene_Evalue", + "id": "#main/Internal_Settings/_VDJ_VGene_Evalue" + } + ], + "out": [ + "#main/Internal_Settings/AbSeq_UMI", + "#main/Internal_Settings/Target_analysis", + "#main/Internal_Settings/VDJ_VGene_Evalue", + "#main/Internal_Settings/VDJ_JGene_Evalue" + ], + "id": "#main/Internal_Settings" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": { + "items": [ + "null", + "File" + ], + "type": "array" + }, + "id": "#main/MergeATAC/run/ATAC_Files" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#main/MergeATAC/run/ATAC_out" + } + ], + "expression": "${\n var fp_array = [];\n for (var i = 0; i < inputs.ATAC_Files.length; i++) {\n var fp = inputs.ATAC_Files[i];\n if (fp != null) {\n fp_array.push(fp);\n }\n }\n return({\"ATAC_out\": fp_array});\n}" + }, + "in": [ + { + "source": [ + "#main/QualCLAlign_ATAC/Fragments", + "#main/QualCLAlign_ATAC/Transposase_Sites", + "#main/QualCLAlign_ATAC/Peaks", + "#main/Metrics/Metrics_ATAC", + "#main/ATAC_Compile_Results/ATAC_Data_Tables", + "#main/Peak_Annotation/Peak_Annotation_TSV" + ], + "linkMerge": "merge_flattened", + "id": "#main/MergeATAC/ATAC_Files" + } + ], + "out": [ + "#main/MergeATAC/ATAC_out" + ], + "id": "#main/MergeATAC" + }, + { + "run": "#MergeBAM.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/MergeBAM_ATAC/Assay" + }, + { + "source": "#main/AddtoBam_ATAC/Annotated_Bam", + "id": "#main/MergeBAM_ATAC/BamFiles" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/MergeBAM_ATAC/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/MergeBAM_ATAC/Run_Metadata" + } + ], + "when": "$(inputs.Generate_Bam == true && inputs.Assay == \"ATAC\")", + "out": [ + "#main/MergeBAM_ATAC/Bam", + "#main/MergeBAM_ATAC/BamIndex", + "#main/MergeBAM_ATAC/log" + ], + "id": "#main/MergeBAM_ATAC" + }, + { + "run": "#MergeBAM.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/MergeBAM_RNA/Assay" + }, + { + "source": "#main/AddtoBam_RNA/Annotated_Bam", + "id": "#main/MergeBAM_RNA/BamFiles" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/MergeBAM_RNA/Generate_Bam" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/MergeBAM_RNA/Run_Metadata" + } + ], + "when": "$(inputs.Generate_Bam == true && (inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\"))", + "out": [ + "#main/MergeBAM_RNA/Bam", + "#main/MergeBAM_RNA/BamIndex", + "#main/MergeBAM_RNA/log" + ], + "id": "#main/MergeBAM_RNA" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": { + "items": [ + "null", + "File" + ], + "type": "array" + }, + "id": "#main/MergeMultiplex/run/SampleTag_Files" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#main/MergeMultiplex/run/Multiplex_out" + } + ], + "expression": "${\n var fp_array = [];\n for (var i = 0; i < inputs.SampleTag_Files.length; i++) {\n var fp = inputs.SampleTag_Files[i];\n if (fp != null) {\n fp_array.push(fp);\n }\n }\n return({\"Multiplex_out\": fp_array});\n}" + }, + "in": [ + { + "source": [ + "#main/GetDataTable/SampleTag_CSVs", + "#main/GetDataTable/SampleTag_perTagZips" + ], + "linkMerge": "merge_flattened", + "id": "#main/MergeMultiplex/SampleTag_Files" + } + ], + "out": [ + "#main/MergeMultiplex/Multiplex_out" + ], + "id": "#main/MergeMultiplex" + }, + { + "run": "#Metadata.cwl", + "in": [ + { + "source": "#main/AbSeq_Reference", + "id": "#main/Metadata_Settings/AbSeq_Reference" + }, + { + "source": [ + "#main/Assay_Settings/Assay_RNA", + "#main/Assay_Settings/Assay_ATAC" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Assay" + }, + { + "source": [ + "#main/QualCLAlign_RNA/Bead_Version", + "#main/QualCLAlign_ATAC/Bead_Version" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Bead_Version" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_ATAC_Algorithm", + "id": "#main/Metadata_Settings/Cell_Calling_ATAC_Algorithm" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Bioproduct_Algorithm", + "id": "#main/Metadata_Settings/Cell_Calling_Bioproduct_Algorithm" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "id": "#main/Metadata_Settings/Cell_Calling_Data" + }, + { + "source": "#main/Misc_Settings/Custom_STAR_Params", + "id": "#main/Metadata_Settings/Custom_STAR_Params" + }, + { + "source": "#main/Misc_Settings/Custom_bwa_mem2_Params", + "id": "#main/Metadata_Settings/Custom_bwa_mem2_Params" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Exact_Cell_Count", + "id": "#main/Metadata_Settings/Exact_Cell_Count" + }, + { + "source": "#main/Misc_Settings/Exclude_Intronic_Reads", + "id": "#main/Metadata_Settings/Exclude_Intronic_Reads" + }, + { + "source": "#main/Putative_Cell_Calling_Settings/Expected_Cell_Count", + "id": "#main/Metadata_Settings/Expected_Cell_Count" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/Metadata_Settings/Generate_Bam" + }, + { + "source": "#main/QualCLAlign_RNA/Libraries", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Libraries" + }, + { + "source": "#main/QualCLAlign_ATAC/Libraries", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Libraries_ATAC" + }, + { + "source": "#main/Misc_Settings/Long_Reads", + "id": "#main/Metadata_Settings/Long_Reads" + }, + { + "valueFrom": "BD Rhapsody Sequence Analysis Pipeline", + "id": "#main/Metadata_Settings/Pipeline_Name" + }, + { + "source": "#main/Version/version", + "id": "#main/Metadata_Settings/Pipeline_Version" + }, + { + "source": "#main/Predefined_ATAC_Peaks", + "id": "#main/Metadata_Settings/Predefined_ATAC_Peaks" + }, + { + "source": "#main/QualCLAlign_RNA/ReadsList", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Reads" + }, + { + "source": "#main/QualCLAlign_ATAC/ReadsList", + "pickValue": "all_non_null", + "id": "#main/Metadata_Settings/Reads_ATAC" + }, + { + "source": "#main/Reference_Archive", + "id": "#main/Metadata_Settings/Reference_Archive" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/Metadata_Settings/Run_Base_Name" + }, + { + "source": "#main/Name_Settings/Run_Name", + "id": "#main/Metadata_Settings/Run_Name" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tag_Names", + "id": "#main/Metadata_Settings/Sample_Tag_Names" + }, + { + "source": "#main/Multiplexing_Settings/Sample_Tags_Version", + "id": "#main/Metadata_Settings/Sample_Tags_Version" + }, + { + "source": "#main/Start_Time/Start_Time", + "id": "#main/Metadata_Settings/Start_Time" + }, + { + "source": "#main/Supplemental_Reference", + "id": "#main/Metadata_Settings/Supplemental_Reference" + }, + { + "source": "#main/Targeted_Reference", + "id": "#main/Metadata_Settings/Targeted_Reference" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/Metadata_Settings/VDJ_Version" + } + ], + "out": [ + "#main/Metadata_Settings/Run_Metadata" + ], + "id": "#main/Metadata_Settings" + }, + { + "requirements": [ + { + "ramMin": 4000, + "class": "ResourceRequirement" + } + ], + "run": "#Metrics.cwl", + "in": [ + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/Metrics/Cell_Type_Predictions" + }, + { + "source": [ + "#main/ATAC_Compile_Results/Metrics_tar", + "#main/GetDataTable/Metrics_tar" + ], + "linkMerge": "merge_flattened", + "pickValue": "first_non_null", + "id": "#main/Metrics/Metrics_tar" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/Metrics/Run_Metadata" + }, + { + "source": "#main/VDJ_Compile_Results/vdjCellsDatatable", + "id": "#main/Metrics/vdjCellsDatatable" + }, + { + "source": "#main/VDJ_Compile_Results/vdjMetricsJson", + "id": "#main/Metrics/vdjMetricsJson" + } + ], + "out": [ + "#main/Metrics/Metrics_Summary", + "#main/Metrics/Metrics_Archive", + "#main/Metrics/Metrics_ATAC", + "#main/Metrics/Pipeline_Report_JSON", + "#main/Metrics/Pipeline_Report_HTML", + "#main/Metrics/output" + ], + "id": "#main/Metrics" + }, + { + "label": "Miscellaneous Settings", + "run": "#MiscSettings.cwl", + "in": [ + { + "source": "#main/Custom_STAR_Params", + "id": "#main/Misc_Settings/_Custom_STAR_Params" + }, + { + "source": "#main/Custom_bwa_mem2_Params", + "id": "#main/Misc_Settings/_Custom_bwa_mem2_Params" + }, + { + "source": "#main/Exclude_Intronic_Reads", + "id": "#main/Misc_Settings/_Exclude_Intronic_Reads" + }, + { + "source": "#main/Long_Reads", + "id": "#main/Misc_Settings/_Long_Reads" + } + ], + "out": [ + "#main/Misc_Settings/Exclude_Intronic_Reads", + "#main/Misc_Settings/Long_Reads", + "#main/Misc_Settings/Custom_STAR_Params", + "#main/Misc_Settings/Custom_bwa_mem2_Params" + ], + "id": "#main/Misc_Settings" + }, + { + "label": "Multiplexing Settings", + "run": "#MultiplexingSettings.cwl", + "in": [ + { + "source": "#main/Tag_Names", + "id": "#main/Multiplexing_Settings/_Sample_Tag_Names" + }, + { + "source": "#main/Sample_Tags_Version", + "id": "#main/Multiplexing_Settings/_Sample_Tags_Version" + } + ], + "out": [ + "#main/Multiplexing_Settings/Sample_Tag_Names", + "#main/Multiplexing_Settings/Sample_Tags_Version" + ], + "id": "#main/Multiplexing_Settings" + }, + { + "label": "Name Settings", + "run": "#NameSettings.cwl", + "in": [ + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ var fastqs = []; if(self) { for(var i = 0; i < self.length; i++) { fastqs.push(self[i].basename); } } return fastqs; }", + "id": "#main/Name_Settings/ATAC_Fastqs" + }, + { + "source": "#main/Reads", + "valueFrom": "${ var fastqs = []; if(self) { for(var i = 0; i < self.length; i++) { fastqs.push(self[i].basename); } } return fastqs; }", + "id": "#main/Name_Settings/Bioproduct_Fastqs" + }, + { + "source": "#main/Run_Name", + "id": "#main/Name_Settings/_Run_Name" + } + ], + "out": [ + "#main/Name_Settings/Run_Name", + "#main/Name_Settings/Run_Base_Name" + ], + "id": "#main/Name_Settings" + }, + { + "run": "#PeakAnnotation.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/Peak_Annotation/Assay" + }, + { + "source": "#main/CheckReference/GTF", + "id": "#main/Peak_Annotation/Gtf" + }, + { + "source": "#main/QualCLAlign_ATAC/Peaks", + "id": "#main/Peak_Annotation/Peaks_bed" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/Peak_Annotation/Run_Metadata" + } + ], + "out": [ + "#main/Peak_Annotation/Peak_Annotation_TSV", + "#main/Peak_Annotation/output" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/Peak_Annotation" + }, + { + "label": "Putative Cell Calling Settings", + "run": "#PutativeCellSettings.cwl", + "in": [ + { + "source": "#main/Cell_Calling_ATAC_Algorithm", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_ATAC_Algorithm" + }, + { + "source": "#main/Cell_Calling_Bioproduct_Algorithm", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_Bioproduct_Algorithm" + }, + { + "source": "#main/Cell_Calling_Data", + "id": "#main/Putative_Cell_Calling_Settings/_Cell_Calling_Data" + }, + { + "source": "#main/Exact_Cell_Count", + "id": "#main/Putative_Cell_Calling_Settings/_Exact_Cell_Count" + }, + { + "source": "#main/Expected_Cell_Count", + "id": "#main/Putative_Cell_Calling_Settings/_Expected_Cell_Count" + }, + { + "source": "#main/Reads_ATAC", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Putative_Cell_Calling_Settings/_Reads_ATAC_Present" + }, + { + "source": "#main/Reads", + "valueFrom": "${ if (self && self.length > 0) { return true; } else { return false; } }", + "id": "#main/Putative_Cell_Calling_Settings/_Reads_RNA_Present" + } + ], + "out": [ + "#main/Putative_Cell_Calling_Settings/Cell_Calling_ATAC_Algorithm", + "#main/Putative_Cell_Calling_Settings/Cell_Calling_Bioproduct_Algorithm", + "#main/Putative_Cell_Calling_Settings/Cell_Calling_Data", + "#main/Putative_Cell_Calling_Settings/Exact_Cell_Count", + "#main/Putative_Cell_Calling_Settings/Expected_Cell_Count" + ], + "id": "#main/Putative_Cell_Calling_Settings" + }, + { + "run": "#QualCLAlign.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_ATAC", + "id": "#main/QualCLAlign_ATAC/Assay" + }, + { + "source": "#main/Bam_Settings/Generate_Bam", + "id": "#main/QualCLAlign_ATAC/Generate_Bam" + }, + { + "source": "#main/CheckReference/Index", + "id": "#main/QualCLAlign_ATAC/Index" + }, + { + "source": "#main/CheckReference/Checked_Predefined_Peaks", + "id": "#main/QualCLAlign_ATAC/Predefined_ATAC_Peaks" + }, + { + "source": "#main/Reads_ATAC", + "id": "#main/QualCLAlign_ATAC/Reads" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/QualCLAlign_ATAC/Run_Base_Name" + }, + { + "source": [ + "#main/Maximum_Threads", + "#main/GetMachineResources/Total_CPUs_Avail", + "#main/Assay_Settings/Assay_RNA" + ], + "valueFrom": "${ var [max_threads, total_cpus_avail, assay_rna] = self;\nif (max_threads != null) { return max_threads; }\nif (assay_rna == null) { return total_cpus_avail; }\nif (total_cpus_avail >= 48) { return parseInt(total_cpus_avail / 2); } else { return total_cpus_avail; } }", + "id": "#main/QualCLAlign_ATAC/Threads" + }, + { + "source": "#main/Write_Filtered_Reads", + "id": "#main/QualCLAlign_ATAC/Write_Filtered_Reads" + }, + { + "source": "#main/Misc_Settings/Custom_bwa_mem2_Params", + "id": "#main/QualCLAlign_ATAC/bwa_mem2_Params" + } + ], + "out": [ + "#main/QualCLAlign_ATAC/Bead_Version", + "#main/QualCLAlign_ATAC/Libraries", + "#main/QualCLAlign_ATAC/ReadsList", + "#main/QualCLAlign_ATAC/BAMFiles", + "#main/QualCLAlign_ATAC/Fragments", + "#main/QualCLAlign_ATAC/Fragments_Index", + "#main/QualCLAlign_ATAC/Transposase_Sites", + "#main/QualCLAlign_ATAC/Transposase_Sites_Index", + "#main/QualCLAlign_ATAC/Peaks", + "#main/QualCLAlign_ATAC/Peaks_Index", + "#main/QualCLAlign_ATAC/QualCLAlignMetrics", + "#main/QualCLAlign_ATAC/UnifiedMetrics", + "#main/QualCLAlign_ATAC/Logs", + "#main/QualCLAlign_ATAC/Failed_Reads_CSVs", + "#main/QualCLAlign_ATAC/Reference_Genome_Size" + ], + "when": "$(inputs.Assay == \"ATAC\")", + "id": "#main/QualCLAlign_ATAC" + }, + { + "run": "#QualCLAlign.cwl", + "in": [ + { + "source": "#main/Assay_Settings/Assay_RNA", + "id": "#main/QualCLAlign_RNA/Assay" + }, + { + "source": "#main/CheckReference/Extra_Seqs", + "id": "#main/QualCLAlign_RNA/Extra_Seqs" + }, + { + "source": "#main/CheckReference/Index", + "id": "#main/QualCLAlign_RNA/Index" + }, + { + "source": "#main/Misc_Settings/Long_Reads", + "id": "#main/QualCLAlign_RNA/Long_Reads" + }, + { + "source": "#main/Reads", + "id": "#main/QualCLAlign_RNA/Reads" + }, + { + "source": "#main/Name_Settings/Run_Base_Name", + "id": "#main/QualCLAlign_RNA/Run_Base_Name" + }, + { + "source": "#main/Misc_Settings/Custom_STAR_Params", + "id": "#main/QualCLAlign_RNA/STAR_Params" + }, + { + "source": [ + "#main/Maximum_Threads", + "#main/GetMachineResources/Total_CPUs_Avail", + "#main/Assay_Settings/Assay_ATAC" + ], + "valueFrom": "${ var [max_threads, total_cpus_avail, assay_atac] = self;\nif (max_threads != null) { return max_threads; }\nif (assay_atac == null) { return total_cpus_avail; }\nif (total_cpus_avail >= 48) { return parseInt(total_cpus_avail / 2); } else { return total_cpus_avail; } }", + "id": "#main/QualCLAlign_RNA/Threads" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/QualCLAlign_RNA/VDJ_Version" + }, + { + "source": "#main/Write_Filtered_Reads", + "id": "#main/QualCLAlign_RNA/Write_Filtered_Reads" + } + ], + "out": [ + "#main/QualCLAlign_RNA/Bead_Version", + "#main/QualCLAlign_RNA/Libraries", + "#main/QualCLAlign_RNA/ReadsList", + "#main/QualCLAlign_RNA/BAMFiles", + "#main/QualCLAlign_RNA/QualCLAlignMetrics", + "#main/QualCLAlign_RNA/Logs", + "#main/QualCLAlign_RNA/Failed_Reads_CSVs" + ], + "when": "$(inputs.Assay == \"WTA\" || inputs.Assay == \"Targeted\")", + "id": "#main/QualCLAlign_RNA" + }, + { + "run": { + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [], + "outputs": [ + { + "type": "string", + "id": "#main/Start_Time/run/Start_Time" + } + ], + "expression": "${\n var today = new Date();\n var date = today.toDateString();\n var time = today.toLocaleTimeString('en-us', {timeZoneName: 'short'});\n return ({Start_Time: date + ' ' + time});\n} " + }, + "in": [], + "out": [ + "#main/Start_Time/Start_Time" + ], + "id": "#main/Start_Time" + }, + { + "run": "#VDJ_Analyze_Reads_IG.cwl", + "when": "$(inputs.VDJ_Version != null && inputs.VDJ_Version != \"humanTCR\" && inputs.VDJ_Version != \"mouseTCR\")", + "in": [ + { + "source": "#main/Maximum_Threads", + "id": "#main/VDJ_Analyze_Reads_IG/Maximum_Threads" + }, + { + "source": "#main/AlignmentAnalysis/num_valid_ig_reads", + "id": "#main/VDJ_Analyze_Reads_IG/Num_Valid_Reads_IG" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Analyze_Reads_IG/VDJ_Version" + }, + { + "source": "#main/AlignmentAnalysis/validIgReads", + "id": "#main/VDJ_Analyze_Reads_IG/Valid_Reads_Fastq_IG" + } + ], + "out": [ + "#main/VDJ_Analyze_Reads_IG/gatheredCalls" + ], + "id": "#main/VDJ_Analyze_Reads_IG" + }, + { + "run": "#VDJ_Analyze_Reads_TCR.cwl", + "when": "$(inputs.VDJ_Version != null && inputs.VDJ_Version != \"humanBCR\" && inputs.VDJ_Version != \"mouseBCR\")", + "in": [ + { + "source": "#main/Maximum_Threads", + "id": "#main/VDJ_Analyze_Reads_TCR/Maximum_Threads" + }, + { + "source": "#main/AlignmentAnalysis/num_valid_tcr_reads", + "id": "#main/VDJ_Analyze_Reads_TCR/Num_Valid_Reads_TCR" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Analyze_Reads_TCR/VDJ_Version" + }, + { + "source": "#main/AlignmentAnalysis/validTcrReads", + "id": "#main/VDJ_Analyze_Reads_TCR/Valid_Reads_Fastq_TCR" + } + ], + "out": [ + "#main/VDJ_Analyze_Reads_TCR/gatheredCalls" + ], + "id": "#main/VDJ_Analyze_Reads_TCR" + }, + { + "run": "#VDJ_Compile_Results.cwl", + "when": "$(inputs.VDJ_Version != null)", + "in": [ + { + "source": "#main/AlignmentAnalysis/Seq_Metrics", + "id": "#main/VDJ_Compile_Results/Seq_Metrics" + }, + { + "source": "#main/VDJ_Settings/VDJ_Version", + "id": "#main/VDJ_Compile_Results/VDJ_Version" + }, + { + "source": [ + "#main/GetDataTable/Cell_Type_Predictions", + "#main/ATAC_Compile_Results/Cell_Type_Predictions" + ], + "linkMerge": "merge_flattened", + "pickValue": "all_non_null", + "id": "#main/VDJ_Compile_Results/cellTypeMapping" + }, + { + "valueFrom": "$([])", + "id": "#main/VDJ_Compile_Results/chainsToIgnore" + }, + { + "source": "#main/Internal_Settings/VDJ_JGene_Evalue", + "id": "#main/VDJ_Compile_Results/evalueJgene" + }, + { + "source": "#main/Internal_Settings/VDJ_VGene_Evalue", + "id": "#main/VDJ_Compile_Results/evalueVgene" + }, + { + "source": "#main/VDJ_Analyze_Reads_IG/gatheredCalls", + "id": "#main/VDJ_Compile_Results/igCalls" + }, + { + "source": "#main/Metadata_Settings/Run_Metadata", + "id": "#main/VDJ_Compile_Results/metadata" + }, + { + "source": "#main/GetDataTable/Cell_Order", + "id": "#main/VDJ_Compile_Results/putativeCells" + }, + { + "source": "#main/VDJ_Analyze_Reads_TCR/gatheredCalls", + "id": "#main/VDJ_Compile_Results/tcrCalls" + } + ], + "out": [ + "#main/VDJ_Compile_Results/vdjCellsDatatable", + "#main/VDJ_Compile_Results/vdjCellsDatatableUncorrected", + "#main/VDJ_Compile_Results/vdjDominantContigsAIRR", + "#main/VDJ_Compile_Results/vdjUnfilteredContigsAIRR", + "#main/VDJ_Compile_Results/vdjMetricsJson", + "#main/VDJ_Compile_Results/vdjMetricsCsv", + "#main/VDJ_Compile_Results/vdjDbecFilterImages" + ], + "id": "#main/VDJ_Compile_Results" + }, + { + "label": "VDJ Settings", + "run": "#VDJ_Settings.cwl", + "in": [ + { + "source": "#main/VDJ_Version", + "id": "#main/VDJ_Settings/_VDJ_Version" + } + ], + "out": [ + "#main/VDJ_Settings/VDJ_Version" + ], + "id": "#main/VDJ_Settings" + }, + { + "run": "#Version.cwl", + "in": [], + "out": [ + "#main/Version/version" + ], + "id": "#main/Version" + } + ], + "id": "#main" + }, + { + "requirements": [ + { + "listing": [ + { + "entryname": "bam_files.txt", + "entry": "${\n function getBamsInSortedOrder(inputs) {\n // Create an associative array to hold the mapping from basename to full path\n var fileMap = {};\n\n // Extract basenames and map them to their full paths\n for (var i = 0; i < inputs.BamFiles.length; i++) {\n var file = inputs.BamFiles[i].path;\n var basename = file.split('/').pop();\n fileMap[basename] = file;\n }\n\n // Sort the basenames numerically\n // This works because all bams share the same prefix and have a numerical id :\n // i.e. foobar.0001.tagged.bam, foobar.0002.tagged.bam, foobar.0010.tagged.bam, etc.\n // so will be sorted by numerical ids\n var sortedBasenames = Object.keys(fileMap).sort(function(a, b) {\n return a.localeCompare(b, undefined, { numeric: true });\n });\n\n // Reconstruct the sorted full paths\n var sortedBamFiles = sortedBasenames.map(function(basename) {\n return fileMap[basename];\n });\n\n // Create a file of file names - 1 per line\n return sortedBamFiles.join('\\n');\n }\n\n // For ATAC we cat the bams so need them in a particular order\n if (inputs.Assay == \"ATAC\") {\n return getBamsInSortedOrder(inputs);\n }\n else {\n return inputs.BamFiles.map(function(file) {\n return file.path;\n }).join('\\n');\n }\n}", + "writable": false + } + ], + "class": "InitialWorkDirRequirement" + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8, + "ramMin": 16000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "MergeBam.sh" + ], + "stderr": "merge_bam.log", + "inputs": [ + { + "type": "string", + "inputBinding": { + "position": 1 + }, + "id": "#MergeBAM.cwl/Assay" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#MergeBAM.cwl/BamFiles" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MergeBAM.cwl/Generate_Bam" + }, + { + "type": "File", + "loadContents": true, + "id": "#MergeBAM.cwl/Run_Metadata" + } + ], + "arguments": [ + { + "position": 2, + "valueFrom": "${\n var st_version = JSON.parse(inputs.Run_Metadata.contents).Sample_Tags_Version\n if (st_version)\n {\n return st_version\n } else\n {\n return \"None\"\n }\n}" + }, + { + "position": 3, + "valueFrom": "$(JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name)" + }, + { + "position": 4, + "valueFrom": "$(runtime.cores)" + }, + { + "position": 5, + "valueFrom": "bam_files.txt" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "${ return \"*\" + JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name + \"*.bam\" }" + }, + "id": "#MergeBAM.cwl/Bam" + }, + { + "type": "File", + "outputBinding": { + "glob": "${ return \"*\" + JSON.parse(inputs.Run_Metadata.contents).Run_Base_Name + \"*.bam.bai\" }" + }, + "id": "#MergeBAM.cwl/BamIndex" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#MergeBAM.cwl/log" + } + ], + "id": "#MergeBAM.cwl" + }, + { + "class": "CommandLineTool", + "baseCommand": "echo", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/AbSeq_Reference" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "id": "#Metadata.cwl/Assay" + }, + { + "type": { + "type": "array", + "items": { + "type": "record", + "fields": [ + { + "name": "#Metadata.cwl/Bead_Version/Library", + "type": "string" + }, + { + "name": "#Metadata.cwl/Bead_Version/bead_version", + "type": "string" + } + ] + } + }, + "id": "#Metadata.cwl/Bead_Version" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#Metadata.cwl/Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#Metadata.cwl/Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Exact_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#Metadata.cwl/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "int" + ], + "id": "#Metadata.cwl/Expected_Cell_Count" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#Metadata.cwl/Generate_Bam" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Libraries" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Libraries_ATAC" + }, + { + "type": "string", + "id": "#Metadata.cwl/Long_Reads" + }, + { + "type": "string", + "id": "#Metadata.cwl/Pipeline_Name" + }, + { + "type": "string", + "id": "#Metadata.cwl/Pipeline_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#Metadata.cwl/Predefined_ATAC_Peaks" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Reads" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Reads_ATAC" + }, + { + "type": [ + "null", + "File", + "Directory" + ], + "id": "#Metadata.cwl/Reference_Archive" + }, + { + "type": "string", + "id": "#Metadata.cwl/Run_Base_Name" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Run_Name" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#Metadata.cwl/Sample_Tag_Names" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Sample_Tags_Version" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/Start_Time" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/Supplemental_Reference" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#Metadata.cwl/Targeted_Reference" + }, + { + "type": [ + "null", + "string" + ], + "id": "#Metadata.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "type": "File", + "outputBinding": { + "glob": "run_metadata.json" + }, + "id": "#Metadata.cwl/Run_Metadata" + } + ], + "stdout": "run_metadata.json", + "arguments": [ + { + "prefix": "" + }, + { + "shellQuote": true, + "valueFrom": "${\n var metadata = inputs;\n var all_bv = {};\n var customer_bv = \"Original (V1)\";\n var detected_bv = \"V1\";\n for (var i = 0; i < inputs.Bead_Version.length; i++) {\n var BeadVer = inputs.Bead_Version[i];\n var Library = BeadVer[\"Library\"];\n var bead_version = BeadVer[\"bead_version\"];\n all_bv[Library] = bead_version \n var short_bv = bead_version.substring(0, 5);\n if (short_bv == \"Enh\") {\n customer_bv = \"Enhanced\";\n detected_bv = \"Enh\";\n }\n else if (short_bv == \"EnhV2\") {\n customer_bv = \"Enhanced V2/V3\";\n detected_bv = \"EnhV2\";\n }\n }\n metadata[\"Bead_Version\"] = all_bv;\n metadata[\"Bead_Version_Detected\"] = detected_bv;\n\n var pipeline_name = inputs.Pipeline_Name;\n var version = inputs.Pipeline_Version;\n var time = inputs.Start_Time;\n var libraries = inputs.Libraries;\n if(libraries == null){\n libraries = [\"None\"];\n }\n var libraries_atac = inputs.Libraries_ATAC\n if(libraries_atac == null){\n libraries_atac = [\"None\"];\n }\n\n var i = 0;\n var refs_mrna_inputs = [];\n var mrna_name = \"mRNA Reference\";\n if (inputs.Targeted_Reference != null) {\n refs_mrna_inputs = refs_mrna_inputs.concat(inputs.Targeted_Reference);\n mrna_name = \"Targeted Reference\";\n }\n if(inputs.Reference_Archive != null){\n refs_mrna_inputs = refs_mrna_inputs.concat(inputs.Reference_Archive);\n mrna_name = \"Reference Archive\";\n }\n var refs_mrna = [];\n if (refs_mrna_inputs.length > 0) {\n for (i = 0; i < refs_mrna_inputs.length; i++) {\n if (refs_mrna_inputs[i] != null) {\n refs_mrna.push(refs_mrna_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_mrna = [\"None\"];\n }\n\n var refs_abseq_inputs = [];\n if (inputs.AbSeq_Reference != null) {\n refs_abseq_inputs = refs_abseq_inputs.concat(inputs.AbSeq_Reference);\n }\n var refs_abseq = [];\n if (refs_abseq_inputs.length > 0) {\n for (i = 0; i < refs_abseq_inputs.length; i++) {\n if (refs_abseq_inputs[i] != null) {\n refs_abseq.push(refs_abseq_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_abseq = [\"None\"];\n }\n\n var refs_supp_inputs = [];\n if (inputs.Supplemental_Reference != null) {\n refs_supp_inputs = refs_supp_inputs.concat(inputs.AbSeq_Reference);\n }\n var refs_supp = [];\n if (refs_supp_inputs.length > 0) {\n for (i = 0; i < refs_supp_inputs.length; i++) {\n if (refs_supp_inputs[i] != null) {\n refs_supp.push(refs_supp_inputs[i][\"basename\"]);\n }\n }\n }\n else {\n refs_supp = [\"None\"];\n }\n\n if (inputs.Predefined_ATAC_Peaks != null) {\n var predef_atac_peaks = inputs.Predefined_ATAC_Peaks[\"basename\"];\n } else {\n var predef_atac_peaks = \"None\";\n }\n\n var parameters = [];\n if(inputs.Sample_Tags_Version != null){\n var tags = \"Sample Tag Version: \" + inputs.Sample_Tags_Version;\n } else{ \n var tags = \"Sample Tag Version: None\";\n }\n parameters.push(tags);\n\n if(inputs.Sample_Tag_Names != null){\n var tag_names = inputs.Sample_Tag_Names.join(\"; \")\n var tag_list = \"Sample Tag Names: \" + tag_names;\n } else{\n var tag_list = \"Sample Tag Names: None\";\n }\n parameters.push(tag_list);\n \n if(inputs.VDJ_Version != null){\n var vdj = \"VDJ Version: \" + inputs.VDJ_Version;\n } else{ \n var vdj = \"VDJ Version: None\";\n }\n parameters.push(vdj)\n\n if (inputs.Cell_Calling_Data == 0) {\n var call = \"Putative Cell Calling Data: mRNA\";\n } else if (inputs.Cell_Calling_Data == 1) {\n var call = \"Putative Cell Calling Data: AbSeq\";\n } else if (inputs.Cell_Calling_Data == 2) {\n var call = \"Putative Cell Calling Data: mRNA_and_AbSeq\";\n } else if (inputs.Cell_Calling_Data == 3) {\n var call = \"Putative Cell Calling Data: mRNA_and_ATAC\";\n } else if (inputs.Cell_Calling_Data == 4) {\n var call = \"Putative Cell Calling Data: AbSeq_and_ATAC\";\n } else if (inputs.Cell_Calling_Data == 5) {\n var call = \"Putative Cell Calling Data: ATAC\";\n } else {\n var call = \"Putative Cell Calling Data: None\";\n }\n parameters.push(call)\n\n if (inputs.Cell_Calling_Bioproduct_Algorithm != null) {\n var bioproduct_alg = \"Bioproduct Cell Calling Algorithm: \" + inputs.Cell_Calling_Bioproduct_Algorithm;\n } else {\n var bioproduct_alg = \"Bioproduct Cell Calling Algorithm: None\";\n }\n parameters.push(bioproduct_alg)\n\n if (inputs.Cell_Calling_ATAC_Algorithm != null) {\n var atac_alg = \"ATAC Cell Calling Algorithm: \" + inputs.Cell_Calling_ATAC_Algorithm;\n } else {\n var atac_alg = \"ATAC Cell Calling Algorithm: None\";\n }\n parameters.push(atac_alg)\n\n if(inputs.Exclude_Intronic_Reads){\n var introns = \"Exclude Intronic Reads: On\";\n } else{\n var introns = \"Exclude Intronic Reads: Off\";\n }\n parameters.push(introns)\n\n if(inputs.Generate_Bam){\n var generateBam = \"Generate Bam: On\";\n } else{\n var generateBam = \"Generate Bam: Off\";\n }\n parameters.push(generateBam)\n\n if(inputs.Exact_Cell_Count != null){\n var exactCells = \"Exact Cell Count: \" + inputs.Exact_Cell_Count;\n } else{\n var exactCells = \"Exact Cell Count: None\";\n }\n parameters.push(exactCells)\n\n if(inputs.Expected_Cell_Count != null){\n var expectedCells = \"Expected Cell Count: \" + inputs.Expected_Cell_Count;\n } else{\n var expectedCells = \"Expected Cell Count: None\";\n }\n parameters.push(expectedCells);\n\n var longReads = \"Long Reads: \" + inputs.Long_Reads;\n parameters.push(longReads);\n\n if (inputs.Custom_STAR_Params != null)\n {\n var starParams = \"Custom STAR Params: \" + inputs.Custom_STAR_Params;\n } else {\n var starParams = \"Custom STAR Params: None\"; \n }\n parameters.push(starParams);\n\n if (inputs.Custom_bwa_mem2_Params != null)\n {\n var bwaParams = \"Custom bwa-mem2 Params: \" + inputs.Custom_bwa_mem2_Params;\n } else {\n var bwaParams = \"Custom bwa-mem2 Params: None\";\n }\n parameters.push(bwaParams);\n\n var run_name = inputs.Run_Name;\n var run_base_name = inputs.Run_Base_Name;\n\n var header = [\"####################\"];\n header.push(\"## \" + pipeline_name + \" Version \" + version);\n header.push(\"## Analysis Date - \" + time);\n header.push(\"## Libraries - Bioproduct Libraries: \" + libraries.join('; ') + \" | ATAC Libraries: \" + libraries_atac.join('; ') + \" | Bead version detected: \" + customer_bv);\n header.push(\"## References - \" + mrna_name + \": \" + refs_mrna.join('; ') + \" | AbSeq Reference: \" + refs_abseq.join('; ') + \" | Supplemental Reference: \" + refs_supp.join('; ') + \" | ATAC Predefined Peak Regions: \" + predef_atac_peaks);\n header.push(\"## Parameters - \" + parameters.join(' | '));\n header.push(\"####################\");\n metadata[\"Output_Header\"] = header;\n metadata[\"Run_Name\"] = run_name \n metadata[\"Run_Base_Name\"] = run_base_name;\n\n var metadata_json = JSON.stringify(metadata, null, 2);\n\n return metadata_json;\n}\n" + } + ], + "id": "#Metadata.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_metrics.py" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "prefix": "--cell-type-file", + "itemSeparator": "," + }, + "id": "#Metrics.cwl/Cell_Type_Predictions" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--metrics-tar" + }, + "id": "#Metrics.cwl/Metrics_tar" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#Metrics.cwl/Run_Metadata" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-per-cell-file" + }, + "id": "#Metrics.cwl/vdjCellsDatatable" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--vdj-metrics-fp" + }, + "id": "#Metrics.cwl/vdjMetricsJson" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputBinding": { + "glob": [ + "*_ATAC_Metrics.json", + "*_ATAC_Metrics.csv" + ] + }, + "id": "#Metrics.cwl/Metrics_ATAC" + }, + { + "type": "File", + "outputBinding": { + "glob": "internal-metrics-archive.tar.gz" + }, + "id": "#Metrics.cwl/Metrics_Archive" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Metrics_Summary.csv" + }, + "id": "#Metrics.cwl/Metrics_Summary" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Pipeline_Report.html" + }, + "id": "#Metrics.cwl/Pipeline_Report_HTML" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_Pipeline_Report.json" + }, + "id": "#Metrics.cwl/Pipeline_Report_JSON" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#Metrics.cwl/output" + } + ], + "id": "#Metrics.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/_Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/_Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/_Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/_Long_Reads" + } + ], + "outputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Custom_STAR_Params" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Custom_bwa_mem2_Params" + }, + { + "type": [ + "null", + "boolean" + ], + "id": "#MiscSettings.cwl/Exclude_Intronic_Reads" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MiscSettings.cwl/Long_Reads" + } + ], + "expression": "${\n // the exclude intronic reads flag defaults to false\n var excludeIntronicReads = false;\n // the user can set the flag to exclude intronic reads\n if (inputs._Exclude_Intronic_Reads) {\n excludeIntronicReads = inputs._Exclude_Intronic_Reads;\n }\n\n // Use_Long_Reads default is autodetect, which happens in CheckFastqs\n // User can set this explicitly true or false\n // Convert boolean results to string. null -> \"auto\", true -> \"true\", false -> \"false\"\n var longReads = \"auto\";\n if (inputs._Long_Reads !== null) {\n if (inputs._Long_Reads) {\n longReads = \"true\";\n }\n else {\n longReads = \"false\";\n }\n }\n\n return ({\n Exclude_Intronic_Reads: excludeIntronicReads,\n Long_Reads: longReads,\n Custom_STAR_Params: inputs._Custom_STAR_Params,\n Custom_bwa_mem2_Params: inputs._Custom_bwa_mem2_Params\n });\n\n\n}", + "id": "#MiscSettings.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": "string", + "default": "Targeted", + "id": "#MultiplexingSettings.cwl/Assay" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#MultiplexingSettings.cwl/_Sample_Tag_Names" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#MultiplexingSettings.cwl/_Sample_Tags_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "id": "#MultiplexingSettings.cwl/Sample_Tag_Names" + }, + { + "type": [ + "null", + "string" + ], + "id": "#MultiplexingSettings.cwl/Sample_Tags_Version" + } + ], + "expression": "${\n var enumifiedSampleTagsVersion = null;\n if (inputs._Sample_Tags_Version) {\n var _Sample_Tags_Version = inputs._Sample_Tags_Version.toLowerCase();\n\n if (_Sample_Tags_Version.indexOf('human') >= 0 || _Sample_Tags_Version === 'hs')\n {\n enumifiedSampleTagsVersion = 'hs';\n }\n else if (_Sample_Tags_Version.indexOf('mouse') >= 0 || _Sample_Tags_Version === 'mm')\n {\n enumifiedSampleTagsVersion = 'mm';\n }\n else if (_Sample_Tags_Version.indexOf('flex') >= 0)\n {\n enumifiedSampleTagsVersion = 'flex';\n }\n else if (_Sample_Tags_Version.indexOf('nuclei') >= 0)\n {\n if (_Sample_Tags_Version.indexOf('atac') >= 0)\n {\n enumifiedSampleTagsVersion = 'nuclei_atac_only';\n }\n else\n {\n enumifiedSampleTagsVersion = 'nuclei_includes_mrna';\n }\n }\n else if (_Sample_Tags_Version === 'no multiplexing')\n {\n enumifiedSampleTagsVersion = null;\n }\n else\n {\n throw new Error(\"Cannot parse Sample Tag Version: \" + inputs._Sample_Tags_Version);\n }\n }\n var newTagNames = null;\n if (inputs._Sample_Tag_Names) {\n var listTagNames = inputs._Sample_Tag_Names\n var newTagNames = []\n for (var num in listTagNames) {\n var tag = listTagNames[num].replace(/[^A-Za-z0-9-+]/g,\"_\");\n newTagNames.push(tag);\n }\n } \n return ({\n Sample_Tag_Names: newTagNames,\n Sample_Tags_Version: enumifiedSampleTagsVersion\n });\n}", + "id": "#MultiplexingSettings.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "NameSettings sets the Run_Name variable that is used as a common prefix to name output files. If the user has specified a Run_Name, it is cleaned up or one is set based on the Bioproduct/ATAC fastq filenames.\n", + "hints": [], + "baseCommand": [ + "python", + "-c" + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "default": [], + "id": "#NameSettings.cwl/ATAC_Fastqs" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "default": [], + "id": "#NameSettings.cwl/Bioproduct_Fastqs" + }, + { + "type": [ + "null", + "string" + ], + "default": "", + "id": "#NameSettings.cwl/_Run_Name" + } + ], + "arguments": [ + { + "position": 2, + "valueFrom": "import sys\nfrom mist.apps import CheckFastqs\nCheckFastqs.write_run_name_json(sys.argv[1], sys.argv[2], sys.argv[3])\n", + "shellQuote": true + }, + { + "position": 3, + "valueFrom": "$(inputs._Run_Name)", + "shellQuote": true + }, + { + "position": 4, + "valueFrom": "$(inputs.Bioproduct_Fastqs.join(\",\"))", + "shellQuote": true + }, + { + "position": 5, + "valueFrom": "$(inputs.ATAC_Fastqs.join(\",\"))", + "shellQuote": true + } + ], + "outputs": [ + { + "type": "string", + "outputBinding": { + "glob": "run_base_name.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents)['Run_Base_Name'])" + }, + "id": "#NameSettings.cwl/Run_Base_Name" + }, + { + "type": [ + "null", + "string" + ], + "outputBinding": { + "outputEval": "$(inputs._Run_Name)" + }, + "id": "#NameSettings.cwl/Run_Name" + } + ], + "id": "#NameSettings.cwl" + }, + { + "requirements": [ + { + "class": "ResourceRequirement", + "ramMin": 16000 + } + ], + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_peak_annotation.py" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "prefix": "--gtf" + }, + "id": "#PeakAnnotation.cwl/Gtf" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--peaks_bed_file" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#PeakAnnotation.cwl/Peaks_bed" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--run-metadata" + }, + "id": "#PeakAnnotation.cwl/Run_Metadata" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*.tsv.gz" + }, + "id": "#PeakAnnotation.cwl/Peak_Annotation_TSV" + }, + { + "type": "File", + "outputBinding": { + "glob": "*.log" + }, + "id": "#PeakAnnotation.cwl/output" + } + ], + "id": "#PeakAnnotation.cwl" + }, + { + "class": "CommandLineTool", + "baseCommand": "echo", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "id": "#PutativeCellSettings.cwl/_Cell_Calling_Data" + }, + { + "type": [ + "null", + "int" + ], + "id": "#PutativeCellSettings.cwl/_Exact_Cell_Count" + }, + { + "type": [ + "null", + "int" + ], + "id": "#PutativeCellSettings.cwl/_Expected_Cell_Count" + }, + { + "type": "boolean", + "id": "#PutativeCellSettings.cwl/_Reads_ATAC_Present" + }, + { + "type": "boolean", + "id": "#PutativeCellSettings.cwl/_Reads_RNA_Present" + } + ], + "outputs": [ + { + "type": [ + "null", + "Any" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_ATAC_Algorithm; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_ATAC_Algorithm" + }, + { + "type": [ + "null", + "Any" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_Bioproduct_Algorithm; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_Bioproduct_Algorithm" + }, + { + "type": "int", + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Cell_Calling_Data; }" + }, + "id": "#PutativeCellSettings.cwl/Cell_Calling_Data" + }, + { + "type": [ + "null", + "int" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Exact_Cell_Count; }" + }, + "id": "#PutativeCellSettings.cwl/Exact_Cell_Count" + }, + { + "type": [ + "null", + "int" + ], + "outputBinding": { + "glob": "putative_cell_settings.json", + "loadContents": true, + "outputEval": "${ return JSON.parse(self[0].contents).Expected_Cell_Count; }" + }, + "id": "#PutativeCellSettings.cwl/Expected_Cell_Count" + }, + { + "type": "File", + "outputBinding": { + "glob": "putative_cell_settings.json" + }, + "id": "#PutativeCellSettings.cwl/PutativeCellSettings" + } + ], + "stdout": "putative_cell_settings.json", + "arguments": [ + { + "prefix": "" + }, + { + "shellQuote": true, + "valueFrom": "${\n var settings = inputs;\n var errorMessage = \"No error\";\n\n var cellCallingATACAlg = null;\n // the default cell calling algorithm for ATAC is basic\n if (inputs._Reads_ATAC_Present) {\n cellCallingATACAlg = \"Basic\";\n }\n // the user can choose the ATAC cell calling algorithm\n if (inputs._Cell_Calling_ATAC_Algorithm) {\n cellCallingATACAlg = inputs._Cell_Calling_ATAC_Algorithm;\n }\n\n var cellCallingBioproductAlg = null;\n // the default cell calling algorithm for bioproducts is basic\n if (inputs._Reads_RNA_Present) {\n cellCallingBioproductAlg = \"Basic\";\n }\n // the user can choose the bioproducts cell calling algorithm\n if (inputs._Cell_Calling_Bioproduct_Algorithm) {\n cellCallingBioproductAlg = inputs._Cell_Calling_Bioproduct_Algorithm;\n }\n\n // the default cell calling data depends on the data that is provided\n // the overall default is mRNA data\n var cellCallingDataInt = 0;\n // if mRNA and ATAC reads are present, then default to joint cell calling\n if (inputs._Reads_RNA_Present && inputs._Reads_ATAC_Present) {\n cellCallingDataInt = 3;\n }\n // if no WTA data is present, but ATAC data is, then default to ATAC\n else if (!inputs._Reads_RNA_Present && inputs._Reads_ATAC_Present) {\n cellCallingDataInt = 5;\n }\n\n // convert the Cell_Calling_Data from a string to an integer\n if (inputs._Cell_Calling_Data) {\n if (inputs._Cell_Calling_Data === \"mRNA\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA\" was selected but no mRNA Reads were provided.';\n } else {\n cellCallingDataInt = 0;\n }\n }\n else if (inputs._Cell_Calling_Data === \"AbSeq\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq\" was selected but no AbSeq Reads were provided.';\n }\n cellCallingDataInt = 1;\n }\n else if (inputs._Cell_Calling_Data === \"mRNA_and_AbSeq\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_AbSeq\" was selected but no mRNA/AbSeq Reads were provided.';\n }\n cellCallingDataInt = 2;\n }\n else if (inputs._Cell_Calling_Data === \"mRNA_and_ATAC\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_ATAC\" was selected but no mRNA Reads were provided.';\n } else if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"mRNA_and_ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 3;\n }\n else if (inputs._Cell_Calling_Data === \"AbSeq_and_ATAC\") {\n if (!inputs._Reads_RNA_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq_and_ATAC\" was selected but no AbSeq Reads were provided.';\n } else if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"AbSeq_and_ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 4;\n }\n else if (inputs._Cell_Calling_Data === \"ATAC\") {\n if (!inputs._Reads_ATAC_Present) {\n errorMessage = 'The \"Cell Calling Data\" option \"ATAC\" was selected but no ATAC Reads were provided.';\n }\n cellCallingDataInt = 5;\n }\n }\n // check the exact cell count\n if (inputs._Exact_Cell_Count) {\n if (inputs._Exact_Cell_Count < 1) {\n errorMessage = \"Exact cell count must be an integer greater than 0, value received: \" + inputs._Exact_Cell_Count;\n }\n }\n // check if there is an error\n if (errorMessage != \"No error\") {\n // If there is an error, force CWL to show it:\n // - \"Cell_Calling_Data\" is a required output\n // - setting it to null will cause a CWL error\n // - the error message will be shown in the json\n cellCallingDataInt = null;\n }\n\n settings[\"Cell_Calling_ATAC_Algorithm\"] = cellCallingATACAlg;\n settings[\"Cell_Calling_Bioproduct_Algorithm\"] = cellCallingBioproductAlg;\n settings[\"Cell_Calling_Data\"] = cellCallingDataInt;\n settings[\"Expected_Cell_Count\"] = inputs._Expected_Cell_Count;\n settings[\"Exact_Cell_Count\"] = inputs._Exact_Cell_Count;\n settings[\"Error\"] = errorMessage;\n\n var settings_json = JSON.stringify(settings, null, 2);\n\n return settings_json;\n }\n" + } + ], + "id": "#PutativeCellSettings.cwl" + }, + { + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "doc": "CheckFastqs does several quality control routines including: ensuring that read pair file names are formatted correctly and contain a read pair mate; QualCLAlign stage of the Rain pipeline overlaps read pairs and then performs a series of filters and mappings to reduce valid reads into a single FastQ file to be fed into the aligner. The R2 reads are annotated with cell index and UMI information derived from the R1 read.\n", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": "${ if(inputs.Threads){ return inputs.Threads; } else{ return 8; } }", + "ramMin": 48000 + } + ], + "class": "CommandLineTool", + "baseCommand": [ + "mist_run_qualclalign.py" + ], + "inputs": [ + { + "inputBinding": { + "prefix": "--alignment-compression-threads" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Alignment_Compression_threads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--assay" + }, + "id": "#QualCLAlign.cwl/Assay" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--bgzf-threads" + }, + "id": "#QualCLAlign.cwl/BGZF_Threads" + }, + { + "inputBinding": { + "prefix": "--extra-seqs" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Extra_Seqs" + }, + { + "type": [ + "null", + "boolean" + ], + "inputBinding": { + "prefix": "--split-atac-bam" + }, + "id": "#QualCLAlign.cwl/Generate_Bam" + }, + { + "inputBinding": { + "prefix": "--index" + }, + "type": "Directory", + "id": "#QualCLAlign.cwl/Index" + }, + { + "inputBinding": { + "prefix": "--use-star-long" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Long_Reads" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "prefix": "--predefined-peaks" + }, + "secondaryFiles": { + "pattern": ".tbi" + }, + "id": "#QualCLAlign.cwl/Predefined_ATAC_Peaks" + }, + { + "inputBinding": { + "prefix": "--reader-annotation-threads" + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/Reader_Annotation_Threads" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "inputBinding": { + "prefix": "--reads", + "itemSeparator": "," + }, + "id": "#QualCLAlign.cwl/Reads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--run-name" + }, + "id": "#QualCLAlign.cwl/Run_Base_Name" + }, + { + "inputBinding": { + "prefix": "--star-params", + "shellQuote": true + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/STAR_Params" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--threads" + }, + "id": "#QualCLAlign.cwl/Threads" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version" + }, + "id": "#QualCLAlign.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "boolean" + ], + "inputBinding": { + "prefix": "--write-filtered-read-pairs" + }, + "id": "#QualCLAlign.cwl/Write_Filtered_Reads" + }, + { + "inputBinding": { + "prefix": "--bwa-mem2-params", + "shellQuote": true + }, + "type": [ + "null", + "string" + ], + "id": "#QualCLAlign.cwl/bwa_mem2_Params" + } + ], + "outputs": [ + { + "outputBinding": { + "glob": "*.bam" + }, + "type": { + "type": "array", + "items": "File" + }, + "id": "#QualCLAlign.cwl/BAMFiles" + }, + { + "type": { + "type": "array", + "items": { + "type": "record", + "fields": [ + { + "name": "#QualCLAlign.cwl/Bead_Version/Library", + "type": "string" + }, + { + "name": "#QualCLAlign.cwl/Bead_Version/bead_version", + "type": "string" + } + ] + } + }, + "outputBinding": { + "glob": "bead_version.json", + "loadContents": true, + "outputEval": "$(JSON.parse(self[0].contents).BeadVersion)\n" + }, + "id": "#QualCLAlign.cwl/Bead_Version" + }, + { + "outputBinding": { + "glob": "*.failedReads.csv.gz" + }, + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#QualCLAlign.cwl/Failed_Reads_CSVs" + }, + { + "outputBinding": { + "glob": "fastq_read_pairs.json" + }, + "type": "File", + "id": "#QualCLAlign.cwl/Fastq_read_pairs" + }, + { + "outputBinding": { + "glob": "*_ATAC_Fragments.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Fragments" + }, + { + "outputBinding": { + "glob": "*_ATAC_Fragments.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Fragments_Index" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "outputBinding": { + "glob": "bead_version.json", + "loadContents": true, + "outputEval": "${\n var obj = JSON.parse(self[0].contents);\n var libraries = [];\n var beadLibs = obj.BeadVersion\n for (var i in beadLibs){\n if (libraries.indexOf(beadLibs[i][\"Library\"]) == -1){ \n libraries.push(beadLibs[i][\"Library\"]);\n }\n }\n libraries.sort();\n return libraries\n}\n" + }, + "id": "#QualCLAlign.cwl/Libraries" + }, + { + "outputBinding": { + "glob": "*logs.tar.gz" + }, + "type": { + "type": "array", + "items": "File" + }, + "id": "#QualCLAlign.cwl/Logs" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Peaks" + }, + { + "outputBinding": { + "glob": "*_ATAC_Peaks.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Peaks_Index" + }, + { + "outputBinding": { + "glob": "*ReadQualityMetrics.json" + }, + "type": "File", + "id": "#QualCLAlign.cwl/QualCLAlignMetrics" + }, + { + "type": { + "type": "array", + "items": "string" + }, + "outputBinding": { + "outputEval": "${ \n var reads = []; \n var files = inputs.Reads\n for (var i in files){\n reads.push(files[i][\"basename\"]);\n }\n reads.sort();\n return(reads)\n}\n" + }, + "id": "#QualCLAlign.cwl/ReadsList" + }, + { + "type": "string", + "outputBinding": { + "glob": "genome_size.json", + "loadContents": true, + "outputEval": "${ if (!self[0]) { return \"0\"; } return JSON.parse(self[0].contents); }" + }, + "id": "#QualCLAlign.cwl/Reference_Genome_Size" + }, + { + "outputBinding": { + "glob": "*_ATAC_Transposase_Sites.bed.gz" + }, + "type": [ + "null", + "File" + ], + "secondaryFiles": [ + { + "pattern": ".tbi" + } + ], + "id": "#QualCLAlign.cwl/Transposase_Sites" + }, + { + "outputBinding": { + "glob": "*_ATAC_Transposase_Sites.bed.gz.tbi" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/Transposase_Sites_Index" + }, + { + "outputBinding": { + "glob": "*_ATAC_UnifiedMetrics.json" + }, + "type": [ + "null", + "File" + ], + "id": "#QualCLAlign.cwl/UnifiedMetrics" + } + ], + "id": "#QualCLAlign.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/Maximum_Threads" + }, + { + "type": "int", + "id": "#VDJ_Analyze_Reads_IG.cwl/Num_Valid_Reads_IG" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/Valid_Reads_Fastq_IG" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/gatheredCalls", + "id": "#VDJ_Analyze_Reads_IG.cwl/gatheredCalls" + } + ], + "requirements": [ + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "steps": [ + { + "run": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/RSEC_Reads_Fastq", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_cores", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/num_cores" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/igCalls" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG" + }, + { + "run": "#VDJ_GatherCalls.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/igCalls", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/theCalls" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls/gatheredCalls" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_GatherIGCalls" + }, + { + "run": "#VDJ_Preprocess_Reads.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Maximum_Threads", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/Maximum_Threads" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Valid_Reads_Fastq_IG", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/Valid_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_IG.cwl/Num_Valid_Reads_IG", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_valid_reads" + }, + { + "valueFrom": "BCR", + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/vdj_type" + } + ], + "out": [ + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/RSEC_Reads_Fastq", + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_splits", + "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG/num_cores" + ], + "id": "#VDJ_Analyze_Reads_IG.cwl/VDJ_Preprocess_Reads_IG" + } + ], + "id": "#VDJ_Analyze_Reads_IG.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/Maximum_Threads" + }, + { + "type": "int", + "id": "#VDJ_Analyze_Reads_TCR.cwl/Num_Valid_Reads_TCR" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/Valid_Reads_Fastq_TCR" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/gatheredCalls", + "id": "#VDJ_Analyze_Reads_TCR.cwl/gatheredCalls" + } + ], + "requirements": [ + { + "class": "SubworkflowFeatureRequirement" + }, + { + "class": "InlineJavascriptRequirement" + } + ], + "steps": [ + { + "run": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/RSEC_Reads_Fastq", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_cores", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/num_cores" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/tcrCalls" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR" + }, + { + "run": "#VDJ_GatherCalls.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Version", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/VDJ_Version" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/tcrCalls", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/theCalls" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls/gatheredCalls" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_GatherTCRCalls" + }, + { + "run": "#VDJ_Preprocess_Reads.cwl", + "in": [ + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Maximum_Threads", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/Maximum_Threads" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Valid_Reads_Fastq_TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/Valid_Reads_Fastq" + }, + { + "source": "#VDJ_Analyze_Reads_TCR.cwl/Num_Valid_Reads_TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_valid_reads" + }, + { + "valueFrom": "TCR", + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/vdj_type" + } + ], + "out": [ + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/RSEC_Reads_Fastq", + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_splits", + "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR/num_cores" + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl/VDJ_Preprocess_Reads_TCR" + } + ], + "id": "#VDJ_Analyze_Reads_TCR.cwl" + }, + { + "class": "CommandLineTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 1, + "ramMin": 1024 + } + ], + "baseCommand": [ + "AssembleAndAnnotate.sh" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "position": 1 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/RSEC_Reads_Fastq" + }, + { + "type": "string", + "inputBinding": { + "position": 2 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/Read_Limit" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "position": 3 + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/VDJ_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_pruned.csv.gz" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl/PyirCall" + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs.cwl" + }, + { + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/RSEC_Reads_Fastq" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/num_cores" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/PyirCall", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/igCalls" + } + ], + "steps": [ + { + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG", + "run": "#VDJ_Assemble_and_Annotate_Contigs.cwl", + "in": [ + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/RSEC_Reads_Fastq", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + }, + { + "valueFrom": "75000", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/Read_Limit" + }, + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Version", + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/VDJ_Version" + } + ], + "out": [ + "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/PyirCall" + ], + "scatter": [ + "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl/VDJ_Assemble_and_Annotate_Contigs_IG/RSEC_Reads_Fastq" + ] + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_IG.cwl" + }, + { + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ScatterFeatureRequirement" + }, + { + "class": "StepInputExpressionRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "inputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/RSEC_Reads_Fastq" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/num_cores" + } + ], + "outputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "outputSource": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/PyirCall", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/tcrCalls" + } + ], + "steps": [ + { + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR", + "run": "#VDJ_Assemble_and_Annotate_Contigs.cwl", + "in": [ + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/RSEC_Reads_Fastq", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + }, + { + "valueFrom": "75000", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/Read_Limit" + }, + { + "source": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Version", + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/VDJ_Version" + } + ], + "out": [ + "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/PyirCall" + ], + "scatter": [ + "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl/VDJ_Assemble_and_Annotate_Contigs_TCR/RSEC_Reads_Fastq" + ] + } + ], + "id": "#VDJ_Assemble_and_Annotate_Contigs_TCR.cwl" + }, + { + "class": "CommandLineTool", + "hints": [ + { + "class": "ResourceRequirement", + "ramMin": 32000 + } + ], + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "baseCommand": [ + "mist_vdj_compile_results.py" + ], + "inputs": [ + { + "type": "File", + "inputBinding": { + "prefix": "--seq-metrics", + "position": 10 + }, + "id": "#VDJ_Compile_Results.cwl/Seq_Metrics" + }, + { + "type": [ + "null", + "string" + ], + "inputBinding": { + "prefix": "--vdj-version", + "position": 2 + }, + "id": "#VDJ_Compile_Results.cwl/VDJ_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "inputBinding": { + "position": 0, + "itemSeparator": ",", + "prefix": "--cell-type-mapping-fp" + }, + "id": "#VDJ_Compile_Results.cwl/cellTypeMapping" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "string" + } + ], + "inputBinding": { + "position": 4, + "prefix": "--ignore", + "itemSeparator": "," + }, + "id": "#VDJ_Compile_Results.cwl/chainsToIgnore" + }, + { + "type": [ + "null", + "float" + ], + "inputBinding": { + "position": 8, + "prefix": "--e-value-for-j" + }, + "id": "#VDJ_Compile_Results.cwl/evalueJgene" + }, + { + "type": [ + "null", + "float" + ], + "inputBinding": { + "position": 7, + "prefix": "--e-value-for-v" + }, + "id": "#VDJ_Compile_Results.cwl/evalueVgene" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 5 + }, + "id": "#VDJ_Compile_Results.cwl/igCalls" + }, + { + "type": "File", + "inputBinding": { + "position": 9, + "prefix": "--metadata-fp" + }, + "id": "#VDJ_Compile_Results.cwl/metadata" + }, + { + "type": "File", + "inputBinding": { + "prefix": "--putative-cells-json-fp", + "position": 3 + }, + "id": "#VDJ_Compile_Results.cwl/putativeCells" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 6 + }, + "id": "#VDJ_Compile_Results.cwl/tcrCalls" + } + ], + "outputs": [ + { + "doc": "VDJ data per cell, with distribution based error correction", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_perCell.csv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjCellsDatatable" + }, + { + "doc": "VDJ data per cell, including non-putative cells, no error correction applied", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_perCell_uncorrected.csv.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjCellsDatatableUncorrected" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_DBEC_images.tar.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjDbecFilterImages" + }, + { + "doc": "AIRR compatible output that only reports the Dominant contigs, counts are DBEC corrected", + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_Dominant_Contigs_AIRR.tsv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjDominantContigsAIRR" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_metrics.csv" + }, + "id": "#VDJ_Compile_Results.cwl/vdjMetricsCsv" + }, + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_VDJ_metrics.json" + }, + "id": "#VDJ_Compile_Results.cwl/vdjMetricsJson" + }, + { + "type": [ + "null", + "File" + ], + "doc": "AIRR compatible output that reports all the congits, counts are not DBEC corrected", + "outputBinding": { + "glob": "*_VDJ_Unfiltered_Contigs_AIRR.tsv.gz" + }, + "id": "#VDJ_Compile_Results.cwl/vdjUnfilteredContigsAIRR" + } + ], + "id": "#VDJ_Compile_Results.cwl" + }, + { + "doc": "VDJ_GatherCalls collect the outputs from the multi-processed VDJ step into one file.\n", + "class": "Workflow", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_GatherCalls.cwl/VDJ_Version" + }, + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#VDJ_GatherCalls.cwl/theCalls" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputSource": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/gatheredCalls", + "id": "#VDJ_GatherCalls.cwl/gatheredCalls" + } + ], + "steps": [ + { + "in": [ + { + "source": "#VDJ_GatherCalls.cwl/theCalls", + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/theCalls" + } + ], + "out": [ + "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/gatheredCalls" + ], + "run": { + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR", + "cwlVersion": "v1.2", + "class": "CommandLineTool", + "hints": [], + "requirements": [ + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "ShellCommandRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + { + "type": "array", + "items": "File" + } + ], + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR/theCalls" + } + ], + "arguments": [ + { + "shellQuote": false, + "valueFrom": "${\n if (!inputs.theCalls[0] ) {\n return (\"echo \\\"No outputs from PyIR detected in VDJ_GatherCalls\\\"\")\n }\n var inputFiles = \"\"\n if (!inputs.theCalls[0].path.split(\"_PrunePyIR\")[1]){\n inputFiles = \"zcat\"\n for (var i = 0; i < inputs.theCalls.length; i++) {\n inputFiles += \" \" + inputs.theCalls[i].path\n }\n inputFiles += \" | \"\n } else {\n inputFiles = \"zcat \" + inputs.theCalls[0].path.split(\"VDJ\")[0] + \"*\" + inputs.theCalls[0].path.split(\"_PrunePyIR\")[1].split(\"_Number_\")[0] + \"_Number_*.csv.gz | \"\n }\n var outputFileName = \"\\\"gzip > \" + inputs.theCalls[0].nameroot.split(\"_Number_\")[0] + \"_constant_region_called_pruned.csv.gz\" + \"\\\"\"\n var awkCommand = \"awk \\'NR==1{F=$1;print | \" + outputFileName + \" } $1!=F { print | \" + outputFileName + \" }\\' \"\n var outputCommand = inputFiles + awkCommand\n return (outputCommand)\n}" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "*_constant_region_called_pruned.csv.gz", + "outputEval": "${\n if (self.size == 0) {\n throw(\"No outputs from PyIR detected in VDJ_GatherCalls!\");\n } else {\n return(self);\n }\n}" + }, + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls/run/gather_PyIR/gatheredCalls" + } + ] + }, + "id": "#VDJ_GatherCalls.cwl/VDJ_GatherCalls" + } + ], + "id": "#VDJ_GatherCalls.cwl" + }, + { + "class": "Workflow", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/Maximum_Threads" + }, + { + "type": [ + "null", + "File" + ], + "id": "#VDJ_Preprocess_Reads.cwl/Valid_Reads_Fastq" + }, + { + "type": "int", + "id": "#VDJ_Preprocess_Reads.cwl/num_valid_reads" + }, + { + "type": "string", + "id": "#VDJ_Preprocess_Reads.cwl/vdj_type" + } + ], + "outputs": [ + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/RSEC_Reads_Fastq", + "type": { + "type": "array", + "items": "File" + }, + "id": "#VDJ_Preprocess_Reads.cwl/RSEC_Reads_Fastq" + }, + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_cores", + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/num_cores" + }, + { + "outputSource": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/num_splits" + } + ], + "requirements": [ + { + "envDef": [ + { + "envValue": "8", + "envName": "CORES_ALLOCATED_PER_CWL_PROCESS" + } + ], + "class": "EnvVarRequirement" + }, + { + "class": "InlineJavascriptRequirement" + }, + { + "class": "SubworkflowFeatureRequirement" + } + ], + "steps": [ + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads", + "requirements": [ + { + "class": "ResourceRequirement", + "coresMin": 8 + } + ], + "run": "#VDJ_RSEC_Reads.cwl", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/Valid_Reads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/num_splits" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/num_valid_reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/num_valid_reads" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_RSEC_Reads/RSEC_Reads_Fastq" + ] + }, + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads", + "hints": [ + { + "class": "ResourceRequirement", + "coresMin": 8 + } + ], + "run": "#VDJ_Trim_Reads.cwl", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/Valid_Reads_Fastq", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads_Fastq" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Valid_Reads", + "#VDJ_Preprocess_Reads.cwl/VDJ_Trim_Reads/Trim_Report" + ] + }, + { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits", + "in": [ + { + "source": "#VDJ_Preprocess_Reads.cwl/Maximum_Threads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/Maximum_Threads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/num_valid_reads", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_valid_reads" + }, + { + "source": "#VDJ_Preprocess_Reads.cwl/vdj_type", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/vdj_type" + } + ], + "out": [ + "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_splits", + "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/num_cores" + ], + "run": { + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits", + "cwlVersion": "v1.2", + "class": "ExpressionTool", + "inputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/Maximum_Threads" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_valid_reads" + }, + { + "type": "string", + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/vdj_type" + } + ], + "outputs": [ + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_cores" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_Preprocess_Reads.cwl/VDJ_num_splits/run/determine_num_splits/num_splits" + } + ], + "expression": "${\n var num_splits = 64;\n var max_threads = parseInt(inputs.Maximum_Threads);\n if (!isNaN(max_threads)) {\n num_splits = parseInt(Math.max(max_threads, 8) * 0.7);\n }\n return ({\"num_splits\": num_splits, \"num_cores\": num_splits});\n}" + } + } + ], + "id": "#VDJ_Preprocess_Reads.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": "mist_vdj_rsec_reads.py", + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_RSEC_Reads.cwl/VDJ_Version" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "inputBinding": { + "prefix": "--vdj-valid-reads", + "itemSeparator": "," + }, + "id": "#VDJ_RSEC_Reads.cwl/Valid_Reads" + }, + { + "type": [ + "null", + "int" + ], + "inputBinding": { + "prefix": "--num-splits" + }, + "id": "#VDJ_RSEC_Reads.cwl/num_splits" + }, + { + "type": [ + "null", + "int" + ], + "id": "#VDJ_RSEC_Reads.cwl/num_valid_reads" + } + ], + "outputs": [ + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*RSEC_Reads_Fastq_*.tar.gz" + }, + "id": "#VDJ_RSEC_Reads.cwl/RSEC_Reads_Fastq" + } + ], + "id": "#VDJ_RSEC_Reads.cwl" + }, + { + "class": "ExpressionTool", + "requirements": [ + { + "class": "InlineJavascriptRequirement" + } + ], + "inputs": [ + { + "type": [ + "null", + "Any" + ], + "id": "#VDJ_Settings.cwl/_VDJ_Version" + } + ], + "outputs": [ + { + "type": [ + "null", + "float" + ], + "id": "#VDJ_Settings.cwl/VDJ_JGene_Evalue" + }, + { + "type": [ + "null", + "float" + ], + "id": "#VDJ_Settings.cwl/VDJ_VGene_Evalue" + }, + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Settings.cwl/VDJ_Version" + } + ], + "expression": "${\n var vdjVersion = null;\n if (!inputs._VDJ_Version) {\n vdjVersion = null;}\n else {\n var _VDJ_Version = inputs._VDJ_Version.toLowerCase();\n if (_VDJ_Version === \"human\" || _VDJ_Version === \"hs\" || _VDJ_Version === \"human vdj - bcr and tcr\") {\n vdjVersion = \"human\";\n } else if (_VDJ_Version === \"humanbcr\" || _VDJ_Version === \"human vdj - bcr only\") {\n vdjVersion = \"humanBCR\";\n } else if (_VDJ_Version === \"humantcr\" || _VDJ_Version === \"human vdj - tcr only\") {\n vdjVersion = \"humanTCR\";\n } else if (_VDJ_Version === \"mouse\" || _VDJ_Version === \"mm\" || _VDJ_Version === \"mouse vdj - bcr and tcr\") {\n vdjVersion = \"mouse\";\n } else if (_VDJ_Version === \"mousebcr\" || _VDJ_Version === \"mouse vdj - bcr only\") {\n vdjVersion = \"mouseBCR\";\n } else if (_VDJ_Version === \"mousetcr\" || _VDJ_Version === \"mouse vdj - tcr only\") {\n vdjVersion = \"mouseTCR\";\n } else {\n vdjVersion = inputs._VDJ_Version;\n }\n }\n\n return ({\n VDJ_Version: vdjVersion,\n })\n}", + "id": "#VDJ_Settings.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": "VDJ_Trim_Reads.sh", + "inputs": [ + { + "type": [ + "null", + "string" + ], + "id": "#VDJ_Trim_Reads.cwl/VDJ_Version" + }, + { + "type": [ + "null", + "File" + ], + "inputBinding": { + "position": 1 + }, + "id": "#VDJ_Trim_Reads.cwl/Valid_Reads_Fastq" + } + ], + "outputs": [ + { + "type": [ + "null", + "File" + ], + "outputBinding": { + "glob": "cutadapt.log" + }, + "id": "#VDJ_Trim_Reads.cwl/Trim_Report" + }, + { + "type": { + "type": "array", + "items": "File" + }, + "outputBinding": { + "glob": "*vdjtxt.gz" + }, + "id": "#VDJ_Trim_Reads.cwl/Valid_Reads" + } + ], + "id": "#VDJ_Trim_Reads.cwl" + }, + { + "hints": [], + "class": "CommandLineTool", + "baseCommand": [ + "mist_print_version.py" + ], + "stdout": "output.txt", + "inputs": [], + "outputs": [ + { + "type": "string", + "outputBinding": { + "glob": "output.txt", + "loadContents": true, + "outputEval": "$(self[0].contents)" + }, + "id": "#Version.cwl/version" + } + ], + "id": "#Version.cwl" + } + ], + "cwlVersion": "v1.2" +} diff --git a/target/nextflow/mapping/cellranger_atac_count/.config.vsh.yaml b/target/nextflow/mapping/cellranger_atac_count/.config.vsh.yaml new file mode 100644 index 00000000..bf2428a0 --- /dev/null +++ b/target/nextflow/mapping/cellranger_atac_count/.config.vsh.yaml @@ -0,0 +1,302 @@ +name: "cellranger_atac_count" +namespace: "mapping" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The fastq.gz files to align. Can also be a single directory containing\ + \ fastq.gz files." + info: null + example: + - "sample_S1_L001_R1_001.fastq.gz" + - "sample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The path to Cell Ranger reference tar.gz file. Can also be a directory." + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The folder to store the alignment results." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--description" + description: "Sample description to embed in output files" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_cells" + description: "Define the top N barcodes with the most fragments overlapping peaks\ + \ as cells and override the cell calling algorithm. N must be a positive integer\ + \ <= 20,000. Use this option if the number of cells estimated by Cell Ranger\ + \ ATAC is not consistent with the barcode rank plot" + info: null + required: false + max: 20000 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--peaks" + description: "Override peak caller: specify peaks to use in downstream analyses\ + \ from supplied 3-column BED file. The supplied peaks file must be sorted by\ + \ position and not contain overlapping peaks; comment lines beginning with #\ + \ are allowed" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--dim_reduce" + description: "Dimensionality reduction mode for clustering" + info: null + default: + - "lsa" + required: false + choices: + - "lsa" + - "pca" + - "plsa" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--subsample_rate" + description: "Downsample to preserve this fraction of reads" + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--lanes" + description: "bcl2fastq option. Semicolon-delimited series of lanes to demultiplex.\ + \ Use this if you have a sample sheet for an entire flow cell but only want\ + \ to generate a few lanes for further 10x Genomics analysis." + info: null + example: + - "1,3" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using Cell Ranger ATAC count." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_atac_tiny_bcl" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger_atac:2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update \\\n&& apt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_atac_count/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/cellranger_atac_count" + executable: "target/nextflow/mapping/cellranger_atac_count/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/cellranger_atac_count/main.nf b/target/nextflow/mapping/cellranger_atac_count/main.nf new file mode 100644 index 00000000..d61ca9c7 --- /dev/null +++ b/target/nextflow/mapping/cellranger_atac_count/main.nf @@ -0,0 +1,4020 @@ +// cellranger_atac_count main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_atac_count", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The fastq.gz files to align. Can also be a single directory containing fastq.gz files.", + "example" : [ + "sample_S1_L001_R1_001.fastq.gz", + "sample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "description" : "The path to Cell Ranger reference tar.gz file. Can also be a directory.", + "example" : [ + "reference.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The folder to store the alignment results.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--description", + "description" : "Sample description to embed in output files", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--force_cells", + "description" : "Define the top N barcodes with the most fragments overlapping peaks as cells and override the cell calling algorithm. N must be a positive integer <= 20,000. Use this option if the number of cells estimated by Cell Ranger ATAC is not consistent with the barcode rank plot", + "required" : false, + "max" : 20000, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--peaks", + "description" : "Override peak caller: specify peaks to use in downstream analyses from supplied 3-column BED file. The supplied peaks file must be sorted by position and not contain overlapping peaks; comment lines beginning with # are allowed", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--dim_reduce", + "description" : "Dimensionality reduction mode for clustering", + "default" : [ + "lsa" + ], + "required" : false, + "choices" : [ + "lsa", + "pca", + "plsa" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--subsample_rate", + "description" : "Downsample to preserve this fraction of reads", + "example" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--lanes", + "description" : "bcl2fastq option. Semicolon-delimited series of lanes to demultiplex. Use this if you have a sample sheet for an entire flow cell but only want to generate a few lanes for further 10x Genomics analysis.", + "example" : [ + "1,3" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Align fastq files using Cell Ranger ATAC count.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_atac_tiny_bcl" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger_atac:2.1", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update \\\\\n&& apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/cellranger_atac_count/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/cellranger_atac_count", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "${VIASH_PAR_REFERENCE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reference='&'#" ; else echo "# par_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_DESCRIPTION+x} ]; then echo "${VIASH_PAR_DESCRIPTION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_description='&'#" ; else echo "# par_description="; fi ) +$( if [ ! -z ${VIASH_PAR_FORCE_CELLS+x} ]; then echo "${VIASH_PAR_FORCE_CELLS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_force_cells='&'#" ; else echo "# par_force_cells="; fi ) +$( if [ ! -z ${VIASH_PAR_PEAKS+x} ]; then echo "${VIASH_PAR_PEAKS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_peaks='&'#" ; else echo "# par_peaks="; fi ) +$( if [ ! -z ${VIASH_PAR_DIM_REDUCE+x} ]; then echo "${VIASH_PAR_DIM_REDUCE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_dim_reduce='&'#" ; else echo "# par_dim_reduce="; fi ) +$( if [ ! -z ${VIASH_PAR_SUBSAMPLE_RATE+x} ]; then echo "${VIASH_PAR_SUBSAMPLE_RATE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_subsample_rate='&'#" ; else echo "# par_subsample_rate="; fi ) +$( if [ ! -z ${VIASH_PAR_LANES+x} ]; then echo "${VIASH_PAR_LANES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_lanes='&'#" ; else echo "# par_lanes="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# just to make sure paths are absolute +par_reference=\\`realpath \\$par_reference\\` +par_output=\\`realpath \\$par_output\\` + +echo "Creating temporary directory" +tmpdir=\\$(mktemp -d "\\$meta_temp_dir/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# process inputs +# for every fastq file found, make a symlink into the tempdir +echo "Locating fastqs" +fastq_dir="\\$tmpdir/fastqs" +mkdir -p "\\$fastq_dir" +IFS=";" +for var in \\$par_input; do + unset IFS + abs_path=\\`realpath \\$var\\` + if [ -d "\\$abs_path" ]; then + find "\\$abs_path" -name *.fastq.gz -exec ln -s {} "\\$fastq_dir" \\\\; + else + ln -s "\\$abs_path" "\\$fastq_dir" + fi +done + +echo "fastq_dir content: \\$(ls \\$fastq_dir)" + +echo "Processing reference" +# process reference +if file \\$par_reference | grep -q 'gzip compressed data'; then + echo "Untarring genome" + reference_dir="\\$tmpdir/fastqs" + mkdir -p "\\$reference_dir" + tar -xvf "\\$par_reference" -C "\\$reference_dir" --strip-components=1 + par_reference="\\$reference_dir" +fi + +# cd into tempdir +cd "\\$tmpdir" + +if [ ! -z "\\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\\`python -c "print(int('\\$meta_memory_gb') - 2)"\\` +fi + +echo "Running cellranger-atac count" + +id=myoutput +cellranger-atac count \\\\ + --id "\\$id" \\\\ + --fastqs "\\$fastq_dir" \\\\ + --reference "\\$par_reference" \\\\ + --dim-reduce "\\$par_dim_reduce" \\\\ + --description "\\$par_description" \\\\ + \\${par_lanes:+--lanes=\\${par_lanes[*]}} \\\\ + \\${par_force_cells:+--force-cells=\\$par_force_cells} \\\\ + \\${par_subsample_rate:+--subsample-rate=\\$par_subsample_rate} \\\\ + \\${memory_gb:+--localmem=\\$memory_gb} \\\\ + \\${meta_cpus:+--localcores=\\$meta_cpus} \\\\ + \\${par_lanes:+--lanes=\\${par_lanes[*]}} + +echo "Copying output" +if [ -d "\\$id/outs/" ]; then + if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" + fi + mv "\\$id/outs/"* "\\$par_output" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/cellranger_atac_count", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/cellranger_atac_count/nextflow.config b/target/nextflow/mapping/cellranger_atac_count/nextflow.config new file mode 100644 index 00000000..c4415480 --- /dev/null +++ b/target/nextflow/mapping/cellranger_atac_count/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/cellranger_atac_count' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Align fastq files using Cell Ranger ATAC count.' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/cellranger_atac_count/nextflow_labels.config b/target/nextflow/mapping/cellranger_atac_count/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/cellranger_atac_count/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/cellranger_atac_count/nextflow_schema.json b/target/nextflow/mapping/cellranger_atac_count/nextflow_schema.json new file mode 100644 index 00000000..7bf25c42 --- /dev/null +++ b/target/nextflow/mapping/cellranger_atac_count/nextflow_schema.json @@ -0,0 +1,120 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_atac_count", + "description": "Align fastq files using Cell Ranger ATAC count.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The fastq.gz files to align", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"sample_S1_L001_R1_001.fastq.gz\";\"sample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "The path to Cell Ranger reference tar.gz file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.tar.gz\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The folder to store the alignment results.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "$id.$key.output" + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "description": { + "type": "string", + "description": "Sample description to embed in output files", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + }, + "force_cells": { + "type": "integer", + "description": "Define the top N barcodes with the most fragments overlapping peaks as cells and override the cell calling algorithm", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "peaks": { + "type": "string", + "format": "path", + "description": "Override peak caller: specify peaks to use in downstream analyses from supplied 3-column BED file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "dim_reduce": { + "type": "string", + "description": "Dimensionality reduction mode for clustering", + "help_text": "Type: `string`, multiple: `False`, default: `\"lsa\"`, choices: ``lsa`, `pca`, `plsa``. ", + "enum": [ + "lsa", + "pca", + "plsa" + ], + "default": "lsa" + }, + "subsample_rate": { + "type": "number", + "description": "Downsample to preserve this fraction of reads", + "help_text": "Type: `double`, multiple: `False`, example: `0.1`. " + }, + "lanes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "bcl2fastq option", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1,3\"]`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/cellranger_count/.config.vsh.yaml b/target/nextflow/mapping/cellranger_count/.config.vsh.yaml new file mode 100644 index 00000000..305ffc6d --- /dev/null +++ b/target/nextflow/mapping/cellranger_count/.config.vsh.yaml @@ -0,0 +1,446 @@ +name: "cellranger_count" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The fastq.gz files to align. Can also be a single directory containing\ + \ fastq.gz files." + info: null + example: + - "sample_S1_L001_R1_001.fastq.gz" + - "sample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The path to Cell Ranger reference tar.gz file. Can also be a directory." + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--feature_reference" + description: "Feature reference CSV file, declaring Feature Barcode constructs\ + \ and associated barcodes\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The folder to store the alignment results." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "integer" + name: "--expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm." + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--force_cells" + description: "Force pipeline to use this number of cells, bypassing cell calling\ + \ algorithm." + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chemistry" + description: "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single\ + \ Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1 \n NOTE:\ + \ this mode cannot be auto-detected. It must be set explicitly with this option.\n\ + - SC3Pv2: Single Cell 3' v2\n- SC3Pv3: Single Cell 3' v3\n- SC3Pv4: Single Cell\ + \ 3' v4\n- SC3Pv3LT: Single Cell 3' v3 LT\n- SC3Pv3HT: Single Cell 3' v3 HT\n\ + - SC5P-R2-v3: Single Cell 5', paired-end/R2-only\n- SC5P-PE-v3: Single Cell\ + \ 5' paired-end v3 (GEM-X)\n- SC5P-PE: Single Cell 5' paired-end\n- SC5P-R2:\ + \ Single Cell 5' R2-only\n- SC-FB: Single Cell Antibody-only 3' v2 or 5'\n-\ + \ ARC-v1: for analyzing the Gene Expression portion of Multiome data. \n NOTE:\ + \ when the pipeline auto-detects ARC-v1 chemistry, an error is triggered.\n\ + See https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-\ + \ for more information.\n" + info: null + default: + - "auto" + required: false + choices: + - "auto" + - "threeprime" + - "fiveprime" + - "SC3Pv1" + - "SC3Pv2" + - "SC3Pv3" + - "SC3Pv4" + - "SC3Pv3LT" + - "SC3Pv3HT" + - "SC5P-PE-v3" + - "SC5P-PE" + - "SC5P-R2" + - "SC5P-R2-v3" + - "SC-FB" + - "ARC-v1" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--tenx_cloud_token_path" + description: "The 10x Cloud Analysis user token used to enable cell annotation." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_annotation_model" + description: "\"Cell annotation model to use. If auto, uses the default model\ + \ for the species.\nIf not given, does not run cell annotation.\"\n" + info: null + required: false + choices: + - "auto" + - "human_pca_v1_beta" + - "mouse_pca_v1_beta" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--secondary_analysis" + description: "Whether or not to run the secondary analysis e.g. clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--generate_bam" + description: "Whether to generate a BAM file." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--include_introns" + description: "Include intronic reads in count." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--r1_length" + description: "Hard trim the input Read 1 to this length before analysis" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--r2_length" + description: "Hard trim the input Read 2 to this length before analysis" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--lanes" + description: "Only use FASTQs from selected lanes." + info: null + example: + - 1 + - 2 + - 3 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean" + name: "--library_compatibility_check" + description: "Whether to check for barcode compatibility between libraries.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_crispr_umi" + description: "Set the minimum number of CRISPR guide RNA UMIs required for protospacer\ + \ detection.\nIf a lower or higher sensitivity is desired for detection, this\ + \ value can be customized\naccording to specific experimental needs. Applicable\ + \ only to datasets that include a\nCRISPR Guide Capture library.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using Cell Ranger count." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +- type: "file" + path: "setup_logger.py" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_count/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/cellranger_count" + executable: "target/nextflow/mapping/cellranger_count/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/cellranger_count/main.nf b/target/nextflow/mapping/cellranger_count/main.nf new file mode 100644 index 00000000..a2ffcbfd --- /dev/null +++ b/target/nextflow/mapping/cellranger_count/main.nf @@ -0,0 +1,4207 @@ +// cellranger_count main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Samuel D'Souza (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_count", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Samuel D'Souza", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "srdsam", + "linkedin" : "samuel-d-souza-887023150/" + }, + "organizations" : [ + { + "name" : "Chan Zuckerberg Biohub", + "href" : "https://www.czbiohub.org", + "role" : "Data Engineer" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The fastq.gz files to align. Can also be a single directory containing fastq.gz files.", + "example" : [ + "sample_S1_L001_R1_001.fastq.gz", + "sample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "description" : "The path to Cell Ranger reference tar.gz file. Can also be a directory.", + "example" : [ + "reference.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--feature_reference", + "description" : "Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The folder to store the alignment results.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--expect_cells", + "description" : "Expected number of recovered cells, used as input to cell calling algorithm.", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--force_cells", + "description" : "Force pipeline to use this number of cells, bypassing cell calling algorithm.", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--chemistry", + "description" : "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1 \n NOTE: this mode cannot be auto-detected. It must be set explicitly with this option.\n- SC3Pv2: Single Cell 3' v2\n- SC3Pv3: Single Cell 3' v3\n- SC3Pv4: Single Cell 3' v4\n- SC3Pv3LT: Single Cell 3' v3 LT\n- SC3Pv3HT: Single Cell 3' v3 HT\n- SC5P-R2-v3: Single Cell 5', paired-end/R2-only\n- SC5P-PE-v3: Single Cell 5' paired-end v3 (GEM-X)\n- SC5P-PE: Single Cell 5' paired-end\n- SC5P-R2: Single Cell 5' R2-only\n- SC-FB: Single Cell Antibody-only 3' v2 or 5'\n- ARC-v1: for analyzing the Gene Expression portion of Multiome data. \n NOTE: when the pipeline auto-detects ARC-v1 chemistry, an error is triggered.\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information.\n", + "default" : [ + "auto" + ], + "required" : false, + "choices" : [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv4", + "SC3Pv3LT", + "SC3Pv3HT", + "SC5P-PE-v3", + "SC5P-PE", + "SC5P-R2", + "SC5P-R2-v3", + "SC-FB", + "ARC-v1" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--tenx_cloud_token_path", + "description" : "The 10x Cloud Analysis user token used to enable cell annotation.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_annotation_model", + "description" : "\\"Cell annotation model to use. If auto, uses the default model for the species.\nIf not given, does not run cell annotation.\\"\n", + "required" : false, + "choices" : [ + "auto", + "human_pca_v1_beta", + "mouse_pca_v1_beta" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--secondary_analysis", + "description" : "Whether or not to run the secondary analysis e.g. clustering.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--generate_bam", + "description" : "Whether to generate a BAM file.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--include_introns", + "description" : "Include intronic reads in count.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--r1_length", + "description" : "Hard trim the input Read 1 to this length before analysis", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--r2_length", + "description" : "Hard trim the input Read 2 to this length before analysis", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--lanes", + "description" : "Only use FASTQs from selected lanes.", + "example" : [ + 1, + 2, + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--library_compatibility_check", + "description" : "Whether to check for barcode compatibility between libraries.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_crispr_umi", + "description" : "Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection.\nIf a lower or higher sensitivity is desired for detection, this value can be customized\naccording to specific experimental needs. Applicable only to datasets that include a\nCRISPR Guide Capture library.\n", + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Align fastq files using Cell Ranger count.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger:9.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update && \\\\\napt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*\n" + ] + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/cellranger_count/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/cellranger_count", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "${VIASH_PAR_REFERENCE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reference='&'#" ; else echo "# par_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_FEATURE_REFERENCE+x} ]; then echo "${VIASH_PAR_FEATURE_REFERENCE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_feature_reference='&'#" ; else echo "# par_feature_reference="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_EXPECT_CELLS+x} ]; then echo "${VIASH_PAR_EXPECT_CELLS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_expect_cells='&'#" ; else echo "# par_expect_cells="; fi ) +$( if [ ! -z ${VIASH_PAR_FORCE_CELLS+x} ]; then echo "${VIASH_PAR_FORCE_CELLS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_force_cells='&'#" ; else echo "# par_force_cells="; fi ) +$( if [ ! -z ${VIASH_PAR_CHEMISTRY+x} ]; then echo "${VIASH_PAR_CHEMISTRY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_chemistry='&'#" ; else echo "# par_chemistry="; fi ) +$( if [ ! -z ${VIASH_PAR_TENX_CLOUD_TOKEN_PATH+x} ]; then echo "${VIASH_PAR_TENX_CLOUD_TOKEN_PATH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_tenx_cloud_token_path='&'#" ; else echo "# par_tenx_cloud_token_path="; fi ) +$( if [ ! -z ${VIASH_PAR_CELL_ANNOTATION_MODEL+x} ]; then echo "${VIASH_PAR_CELL_ANNOTATION_MODEL}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cell_annotation_model='&'#" ; else echo "# par_cell_annotation_model="; fi ) +$( if [ ! -z ${VIASH_PAR_SECONDARY_ANALYSIS+x} ]; then echo "${VIASH_PAR_SECONDARY_ANALYSIS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_secondary_analysis='&'#" ; else echo "# par_secondary_analysis="; fi ) +$( if [ ! -z ${VIASH_PAR_GENERATE_BAM+x} ]; then echo "${VIASH_PAR_GENERATE_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_generate_bam='&'#" ; else echo "# par_generate_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_INCLUDE_INTRONS+x} ]; then echo "${VIASH_PAR_INCLUDE_INTRONS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_include_introns='&'#" ; else echo "# par_include_introns="; fi ) +$( if [ ! -z ${VIASH_PAR_R1_LENGTH+x} ]; then echo "${VIASH_PAR_R1_LENGTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_r1_length='&'#" ; else echo "# par_r1_length="; fi ) +$( if [ ! -z ${VIASH_PAR_R2_LENGTH+x} ]; then echo "${VIASH_PAR_R2_LENGTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_r2_length='&'#" ; else echo "# par_r2_length="; fi ) +$( if [ ! -z ${VIASH_PAR_LANES+x} ]; then echo "${VIASH_PAR_LANES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_lanes='&'#" ; else echo "# par_lanes="; fi ) +$( if [ ! -z ${VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK+x} ]; then echo "${VIASH_PAR_LIBRARY_COMPATIBILITY_CHECK}" | sed "s#'#'\\"'\\"'#g;s#.*#par_library_compatibility_check='&'#" ; else echo "# par_library_compatibility_check="; fi ) +$( if [ ! -z ${VIASH_PAR_MIN_CRISPR_UMI+x} ]; then echo "${VIASH_PAR_MIN_CRISPR_UMI}" | sed "s#'#'\\"'\\"'#g;s#.*#par_min_crispr_umi='&'#" ; else echo "# par_min_crispr_umi="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# just to make sure paths are absolute +par_reference=\\`realpath \\$par_reference\\` +par_output=\\`realpath \\$par_output\\` + +# create temporary directory +tmpdir=\\$(mktemp -d "\\$meta_temp_dir/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# process inputs +# for every fastq file found, make a symlink into the tempdir +fastq_dir="\\$tmpdir/fastqs" +mkdir -p "\\$fastq_dir" +IFS=";" +for var in \\$par_input; do + unset IFS + abs_path=\\`realpath \\$var\\` + if [ -d "\\$abs_path" ]; then + find "\\$abs_path" -name *.fastq.gz -exec ln -s {} "\\$fastq_dir" \\\\; + else + ln -s "\\$abs_path" "\\$fastq_dir" + fi +done + +# process reference +if file \\$par_reference | grep -q 'gzip compressed data'; then + echo "Untarring genome" + reference_dir="\\$tmpdir/fastqs" + mkdir -p "\\$reference_dir" + tar -xvf "\\$par_reference" -C "\\$reference_dir" --strip-components=1 + par_reference="\\$reference_dir" +fi + +# cd into tempdir +cd "\\$tmpdir" + +no_secondary_analysis="" +if [ "\\$par_secondary_analysis" == "false" ]; then + no_secondary_analysis="true" +fi + +IFS="," +id=myoutput +cellranger count \\\\ + --id="\\$id" \\\\ + --fastqs="\\$fastq_dir" \\\\ + --transcriptome="\\$par_reference" \\\\ + --include-introns="\\$par_include_introns" \\\\ + \\${par_feature_reference:+--feature-ref=\\$par_feature_reference} \\\\ + \\${par_cell_annotation_model:+--cell-annotation-model=\\$par_cell_annotation_model} \\\\ + \\${par_tenx_cloud_token_path:+--tenx-cloud-token-path=\\$par_tenx_cloud_token_path} \\\\ + \\${meta_cpus:+--localcores=\\$meta_cpus} \\\\ + \\${meta_memory_gb:+--localmem=\\$((meta_memory_gb-2))} \\\\ + \\${par_expect_cells:+--expect-cells=\\$par_expect_cells} \\\\ + \\${par_force_cells:+--force-cells=\\$par_force_cells} \\\\ + \\${par_chemistry:+--chemistry="\\$par_chemistry"} \\\\ + \\${par_generate_bam:+--create-bam=\\$par_generate_bam} \\\\ + \\${no_secondary_analysis:+--nosecondary} \\\\ + \\${par_r1_length:+--r1-length=\\$par_r1_length} \\\\ + \\${par_r2_length:+--r2-length=\\$par_r2_length} \\\\ + \\${par_lanes:+--lanes=\\${par_lanes[*]}} \\\\ + \\${par_library_compatibility_check:+--check-library-compatibility=\\$par_library_compatibility_check}\\\\ + --disable-ui +unset IFS + +echo "Copying output" +if [ -d "\\$id/outs/" ]; then + if [ ! -d "\\$par_output" ]; then + mkdir -p "\\$par_output" + fi + mv "\\$id/outs/"* "\\$par_output" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/cellranger_count", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/cellranger_count/nextflow.config b/target/nextflow/mapping/cellranger_count/nextflow.config new file mode 100644 index 00000000..b1a9b6fd --- /dev/null +++ b/target/nextflow/mapping/cellranger_count/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/cellranger_count' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Align fastq files using Cell Ranger count.' + author = 'Angela Oliveira Pisco, Samuel D\'Souza, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/cellranger_count/nextflow_labels.config b/target/nextflow/mapping/cellranger_count/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/cellranger_count/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/cellranger_count/nextflow_schema.json b/target/nextflow/mapping/cellranger_count/nextflow_schema.json new file mode 100644 index 00000000..18665c58 --- /dev/null +++ b/target/nextflow/mapping/cellranger_count/nextflow_schema.json @@ -0,0 +1,181 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_count", + "description": "Align fastq files using Cell Ranger count.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The fastq.gz files to align", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"sample_S1_L001_R1_001.fastq.gz\";\"sample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "The path to Cell Ranger reference tar.gz file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.tar.gz\"`. " + }, + "feature_reference": { + "type": "string", + "format": "path", + "description": "Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The folder to store the alignment results.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "$id.$key.output" + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "expect_cells": { + "type": "integer", + "description": "Expected number of recovered cells, used as input to cell calling algorithm.", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "force_cells": { + "type": "integer", + "description": "Force pipeline to use this number of cells, bypassing cell calling algorithm.", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "chemistry": { + "type": "string", + "description": "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1 \n NOTE: this mode cannot be auto-detected", + "help_text": "Type: `string`, multiple: `False`, default: `\"auto\"`, choices: ``auto`, `threeprime`, `fiveprime`, `SC3Pv1`, `SC3Pv2`, `SC3Pv3`, `SC3Pv4`, `SC3Pv3LT`, `SC3Pv3HT`, `SC5P-PE-v3`, `SC5P-PE`, `SC5P-R2`, `SC5P-R2-v3`, `SC-FB`, `ARC-v1``. ", + "enum": [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv4", + "SC3Pv3LT", + "SC3Pv3HT", + "SC5P-PE-v3", + "SC5P-PE", + "SC5P-R2", + "SC5P-R2-v3", + "SC-FB", + "ARC-v1" + ], + "default": "auto" + }, + "tenx_cloud_token_path": { + "type": "string", + "format": "path", + "description": "The 10x Cloud Analysis user token used to enable cell annotation.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "cell_annotation_model": { + "type": "string", + "description": "\"Cell annotation model to use", + "help_text": "Type: `string`, multiple: `False`, choices: ``auto`, `human_pca_v1_beta`, `mouse_pca_v1_beta``. ", + "enum": [ + "auto", + "human_pca_v1_beta", + "mouse_pca_v1_beta" + ] + }, + "secondary_analysis": { + "type": "boolean", + "description": "Whether or not to run the secondary analysis e.g", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "generate_bam": { + "type": "boolean", + "description": "Whether to generate a BAM file.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "include_introns": { + "type": "boolean", + "description": "Include intronic reads in count.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "r1_length": { + "type": "integer", + "description": "Hard trim the input Read 1 to this length before analysis", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "r2_length": { + "type": "integer", + "description": "Hard trim the input Read 2 to this length before analysis", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "lanes": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Only use FASTQs from selected lanes.", + "help_text": "Type: `integer`, multiple: `True`, example: `[1;2;3]`. " + }, + "library_compatibility_check": { + "type": "boolean", + "description": "Whether to check for barcode compatibility between libraries.\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "min_crispr_umi": { + "type": "integer", + "description": "Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection.\nIf a lower or higher sensitivity is desired for detection, this value can be customized\naccording to specific experimental needs", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/cellranger_count_split/.config.vsh.yaml b/target/nextflow/mapping/cellranger_count_split/.config.vsh.yaml new file mode 100644 index 00000000..76c37418 --- /dev/null +++ b/target/nextflow/mapping/cellranger_count_split/.config.vsh.yaml @@ -0,0 +1,290 @@ +name: "cellranger_count_split" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Output directory from a Cell Ranger count run." + info: null + example: + - "input_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--filtered_h5" + description: "Output path for the h5 file storing the filtered counts.\n" + info: null + example: + - "filtered_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--metrics_summary" + description: "Where to store the 'metrics_summary' CSV file.\n" + info: null + example: + - "metrics_summary.csv" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--molecule_info" + description: "Where to store Cell Ranger's 'molecule_info.h5' file.\n" + info: null + example: + - "molecule_info.h5" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bam" + description: "Location of output BAM files.\n" + info: null + example: + - "possorted_genome_bam.bam" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--bai" + description: "Where to store the BAM index files.\n" + info: null + example: + - "possorted_genome_bam.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--raw_h5" + description: "Output path for the h5 file storing the raw counts.\n" + info: null + example: + - "raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Split 10x Cell Ranger output directory into separate output fields." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:jammy" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "apt update && apt upgrade -y" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_count_split/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/cellranger_count_split" + executable: "target/nextflow/mapping/cellranger_count_split/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/cellranger_count_split/main.nf b/target/nextflow/mapping/cellranger_count_split/main.nf new file mode 100644 index 00000000..462c1476 --- /dev/null +++ b/target/nextflow/mapping/cellranger_count_split/main.nf @@ -0,0 +1,3979 @@ +// cellranger_count_split main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Samuel D'Souza (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_count_split", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Samuel D'Souza", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "srdsam", + "linkedin" : "samuel-d-souza-887023150/" + }, + "organizations" : [ + { + "name" : "Chan Zuckerberg Biohub", + "href" : "https://www.czbiohub.org", + "role" : "Data Engineer" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Output directory from a Cell Ranger count run.", + "example" : [ + "input_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--filtered_h5", + "description" : "Output path for the h5 file storing the filtered counts.\n", + "example" : [ + "filtered_feature_bc_matrix.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--metrics_summary", + "description" : "Where to store the 'metrics_summary' CSV file.\n", + "example" : [ + "metrics_summary.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--molecule_info", + "description" : "Where to store Cell Ranger's 'molecule_info.h5' file.\n", + "example" : [ + "molecule_info.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bam", + "description" : "Location of output BAM files.\n", + "example" : [ + "possorted_genome_bam.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--bai", + "description" : "Where to store the BAM index files.\n", + "example" : [ + "possorted_genome_bam.bam.bai" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--raw_h5", + "description" : "Output path for the h5 file storing the raw counts.\n", + "example" : [ + "raw_feature_bc_matrix.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Split 10x Cell Ranger output directory into separate output fields.", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:jammy", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "apt update && apt upgrade -y" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/cellranger_count_split/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/cellranger_count_split", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_FILTERED_H5+x} ]; then echo "${VIASH_PAR_FILTERED_H5}" | sed "s#'#'\\"'\\"'#g;s#.*#par_filtered_h5='&'#" ; else echo "# par_filtered_h5="; fi ) +$( if [ ! -z ${VIASH_PAR_METRICS_SUMMARY+x} ]; then echo "${VIASH_PAR_METRICS_SUMMARY}" | sed "s#'#'\\"'\\"'#g;s#.*#par_metrics_summary='&'#" ; else echo "# par_metrics_summary="; fi ) +$( if [ ! -z ${VIASH_PAR_MOLECULE_INFO+x} ]; then echo "${VIASH_PAR_MOLECULE_INFO}" | sed "s#'#'\\"'\\"'#g;s#.*#par_molecule_info='&'#" ; else echo "# par_molecule_info="; fi ) +$( if [ ! -z ${VIASH_PAR_BAM+x} ]; then echo "${VIASH_PAR_BAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bam='&'#" ; else echo "# par_bam="; fi ) +$( if [ ! -z ${VIASH_PAR_BAI+x} ]; then echo "${VIASH_PAR_BAI}" | sed "s#'#'\\"'\\"'#g;s#.*#par_bai='&'#" ; else echo "# par_bai="; fi ) +$( if [ ! -z ${VIASH_PAR_RAW_H5+x} ]; then echo "${VIASH_PAR_RAW_H5}" | sed "s#'#'\\"'\\"'#g;s#.*#par_raw_h5='&'#" ; else echo "# par_raw_h5="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +filtered_h5="\\$par_input/filtered_feature_bc_matrix.h5" +if [ -f "\\$filtered_h5" ] && [ ! -z "\\$par_filtered_h5" ]; then + echo "+ cp \\$filtered_h5 \\$par_filtered_h5" + cp "\\$filtered_h5" "\\$par_filtered_h5" +fi + +metrics_summary="\\$par_input/metrics_summary.csv" +if [ -f "\\$metrics_summary" ] && [ ! -z "\\$par_metrics_summary" ]; then + echo "+ cp \\$metrics_summary \\$par_metrics_summary" + cp "\\$metrics_summary" "\\$par_metrics_summary" +fi + +molecule_info="\\$par_input/molecule_info.h5" +if [ -f "\\$molecule_info" ] && [ ! -z "\\$par_molecule_info" ]; then + echo "+ cp \\$molecule_info \\$par_molecule_info" + cp "\\$molecule_info" "\\$par_molecule_info" +fi + +bam="\\$par_input/possorted_genome_bam.bam" +if [ -f "\\$bam" ] && [ ! -z "\\$par_bam" ]; then + echo "cp \\$bam \\$par_bam" + cp "\\$bam" "\\$par_bam" +fi + +raw_h5="\\$par_input/raw_feature_bc_matrix.h5" +if [ -f "\\$raw_h5" ] && [ ! -z "\\$par_raw_h5" ]; then + echo "+ cp \\$raw_h5 \\$par_raw_h5" + cp "\\$raw_h5" "\\$par_raw_h5" +fi + +bai="\\$par_input/possorted_genome_bam.bam.bai" +if [ -f "\\$bai" ] && [ ! -z "\\$par_bai" ]; then + echo "+ cp \\$bai \\$par_bai" + cp "\\$bai" "\\$par_bai" +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/cellranger_count_split", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/cellranger_count_split/nextflow.config b/target/nextflow/mapping/cellranger_count_split/nextflow.config new file mode 100644 index 00000000..0bd357ac --- /dev/null +++ b/target/nextflow/mapping/cellranger_count_split/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/cellranger_count_split' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Split 10x Cell Ranger output directory into separate output fields.' + author = 'Angela Oliveira Pisco, Samuel D\'Souza, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/cellranger_count_split/nextflow_labels.config b/target/nextflow/mapping/cellranger_count_split/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/cellranger_count_split/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/cellranger_count_split/nextflow_schema.json b/target/nextflow/mapping/cellranger_count_split/nextflow_schema.json new file mode 100644 index 00000000..eec61b34 --- /dev/null +++ b/target/nextflow/mapping/cellranger_count_split/nextflow_schema.json @@ -0,0 +1,84 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_count_split", + "description": "Split 10x Cell Ranger output directory into separate output fields.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Output directory from a Cell Ranger count run.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input_dir\"`. " + }, + "filtered_h5": { + "type": "string", + "format": "path", + "description": "Output path for the h5 file storing the filtered counts.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.filtered_h5.h5\"`, direction: `output`, example: `\"filtered_feature_bc_matrix.h5\"`. ", + "default": "$id.$key.filtered_h5.h5" + }, + "metrics_summary": { + "type": "string", + "format": "path", + "description": "Where to store the 'metrics_summary' CSV file.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.metrics_summary.csv\"`, direction: `output`, example: `\"metrics_summary.csv\"`. ", + "default": "$id.$key.metrics_summary.csv" + }, + "molecule_info": { + "type": "string", + "format": "path", + "description": "Where to store Cell Ranger's 'molecule_info.h5' file.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.molecule_info.h5\"`, direction: `output`, example: `\"molecule_info.h5\"`. ", + "default": "$id.$key.molecule_info.h5" + }, + "bam": { + "type": "string", + "format": "path", + "description": "Location of output BAM files.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.bam.bam\"`, direction: `output`, example: `\"possorted_genome_bam.bam\"`. ", + "default": "$id.$key.bam.bam" + }, + "bai": { + "type": "string", + "format": "path", + "description": "Where to store the BAM index files.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.bai.bai\"`, direction: `output`, example: `\"possorted_genome_bam.bam.bai\"`. ", + "default": "$id.$key.bai.bai" + }, + "raw_h5": { + "type": "string", + "format": "path", + "description": "Output path for the h5 file storing the raw counts.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.raw_h5.h5\"`, direction: `output`, example: `\"raw_feature_bc_matrix.h5\"`. ", + "default": "$id.$key.raw_h5.h5" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/cellranger_multi/.config.vsh.yaml b/target/nextflow/mapping/cellranger_multi/.config.vsh.yaml new file mode 100644 index 00000000..8715e1d6 --- /dev/null +++ b/target/nextflow/mapping/cellranger_multi/.config.vsh.yaml @@ -0,0 +1,1028 @@ +name: "cellranger_multi" +namespace: "mapping" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input files" + arguments: + - type: "file" + name: "--input" + description: "The FASTQ files to be analyzed. FASTQ files should conform to the\ + \ naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane\ + \ Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Feature type-specific input files" + description: "Helper functionality to allow feature type-specific input files, without\ + \ the need to specify\nlibrary_type or library_id. The library_id will be inferred\ + \ from the input paths.\n" + arguments: + - type: "file" + name: "--gex_input" + description: "The FASTQ files to be analyzed for Gene Expression. FASTQ files\ + \ should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--abc_input" + description: "The FASTQ files to be analyzed for Antibody Capture. FASTQ files\ + \ should conform to \nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--cgc_input" + description: "The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--mux_input" + description: "The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_input" + description: "The FASTQ files to be analyzed for VDJ. FASTQ files should conform\ + \ to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_t_input" + description: "The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform\ + \ to the naming\nconventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_t_gd_input" + description: "The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should\ + \ conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_b_input" + description: "The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform\ + \ to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--agc_input" + description: "The FASTQ files to be analyzed for Antigen Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Library arguments" + arguments: + - type: "string" + name: "--library_id" + description: "The Illumina sample name to analyze. This must exactly match the\ + \ 'Sample Name'part\nof the FASTQ files specified in the `--input` argument.\n" + info: null + example: + - "mysample1" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_type" + description: "The underlying feature type of the library.\n" + info: null + example: + - "Gene Expression" + required: false + choices: + - "Gene Expression" + - "VDJ" + - "VDJ-T" + - "VDJ-B" + - "VDJ-T-GD" + - "Antibody Capture" + - "CRISPR Guide Capture" + - "Multiplexing Capture" + - "Antigen Capture" + - "Custom" + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_subsample" + description: "The rate at which reads from the provided FASTQ files are sampled.\n\ + Must be strictly greater than 0 and less than or equal to 1.\n" + info: null + example: + - "0.5" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_lanes" + description: "Lanes associated with this sample. Defaults to using all lanes." + info: null + example: + - "1-4" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_chemistry" + description: "Only applicable to FRP. Library-specific assay configuration. By\ + \ default,\nthe assay configuration is detected automatically. Typically, users\ + \ will\nnot need to specify a chemistry.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Sample parameters" + arguments: + - type: "string" + name: "--sample_ids" + alternatives: + - "--cell_multiplex_sample_id" + description: "A name to identify a multiplexed sample. Must be alphanumeric with\ + \ hyphens and/or underscores,\nand less than 64 characters. Required for Cell\ + \ Multiplexing libraries.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sample_description" + alternatives: + - "--cell_multiplex_description" + description: "A description for the sample." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sample_expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sample_force_cells" + description: "Force pipeline to use this number of cells, bypassing cell detection.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Feature Barcode library specific arguments" + arguments: + - type: "file" + name: "--feature_reference" + description: "Path to the Feature reference CSV file, declaring Feature Barcode\ + \ constructs and associated barcodes.\nRequired only for Antibody Capture or\ + \ CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref\ + \ for more information.\"\n" + info: null + example: + - "feature_reference.csv" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--feature_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is the user-supplied value. Note that the length\ + \ includes the Barcode and UMI\nsequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--feature_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before\ + \ sequencing metrics are computed\nand therefore, limiting the length of Read\ + \ 2 may affect Q30 scores.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_crispr_umi" + description: "Set the minimum number of CRISPR guide RNA UMIs required for protospacer\ + \ detection.\nIf a lower or higher sensitivity is desired for detection, this\ + \ value can be customized\naccording to specific experimental needs. Applicable\ + \ only to datasets that include a\nCRISPR Guide Capture library.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Gene expression arguments" + description: "Arguments relevant to the analysis of gene expression data." + arguments: + - type: "file" + name: "--gex_reference" + description: "Genome refence index built by Cell Ranger mkref." + info: null + example: + - "reference_genome.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_secondary_analysis" + description: "Whether or not to run the secondary analysis e.g. clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_generate_bam" + description: "Whether to generate a BAM file." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--tenx_cloud_token_path" + description: "The 10x Cloud Analysis user token used to enable cell annotation." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_annotation_model" + description: "\"Cell annotation model to use. If auto, uses the default model\ + \ for the species.\nIf not given, does not run cell annotation.\"\n" + info: null + required: false + choices: + - "auto" + - "human_pca_v1_beta" + - "mouse_pca_v1_beta" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_force_cells" + description: "Force pipeline to use this number of cells, bypassing cell detection.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_include_introns" + description: "Whether or not to include intronic reads in counts.\nThis option\ + \ does not apply to Fixed RNA Profiling analysis.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is the user-supplied value. Note that the length\ + \ includes the Barcode and UMI\nsequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before\ + \ sequencing metrics are computed\nand therefore, limiting the length of Read\ + \ 2 may affect Q30 scores.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gex_chemistry" + description: "Assay configuration. Either specify a single value which will be\ + \ applied to all libraries,\nor a number of values that is equal to the number\ + \ of libararies. The latter is only applicable\nto only applicable to Fixed\ + \ RNA Profiling.\n - auto: Chemistry autodetection (default)\n - threeprime:\ + \ Single Cell 3'\n - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single\ + \ Cell 3' v1, v2, v3, or v4\n - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT\n\ + \ - SC-FB: Single Cell Antibody-only 3' v2 or 5'\n - fiveprime: Single Cell\ + \ 5'\n - SC5P-PE: Paired-end Single Cell 5'\n - SC5P-PE-v3: Paired-end Single\ + \ Cell 5' v3\n - SC5P-R2: R2-only Single Cell 5'\n - SC5P-R2-v3: R2-only Single\ + \ Cell 5' v3\n - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X)\n - SC5PHT\ + \ : Single Cell 5' v2 HT\n - SFRP: Fixed RNA Profiling (Singleplex)\n - MFRP:\ + \ Fixed RNA Profiling (Multiplex, Probe Barcode on R2)\n - MFRP-R1: Fixed RNA\ + \ Profiling (Multiplex, Probe Barcode on R1)\n - MFRP-RNA: Fixed RNA Profiling\ + \ (Multiplex, RNA, Probe Barcode on R2)\n - MFRP-Ab: Fixed RNA Profiling (Multiplex,\ + \ Antibody, Probe Barcode at R2:69)\n - MFRP-Ab-R2pos50: Fixed RNA Profiling\ + \ (Multiplex, Antibody, Probe Barcode at R2:50)\n - MFRP-RNA-R1: Fixed RNA\ + \ Profiling (Multiplex, RNA, Probe Barcode on R1)\n - MFRP-Ab-R1: Fixed RNA\ + \ Profiling (Multiplex, Antibody, Probe Barcode on R1)\n - ARC-v1 for analyzing\ + \ the Gene Expression portion of Multiome data. If Cell Ranger auto-detects\ + \ ARC-v1 chemistry, an error is triggered.\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-\ + \ for more information.\n" + info: null + default: + - "auto" + required: false + choices: + - "auto" + - "threeprime" + - "fiveprime" + - "SC3Pv1" + - "SC3Pv2" + - "SC3Pv3" + - "SC3Pv3-polyA" + - "SC3Pv4" + - "SC3Pv4-polyA" + - "SC3Pv3LT" + - "SC3Pv3HT" + - "SC3Pv3HT-polyA" + - "SC5P-PE" + - "SC5P-PE-v3" + - "SC5P-R2" + - "SC-FB" + - "SC5P-R2-v3" + - "SCP5-PE-v3" + - "SC5PHT" + - "MFRP" + - "MFRP-R1" + - "MFRP-RNA" + - "MFRP-Ab" + - "SFRP" + - "MFRP-Ab-R2pos50" + - "MFRP-RNA-R1" + - "MFRP-Ab-R1" + - "ARC-v1" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "VDJ related parameters" + arguments: + - type: "file" + name: "--vdj_reference" + description: "VDJ refence index built by Cell Ranger mkref." + info: null + example: + - "reference_vdj.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_inner_enrichment_primers" + description: "V(D)J Immune Profiling libraries: if inner enrichment primers other\ + \ than those provided \nin the 10x Genomics kits are used, they need to be specified\ + \ here as a\ntext file with one primer per line.\n" + info: null + example: + - "enrichment_primers.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vdj_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases, where N is the user-supplied value.\nNote that the length\ + \ includes the Barcode and UMI sequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vdj_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases, where N is a user-supplied value. \nTrimming occurs\ + \ before sequencing metrics are computed and therefore, limiting the length\ + \ of Read 2 may affect Q30 scores\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "3' Cell multiplexing parameters (CellPlex Multiplexing)" + arguments: + - type: "string" + name: "--cell_multiplex_oligo_ids" + alternatives: + - "--cmo_ids" + description: "The Cell Multiplexing oligo IDs used to multiplex this sample. If\ + \ multiple CMOs were used for a sample,\nseparate IDs with a pipe (e.g., CMO301|CMO302).\ + \ Required for Cell Multiplexing libraries.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--min_assignment_confidence" + description: "The minimum estimated likelihood to call a sample as tagged with\ + \ a Cell Multiplexing Oligo (CMO) instead of \"Unassigned\".\nUsers may wish\ + \ to tolerate a higher rate of mis-assignment in order to obtain more singlets\ + \ to include in their analysis,\nor a lower rate of mis-assignment at the cost\ + \ of obtaining fewer singlets.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cmo_set" + description: "Path to a custom CMO set CSV file, declaring CMO constructs and\ + \ associated barcodes. If the default CMO reference IDs that are built into\n\ + the Cell Ranger software are required, this option does not need to be used.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode_sample_assignment" + description: "Path to a barcode-sample assignment CSV file that specifies the\ + \ barcodes that belong to each sample.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Hashtag multiplexing parameters" + arguments: + - type: "string" + name: "--hashtag_ids" + description: "The hashtag IDs used to multiplex this sample. If multiple antibody\ + \ hashtags were used for the same sample,\nyou can separate IDs with a pipe.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "On-chip multiplexing parameters" + arguments: + - type: "string" + name: "--ocm_barcode_ids" + description: "The OCM barcode IDs used to multiplex this sample. Must be one of\ + \ OB1, OB2, OB3, OB4.\nIf multiple OCM Barcodes were used for the same sample,\ + \ you can separate IDs\nwith a pipe (e.g., OB1|OB2).\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Flex multiplexing paramaters" + arguments: + - type: "file" + name: "--probe_set" + description: "A probe set reference CSV file. It specifies the sequences used\ + \ as a reference for probe alignment and the gene ID associated with each probe.\n\ + It must include 4 columns (probe file format 1.0.0): gene_id,probe_seq,probe_id,included,region\ + \ and an optional 5th column (probe file format 1.0.1).\n- gene_id: The Ensembl\ + \ gene identifier targeted by the probe.\n- probe_seq: The nucleotide sequence\ + \ of the probe, which is complementary to the transcript sequence.\n- probe_id:\ + \ The probe identifier, whose format is described in Probe identifiers.\n- included:\ + \ A TRUE or FALSE flag specifying whether the probe is included in the filtered\ + \ counts matrix output or excluded by the probe filter. \n See filter-probes\ + \ option of cellranger multi. All probes of a gene must be marked TRUE in the\ + \ included column for that gene to be included.\n- region: Present only in v1.0.1\ + \ probe set reference CSV. The gene boundary targeted by the probe. Accepted\ + \ values are spliced or unspliced.\n\nThe file also contains a number of required\ + \ metadata fields in the header in the format #key=value:\n- panel_name: The\ + \ name of the probe set.\n- panel_type: Always predesigned for predesigned probe\ + \ sets.\n- reference_genome: The reference genome build used for probe design.\n\ + - reference_version: The version of the Cell Ranger reference transcriptome\ + \ used for probe design.\n- probe_set_file_format: The version of the probe\ + \ set file format specification that this file conforms to.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--filter_probes" + description: "If 'false', include all non-deprecated probes listed in the probe\ + \ set reference CSV file.\nIf 'true' or not set, probes that are predicted to\ + \ have off-target activity to homologous genes are excluded from analysis.\n\ + Not filtering will result in UMI counts from all non-deprecated probes,\nincluding\ + \ those with predicted off-target activity, to be used in the analysis.\nProbes\ + \ whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--probe_barcode_ids" + description: "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex\ + \ GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing\ + \ Barcode IDs. 10x recommends specifying both barcodes (e.g., BC001+AB001)\n\ + when an Antibody Capture library is present. The barcode pair order is BC+AB\ + \ and they\nare separated with a \"+\" (no spaces). Alternatively, you can specify\ + \ the Probe Barcode ID alone and\nCell Ranger's barcode pairing auto-detection\ + \ algorithm will automatically match to the corresponding Antibody\nMultiplexing\ + \ Barcode.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--emptydrops_minimum_umis" + description: "For singleplex Flex experiments, use this option to adjust the UMI\ + \ cutoff during the second step of cell calling.\nCell Ranger will still perform\ + \ the full cell calling process but will only evaluate barcodes with UMIs above\n\ + the threshold you specify.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Antigen Capture (BEAM) libary arguments" + description: "These arguments are recommended if an Antigen Capture (BEAM) library\ + \ is present. \nIt is needed to calculate the antigen specificity score.\n" + arguments: + - type: "string" + name: "--control_id" + description: "A user-defined ID for any negative controls used in the T/BCR Antigen\ + \ Capture assay. Must match id specified in the feature reference CSV.\nMay\ + \ only include ASCII characters and must not use whitespace, slash, quote, or\ + \ comma characters. \nEach ID must be unique and must not collide with a gene\ + \ identifier from the transcriptome.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--mhc_allele" + description: "The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele\ + \ name specified in the Feature Reference CSV.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "General arguments" + description: "These arguments are applicable to all library types.\n" + arguments: + - type: "boolean" + name: "--check_library_compatibility" + description: "Optional. This option allows users to disable the check that evaluates\ + \ 10x Barcode overlap between\nibraries when multiple libraries are specified\ + \ (e.g., Gene Expression + Antibody Capture). Setting\nthis option to false\ + \ will disable the check across all library combinations. We recommend running\n\ + this check (default), however if the pipeline errors out, users can bypass the\ + \ check to generate\noutputs for troubleshooting.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The folder to store the alignment results." + info: null + example: + - "/path/to/output" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Executor arguments" + arguments: + - type: "boolean_true" + name: "--dryrun" + description: "If true, the output directory will only contain the CWL input files,\ + \ but the pipeline itself will not be executed." + info: null + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using Cell Ranger multi." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "10x_5k_anticmv/raw/" + dest: "10x_5k_anticmv/raw/" +- type: "file" + path: "10x_5k_lung_crispr/raw/" + dest: "10x_5k_lung_crispr/raw/" +- type: "file" + path: "10x_5k_beam/raw/" + dest: "10x_5k_beam/raw/" +- type: "file" + path: "10x_5k_fixed/raw" + dest: "10x_5k_fixed/raw" +- type: "file" + path: "raw" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "veryhighmem" + - "highcpu" + - "highdisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps && rm -rf /var/lib/apt/lists/*\n" + - type: "python" + user: false + packages: + - "pandas" + - "pyyaml" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/cellranger_multi/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/cellranger_multi" + executable: "target/nextflow/mapping/cellranger_multi/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/cellranger_multi/main.nf b/target/nextflow/mapping/cellranger_multi/main.nf new file mode 100644 index 00000000..de82f13e --- /dev/null +++ b/target/nextflow/mapping/cellranger_multi/main.nf @@ -0,0 +1,5251 @@ +// cellranger_multi main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) +// * Dries De Maeyer (author) +// * Weiwei Schultz (contributor) +// * Dorien Roosen (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_multi", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + }, + { + "name" : "Dorien Roosen", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input files", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The FASTQ files to be analyzed. FASTQ files should conform to the naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Feature type-specific input files", + "description" : "Helper functionality to allow feature type-specific input files, without the need to specify\nlibrary_type or library_id. The library_id will be inferred from the input paths.\n", + "arguments" : [ + { + "type" : "file", + "name" : "--gex_input", + "description" : "The FASTQ files to be analyzed for Gene Expression. FASTQ files should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--abc_input", + "description" : "The FASTQ files to be analyzed for Antibody Capture. FASTQ files should conform to \nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cgc_input", + "description" : "The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--mux_input", + "description" : "The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_input", + "description" : "The FASTQ files to be analyzed for VDJ. FASTQ files should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_t_input", + "description" : "The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform to the naming\nconventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_t_gd_input", + "description" : "The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_b_input", + "description" : "The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--agc_input", + "description" : "The FASTQ files to be analyzed for Antigen Capture. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Library arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--library_id", + "description" : "The Illumina sample name to analyze. This must exactly match the 'Sample Name'part\nof the FASTQ files specified in the `--input` argument.\n", + "example" : [ + "mysample1" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_type", + "description" : "The underlying feature type of the library.\n", + "example" : [ + "Gene Expression" + ], + "required" : false, + "choices" : [ + "Gene Expression", + "VDJ", + "VDJ-T", + "VDJ-B", + "VDJ-T-GD", + "Antibody Capture", + "CRISPR Guide Capture", + "Multiplexing Capture", + "Antigen Capture", + "Custom" + ], + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_subsample", + "description" : "The rate at which reads from the provided FASTQ files are sampled.\nMust be strictly greater than 0 and less than or equal to 1.\n", + "example" : [ + "0.5" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_lanes", + "description" : "Lanes associated with this sample. Defaults to using all lanes.", + "example" : [ + "1-4" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_chemistry", + "description" : "Only applicable to FRP. Library-specific assay configuration. By default,\nthe assay configuration is detected automatically. Typically, users will\nnot need to specify a chemistry.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Sample parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--sample_ids", + "alternatives" : [ + "--cell_multiplex_sample_id" + ], + "description" : "A name to identify a multiplexed sample. Must be alphanumeric with hyphens and/or underscores,\nand less than 64 characters. Required for Cell Multiplexing libraries.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample_description", + "alternatives" : [ + "--cell_multiplex_description" + ], + "description" : "A description for the sample.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sample_expect_cells", + "description" : "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sample_force_cells", + "description" : "Force pipeline to use this number of cells, bypassing cell detection.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Feature Barcode library specific arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--feature_reference", + "description" : "Path to the Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes.\nRequired only for Antibody Capture or CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref for more information.\\"\n", + "example" : [ + "feature_reference.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--feature_r1_length", + "description" : "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value. Note that the length includes the Barcode and UMI\nsequences so do not set this below 26.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--feature_r2_length", + "description" : "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before sequencing metrics are computed\nand therefore, limiting the length of Read 2 may affect Q30 scores.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_crispr_umi", + "description" : "Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection.\nIf a lower or higher sensitivity is desired for detection, this value can be customized\naccording to specific experimental needs. Applicable only to datasets that include a\nCRISPR Guide Capture library.\n", + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Gene expression arguments", + "description" : "Arguments relevant to the analysis of gene expression data.", + "arguments" : [ + { + "type" : "file", + "name" : "--gex_reference", + "description" : "Genome refence index built by Cell Ranger mkref.", + "example" : [ + "reference_genome.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--gex_secondary_analysis", + "description" : "Whether or not to run the secondary analysis e.g. clustering.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--gex_generate_bam", + "description" : "Whether to generate a BAM file.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--tenx_cloud_token_path", + "description" : "The 10x Cloud Analysis user token used to enable cell annotation.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_annotation_model", + "description" : "\\"Cell annotation model to use. If auto, uses the default model for the species.\nIf not given, does not run cell annotation.\\"\n", + "required" : false, + "choices" : [ + "auto", + "human_pca_v1_beta", + "mouse_pca_v1_beta" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_expect_cells", + "description" : "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_force_cells", + "description" : "Force pipeline to use this number of cells, bypassing cell detection.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--gex_include_introns", + "description" : "Whether or not to include intronic reads in counts.\nThis option does not apply to Fixed RNA Profiling analysis.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_r1_length", + "description" : "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value. Note that the length includes the Barcode and UMI\nsequences so do not set this below 26.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_r2_length", + "description" : "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before sequencing metrics are computed\nand therefore, limiting the length of Read 2 may affect Q30 scores.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--gex_chemistry", + "description" : "Assay configuration. Either specify a single value which will be applied to all libraries,\nor a number of values that is equal to the number of libararies. The latter is only applicable\nto only applicable to Fixed RNA Profiling.\n - auto: Chemistry autodetection (default)\n - threeprime: Single Cell 3'\n - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single Cell 3' v1, v2, v3, or v4\n - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT\n - SC-FB: Single Cell Antibody-only 3' v2 or 5'\n - fiveprime: Single Cell 5'\n - SC5P-PE: Paired-end Single Cell 5'\n - SC5P-PE-v3: Paired-end Single Cell 5' v3\n - SC5P-R2: R2-only Single Cell 5'\n - SC5P-R2-v3: R2-only Single Cell 5' v3\n - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X)\n - SC5PHT : Single Cell 5' v2 HT\n - SFRP: Fixed RNA Profiling (Singleplex)\n - MFRP: Fixed RNA Profiling (Multiplex, Probe Barcode on R2)\n - MFRP-R1: Fixed RNA Profiling (Multiplex, Probe Barcode on R1)\n - MFRP-RNA: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R2)\n - MFRP-Ab: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at R2:69)\n - MFRP-Ab-R2pos50: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at R2:50)\n - MFRP-RNA-R1: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R1)\n - MFRP-Ab-R1: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode on R1)\n - ARC-v1 for analyzing the Gene Expression portion of Multiome data. If Cell Ranger auto-detects ARC-v1 chemistry, an error is triggered.\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information.\n", + "default" : [ + "auto" + ], + "required" : false, + "choices" : [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv3-polyA", + "SC3Pv4", + "SC3Pv4-polyA", + "SC3Pv3LT", + "SC3Pv3HT", + "SC3Pv3HT-polyA", + "SC5P-PE", + "SC5P-PE-v3", + "SC5P-R2", + "SC-FB", + "SC5P-R2-v3", + "SCP5-PE-v3", + "SC5PHT", + "MFRP", + "MFRP-R1", + "MFRP-RNA", + "MFRP-Ab", + "SFRP", + "MFRP-Ab-R2pos50", + "MFRP-RNA-R1", + "MFRP-Ab-R1", + "ARC-v1" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "VDJ related parameters", + "arguments" : [ + { + "type" : "file", + "name" : "--vdj_reference", + "description" : "VDJ refence index built by Cell Ranger mkref.", + "example" : [ + "reference_vdj.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_inner_enrichment_primers", + "description" : "V(D)J Immune Profiling libraries: if inner enrichment primers other than those provided \nin the 10x Genomics kits are used, they need to be specified here as a\ntext file with one primer per line.\n", + "example" : [ + "enrichment_primers.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--vdj_r1_length", + "description" : "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, where N is the user-supplied value.\nNote that the length includes the Barcode and UMI sequences so do not set this below 26.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--vdj_r2_length", + "description" : "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, where N is a user-supplied value. \nTrimming occurs before sequencing metrics are computed and therefore, limiting the length of Read 2 may affect Q30 scores\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "3' Cell multiplexing parameters (CellPlex Multiplexing)", + "arguments" : [ + { + "type" : "string", + "name" : "--cell_multiplex_oligo_ids", + "alternatives" : [ + "--cmo_ids" + ], + "description" : "The Cell Multiplexing oligo IDs used to multiplex this sample. If multiple CMOs were used for a sample,\nseparate IDs with a pipe (e.g., CMO301|CMO302). Required for Cell Multiplexing libraries.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_assignment_confidence", + "description" : "The minimum estimated likelihood to call a sample as tagged with a Cell Multiplexing Oligo (CMO) instead of \\"Unassigned\\".\nUsers may wish to tolerate a higher rate of mis-assignment in order to obtain more singlets to include in their analysis,\nor a lower rate of mis-assignment at the cost of obtaining fewer singlets.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cmo_set", + "description" : "Path to a custom CMO set CSV file, declaring CMO constructs and associated barcodes. If the default CMO reference IDs that are built into\nthe Cell Ranger software are required, this option does not need to be used.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcode_sample_assignment", + "description" : "Path to a barcode-sample assignment CSV file that specifies the barcodes that belong to each sample.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Hashtag multiplexing parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--hashtag_ids", + "description" : "The hashtag IDs used to multiplex this sample. If multiple antibody hashtags were used for the same sample,\nyou can separate IDs with a pipe.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "On-chip multiplexing parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--ocm_barcode_ids", + "description" : "The OCM barcode IDs used to multiplex this sample. Must be one of OB1, OB2, OB3, OB4.\nIf multiple OCM Barcodes were used for the same sample, you can separate IDs\nwith a pipe (e.g., OB1|OB2).\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Flex multiplexing paramaters", + "arguments" : [ + { + "type" : "file", + "name" : "--probe_set", + "description" : "A probe set reference CSV file. It specifies the sequences used as a reference for probe alignment and the gene ID associated with each probe.\nIt must include 4 columns (probe file format 1.0.0): gene_id,probe_seq,probe_id,included,region and an optional 5th column (probe file format 1.0.1).\n- gene_id: The Ensembl gene identifier targeted by the probe.\n- probe_seq: The nucleotide sequence of the probe, which is complementary to the transcript sequence.\n- probe_id: The probe identifier, whose format is described in Probe identifiers.\n- included: A TRUE or FALSE flag specifying whether the probe is included in the filtered counts matrix output or excluded by the probe filter. \n See filter-probes option of cellranger multi. All probes of a gene must be marked TRUE in the included column for that gene to be included.\n- region: Present only in v1.0.1 probe set reference CSV. The gene boundary targeted by the probe. Accepted values are spliced or unspliced.\n\nThe file also contains a number of required metadata fields in the header in the format #key=value:\n- panel_name: The name of the probe set.\n- panel_type: Always predesigned for predesigned probe sets.\n- reference_genome: The reference genome build used for probe design.\n- reference_version: The version of the Cell Ranger reference transcriptome used for probe design.\n- probe_set_file_format: The version of the probe set file format specification that this file conforms to.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--filter_probes", + "description" : "If 'false', include all non-deprecated probes listed in the probe set reference CSV file.\nIf 'true' or not set, probes that are predicted to have off-target activity to homologous genes are excluded from analysis.\nNot filtering will result in UMI counts from all non-deprecated probes,\nincluding those with predicted off-target activity, to be used in the analysis.\nProbes whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--probe_barcode_ids", + "description" : "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing Barcode IDs. 10x recommends specifying both barcodes (e.g., BC001+AB001)\nwhen an Antibody Capture library is present. The barcode pair order is BC+AB and they\nare separated with a \\"+\\" (no spaces). Alternatively, you can specify the Probe Barcode ID alone and\nCell Ranger's barcode pairing auto-detection algorithm will automatically match to the corresponding Antibody\nMultiplexing Barcode.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--emptydrops_minimum_umis", + "description" : "For singleplex Flex experiments, use this option to adjust the UMI cutoff during the second step of cell calling.\nCell Ranger will still perform the full cell calling process but will only evaluate barcodes with UMIs above\nthe threshold you specify.\n", + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Antigen Capture (BEAM) libary arguments", + "description" : "These arguments are recommended if an Antigen Capture (BEAM) library is present. \nIt is needed to calculate the antigen specificity score.\n", + "arguments" : [ + { + "type" : "string", + "name" : "--control_id", + "description" : "A user-defined ID for any negative controls used in the T/BCR Antigen Capture assay. Must match id specified in the feature reference CSV.\nMay only include ASCII characters and must not use whitespace, slash, quote, or comma characters. \nEach ID must be unique and must not collide with a gene identifier from the transcriptome.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--mhc_allele", + "description" : "The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele name specified in the Feature Reference CSV.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "General arguments", + "description" : "These arguments are applicable to all library types.\n", + "arguments" : [ + { + "type" : "boolean", + "name" : "--check_library_compatibility", + "description" : "Optional. This option allows users to disable the check that evaluates 10x Barcode overlap between\nibraries when multiple libraries are specified (e.g., Gene Expression + Antibody Capture). Setting\nthis option to false will disable the check across all library combinations. We recommend running\nthis check (default), however if the pipeline errors out, users can bypass the check to generate\noutputs for troubleshooting.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The folder to store the alignment results.", + "example" : [ + "/path/to/output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Executor arguments", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--dryrun", + "description" : "If true, the output directory will only contain the CWL input files, but the pipeline itself will not be executed.", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Align fastq files using Cell Ranger multi.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_anticmv/raw/", + "dest" : "10x_5k_anticmv/raw/" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_lung_crispr/raw/", + "dest" : "10x_5k_lung_crispr/raw/" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_beam/raw/", + "dest" : "10x_5k_beam/raw/" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_fixed/raw/", + "dest" : "10x_5k_fixed/raw" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_beam/raw/" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "veryhighmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger:9.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update && \\\\\napt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*\n" + ] + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pandas", + "pyyaml" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/cellranger_multi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/cellranger_multi", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from __future__ import annotations + +import sys +import re +import subprocess +from multiprocessing import cpu_count +import tempfile +import pandas as pd +import yaml +from typing import Optional, Any, Union +import tarfile +from pathlib import Path +import shutil +from itertools import chain + +CELL_RANGER_EXEC = shutil.which("cellranger") + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'gex_input': $( if [ ! -z ${VIASH_PAR_GEX_INPUT+x} ]; then echo "r'${VIASH_PAR_GEX_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'abc_input': $( if [ ! -z ${VIASH_PAR_ABC_INPUT+x} ]; then echo "r'${VIASH_PAR_ABC_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'cgc_input': $( if [ ! -z ${VIASH_PAR_CGC_INPUT+x} ]; then echo "r'${VIASH_PAR_CGC_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'mux_input': $( if [ ! -z ${VIASH_PAR_MUX_INPUT+x} ]; then echo "r'${VIASH_PAR_MUX_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'vdj_input': $( if [ ! -z ${VIASH_PAR_VDJ_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'vdj_t_input': $( if [ ! -z ${VIASH_PAR_VDJ_T_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_T_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'vdj_t_gd_input': $( if [ ! -z ${VIASH_PAR_VDJ_T_GD_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_T_GD_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'vdj_b_input': $( if [ ! -z ${VIASH_PAR_VDJ_B_INPUT+x} ]; then echo "r'${VIASH_PAR_VDJ_B_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'agc_input': $( if [ ! -z ${VIASH_PAR_AGC_INPUT+x} ]; then echo "r'${VIASH_PAR_AGC_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'library_id': $( if [ ! -z ${VIASH_PAR_LIBRARY_ID+x} ]; then echo "r'${VIASH_PAR_LIBRARY_ID//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'library_type': $( if [ ! -z ${VIASH_PAR_LIBRARY_TYPE+x} ]; then echo "r'${VIASH_PAR_LIBRARY_TYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'library_subsample': $( if [ ! -z ${VIASH_PAR_LIBRARY_SUBSAMPLE+x} ]; then echo "r'${VIASH_PAR_LIBRARY_SUBSAMPLE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'library_lanes': $( if [ ! -z ${VIASH_PAR_LIBRARY_LANES+x} ]; then echo "r'${VIASH_PAR_LIBRARY_LANES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'library_chemistry': $( if [ ! -z ${VIASH_PAR_LIBRARY_CHEMISTRY+x} ]; then echo "r'${VIASH_PAR_LIBRARY_CHEMISTRY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sample_ids': $( if [ ! -z ${VIASH_PAR_SAMPLE_IDS+x} ]; then echo "r'${VIASH_PAR_SAMPLE_IDS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sample_description': $( if [ ! -z ${VIASH_PAR_SAMPLE_DESCRIPTION+x} ]; then echo "r'${VIASH_PAR_SAMPLE_DESCRIPTION//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sample_expect_cells': $( if [ ! -z ${VIASH_PAR_SAMPLE_EXPECT_CELLS+x} ]; then echo "list(map(int, r'${VIASH_PAR_SAMPLE_EXPECT_CELLS//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'sample_force_cells': $( if [ ! -z ${VIASH_PAR_SAMPLE_FORCE_CELLS+x} ]; then echo "list(map(int, r'${VIASH_PAR_SAMPLE_FORCE_CELLS//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'feature_reference': $( if [ ! -z ${VIASH_PAR_FEATURE_REFERENCE+x} ]; then echo "r'${VIASH_PAR_FEATURE_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'feature_r1_length': $( if [ ! -z ${VIASH_PAR_FEATURE_R1_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_FEATURE_R1_LENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'feature_r2_length': $( if [ ! -z ${VIASH_PAR_FEATURE_R2_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_FEATURE_R2_LENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_crispr_umi': $( if [ ! -z ${VIASH_PAR_MIN_CRISPR_UMI+x} ]; then echo "int(r'${VIASH_PAR_MIN_CRISPR_UMI//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gex_reference': $( if [ ! -z ${VIASH_PAR_GEX_REFERENCE+x} ]; then echo "r'${VIASH_PAR_GEX_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'gex_secondary_analysis': $( if [ ! -z ${VIASH_PAR_GEX_SECONDARY_ANALYSIS+x} ]; then echo "r'${VIASH_PAR_GEX_SECONDARY_ANALYSIS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'gex_generate_bam': $( if [ ! -z ${VIASH_PAR_GEX_GENERATE_BAM+x} ]; then echo "r'${VIASH_PAR_GEX_GENERATE_BAM//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'tenx_cloud_token_path': $( if [ ! -z ${VIASH_PAR_TENX_CLOUD_TOKEN_PATH+x} ]; then echo "r'${VIASH_PAR_TENX_CLOUD_TOKEN_PATH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cell_annotation_model': $( if [ ! -z ${VIASH_PAR_CELL_ANNOTATION_MODEL+x} ]; then echo "r'${VIASH_PAR_CELL_ANNOTATION_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'gex_expect_cells': $( if [ ! -z ${VIASH_PAR_GEX_EXPECT_CELLS+x} ]; then echo "int(r'${VIASH_PAR_GEX_EXPECT_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gex_force_cells': $( if [ ! -z ${VIASH_PAR_GEX_FORCE_CELLS+x} ]; then echo "int(r'${VIASH_PAR_GEX_FORCE_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gex_include_introns': $( if [ ! -z ${VIASH_PAR_GEX_INCLUDE_INTRONS+x} ]; then echo "r'${VIASH_PAR_GEX_INCLUDE_INTRONS//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'gex_r1_length': $( if [ ! -z ${VIASH_PAR_GEX_R1_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_GEX_R1_LENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gex_r2_length': $( if [ ! -z ${VIASH_PAR_GEX_R2_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_GEX_R2_LENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gex_chemistry': $( if [ ! -z ${VIASH_PAR_GEX_CHEMISTRY+x} ]; then echo "r'${VIASH_PAR_GEX_CHEMISTRY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_reference': $( if [ ! -z ${VIASH_PAR_VDJ_REFERENCE+x} ]; then echo "r'${VIASH_PAR_VDJ_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_inner_enrichment_primers': $( if [ ! -z ${VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS+x} ]; then echo "r'${VIASH_PAR_VDJ_INNER_ENRICHMENT_PRIMERS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vdj_r1_length': $( if [ ! -z ${VIASH_PAR_VDJ_R1_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_VDJ_R1_LENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'vdj_r2_length': $( if [ ! -z ${VIASH_PAR_VDJ_R2_LENGTH+x} ]; then echo "int(r'${VIASH_PAR_VDJ_R2_LENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'cell_multiplex_oligo_ids': $( if [ ! -z ${VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS+x} ]; then echo "r'${VIASH_PAR_CELL_MULTIPLEX_OLIGO_IDS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'min_assignment_confidence': $( if [ ! -z ${VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE+x} ]; then echo "float(r'${VIASH_PAR_MIN_ASSIGNMENT_CONFIDENCE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'cmo_set': $( if [ ! -z ${VIASH_PAR_CMO_SET+x} ]; then echo "r'${VIASH_PAR_CMO_SET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'barcode_sample_assignment': $( if [ ! -z ${VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT+x} ]; then echo "r'${VIASH_PAR_BARCODE_SAMPLE_ASSIGNMENT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'hashtag_ids': $( if [ ! -z ${VIASH_PAR_HASHTAG_IDS+x} ]; then echo "r'${VIASH_PAR_HASHTAG_IDS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'ocm_barcode_ids': $( if [ ! -z ${VIASH_PAR_OCM_BARCODE_IDS+x} ]; then echo "r'${VIASH_PAR_OCM_BARCODE_IDS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'probe_set': $( if [ ! -z ${VIASH_PAR_PROBE_SET+x} ]; then echo "r'${VIASH_PAR_PROBE_SET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'filter_probes': $( if [ ! -z ${VIASH_PAR_FILTER_PROBES+x} ]; then echo "r'${VIASH_PAR_FILTER_PROBES//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'probe_barcode_ids': $( if [ ! -z ${VIASH_PAR_PROBE_BARCODE_IDS+x} ]; then echo "r'${VIASH_PAR_PROBE_BARCODE_IDS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'emptydrops_minimum_umis': $( if [ ! -z ${VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS+x} ]; then echo "int(r'${VIASH_PAR_EMPTYDROPS_MINIMUM_UMIS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'control_id': $( if [ ! -z ${VIASH_PAR_CONTROL_ID+x} ]; then echo "r'${VIASH_PAR_CONTROL_ID//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'mhc_allele': $( if [ ! -z ${VIASH_PAR_MHC_ALLELE+x} ]; then echo "r'${VIASH_PAR_MHC_ALLELE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'check_library_compatibility': $( if [ ! -z ${VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY+x} ]; then echo "r'${VIASH_PAR_CHECK_LIBRARY_COMPATIBILITY//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'dryrun': $( if [ ! -z ${VIASH_PAR_DRYRUN+x} ]; then echo "r'${VIASH_PAR_DRYRUN//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +if not CELL_RANGER_EXEC: + raise RuntimeError( + "Could not find Cell Ranger executable. Please make sure it is installed and available in your PATH." + ) + +# Tested with cellranger 7.0: +# - omitting the lane number is allowed (e.g. \\`_L001\\`) +# - lane number should be omitted across all files if omitted in one +# - replacing \\`.fastq.\\` for \\`.fq.\\` is NOT allowed +# - omitting \\`.gz\\` is allowed + +fastq_regex = r"^([A-Za-z0-9\\\\-_\\\\.]+)_S(\\\\d+)_(L(\\\\d+)_)?[RI](\\\\d+)_(\\\\d+)\\\\.fastq(\\\\.gz)?\\$" +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_L001_R1_001.fastq.gz") is not None +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq") is not None +# assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq.gz.txt") is None + +# Invert some parameters. Keep the original ones in the config for compatibility +inverted_params = { + "gex_no_secondary_analysis": "gex_secondary_analysis", +} +for inverted_param, param in inverted_params.items(): + par[inverted_param] = not par[param] if par[param] is not None else None + del par[param] + +GEX_CONFIG_KEYS = { + "gex_reference": "reference", + "gex_expect_cells": "expect-cells", + "gex_force_cells": "force-cells", + "gex_chemistry": "chemistry", + "gex_no_secondary_analysis": "no-secondary", + "gex_generate_bam": "create-bam", + "gex_include_introns": "include-introns", + "min_assignment_confidence": "min-assignment-confidence", + "check_library_compatibility": "check-library-compatibility", + "barcode_sample_assignment": "barcode-sample-assignment", + "cmo_set": "cmo-set", + "probe_set": "probe-set", + "filter_probes": "filter-probes", + "gex_r1_length": "r1-length", + "gex_r2_length": "r2-length", + "tenx_cloud_token_path": "tenx-cloud-token-path", + "cell_annotation_model": "cell-annotation-model", + "emptydrops_minimum_umis": "emptydrops_minimum_umis", +} + +FEATURE_CONFIG_KEYS = { + "feature_reference": "reference", + "feature_r1_length": "r1-length", + "feature_r2_length": "r2-length", + "min_crispr_umi": "min-crispr-umi", +} + +VDJ_CONFIG_KEYS = { + "vdj_reference": "reference", + "vdj_inner_enrichment_primers": "inner-enrichment-primers", + "vdj_r1_length": "r1-length", + "vdj_r2_length": "r2-length", +} + + +ANTIGEN_SPECIFICITY_CONFIG_KEYS = { + "control_id": "control_id", + "mhc_allele": "mhc_allele", +} + + +REFERENCE_SECTIONS = { + "gene-expression": (GEX_CONFIG_KEYS, "index"), + "feature": (FEATURE_CONFIG_KEYS, "index"), + "vdj": (VDJ_CONFIG_KEYS, "index"), + "antigen-specificity": (ANTIGEN_SPECIFICITY_CONFIG_KEYS, "columns"), +} + +LIBRARY_CONFIG_KEYS = { + "library_id": "fastq_id", + "library_type": "feature_types", + "library_subsample": "subsample_rate", + "library_lanes": "lanes", + "library_chemistry": "chemistry", +} + + +SAMPLE_PARAMS_CONFIG_KEYS = { + "sample_ids": "sample_id", + "cell_multiplex_oligo_ids": "cmo_ids", + "sample_description": "description", + "probe_barcode_ids": "probe_barcode_ids", + "sample_expect_cells": "expect_cells", + "sample_force_cells": "force_cells", + "hashtag_ids": "hashtag_ids", + "ocm_barcode_ids": "ocm_barcode_ids", +} + + +# These are derived from the dictionaries above +REFERENCES = tuple( + reference_param + for reference_param, cellranger_param in chain( + GEX_CONFIG_KEYS.items(), FEATURE_CONFIG_KEYS.items(), VDJ_CONFIG_KEYS.items() + ) + if cellranger_param == "reference" +) +LIBRARY_PARAMS = tuple(LIBRARY_CONFIG_KEYS.keys()) +SAMPLE_PARAMS = tuple(SAMPLE_PARAMS_CONFIG_KEYS.keys()) +HELPER_INPUT = { + "gex_input": "Gene Expression", + "abc_input": "Antibody Capture", + "cgc_input": "CRISPR Guide Capture", + "mux_input": "Multiplexing Capture", + "vdj_input": "VDJ", + "vdj_t_input": "VDJ-T", + "vdj_t_gd_input": "VDJ-T-GD", + "vdj_b_input": "VDJ-B", + "agc_input": "Antigen Capture", +} + + +def infer_library_id_from_path(input_path: str) -> str: + match = re.match(fastq_regex, input_path) + assert match is not None, ( + f"File name of '{input_path}' should match regex {fastq_regex}." + ) + return match.group(1) + + +def transform_helper_inputs(par: dict[str, Any]) -> dict[str, Any]: + helper_input = {"input": [], "library_id": [], "library_type": []} + for input_type, library_type in HELPER_INPUT.items(): + if par[input_type]: + par[input_type] = resolve_input_directories_to_fastq_paths(par[input_type]) + + library_ids = [ + infer_library_id_from_path(path.name) for path in par[input_type] + ] + + library_id_dict = {} + for fastq, library_id in zip(par[input_type], library_ids): + library_id_dict.setdefault(library_id, []).append(fastq) + + for library_id, input in library_id_dict.items(): + helper_input["input"] += input + helper_input["library_id"].append(library_id) + helper_input["library_type"].append(library_type) + + assert len(helper_input["library_id"]) == len(set(helper_input["library_id"])), ( + "File names passed to feature type-specific inputs must be unique" + ) + + return helper_input + + +def lengths_gt1(dic: dict[str, Optional[list[Any]]]) -> dict[str, int]: + return { + key: len(li) + for key, li in dic.items() + if li is not None and isinstance(li, (list, tuple, set)) + } + + +def strip_margin(text: str) -> str: + return re.sub("(\\\\n?)[ \\\\t]*\\\\|", "\\\\\\\\1", text) + + +def subset_dict( + dictionary: dict[str, str], keys: Union[dict[str, str], list[str]] +) -> dict[str, str]: + if isinstance(keys, (list, tuple)): + keys = {key: key for key in keys} + return { + dest_key: dictionary[orig_key] + for orig_key, dest_key in keys.items() + if dictionary[orig_key] is not None + } + + +def check_subset_dict_equal_length( + group_name: str, dictionary: dict[str, list[str]] +) -> None: + lens = lengths_gt1(dictionary) + assert len(set(lens.values())) <= 1, ( + f"The number of values passed to {group_name} " + f"arguments must be 0, 1 or all the same. Offenders: {lens}" + ) + + +def resolve_input_directories_to_fastq_paths(input_paths: list[str]) -> list[Path]: + input_paths = [Path(fastq) for fastq in input_paths] + if len(input_paths) == 1 and input_paths[0].is_dir(): + logger.info( + "Detected a directory in input paths, " + "traversing to see if we can detect any FASTQ files." + ) + input_paths = [ + input_path + for input_path in input_paths[0].rglob("*") + if re.match(fastq_regex, input_path.name) + ] + + # check input fastq files + for input_path in input_paths: + assert re.match(fastq_regex, input_path.name) is not None, ( + f"File name of --input '{input_path}' should match regex {fastq_regex}." + ) + + return input_paths + + +def make_paths_absolute(par: dict[str, Any], config: Path | str): + with open(config, "r", encoding="utf-8") as open_viash_config: + config = yaml.safe_load(open_viash_config) + + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + for arg_name, arg in arguments.items(): + if not par.get(arg_name) or arg["type"] != "file": + continue + par_value, is_multiple = par[arg_name], arg["multiple"] + assert is_multiple in (True, False) + + def make_path_absolute(file: str | Path) -> Path: + logger.info("Making path %s absolute", file) + return Path(file).resolve() + + new_arg = ( + [make_path_absolute(file) for file in par_value] + if is_multiple + else make_path_absolute(par_value) + ) + par[arg_name] = new_arg + return par + + +def handle_integers_not_set(par: dict[str, Any], viash_config: Path | str) -> str: + """ + Allow to use \\`-1\\` to define a 'not set' value for arguments of \\`type: integer\\` with \\`multiple: true\\`. + """ + with open(viash_config, "r", encoding="utf-8") as open_viash_config: + config = yaml.safe_load(open_viash_config) + + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + for arg_name, arg in arguments.items(): + if not par.get(arg_name) or arg["type"] != "integer": + continue + par_value, is_multiple = par[arg_name], arg["multiple"] + assert is_multiple in (True, False) + + if not is_multiple: + continue + + def replace_notset_values(integer_value: int) -> int | None: + return None if integer_value == -1 else integer_value + + # Use an extension array to handle "None" values, otherwise int + NA + # values would be converted to a "float" dtype + new_arg = pd.array( + [replace_notset_values(value) for value in par_value], dtype="Int64" + ) + par[arg_name] = new_arg + return par + + +def process_params(par: dict[str, Any], viash_config: Path | str) -> str: + if par["input"]: + assert len(par["library_type"]) > 0, ( + "--library_type must be defined when passing input to --input" + ) + assert len(par["library_id"]) > 0, ( + "--library_id must be defined when passing input to --input" + ) + + # if par["input"] is a directory, look for fastq files + par["input"] = resolve_input_directories_to_fastq_paths(par["input"]) + + # add helper input + helper_input = transform_helper_inputs(par) + for key in ["input", "library_id", "library_type"]: + par[key] = (par[key] if par[key] else []) + helper_input[key] + + assert len(par[key]) > 0, ( + f"Either --{key} or feature type-specific input (e.g. --gex_input, --abc_input, ...) must be defined" + ) + + # check lengths of libraries metadata + library_dict = subset_dict(par, LIBRARY_PARAMS) + check_subset_dict_equal_length("Library", library_dict) + + samples_dict = subset_dict(par, SAMPLE_PARAMS) + check_subset_dict_equal_length("Samples", samples_dict) + + # Allow using -1 to indicate unset integers for arguments + # that accept multiple integers. + par = handle_integers_not_set(par, viash_config) + + # use absolute paths + return make_paths_absolute(par, viash_config) + + +def generate_csv_category(name: str, args: dict[str, str], orient: str) -> list[str]: + assert orient in ("index", "columns") + if not args: + return [] + title = [f"[{name}]"] + # Which index to include in csv section is based on orientation + to_csv_args = {"index": (orient == "index"), "header": (orient == "columns")} + values = [pd.DataFrame.from_dict(args, orient=orient).to_csv(**to_csv_args).strip()] + return title + values + [""] + + +def generate_config(par: dict[str, Any], fastq_dir: str) -> str: + content_list = [] + par["fastqs"] = fastq_dir + libraries = dict(LIBRARY_CONFIG_KEYS, **{"fastqs": "fastqs"}) + # TODO: use the union (|) operator when python is updated to 3.9 + all_sections = REFERENCE_SECTIONS | { + "libraries": (libraries, "columns"), + "samples": (SAMPLE_PARAMS_CONFIG_KEYS, "columns"), + } + for section_name, (section_params, orientation) in all_sections.items(): + reference_pars = subset_dict(par, section_params) + content_list += generate_csv_category( + section_name, reference_pars, orient=orientation + ) + + return "\\\\n".join(content_list) + + +def untar(tar_file, output_directory): + logger.info("Looking at %s to check if it needs decompressing", tar_file) + if tar_file and Path(tar_file).is_file() and tarfile.is_tarfile(tar_file): + extaction_dir_name = Path( + tar_file.stem + ).stem # Remove two extensions (if they exist) + unpacked_directory = output_directory / extaction_dir_name + logger.info("Extracting %s to %s", tar_file, unpacked_directory) + + with tarfile.open(tar_file, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_directory, members=members_to_move) + return unpacked_directory + return tar_file + + +def main(par: dict[str, Any], meta: dict[str, Any]): + logger.info("Processing params") + par = process_params(par, meta["config"]) + logger.info(par) + + with tempfile.TemporaryDirectory( + prefix="cellranger_multi-", dir=meta["temp_dir"] + ) as temp_dir: + logger.info("Using temporary directory %s", temp_dir) + temp_dir_path = Path(temp_dir) + for reference_par_name in REFERENCES: + par[reference_par_name] = untar(par[reference_par_name], temp_dir_path) + + logger.info("Creating symbolic links to temporary directory") + # Creating symlinks of fastq files to tempdir + input_symlinks_dir = temp_dir_path / "input_symlinks" + input_symlinks_dir.mkdir() + for fastq in par["input"]: + destination = input_symlinks_dir / fastq.name + destination.symlink_to(fastq) + + logger.info("Creating config file") + config_content = generate_config(par, input_symlinks_dir) + logger.info("Using config: \\\\n\\\\t%s", config_content.replace("\\\\n", "\\\\n\\\\t")) + par["output"].mkdir(parents=True, exist_ok=True) + config_file = par["output"] / "config.csv" + with open(config_file, "w") as f: + f.write(config_content) + + logger.info("Creating Cell Ranger argument list") + temp_id = "run" + cmd = [CELL_RANGER_EXEC, "multi", "--disable-ui", "--id", temp_id] + + command_line_parameters = { + "--localcores": int(meta["cpus"]) if meta["cpus"] else cpu_count() - 1, + "--csv": config_file, + } + if meta["memory_gb"]: + command_line_parameters["--localmem"] = ( + int(meta["memory_gb"]) - 2 + if meta["memory_gb"] > 2 + else meta["memory_gb"] + ) + cmd += [ + f"{param}={param_value}" + for param, param_value in command_line_parameters.items() + ] + + logger.info("> " + " ".join(cmd)) + + if par["dryrun"]: + logger.info("Not executing command. Dry run enabled.") + return + + logger.info("Starting Cell Ranger") + with ( + subprocess.Popen( + cmd, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=1, + cwd=temp_dir, + ) as process, + (par["output"] / "cellranger_multi.log").open("w") as open_log, + ): + for line in process.stdout: + print(line.strip(), flush=True) + open_log.write(line) + logger.info("Checking exit code.") + # Popen objects do not have a check_exitcode method. + if process.returncode != 0: + raise RuntimeError("Cell Ranger returned a nonzero exitcode!") + + logger.info("Checking if expected output files are present.") + # look for output dir file + tmp_output_dir = temp_dir_path / temp_id / "outs" + expected_files = { + Path("multi"): Path.is_dir, + Path("per_sample_outs"): Path.is_dir, + Path("config.csv"): Path.is_file, + } + for file_path, type_func in expected_files.items(): + output_path = tmp_output_dir / file_path + if not type_func(output_path): + raise ValueError(f"Could not find expected '{output_path}'") + logger.info("Copying output files to %s", par["output"]) + for output_path in tmp_output_dir.rglob("*"): + if output_path.name != "config.csv": # Already created + shutil.move(str(output_path), par["output"]) + + +if __name__ == "__main__": + logger.info("Component '%s' started.", meta["name"]) + main(par, meta) + logger.info("Finished!") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/cellranger_multi", + "tag" : "main" + }, + "label" : [ + "veryhighmem", + "highcpu", + "highdisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/cellranger_multi/nextflow.config b/target/nextflow/mapping/cellranger_multi/nextflow.config new file mode 100644 index 00000000..bce007da --- /dev/null +++ b/target/nextflow/mapping/cellranger_multi/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/cellranger_multi' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Align fastq files using Cell Ranger multi.' + author = 'Dries Schaumont, Angela Oliveira Pisco, Robrecht Cannoodt, Dries De Maeyer, Weiwei Schultz, Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/cellranger_multi/nextflow_labels.config b/target/nextflow/mapping/cellranger_multi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/cellranger_multi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/cellranger_multi/nextflow_schema.json b/target/nextflow/mapping/cellranger_multi/nextflow_schema.json new file mode 100644 index 00000000..3d1de3bc --- /dev/null +++ b/target/nextflow/mapping/cellranger_multi/nextflow_schema.json @@ -0,0 +1,574 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_multi", + "description": "Align fastq files using Cell Ranger multi.", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The folder to store the alignment results.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ", + "default": "$id.$key.output" + } + } + }, + "input files": { + "title": "Input files", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + } + } + }, + "feature type-specific input files": { + "title": "Feature type-specific input files", + "type": "object", + "description": "Helper functionality to allow feature type-specific input files, without the need to specify\nlibrary_type or library_id. The library_id will be inferred from the input paths.\n", + "properties": { + "gex_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Gene Expression", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "abc_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Antibody Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "cgc_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for CRISPR Guide Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "mux_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Multiplexing Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_t_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ-T", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_t_gd_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ-T-GD", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_b_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ-B", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "agc_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Antigen Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + } + } + }, + "library arguments": { + "title": "Library arguments", + "type": "object", + "description": "No description", + "properties": { + "library_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The Illumina sample name to analyze", + "help_text": "Type: `string`, multiple: `True`, example: `[\"mysample1\"]`. " + }, + "library_type": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The underlying feature type of the library.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene Expression\"]`, choices: ``Gene Expression`, `VDJ`, `VDJ-T`, `VDJ-B`, `VDJ-T-GD`, `Antibody Capture`, `CRISPR Guide Capture`, `Multiplexing Capture`, `Antigen Capture`, `Custom``. " + }, + "library_subsample": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The rate at which reads from the provided FASTQ files are sampled.\nMust be strictly greater than 0 and less than or equal to 1.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"0.5\"]`. " + }, + "library_lanes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Lanes associated with this sample", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1-4\"]`. " + }, + "library_chemistry": { + "type": "string", + "description": "Only applicable to FRP", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "sample parameters": { + "title": "Sample parameters", + "type": "object", + "description": "No description", + "properties": { + "sample_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A name to identify a multiplexed sample", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sample_description": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A description for the sample.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sample_expect_cells": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "help_text": "Type: `integer`, multiple: `True`, example: `[3000]`. " + }, + "sample_force_cells": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Force pipeline to use this number of cells, bypassing cell detection.\n", + "help_text": "Type: `integer`, multiple: `True`, example: `[3000]`. " + } + } + }, + "feature barcode library specific arguments": { + "title": "Feature Barcode library specific arguments", + "type": "object", + "description": "No description", + "properties": { + "feature_reference": { + "type": "string", + "format": "path", + "description": "Path to the Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes.\nRequired only for Antibody Capture or CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref for more information.\"\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"feature_reference.csv\"`. " + }, + "feature_r1_length": { + "type": "integer", + "description": "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "feature_r2_length": { + "type": "integer", + "description": "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_crispr_umi": { + "type": "integer", + "description": "Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection.\nIf a lower or higher sensitivity is desired for detection, this value can be customized\naccording to specific experimental needs", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "gene expression arguments": { + "title": "Gene expression arguments", + "type": "object", + "description": "Arguments relevant to the analysis of gene expression data.", + "properties": { + "gex_reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Genome refence index built by Cell Ranger mkref.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference_genome.tar.gz\"`. " + }, + "gex_secondary_analysis": { + "type": "boolean", + "description": "Whether or not to run the secondary analysis e.g", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "gex_generate_bam": { + "type": "boolean", + "description": "Whether to generate a BAM file.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "tenx_cloud_token_path": { + "type": "string", + "format": "path", + "description": "The 10x Cloud Analysis user token used to enable cell annotation.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "cell_annotation_model": { + "type": "string", + "description": "\"Cell annotation model to use", + "help_text": "Type: `string`, multiple: `False`, choices: ``auto`, `human_pca_v1_beta`, `mouse_pca_v1_beta``. ", + "enum": [ + "auto", + "human_pca_v1_beta", + "mouse_pca_v1_beta" + ] + }, + "gex_expect_cells": { + "type": "integer", + "description": "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "gex_force_cells": { + "type": "integer", + "description": "Force pipeline to use this number of cells, bypassing cell detection.\n", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "gex_include_introns": { + "type": "boolean", + "description": "Whether or not to include intronic reads in counts.\nThis option does not apply to Fixed RNA Profiling analysis.\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "gex_r1_length": { + "type": "integer", + "description": "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "gex_r2_length": { + "type": "integer", + "description": "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "gex_chemistry": { + "type": "string", + "description": "Assay configuration", + "help_text": "Type: `string`, multiple: `False`, default: `\"auto\"`, choices: ``auto`, `threeprime`, `fiveprime`, `SC3Pv1`, `SC3Pv2`, `SC3Pv3`, `SC3Pv3-polyA`, `SC3Pv4`, `SC3Pv4-polyA`, `SC3Pv3LT`, `SC3Pv3HT`, `SC3Pv3HT-polyA`, `SC5P-PE`, `SC5P-PE-v3`, `SC5P-R2`, `SC-FB`, `SC5P-R2-v3`, `SCP5-PE-v3`, `SC5PHT`, `MFRP`, `MFRP-R1`, `MFRP-RNA`, `MFRP-Ab`, `SFRP`, `MFRP-Ab-R2pos50`, `MFRP-RNA-R1`, `MFRP-Ab-R1`, `ARC-v1``. ", + "enum": [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv3-polyA", + "SC3Pv4", + "SC3Pv4-polyA", + "SC3Pv3LT", + "SC3Pv3HT", + "SC3Pv3HT-polyA", + "SC5P-PE", + "SC5P-PE-v3", + "SC5P-R2", + "SC-FB", + "SC5P-R2-v3", + "SCP5-PE-v3", + "SC5PHT", + "MFRP", + "MFRP-R1", + "MFRP-RNA", + "MFRP-Ab", + "SFRP", + "MFRP-Ab-R2pos50", + "MFRP-RNA-R1", + "MFRP-Ab-R1", + "ARC-v1" + ], + "default": "auto" + } + } + }, + "vdj related parameters": { + "title": "VDJ related parameters", + "type": "object", + "description": "No description", + "properties": { + "vdj_reference": { + "type": "string", + "format": "path", + "description": "VDJ refence index built by Cell Ranger mkref.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference_vdj.tar.gz\"`. " + }, + "vdj_inner_enrichment_primers": { + "type": "string", + "format": "path", + "description": "V(D)J Immune Profiling libraries: if inner enrichment primers other than those provided \nin the 10x Genomics kits are used, they need to be specified here as a\ntext file with one primer per line.\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"enrichment_primers.txt\"`. " + }, + "vdj_r1_length": { + "type": "integer", + "description": "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, where N is the user-supplied value.\nNote that the length includes the Barcode and UMI sequences so do not set this below 26.\n", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "vdj_r2_length": { + "type": "integer", + "description": "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, where N is a user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "3' cell multiplexing parameters (cellplex multiplexing)": { + "title": "3' Cell multiplexing parameters (CellPlex Multiplexing)", + "type": "object", + "description": "No description", + "properties": { + "cell_multiplex_oligo_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The Cell Multiplexing oligo IDs used to multiplex this sample", + "help_text": "Type: `string`, multiple: `True`. " + }, + "min_assignment_confidence": { + "type": "number", + "description": "The minimum estimated likelihood to call a sample as tagged with a Cell Multiplexing Oligo (CMO) instead of \"Unassigned\".\nUsers may wish to tolerate a higher rate of mis-assignment in order to obtain more singlets to include in their analysis,\nor a lower rate of mis-assignment at the cost of obtaining fewer singlets.\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "cmo_set": { + "type": "string", + "format": "path", + "description": "Path to a custom CMO set CSV file, declaring CMO constructs and associated barcodes", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "barcode_sample_assignment": { + "type": "string", + "format": "path", + "description": "Path to a barcode-sample assignment CSV file that specifies the barcodes that belong to each sample.\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + } + } + }, + "hashtag multiplexing parameters": { + "title": "Hashtag multiplexing parameters", + "type": "object", + "description": "No description", + "properties": { + "hashtag_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The hashtag IDs used to multiplex this sample", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "on-chip multiplexing parameters": { + "title": "On-chip multiplexing parameters", + "type": "object", + "description": "No description", + "properties": { + "ocm_barcode_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The OCM barcode IDs used to multiplex this sample", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "flex multiplexing paramaters": { + "title": "Flex multiplexing paramaters", + "type": "object", + "description": "No description", + "properties": { + "probe_set": { + "type": "string", + "format": "path", + "description": "A probe set reference CSV file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "filter_probes": { + "type": "boolean", + "description": "If 'false', include all non-deprecated probes listed in the probe set reference CSV file.\nIf 'true' or not set, probes that are predicted to have off-target activity to homologous genes are excluded from analysis.\nNot filtering will result in UMI counts from all non-deprecated probes,\nincluding those with predicted off-target activity, to be used in the analysis.\nProbes whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "probe_barcode_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing Barcode IDs", + "help_text": "Type: `string`, multiple: `True`. " + }, + "emptydrops_minimum_umis": { + "type": "integer", + "description": "For singleplex Flex experiments, use this option to adjust the UMI cutoff during the second step of cell calling.\nCell Ranger will still perform the full cell calling process but will only evaluate barcodes with UMIs above\nthe threshold you specify.\n", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "antigen capture (beam) libary arguments": { + "title": "Antigen Capture (BEAM) libary arguments", + "type": "object", + "description": "These arguments are recommended if an Antigen Capture (BEAM) library is present. \nIt is needed to calculate the antigen specificity score.\n", + "properties": { + "control_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A user-defined ID for any negative controls used in the T/BCR Antigen Capture assay", + "help_text": "Type: `string`, multiple: `True`. " + }, + "mhc_allele": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The MHC allele for TCR Antigen Capture libraries", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "general arguments": { + "title": "General arguments", + "type": "object", + "description": "These arguments are applicable to all library types.\n", + "properties": { + "check_library_compatibility": { + "type": "boolean", + "description": "Optional", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "executor arguments": { + "title": "Executor arguments", + "type": "object", + "description": "No description", + "properties": { + "dryrun": { + "type": "boolean", + "description": "If true, the output directory will only contain the CWL input files, but the pipeline itself will not be executed.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/input files" + }, + { + "$ref": "#/$defs/feature type-specific input files" + }, + { + "$ref": "#/$defs/library arguments" + }, + { + "$ref": "#/$defs/sample parameters" + }, + { + "$ref": "#/$defs/feature barcode library specific arguments" + }, + { + "$ref": "#/$defs/gene expression arguments" + }, + { + "$ref": "#/$defs/vdj related parameters" + }, + { + "$ref": "#/$defs/3' cell multiplexing parameters (cellplex multiplexing)" + }, + { + "$ref": "#/$defs/hashtag multiplexing parameters" + }, + { + "$ref": "#/$defs/on-chip multiplexing parameters" + }, + { + "$ref": "#/$defs/flex multiplexing paramaters" + }, + { + "$ref": "#/$defs/antigen capture (beam) libary arguments" + }, + { + "$ref": "#/$defs/general arguments" + }, + { + "$ref": "#/$defs/executor arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/cellranger_multi/setup_logger.py b/target/nextflow/mapping/cellranger_multi/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/mapping/cellranger_multi/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/mapping/htseq_count/.config.vsh.yaml b/target/nextflow/mapping/htseq_count/.config.vsh.yaml new file mode 100644 index 00000000..dc7167ff --- /dev/null +++ b/target/nextflow/mapping/htseq_count/.config.vsh.yaml @@ -0,0 +1,472 @@ +name: "htseq_count" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Path to the SAM/BAM files containing the mapped reads." + info: + orig_arg: "samfilenames" + example: + - "mysample1.BAM" + - "mysample2.BAM" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "Path to the GTF file containing the features." + info: + orig_arg: "featurefilename" + example: + - "reference.gtf" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Filename to output the counts to." + info: + orig_arg: "--counts_output" + example: + - "htseq-count.tsv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_delimiter" + description: "Column delimiter in output." + info: + orig_arg: "--delimiter" + example: + - "\t" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_sam" + description: "Write out all SAM alignment records into SAM/BAM files (one per\ + \ input file needed), \nannotating each line with its feature assignment (as\ + \ an optional field with tag 'XF'). \nSee the -p option to use BAM instead of\ + \ SAM.\n" + info: + orig_arg: "--samout" + example: + - "mysample1_out.BAM" + - "mysample2_out.BAM" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_sam_format" + description: "Format to use with the --output_sam argument." + info: + orig_arg: "--samout-format" + required: false + choices: + - "sam" + - "bam" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--order" + alternatives: + - "-r" + description: "Sorting order of . Paired-end sequencing data must\ + \ be sorted either by position or\nby read name, and the sorting order must\ + \ be specified. Ignored for single-end data.\n" + info: + orig_arg: "--order" + default: + - "name" + required: false + choices: + - "pos" + - "name" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--stranded" + alternatives: + - "-s" + description: "Whether the data is from a strand-specific assay. 'reverse' means\ + \ 'yes' with reversed strand interpretation." + info: + orig_arg: "--stranded" + default: + - "yes" + required: false + choices: + - "yes" + - "no" + - "reverse" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--minimum_alignment_quality" + alternatives: + - "-a" + - "--minaqual" + description: "Skip all reads with MAPQ alignment quality lower than the given\ + \ minimum value. \nMAPQ is the 5th column of a SAM/BAM file and its usage depends\ + \ on the software \nused to map the reads.\n" + info: + orig_arg: "--minaqual" + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--type" + alternatives: + - "-t" + description: "Feature type (3rd column in GTF file) to be used, all features of\ + \ other type are ignored (default, suitable for Ensembl GTF files: exon)" + info: + orig_arg: "--type" + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--id_attribute" + alternatives: + - "-i" + description: "GTF attribute to be used as feature ID (default, suitable for Ensembl\ + \ GTF files: gene_id).\nAll feature of the right type (see -t option) within\ + \ the same GTF attribute will be added\ntogether. The typical way of using this\ + \ option is to count all exonic reads from each gene\nand add the exons but\ + \ other uses are possible as well. You can call this option multiple\ntimes:\ + \ in that case, the combination of all attributes separated by colons (:) will\ + \ be used\nas a unique identifier, e.g. for exons you might use -i gene_id -i\ + \ exon_number.\n" + info: + orig_arg: "--idattr" + example: + - "gene_id" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--additional_attributes" + description: "Additional feature attributes (suitable for Ensembl GTF files: gene_name).\ + \ Use multiple times\nfor more than one additional attribute. These attributes\ + \ are only used as annotations in the\noutput, while the determination of how\ + \ the counts are added together is done based on option -i.\n" + info: + orig_arg: "--additional-attr" + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--add_chromosome_info" + description: "Store information about the chromosome of each feature as an additional\ + \ attribute\n(e.g. colunm in the TSV output file).\n" + info: + orig_arg: "--add-chromosome-info" + direction: "input" + - type: "string" + name: "--mode" + alternatives: + - "-m" + description: "Mode to handle reads overlapping more than one feature." + info: + orig_arg: "--mode" + default: + - "union" + required: false + choices: + - "union" + - "intersection-strict" + - "intersection-nonempty" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_unique" + description: "Whether and how to score reads that are not uniquely aligned or\ + \ ambiguously assigned to features." + info: + orig_arg: "--nonunique" + default: + - "none" + required: false + choices: + - "none" + - "all" + - "fraction" + - "random" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--secondary_alignments" + description: "Whether to score secondary alignments (0x100 flag)." + info: + orig_arg: "--secondary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--supplementary_alignments" + description: "Whether to score supplementary alignments (0x800 flag)." + info: + orig_arg: "--supplementary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--counts_output_sparse" + description: "Store the counts as a sparse matrix (mtx, h5ad, loom)." + info: + orig_arg: "--counts-output-sparse" + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Quantify gene expression for subsequent testing for differential expression.\n\ + \nThis script takes one or more alignment files in SAM/BAM format and a feature\ + \ file in GFF format and calculates for each feature the number of reads mapping\ + \ to it. \n\nSee http://htseq.readthedocs.io/en/master/count.html for details.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "HTSeq" + - "pyyaml" + - "scipy" + - "pandas" + - "numpy<2" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/htseq_count/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/htseq_count" + executable: "target/nextflow/mapping/htseq_count/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/htseq_count/main.nf b/target/nextflow/mapping/htseq_count/main.nf new file mode 100644 index 00000000..8a947484 --- /dev/null +++ b/target/nextflow/mapping/htseq_count/main.nf @@ -0,0 +1,4310 @@ +// htseq_count main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) +// * Angela Oliveira Pisco (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "htseq_count", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to the SAM/BAM files containing the mapped reads.", + "info" : { + "orig_arg" : "samfilenames" + }, + "example" : [ + "mysample1.BAM", + "mysample2.BAM" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "description" : "Path to the GTF file containing the features.", + "info" : { + "orig_arg" : "featurefilename" + }, + "example" : [ + "reference.gtf" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Filename to output the counts to.", + "info" : { + "orig_arg" : "--counts_output" + }, + "example" : [ + "htseq-count.tsv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_delimiter", + "description" : "Column delimiter in output.", + "info" : { + "orig_arg" : "--delimiter" + }, + "example" : [ + "\t" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_sam", + "description" : "Write out all SAM alignment records into SAM/BAM files (one per input file needed), \nannotating each line with its feature assignment (as an optional field with tag 'XF'). \nSee the -p option to use BAM instead of SAM.\n", + "info" : { + "orig_arg" : "--samout" + }, + "example" : [ + "mysample1_out.BAM", + "mysample2_out.BAM" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_sam_format", + "description" : "Format to use with the --output_sam argument.", + "info" : { + "orig_arg" : "--samout-format" + }, + "required" : false, + "choices" : [ + "sam", + "bam" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--order", + "alternatives" : [ + "-r" + ], + "description" : "Sorting order of . Paired-end sequencing data must be sorted either by position or\nby read name, and the sorting order must be specified. Ignored for single-end data.\n", + "info" : { + "orig_arg" : "--order" + }, + "default" : [ + "name" + ], + "required" : false, + "choices" : [ + "pos", + "name" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--stranded", + "alternatives" : [ + "-s" + ], + "description" : "Whether the data is from a strand-specific assay. 'reverse' means 'yes' with reversed strand interpretation.", + "info" : { + "orig_arg" : "--stranded" + }, + "default" : [ + "yes" + ], + "required" : false, + "choices" : [ + "yes", + "no", + "reverse" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--minimum_alignment_quality", + "alternatives" : [ + "-a", + "--minaqual" + ], + "description" : "Skip all reads with MAPQ alignment quality lower than the given minimum value. \nMAPQ is the 5th column of a SAM/BAM file and its usage depends on the software \nused to map the reads.\n", + "info" : { + "orig_arg" : "--minaqual" + }, + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--type", + "alternatives" : [ + "-t" + ], + "description" : "Feature type (3rd column in GTF file) to be used, all features of other type are ignored (default, suitable for Ensembl GTF files: exon)", + "info" : { + "orig_arg" : "--type" + }, + "example" : [ + "exon" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--id_attribute", + "alternatives" : [ + "-i" + ], + "description" : "GTF attribute to be used as feature ID (default, suitable for Ensembl GTF files: gene_id).\nAll feature of the right type (see -t option) within the same GTF attribute will be added\ntogether. The typical way of using this option is to count all exonic reads from each gene\nand add the exons but other uses are possible as well. You can call this option multiple\ntimes: in that case, the combination of all attributes separated by colons (:) will be used\nas a unique identifier, e.g. for exons you might use -i gene_id -i exon_number.\n", + "info" : { + "orig_arg" : "--idattr" + }, + "example" : [ + "gene_id" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--additional_attributes", + "description" : "Additional feature attributes (suitable for Ensembl GTF files: gene_name). Use multiple times\nfor more than one additional attribute. These attributes are only used as annotations in the\noutput, while the determination of how the counts are added together is done based on option -i.\n", + "info" : { + "orig_arg" : "--additional-attr" + }, + "example" : [ + "gene_name" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--add_chromosome_info", + "description" : "Store information about the chromosome of each feature as an additional attribute\n(e.g. colunm in the TSV output file).\n", + "info" : { + "orig_arg" : "--add-chromosome-info" + }, + "direction" : "input" + }, + { + "type" : "string", + "name" : "--mode", + "alternatives" : [ + "-m" + ], + "description" : "Mode to handle reads overlapping more than one feature.", + "info" : { + "orig_arg" : "--mode" + }, + "default" : [ + "union" + ], + "required" : false, + "choices" : [ + "union", + "intersection-strict", + "intersection-nonempty" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--non_unique", + "description" : "Whether and how to score reads that are not uniquely aligned or ambiguously assigned to features.", + "info" : { + "orig_arg" : "--nonunique" + }, + "default" : [ + "none" + ], + "required" : false, + "choices" : [ + "none", + "all", + "fraction", + "random" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--secondary_alignments", + "description" : "Whether to score secondary alignments (0x100 flag).", + "info" : { + "orig_arg" : "--secondary-alignments" + }, + "required" : false, + "choices" : [ + "score", + "ignore" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--supplementary_alignments", + "description" : "Whether to score supplementary alignments (0x800 flag).", + "info" : { + "orig_arg" : "--supplementary-alignments" + }, + "required" : false, + "choices" : [ + "score", + "ignore" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--counts_output_sparse", + "description" : "Store the counts as a sparse matrix (mtx, h5ad, loom).", + "info" : { + "orig_arg" : "--counts-output-sparse" + }, + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Quantify gene expression for subsequent testing for differential expression.\n\nThis script takes one or more alignment files in SAM/BAM format and a feature file in GFF format and calculates for each feature the number of reads mapping to it. \n\nSee http://htseq.readthedocs.io/en/master/count.html for details.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "HTSeq", + "pyyaml", + "scipy", + "pandas", + "numpy<2" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/htseq_count/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/htseq_count", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil +import yaml + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_delimiter': $( if [ ! -z ${VIASH_PAR_OUTPUT_DELIMITER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_DELIMITER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_sam': $( if [ ! -z ${VIASH_PAR_OUTPUT_SAM+x} ]; then echo "r'${VIASH_PAR_OUTPUT_SAM//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output_sam_format': $( if [ ! -z ${VIASH_PAR_OUTPUT_SAM_FORMAT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_SAM_FORMAT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'order': $( if [ ! -z ${VIASH_PAR_ORDER+x} ]; then echo "r'${VIASH_PAR_ORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'stranded': $( if [ ! -z ${VIASH_PAR_STRANDED+x} ]; then echo "r'${VIASH_PAR_STRANDED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'minimum_alignment_quality': $( if [ ! -z ${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY+x} ]; then echo "int(r'${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'type': $( if [ ! -z ${VIASH_PAR_TYPE+x} ]; then echo "r'${VIASH_PAR_TYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'id_attribute': $( if [ ! -z ${VIASH_PAR_ID_ATTRIBUTE+x} ]; then echo "r'${VIASH_PAR_ID_ATTRIBUTE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'additional_attributes': $( if [ ! -z ${VIASH_PAR_ADDITIONAL_ATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_ADDITIONAL_ATTRIBUTES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'add_chromosome_info': $( if [ ! -z ${VIASH_PAR_ADD_CHROMOSOME_INFO+x} ]; then echo "r'${VIASH_PAR_ADD_CHROMOSOME_INFO//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'mode': $( if [ ! -z ${VIASH_PAR_MODE+x} ]; then echo "r'${VIASH_PAR_MODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'non_unique': $( if [ ! -z ${VIASH_PAR_NON_UNIQUE+x} ]; then echo "r'${VIASH_PAR_NON_UNIQUE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'secondary_alignments': $( if [ ! -z ${VIASH_PAR_SECONDARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SECONDARY_ALIGNMENTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'supplementary_alignments': $( if [ ! -z ${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'counts_output_sparse': $( if [ ! -z ${VIASH_PAR_COUNTS_OUTPUT_SPARSE+x} ]; then echo "r'${VIASH_PAR_COUNTS_OUTPUT_SPARSE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\\\x1f\\\\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print( + f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True + ) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print( + f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True + ) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +def generate_args(par, config): + # fetch arguments from config + arguments = [ + arg for group in config["argument_groups"] for arg in group["arguments"] + ] + + cmd_args = [] + + for arg in arguments: + arg_val = par.get(arg["name"].removeprefix("--")) + orig_arg = arg.get("info", {}).get("orig_arg") + if arg_val and orig_arg: + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +######################## +### Main code ### +######################## + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + + +with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + reference = Path(par["reference"]) + + print(f">> Check compression of --reference with value: {reference}", flush=True) + par["reference"] = extract_if_need_be(reference, temp_dir_path) + + print(">> Constructing command", flush=True) + cmd_args = ["htseq-count"] + generate_args(par, config) + + # manually process cpus parameter + if "cpus" in meta and meta["cpus"]: + cmd_args.extend(["--nprocesses", str(meta["cpus"])]) + + print(">> Running htseq-count with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/htseq_count", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/htseq_count/nextflow.config b/target/nextflow/mapping/htseq_count/nextflow.config new file mode 100644 index 00000000..899a0b18 --- /dev/null +++ b/target/nextflow/mapping/htseq_count/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/htseq_count' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Quantify gene expression for subsequent testing for differential expression.\n\nThis script takes one or more alignment files in SAM/BAM format and a feature file in GFF format and calculates for each feature the number of reads mapping to it. \n\nSee http://htseq.readthedocs.io/en/master/count.html for details.\n' + author = 'Robrecht Cannoodt, Angela Oliveira Pisco' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/htseq_count/nextflow_labels.config b/target/nextflow/mapping/htseq_count/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/htseq_count/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/htseq_count/nextflow_schema.json b/target/nextflow/mapping/htseq_count/nextflow_schema.json new file mode 100644 index 00000000..b3ff3579 --- /dev/null +++ b/target/nextflow/mapping/htseq_count/nextflow_schema.json @@ -0,0 +1,204 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "htseq_count", + "description": "Quantify gene expression for subsequent testing for differential expression.\n\nThis script takes one or more alignment files in SAM/BAM format and a feature file in GFF format and calculates for each feature the number of reads mapping to it. \n\nSee http://htseq.readthedocs.io/en/master/count.html for details.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "order": { + "type": "string", + "description": "Sorting order of ", + "help_text": "Type: `string`, multiple: `False`, default: `\"name\"`, choices: ``pos`, `name``. ", + "enum": [ + "pos", + "name" + ], + "default": "name" + }, + "stranded": { + "type": "string", + "description": "Whether the data is from a strand-specific assay", + "help_text": "Type: `string`, multiple: `False`, default: `\"yes\"`, choices: ``yes`, `no`, `reverse``. ", + "enum": [ + "yes", + "no", + "reverse" + ], + "default": "yes" + }, + "minimum_alignment_quality": { + "type": "integer", + "description": "Skip all reads with MAPQ alignment quality lower than the given minimum value", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "type": { + "type": "string", + "description": "Feature type (3rd column in GTF file) to be used, all features of other type are ignored (default, suitable for Ensembl GTF files: exon)", + "help_text": "Type: `string`, multiple: `False`, example: `\"exon\"`. " + }, + "id_attribute": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute to be used as feature ID (default, suitable for Ensembl GTF files: gene_id).\nAll feature of the right type (see -t option) within the same GTF attribute will be added\ntogether", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_id\"]`. " + }, + "additional_attributes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Additional feature attributes (suitable for Ensembl GTF files: gene_name)", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_name\"]`. " + }, + "add_chromosome_info": { + "type": "boolean", + "description": "Store information about the chromosome of each feature as an additional attribute\n(e.g", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "mode": { + "type": "string", + "description": "Mode to handle reads overlapping more than one feature.", + "help_text": "Type: `string`, multiple: `False`, default: `\"union\"`, choices: ``union`, `intersection-strict`, `intersection-nonempty``. ", + "enum": [ + "union", + "intersection-strict", + "intersection-nonempty" + ], + "default": "union" + }, + "non_unique": { + "type": "string", + "description": "Whether and how to score reads that are not uniquely aligned or ambiguously assigned to features.", + "help_text": "Type: `string`, multiple: `False`, default: `\"none\"`, choices: ``none`, `all`, `fraction`, `random``. ", + "enum": [ + "none", + "all", + "fraction", + "random" + ], + "default": "none" + }, + "secondary_alignments": { + "type": "string", + "description": "Whether to score secondary alignments (0x100 flag).", + "help_text": "Type: `string`, multiple: `False`, choices: ``score`, `ignore``. ", + "enum": [ + "score", + "ignore" + ] + }, + "supplementary_alignments": { + "type": "string", + "description": "Whether to score supplementary alignments (0x800 flag).", + "help_text": "Type: `string`, multiple: `False`, choices: ``score`, `ignore``. ", + "enum": [ + "score", + "ignore" + ] + }, + "counts_output_sparse": { + "type": "boolean", + "description": "Store the counts as a sparse matrix (mtx, h5ad, loom).", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Path to the SAM/BAM files containing the mapped reads.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"mysample1.BAM\";\"mysample2.BAM\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the GTF file containing the features.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.gtf\"`. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Filename to output the counts to.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.tsv\"`, direction: `output`, example: `\"htseq-count.tsv\"`. ", + "default": "$id.$key.output.tsv" + }, + "output_delimiter": { + "type": "string", + "description": "Column delimiter in output.", + "help_text": "Type: `string`, multiple: `False`, example: `\"\t\"`. " + }, + "output_sam": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Write out all SAM alignment records into SAM/BAM files (one per input file needed), \nannotating each line with its feature assignment (as an optional field with tag 'XF')", + "help_text": "Type: `file`, multiple: `True`, default: `\"$id.$key.output_sam_*.BAM\"`, direction: `output`, example: `[\"mysample1_out.BAM\";\"mysample2_out.BAM\"]`. ", + "default": "$id.$key.output_sam_*.BAM" + }, + "output_sam_format": { + "type": "string", + "description": "Format to use with the --output_sam argument.", + "help_text": "Type: `string`, multiple: `False`, choices: ``sam`, `bam``. ", + "enum": [ + "sam", + "bam" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/htseq_count_to_h5mu/.config.vsh.yaml b/target/nextflow/mapping/htseq_count_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..bf90cb07 --- /dev/null +++ b/target/nextflow/mapping/htseq_count_to_h5mu/.config.vsh.yaml @@ -0,0 +1,283 @@ +name: "htseq_count_to_h5mu" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--input_id" + description: "The obs index for the counts" + info: null + example: + - "foo" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_counts" + description: "The counts as a TSV file as output by HTSeq." + info: null + example: + - "counts.tsv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The GTF file." + info: null + example: + - "gencode_v41_star" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the htseq table to a h5mu.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "gtfparse" + upgrade: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "pyarrow~=18.0.0" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/htseq_count_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/htseq_count_to_h5mu" + executable: "target/nextflow/mapping/htseq_count_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/htseq_count_to_h5mu/main.nf b/target/nextflow/mapping/htseq_count_to_h5mu/main.nf new file mode 100644 index 00000000..a7b069f1 --- /dev/null +++ b/target/nextflow/mapping/htseq_count_to_h5mu/main.nf @@ -0,0 +1,4062 @@ +// htseq_count_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) +// * Angela Oliveira Pisco (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "htseq_count_to_h5mu", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "string", + "name" : "--input_id", + "description" : "The obs index for the counts", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_counts", + "description" : "The counts as a TSV file as output by HTSeq.", + "example" : [ + "counts.tsv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "description" : "The GTF file.", + "example" : [ + "gencode_v41_star" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert the htseq table to a h5mu.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "gtfparse" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "pyarrow~=18.0.0" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/htseq_count_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/htseq_count_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import tempfile +from pathlib import Path +import tarfile +import gzip +import shutil +import pandas as pd +import mudata as md +import anndata as ad +import polars as pl +import numpy as np +import gtfparse + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'input_counts': $( if [ ! -z ${VIASH_PAR_INPUT_COUNTS+x} ]; then echo "r'${VIASH_PAR_INPUT_COUNTS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\\\x1f\\\\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +print("> combine counts data", flush=True) +counts_data = [] + +for input_id, input_counts in zip(par["input_id"], par["input_counts"]): + data = pd.read_table( + input_counts, + index_col=0, + names=["gene_ids", input_id], + dtype={"gene_ids": "U", input_id: "i"}, + ).transpose() + counts_data.append(data) + +# combine all counts +counts_and_qc = pd.concat(counts_data, axis=0) + +print("> split qc", flush=True) +idx = counts_and_qc.columns.str.startswith("_") +qc = counts_and_qc.loc[:, idx] +qc.columns = qc.columns.str.replace("^__", "", regex=True) +counts = counts_and_qc.loc[:, ~idx] + +print("> construct var", flush=True) +with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + reference = Path(par["reference"]) + + print(f">> Check compression of --reference with value: {reference}", flush=True) + par["reference"] = extract_if_need_be(reference, temp_dir_path) + + # read_gtf only works on str object, not pathlib.Path + reference = gtfparse.read_gtf(str(par["reference"])) + + +# This is a polars dataframe, not pandas +reference_genes = reference.filter( + (pl.col("feature") == "gene") & (pl.col("gene_id").is_in(list(counts.columns))) +).sort("gene_id") + +var = pd.DataFrame( + data={ + "gene_ids": pd.Index(reference_genes.get_column("gene_id")), + "feature_types": "Gene Expression", + "gene_symbol": reference_genes.get_column("gene_name").to_pandas(), + } +).set_index("gene_ids") + +print("> construct anndata", flush=True) +adata = ad.AnnData(X=counts, obsm={"qc_htseq": qc}, var=var, dtype=np.int32) + +print("> convert to mudata", flush=True) +mdata = md.MuData(adata) + +print("> write to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/htseq_count_to_h5mu", + "tag" : "main" + }, + "label" : [ + "highmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/htseq_count_to_h5mu/nextflow.config b/target/nextflow/mapping/htseq_count_to_h5mu/nextflow.config new file mode 100644 index 00000000..50f50e98 --- /dev/null +++ b/target/nextflow/mapping/htseq_count_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/htseq_count_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert the htseq table to a h5mu.\n' + author = 'Robrecht Cannoodt, Angela Oliveira Pisco' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/htseq_count_to_h5mu/nextflow_labels.config b/target/nextflow/mapping/htseq_count_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/htseq_count_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/htseq_count_to_h5mu/nextflow_schema.json b/target/nextflow/mapping/htseq_count_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..bd77242c --- /dev/null +++ b/target/nextflow/mapping/htseq_count_to_h5mu/nextflow_schema.json @@ -0,0 +1,86 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "htseq_count_to_h5mu", + "description": "Convert the htseq table to a h5mu.\n", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "input_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The obs index for the counts", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"foo\"]`. " + }, + "input_counts": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The counts as a TSV file as output by HTSeq.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"counts.tsv\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "The GTF file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"gencode_v41_star\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/multi_star/.config.vsh.yaml b/target/nextflow/mapping/multi_star/.config.vsh.yaml new file mode 100644 index 00000000..ae2e50be --- /dev/null +++ b/target/nextflow/mapping/multi_star/.config.vsh.yaml @@ -0,0 +1,2953 @@ +name: "multi_star" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input/Output" + arguments: + - type: "string" + name: "--input_id" + description: "The ID of the sample being processed. This vector should have the\ + \ same length as the `--input_r1` argument." + info: null + example: + - "mysample" + - "mysample" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r1" + description: "Paths to the sequences to be mapped. If using Illumina paired-end\ + \ reads, only the R1 files should be passed." + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L002_R1_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Paths to the sequences to be mapped. If using Illumina paired-end\ + \ reads, only the R2 files should be passed." + info: null + example: + - "mysample_S1_L001_R2_001.fastq.gz" + - "mysample_S1_L002_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference_index" + alternatives: + - "--genomeDir" + description: "Path to the reference built by star_build_reference. Corresponds\ + \ to the --genomeDir argument in the STAR command." + info: null + example: + - "/path/to/reference" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference_gtf" + description: "Path to the gtf reference file." + info: null + example: + - "genes.gtf" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--outFileNamePrefix" + description: "Path to output directory. Corresponds to the --outFileNamePrefix\ + \ argument in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Processing arguments" + arguments: + - type: "boolean" + name: "--run_htseq_count" + description: "Whether or not to also run htseq-count after STAR." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--run_multiqc" + description: "Whether or not to also run MultiQC at the end." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_success_rate" + description: "Fail when the success rate is below this threshold." + info: null + default: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Run Parameters" + arguments: + - type: "integer" + name: "--runRNGseed" + description: "random number generator seed." + info: + step: "star" + orig_arg: "--runRNGseed" + example: + - 777 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Genome Parameters" + arguments: + - type: "file" + name: "--genomeFastaFiles" + description: "path(s) to the fasta files with the genome sequences, separated\ + \ by spaces. These files should be plain text FASTA files, they *cannot* be\ + \ zipped.\n\nRequired for the genome generation (--runMode genomeGenerate).\ + \ Can also be used in the mapping (--runMode alignReads) to add extra (new)\ + \ sequences to the genome (e.g. spike-ins)." + info: + step: "star" + orig_arg: "--genomeFastaFiles" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Splice Junctions Database" + arguments: + - type: "string" + name: "--sjdbFileChrStartEnd" + description: "path to the files with genomic coordinates (chr start \ + \ end strand) for the splice junction introns. Multiple files can be supplied\ + \ and will be concatenated." + info: + step: "star" + orig_arg: "--sjdbFileChrStartEnd" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sjdbGTFfile" + description: "path to the GTF file with annotations" + info: + step: "star" + orig_arg: "--sjdbGTFfile" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFchrPrefix" + description: "prefix for chromosome names in a GTF file (e.g. 'chr' for using\ + \ ENSMEBL annotations with UCSC genomes)" + info: + step: "star" + orig_arg: "--sjdbGTFchrPrefix" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFfeatureExon" + description: "feature type in GTF file to be used as exons for building transcripts" + info: + step: "star" + orig_arg: "--sjdbGTFfeatureExon" + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentTranscript" + description: "GTF attribute name for parent transcript ID (default \"transcript_id\"\ + \ works for GTF files)" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentTranscript" + example: + - "transcript_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGene" + description: "GTF attribute name for parent gene ID (default \"gene_id\" works\ + \ for GTF files)" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentGene" + example: + - "gene_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneName" + description: "GTF attribute name for parent gene name" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentGeneName" + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneType" + description: "GTF attribute name for parent gene type" + info: + step: "star" + orig_arg: "--sjdbGTFtagExonParentGeneType" + example: + - "gene_type" + - "gene_biotype" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sjdbOverhang" + description: "length of the donor/acceptor sequence on each side of the junctions,\ + \ ideally = (mate_length - 1)" + info: + step: "star" + orig_arg: "--sjdbOverhang" + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sjdbScore" + description: "extra alignment score for alignments that cross database junctions" + info: + step: "star" + orig_arg: "--sjdbScore" + example: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbInsertSave" + description: "which files to save when sjdb junctions are inserted on the fly\ + \ at the mapping step\n\n- Basic ... only small junction / transcript files\n\ + - All ... all files including big Genome, SA and SAindex - this will create\ + \ a complete genome directory" + info: + step: "star" + orig_arg: "--sjdbInsertSave" + example: + - "Basic" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variation parameters" + arguments: + - type: "string" + name: "--varVCFfile" + description: "path to the VCF file that contains variation data. The 10th column\ + \ should contain the genotype information, e.g. 0/1" + info: + step: "star" + orig_arg: "--varVCFfile" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Parameters" + arguments: + - type: "string" + name: "--readFilesType" + description: "format of input read files\n\n- Fastx ... FASTA or FASTQ\n\ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand\ + \ samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use\ + \ --readFilesCommand samtools view" + info: + step: "star" + orig_arg: "--readFilesType" + example: + - "Fastx" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesSAMattrKeep" + description: "for --readFilesType SAM SE/PE, which SAM tags to keep in the output\ + \ BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n-\ + \ None ... do not keep any tags" + info: + step: "star" + orig_arg: "--readFilesSAMattrKeep" + example: + - "All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--readFilesManifest" + description: "path to the \"manifest\" file with the names of read files. The\ + \ manifest file should contain 3 tab-separated columns:\n\npaired-end reads:\ + \ read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads:\ + \ read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but\ + \ not tabs are allowed in file names.\nIf read_group_line does not start with\ + \ ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line\ + \ starts with ID:, it can contain several fields separated by $tab$, and all\ + \ fields will be be copied verbatim into SAM @RG header line." + info: + step: "star" + orig_arg: "--readFilesManifest" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesPrefix" + description: "prefix for the read files names, i.e. it will be added in front\ + \ of the strings in --readFilesIn" + info: + step: "star" + orig_arg: "--readFilesPrefix" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesCommand" + description: "command line to execute for each of the input file. This command\ + \ should generate FASTA or FASTQ text and send it to stdout\n\nFor example:\ + \ zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc." + info: + step: "star" + orig_arg: "--readFilesCommand" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readMapNumber" + description: "number of reads to map from the beginning of the file\n\n-1: map\ + \ all reads" + info: + step: "star" + orig_arg: "--readMapNumber" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readMatesLengthsIn" + description: "Equal/NotEqual - lengths of names,sequences,qualities for both mates\ + \ are the same / not the same. NotEqual is safe in all situations." + info: + step: "star" + orig_arg: "--readMatesLengthsIn" + example: + - "NotEqual" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readNameSeparator" + description: "character(s) separating the part of the read names that will be\ + \ trimmed in output (read name after space is always trimmed)" + info: + step: "star" + orig_arg: "--readNameSeparator" + example: + - "/" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readQualityScoreBase" + description: "number to be subtracted from the ASCII code to get Phred quality\ + \ score" + info: + step: "star" + orig_arg: "--readQualityScoreBase" + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Clipping" + arguments: + - type: "string" + name: "--clipAdapterType" + description: "adapter clipping type\n\n- Hamming ... adapter clipping based on\ + \ Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n\ + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes\ + \ Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ...\ + \ no adapter clipping, all other clip* parameters are disregarded" + info: + step: "star" + orig_arg: "--clipAdapterType" + example: + - "Hamming" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clip3pNbases" + description: "number(s) of bases to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip3pNbases" + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--clip3pAdapterSeq" + description: "adapter sequences to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence\ + \ with the length equal to read length" + info: + step: "star" + orig_arg: "--clip3pAdapterSeq" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--clip3pAdapterMMp" + description: "max proportion of mismatches for 3p adapter clipping for each mate.\ + \ If one value is given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip3pAdapterMMp" + example: + - 0.1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip3pAfterAdapterNbases" + description: "number of bases to clip from 3p of each mate after the adapter clipping.\ + \ If one value is given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip3pAfterAdapterNbases" + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip5pNbases" + description: "number(s) of bases to clip from 5p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: + step: "star" + orig_arg: "--clip5pNbases" + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Limits" + arguments: + - type: "long" + name: "--limitGenomeGenerateRAM" + description: "maximum available RAM (bytes) for genome generation" + info: + step: "star" + orig_arg: "--limitGenomeGenerateRAM" + example: + - 31000000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitIObufferSize" + description: "max available buffers size (bytes) for input/output, per thread" + info: + step: "star" + orig_arg: "--limitIObufferSize" + example: + - 30000000 + - 50000000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "long" + name: "--limitOutSAMoneReadBytes" + description: "max size of the SAM record (bytes) for one read. Recommended value:\ + \ >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + info: + step: "star" + orig_arg: "--limitOutSAMoneReadBytes" + example: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJoneRead" + description: "max number of junctions for one read (including all multi-mappers)" + info: + step: "star" + orig_arg: "--limitOutSJoneRead" + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJcollapsed" + description: "max number of collapsed junctions" + info: + step: "star" + orig_arg: "--limitOutSJcollapsed" + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitBAMsortRAM" + description: "maximum available RAM (bytes) for sorting BAM. If =0, it will be\ + \ set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory\ + \ option." + info: + step: "star" + orig_arg: "--limitBAMsortRAM" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitSjdbInsertNsj" + description: "maximum number of junctions to be inserted to the genome on the\ + \ fly at the mapping stage, including those from annotations and those detected\ + \ in the 1st step of the 2-pass run" + info: + step: "star" + orig_arg: "--limitSjdbInsertNsj" + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitNreadsSoft" + description: "soft limit on the number of reads" + info: + step: "star" + orig_arg: "--limitNreadsSoft" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: general" + arguments: + - type: "string" + name: "--outTmpKeep" + description: "whether to keep the temporary files after STAR runs is finished\n\ + \n- None ... remove all temporary files\n- All ... keep all files" + info: + step: "star" + orig_arg: "--outTmpKeep" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outStd" + description: "which output will be directed to stdout (standard out)\n\n- Log\ + \ ... log messages\n- SAM ... alignments\ + \ in SAM format (which normally are output to Aligned.out.sam file), normal\ + \ standard output will go into Log.std.out\n- BAM_Unsorted ... alignments\ + \ in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate\ + \ ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype\ + \ BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome\ + \ in BAM format, unsorted. Requires --quantMode TranscriptomeSAM" + info: + step: "star" + orig_arg: "--outStd" + example: + - "Log" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outReadsUnmapped" + description: "output of unmapped and partially mapped (i.e. mapped only one mate\ + \ of a paired end read) reads in separate file(s).\n\n- None ... no output\n\ + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + info: + step: "star" + orig_arg: "--outReadsUnmapped" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outQSconversionAdd" + description: "add this number to the quality score (e.g. to convert from Illumina\ + \ to Sanger, use -31)" + info: + step: "star" + orig_arg: "--outQSconversionAdd" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outMultimapperOrder" + description: "order of multimapping alignments in the output files\n\n- Old_2.4\ + \ ... quasi-random order used before 2.5.0\n- Random \ + \ ... random order of alignments for each multi-mapper. Read mates (pairs)\ + \ are always adjacent, all alignment for each read stay together. This option\ + \ will become default in the future releases." + info: + step: "star" + orig_arg: "--outMultimapperOrder" + example: + - "Old_2.4" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: SAM and BAM" + arguments: + - type: "string" + name: "--outSAMmode" + description: "mode of SAM output\n\n- None ... no SAM output\n- Full ... full\ + \ SAM output\n- NoQS ... full SAM but without quality scores" + info: + step: "star" + orig_arg: "--outSAMmode" + example: + - "Full" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMstrandField" + description: "Cufflinks-like strand field flag\n\n- None ... not used\n\ + - intronMotif ... strand derived from the intron motif. This option changes\ + \ the output alignments: reads with inconsistent and/or non-canonical introns\ + \ are filtered out." + info: + step: "star" + orig_arg: "--outSAMstrandField" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattributes" + description: "a string of desired SAM attributes, in the order desired for the\ + \ output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n\ + - None ... no attributes\n- Standard ... NH HI AS nM\n- All \ + \ ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number\ + \ of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard\ + \ SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart\ + \ (=1 by default). Standard SAM tag.\n- AS ... local alignment score,\ + \ +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE\ + \ reads, total score for two mates. Stadnard SAM tag.\n- nM ... number\ + \ of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance\ + \ to the reference (number of mismatched + inserted + deleted bases) for each\ + \ mate. Standard SAM tag.\n- MD ... string encoding mismatched and\ + \ deleted reference bases (see standard SAM specifications). Standard SAM tag.\n\ + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical;\ + \ 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions\ + \ database is used, and a junction is annotated, 20 is added to its motif value.\n\ + - jI ... start and end of introns for all junctions (1-based).\n- XS\ + \ ... alignment strand according to --outSAMstrandField.\n- MC \ + \ ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all\ + \ segment of all chimeric alingments for --chimOutType WithinBAM output.\n-\ + \ cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n\ + - vA ... variant allele\n- vG ... genomic coordinate of the\ + \ variant overlapped by the read.\n- vW ... 1 - alignment passes WASP\ + \ filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires\ + \ --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality\ + \ scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN \ + \ ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene\ + \ IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected\ + \ cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM\ + \ SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS \ + \ ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ \ + \ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha \ + \ ... haplotype (1/2) when mapping to the diploid genome. Requires genome\ + \ generated with --genomeTransformType Diploid .\n- rB ... alignment\ + \ block read/genomic coordinates.\n- vR ... read coordinate of the\ + \ variant." + info: + step: "star" + orig_arg: "--outSAMattributes" + example: + - "Standard" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMattrIHstart" + description: "start value for the IH attribute. 0 may be required by some downstream\ + \ software, such as Cufflinks or StringTie." + info: + step: "star" + orig_arg: "--outSAMattrIHstart" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMunmapped" + description: "output of unmapped reads in the SAM format\n\n1st word:\n- None\ + \ ... no output\n- Within ... output unmapped reads within the main SAM file\ + \ (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for\ + \ each alignment, and, in case of unsorted output, keep it adjacent to its mapped\ + \ mate. Only affects multi-mapping reads." + info: + step: "star" + orig_arg: "--outSAMunmapped" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMorder" + description: "type of sorting for the SAM output\n\nPaired: one mate after the\ + \ other for all paired alignments\nPairedKeepInputOrder: one mate after the\ + \ other for all paired alignments, the order is kept the same as in the input\ + \ FASTQ files" + info: + step: "star" + orig_arg: "--outSAMorder" + example: + - "Paired" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMprimaryFlag" + description: "which alignments are considered primary - all others will be marked\ + \ with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the\ + \ best score is primary\n- AllBestScore ... all alignments with the best score\ + \ are primary" + info: + step: "star" + orig_arg: "--outSAMprimaryFlag" + example: + - "OneBestScore" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMreadID" + description: "read ID record type\n\n- Standard ... first word (until space) from\ + \ the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number\ + \ (index) in the FASTx file" + info: + step: "star" + orig_arg: "--outSAMreadID" + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMmapqUnique" + description: "0 to 255: the MAPQ value for unique mappers" + info: + step: "star" + orig_arg: "--outSAMmapqUnique" + example: + - 255 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagOR" + description: "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e.\ + \ FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, and after outSAMflagAND. Can be used to set specific bits that are not\ + \ set otherwise." + info: + step: "star" + orig_arg: "--outSAMflagOR" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagAND" + description: "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e.\ + \ FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, but before outSAMflagOR. Can be used to unset specific bits that are\ + \ not set otherwise." + info: + step: "star" + orig_arg: "--outSAMflagAND" + example: + - 65535 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattrRGline" + description: "SAM/BAM read group line. The first word contains the read group\ + \ identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx CN:yy\ + \ \"DS:z z z\".\n\nxxx will be added as RG tag to each output alignment. Any\ + \ spaces in the tag values have to be double quoted.\nComma separated RG lines\ + \ correspons to different (comma separated) input files in --readFilesIn. Commas\ + \ have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz\ + \ \"DS:z z\" , ID:yyy DS:yyyy" + info: + step: "star" + orig_arg: "--outSAMattrRGline" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderHD" + description: "@HD (header) line of the SAM header" + info: + step: "star" + orig_arg: "--outSAMheaderHD" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderPG" + description: "extra @PG (software) line of the SAM header (in addition to STAR)" + info: + step: "star" + orig_arg: "--outSAMheaderPG" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderCommentFile" + description: "path to the file with @CO (comment) lines of the SAM header" + info: + step: "star" + orig_arg: "--outSAMheaderCommentFile" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMfilter" + description: "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences\ + \ ... only keep the reads for which all alignments are to the extra reference\ + \ sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences\ + \ ... keep all alignments to the extra reference sequences added with --genomeFastaFiles\ + \ at the mapping stage." + info: + step: "star" + orig_arg: "--outSAMfilter" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMmultNmax" + description: "max number of multiple alignments for a read that will be output\ + \ to the SAM/BAM files. Note that if this value is not equal to -1, the top\ + \ scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax)\ + \ will be output" + info: + step: "star" + orig_arg: "--outSAMmultNmax" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMtlen" + description: "calculation method for the TLEN field in the SAM/BAM files\n\n-\ + \ 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate.\ + \ (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost\ + \ base of any mate. (+)sign for the mate with the leftmost base. This is different\ + \ from 1 for overlapping mates with protruding ends" + info: + step: "star" + orig_arg: "--outSAMtlen" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMcompression" + description: "-1 to 10 BAM compression level, -1=default compression (6?), 0=no\ + \ compression, 10=maximum compression" + info: + step: "star" + orig_arg: "--outBAMcompression" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingThreadN" + description: ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN)." + info: + step: "star" + orig_arg: "--outBAMsortingThreadN" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingBinsN" + description: ">0: number of genome bins for coordinate-sorting" + info: + step: "star" + orig_arg: "--outBAMsortingBinsN" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BAM processing" + arguments: + - type: "string" + name: "--bamRemoveDuplicatesType" + description: "mark duplicates in the BAM file, for now only works with (i) sorted\ + \ BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- -\ + \ ... no duplicate removal/marking\n- UniqueIdentical\ + \ ... mark all multimappers, and duplicate unique mappers. The coordinates,\ + \ FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate\ + \ unique mappers but not multimappers." + info: + step: "star" + orig_arg: "--bamRemoveDuplicatesType" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--bamRemoveDuplicatesMate2basesN" + description: "number of bases from the 5' of mate 2 to use in collapsing (e.g.\ + \ for RAMPAGE)" + info: + step: "star" + orig_arg: "--bamRemoveDuplicatesMate2basesN" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Wiggle" + arguments: + - type: "string" + name: "--outWigType" + description: "type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\"\ + . Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n\ + - None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle\ + \ ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of\ + \ the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only\ + \ 2nd read" + info: + step: "star" + orig_arg: "--outWigType" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outWigStrand" + description: "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate\ + \ strands, str1 and str2\n- Unstranded ... collapsed strands" + info: + step: "star" + orig_arg: "--outWigStrand" + example: + - "Stranded" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigReferencesPrefix" + description: "prefix matching reference names to include in the output wiggle\ + \ file, e.g. \"chr\", default \"-\" - include all references" + info: + step: "star" + orig_arg: "--outWigReferencesPrefix" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigNorm" + description: "type of normalization for the signal\n\n- RPM ... reads per million\ + \ of mapped reads\n- None ... no normalization, \"raw\" counts" + info: + step: "star" + orig_arg: "--outWigNorm" + example: + - "RPM" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering" + arguments: + - type: "string" + name: "--outFilterType" + description: "type of filtering\n\n- Normal ... standard filtering using only\ + \ current alignment\n- BySJout ... keep only those reads that contain junctions\ + \ that passed filtering into SJ.out.tab" + info: + step: "star" + orig_arg: "--outFilterType" + example: + - "Normal" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapScoreRange" + description: "the score range below the maximum score for multimapping alignments" + info: + step: "star" + orig_arg: "--outFilterMultimapScoreRange" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapNmax" + description: "maximum number of loci the read is allowed to map to. Alignments\ + \ (all of them) will be output only if the read maps to no more loci than this\ + \ value.\n\nOtherwise no alignments will be output, and the read will be counted\ + \ as \"mapped to too many loci\" in the Log.final.out ." + info: + step: "star" + orig_arg: "--outFilterMultimapNmax" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMismatchNmax" + description: "alignment will be output only if it has no more mismatches than\ + \ this value." + info: + step: "star" + orig_arg: "--outFilterMismatchNmax" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverLmax" + description: "alignment will be output only if its ratio of mismatches to *mapped*\ + \ length is less than or equal to this value." + info: + step: "star" + orig_arg: "--outFilterMismatchNoverLmax" + example: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverReadLmax" + description: "alignment will be output only if its ratio of mismatches to *read*\ + \ length is less than or equal to this value." + info: + step: "star" + orig_arg: "--outFilterMismatchNoverReadLmax" + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterScoreMin" + description: "alignment will be output only if its score is higher than or equal\ + \ to this value." + info: + step: "star" + orig_arg: "--outFilterScoreMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterScoreMinOverLread" + description: "same as outFilterScoreMin, but normalized to read length (sum of\ + \ mates' lengths for paired-end reads)" + info: + step: "star" + orig_arg: "--outFilterScoreMinOverLread" + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMatchNmin" + description: "alignment will be output only if the number of matched bases is\ + \ higher than or equal to this value." + info: + step: "star" + orig_arg: "--outFilterMatchNmin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMatchNminOverLread" + description: "sam as outFilterMatchNmin, but normalized to the read length (sum\ + \ of mates' lengths for paired-end reads)." + info: + step: "star" + orig_arg: "--outFilterMatchNminOverLread" + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronMotifs" + description: "filter alignment using their motifs\n\n- None \ + \ ... no filtering\n- RemoveNoncanonical ... filter out\ + \ alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated\ + \ ... filter out alignments that contain non-canonical unannotated junctions\ + \ when using annotated splice junctions database. The annotated non-canonical\ + \ junctions will be kept." + info: + step: "star" + orig_arg: "--outFilterIntronMotifs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronStrands" + description: "filter alignments\n\n- RemoveInconsistentStrands ... remove\ + \ alignments that have junctions with inconsistent strands\n- None \ + \ ... no filtering" + info: + step: "star" + orig_arg: "--outFilterIntronStrands" + example: + - "RemoveInconsistentStrands" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output splice junctions (SJ.out.tab)" + arguments: + - type: "string" + name: "--outSJtype" + description: "type of splice junction output\n\n- Standard ... standard SJ.out.tab\ + \ output\n- None ... no splice junction output" + info: + step: "star" + orig_arg: "--outSJtype" + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering: Splice Junctions" + arguments: + - type: "string" + name: "--outSJfilterReads" + description: "which reads to consider for collapsed splice junctions output\n\n\ + - All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping\ + \ reads only" + info: + step: "star" + orig_arg: "--outSJfilterReads" + example: + - "All" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterOverhangMin" + description: "minimum overhang length for splice junctions on both sides for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply\ + \ to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterOverhangMin" + example: + - 30 + - 12 + - 12 + - 12 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountUniqueMin" + description: "minimum uniquely mapping read count per junction for: (1) non-canonical\ + \ motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and\ + \ GT/AT motif. -1 means no output for that motif\n\nJunctions are output if\ + \ one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are\ + \ satisfied\ndoes not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterCountUniqueMin" + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountTotalMin" + description: "minimum total (multi-mapping+unique) read count per junction for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions\ + \ are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin\ + \ conditions are satisfied\ndoes not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterCountTotalMin" + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterDistToOtherSJmin" + description: "minimum allowed distance to other junctions' donor/acceptor\n\n\ + does not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterDistToOtherSJmin" + example: + - 10 + - 0 + - 5 + - 10 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterIntronMaxVsReadN" + description: "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ + \ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2\ + \ reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\n\ + does not apply to annotated junctions" + info: + step: "star" + orig_arg: "--outSJfilterIntronMaxVsReadN" + example: + - 50000 + - 100000 + - 200000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Scoring" + arguments: + - type: "integer" + name: "--scoreGap" + description: "splice junction penalty (independent on intron motif)" + info: + step: "star" + orig_arg: "--scoreGap" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapNoncan" + description: "non-canonical junction penalty (in addition to scoreGap)" + info: + step: "star" + orig_arg: "--scoreGapNoncan" + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapGCAG" + description: "GC/AG and CT/GC junction penalty (in addition to scoreGap)" + info: + step: "star" + orig_arg: "--scoreGapGCAG" + example: + - -4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapATAC" + description: "AT/AC and GT/AT junction penalty (in addition to scoreGap)" + info: + step: "star" + orig_arg: "--scoreGapATAC" + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGenomicLengthLog2scale" + description: "extra score logarithmically scaled with genomic length of the alignment:\ + \ scoreGenomicLengthLog2scale*log2(genomicLength)" + info: + step: "star" + orig_arg: "--scoreGenomicLengthLog2scale" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelOpen" + description: "deletion open penalty" + info: + step: "star" + orig_arg: "--scoreDelOpen" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelBase" + description: "deletion extension penalty per base (in addition to scoreDelOpen)" + info: + step: "star" + orig_arg: "--scoreDelBase" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsOpen" + description: "insertion open penalty" + info: + step: "star" + orig_arg: "--scoreInsOpen" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsBase" + description: "insertion extension penalty per base (in addition to scoreInsOpen)" + info: + step: "star" + orig_arg: "--scoreInsBase" + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreStitchSJshift" + description: "maximum score reduction while searching for SJ boundaries in the\ + \ stitching step" + info: + step: "star" + orig_arg: "--scoreStitchSJshift" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Alignments and Seeding" + arguments: + - type: "integer" + name: "--seedSearchStartLmax" + description: "defines the search start point through the read - the read is split\ + \ into pieces no longer than this value" + info: + step: "star" + orig_arg: "--seedSearchStartLmax" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--seedSearchStartLmaxOverLread" + description: "seedSearchStartLmax normalized to read length (sum of mates' lengths\ + \ for paired-end reads)" + info: + step: "star" + orig_arg: "--seedSearchStartLmaxOverLread" + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSearchLmax" + description: "defines the maximum length of the seeds, if =0 seed length is not\ + \ limited" + info: + step: "star" + orig_arg: "--seedSearchLmax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMultimapNmax" + description: "only pieces that map fewer than this value are utilized in the stitching\ + \ procedure" + info: + step: "star" + orig_arg: "--seedMultimapNmax" + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerReadNmax" + description: "max number of seeds per read" + info: + step: "star" + orig_arg: "--seedPerReadNmax" + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerWindowNmax" + description: "max number of seeds per window" + info: + step: "star" + orig_arg: "--seedPerWindowNmax" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedNoneLociPerWindow" + description: "max number of one seed loci per window" + info: + step: "star" + orig_arg: "--seedNoneLociPerWindow" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSplitMin" + description: "min length of the seed sequences split by Ns or mate gap" + info: + step: "star" + orig_arg: "--seedSplitMin" + example: + - 12 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMapMin" + description: "min length of seeds to be mapped" + info: + step: "star" + orig_arg: "--seedMapMin" + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMin" + description: "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,\ + \ otherwise it is considered Deletion" + info: + step: "star" + orig_arg: "--alignIntronMin" + example: + - 21 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMax" + description: "maximum intron size, if 0, max intron size will be determined by\ + \ (2^winBinNbits)*winAnchorDistNbins" + info: + step: "star" + orig_arg: "--alignIntronMax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignMatesGapMax" + description: "maximum gap between two mates, if 0, max intron gap will be determined\ + \ by (2^winBinNbits)*winAnchorDistNbins" + info: + step: "star" + orig_arg: "--alignMatesGapMax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJoverhangMin" + description: "minimum overhang (i.e. block size) for spliced alignments" + info: + step: "star" + orig_arg: "--alignSJoverhangMin" + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJstitchMismatchNmax" + description: "maximum number of mismatches for stitching of the splice junctions\ + \ (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3)\ + \ GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif." + info: + step: "star" + orig_arg: "--alignSJstitchMismatchNmax" + example: + - 0 + - -1 + - 0 + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--alignSJDBoverhangMin" + description: "minimum overhang (i.e. block size) for annotated (sjdb) spliced\ + \ alignments" + info: + step: "star" + orig_arg: "--alignSJDBoverhangMin" + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSplicedMateMapLmin" + description: "minimum mapped length for a read mate that is spliced" + info: + step: "star" + orig_arg: "--alignSplicedMateMapLmin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alignSplicedMateMapLminOverLmate" + description: "alignSplicedMateMapLmin normalized to mate length" + info: + step: "star" + orig_arg: "--alignSplicedMateMapLminOverLmate" + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignWindowsPerReadNmax" + description: "max number of windows per read" + info: + step: "star" + orig_arg: "--alignWindowsPerReadNmax" + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerWindowNmax" + description: "max number of transcripts per window" + info: + step: "star" + orig_arg: "--alignTranscriptsPerWindowNmax" + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerReadNmax" + description: "max number of different alignments per read to consider" + info: + step: "star" + orig_arg: "--alignTranscriptsPerReadNmax" + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsType" + description: "type of read ends alignment\n\n- Local ... standard\ + \ local alignment with soft-clipping allowed\n- EndToEnd ... force\ + \ end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully\ + \ extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12\ + \ ... fully extend only the 5p of the both read1 and read2, all other ends:\ + \ local alignment" + info: + step: "star" + orig_arg: "--alignEndsType" + example: + - "Local" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsProtrude" + description: "allow protrusion of alignment ends, i.e. start (end) of the +strand\ + \ mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum\ + \ number of protrusion bases allowed\n2nd word: string:\n- \ + \ ConcordantPair ... report alignments with non-zero protrusion as concordant\ + \ pairs\n- DiscordantPair ... report alignments with non-zero\ + \ protrusion as discordant pairs" + info: + step: "star" + orig_arg: "--alignEndsProtrude" + example: + - "0 ConcordantPair" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignSoftClipAtReferenceEnds" + description: "allow the soft-clipping of the alignments past the end of the chromosomes\n\ + \n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks" + info: + step: "star" + orig_arg: "--alignSoftClipAtReferenceEnds" + example: + - "Yes" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignInsertionFlush" + description: "how to flush ambiguous insertion positions\n\n- None ... insertions\ + \ are not flushed\n- Right ... insertions are flushed to the right" + info: + step: "star" + orig_arg: "--alignInsertionFlush" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-End reads" + arguments: + - type: "integer" + name: "--peOverlapNbasesMin" + description: "minimum number of overlapping bases to trigger mates merging and\ + \ realignment. Specify >0 value to switch on the \"merginf of overlapping mates\"\ + \ algorithm." + info: + step: "star" + orig_arg: "--peOverlapNbasesMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--peOverlapMMp" + description: "maximum proportion of mismatched bases in the overlap area" + info: + step: "star" + orig_arg: "--peOverlapMMp" + example: + - 0.01 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Windows, Anchors, Binning" + arguments: + - type: "integer" + name: "--winAnchorMultimapNmax" + description: "max number of loci anchors are allowed to map to" + info: + step: "star" + orig_arg: "--winAnchorMultimapNmax" + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winBinNbits" + description: "=log2(winBin), where winBin is the size of the bin for the windows/clustering,\ + \ each window will occupy an integer number of bins." + info: + step: "star" + orig_arg: "--winBinNbits" + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winAnchorDistNbins" + description: "max number of bins between two anchors that allows aggregation of\ + \ anchors into one window" + info: + step: "star" + orig_arg: "--winAnchorDistNbins" + example: + - 9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winFlankNbins" + description: "log2(winFlank), where win Flank is the size of the left and right\ + \ flanking regions for each window" + info: + step: "star" + orig_arg: "--winFlankNbins" + example: + - 4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--winReadCoverageRelativeMin" + description: "minimum relative coverage of the read sequence by the seeds in a\ + \ window, for STARlong algorithm only." + info: + step: "star" + orig_arg: "--winReadCoverageRelativeMin" + example: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winReadCoverageBasesMin" + description: "minimum number of bases covered by the seeds in a window , for STARlong\ + \ algorithm only." + info: + step: "star" + orig_arg: "--winReadCoverageBasesMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Chimeric Alignments" + arguments: + - type: "string" + name: "--chimOutType" + description: "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n\ + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n-\ + \ WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n-\ + \ WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental\ + \ chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip\ + \ ... soft-clipping in the CIGAR for supplemental chimeric alignments" + info: + step: "star" + orig_arg: "--chimOutType" + example: + - "Junctions" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentMin" + description: "minimum length of chimeric segment length, if ==0, no chimeric output" + info: + step: "star" + orig_arg: "--chimSegmentMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreMin" + description: "minimum total (summed) score of the chimeric segments" + info: + step: "star" + orig_arg: "--chimScoreMin" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreDropMax" + description: "max drop (difference) of chimeric score (the sum of scores of all\ + \ chimeric segments) from the read length" + info: + step: "star" + orig_arg: "--chimScoreDropMax" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreSeparation" + description: "minimum difference (separation) between the best chimeric score\ + \ and the next one" + info: + step: "star" + orig_arg: "--chimScoreSeparation" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreJunctionNonGTAG" + description: "penalty for a non-GT/AG chimeric junction" + info: + step: "star" + orig_arg: "--chimScoreJunctionNonGTAG" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimJunctionOverhangMin" + description: "minimum overhang for a chimeric junction" + info: + step: "star" + orig_arg: "--chimJunctionOverhangMin" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentReadGapMax" + description: "maximum gap in the read sequence between chimeric segments" + info: + step: "star" + orig_arg: "--chimSegmentReadGapMax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chimFilter" + description: "different filters for chimeric alignments\n\n- None ... no filtering\n\ + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric\ + \ junction" + info: + step: "star" + orig_arg: "--chimFilter" + example: + - "banGenomicN" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimMainSegmentMultNmax" + description: "maximum number of multi-alignments for the main chimeric segment.\ + \ =1 will prohibit multimapping main segments." + info: + step: "star" + orig_arg: "--chimMainSegmentMultNmax" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapNmax" + description: "maximum number of chimeric multi-alignments\n\n- 0 ... use the old\ + \ scheme for chimeric detection which only considered unique alignments" + info: + step: "star" + orig_arg: "--chimMultimapNmax" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapScoreRange" + description: "the score range for multi-mapping chimeras below the best chimeric\ + \ score. Only works with --chimMultimapNmax > 1" + info: + step: "star" + orig_arg: "--chimMultimapScoreRange" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimNonchimScoreDropMin" + description: "to trigger chimeric detection, the drop in the best non-chimeric\ + \ alignment score with respect to the read length has to be greater than this\ + \ value" + info: + step: "star" + orig_arg: "--chimNonchimScoreDropMin" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimOutJunctionFormat" + description: "formatting type for the Chimeric.out.junction file\n\n- 0 ... no\ + \ comment lines/headers\n- 1 ... comment lines at the end of the file: command\ + \ line and Nreads: total, unique/multi-mapping" + info: + step: "star" + orig_arg: "--chimOutJunctionFormat" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Quantification of Annotations" + arguments: + - type: "string" + name: "--quantMode" + description: "types of quantification requested\n\n- - ... none\n\ + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate\ + \ file\n- GeneCounts ... count reads per gene" + info: + step: "star" + orig_arg: "--quantMode" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--quantTranscriptomeBAMcompression" + description: "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM\ + \ output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10\ + \ ... maximum compression" + info: + step: "star" + orig_arg: "--quantTranscriptomeBAMcompression" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quantTranscriptomeBan" + description: "prohibit various alignment type\n\n- IndelSoftclipSingleend ...\ + \ prohibit indels, soft clipping and single-end alignments - compatible with\ + \ RSEM\n- Singleend ... prohibit single-end alignments" + info: + step: "star" + orig_arg: "--quantTranscriptomeBan" + example: + - "IndelSoftclipSingleend" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "2-pass Mapping" + arguments: + - type: "string" + name: "--twopassMode" + description: "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic\ + \ ... basic 2-pass mapping, with all 1st pass junctions inserted into\ + \ the genome indices on the fly" + info: + step: "star" + orig_arg: "--twopassMode" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--twopass1readsN" + description: "number of reads to process for the 1st step. Use very large number\ + \ (or default -1) to map all reads in the first step." + info: + step: "star" + orig_arg: "--twopass1readsN" + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "WASP parameters" + arguments: + - type: "string" + name: "--waspOutputMode" + description: "WASP allele-specific output type. This is re-implementation of the\ + \ original WASP mappability filtering by Bryce van de Geijn, Graham McVicker,\ + \ Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature\ + \ Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\ + \n- SAMtag ... add WASP tags to the alignments that pass WASP filtering" + info: + step: "star" + orig_arg: "--waspOutputMode" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STARsolo (single cell RNA-seq) parameters" + arguments: + - type: "string" + name: "--soloType" + description: "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet)\ + \ one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X\ + \ Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length,\ + \ one UMI of fixed length and one adapter sequence of fixed length are allowed\ + \ in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode\ + \ as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2\ + \ if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or\ + \ SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate\ + \ FASTQ (paired- or single-end), barcodes are corresponding read-groups, no\ + \ UMI sequences, alignments deduplicated according to alignment start and end\ + \ (after extending soft-clipped bases)" + info: + step: "star" + orig_arg: "--soloType" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCBwhitelist" + description: "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex\ + \ allows more than one whitelist file.\n\n- None ... no whitelist:\ + \ all cell barcodes are allowed" + info: + step: "star" + orig_arg: "--soloCBwhitelist" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--soloCBstart" + description: "cell barcode start base" + info: + step: "star" + orig_arg: "--soloCBstart" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloCBlen" + description: "cell barcode length" + info: + step: "star" + orig_arg: "--soloCBlen" + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIstart" + description: "UMI start base" + info: + step: "star" + orig_arg: "--soloUMIstart" + example: + - 17 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIlen" + description: "UMI length" + info: + step: "star" + orig_arg: "--soloUMIlen" + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeReadLength" + description: "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n\ + - 0 ... not defined, do not check" + info: + step: "star" + orig_arg: "--soloBarcodeReadLength" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeMate" + description: "identifies which read mate contains the barcode (CB+UMI) sequence\n\ + \n- 0 ... barcode sequence is on separate read, which should always be the\ + \ last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part\ + \ of mate 1\n- 2 ... barcode sequence is a part of mate 2" + info: + step: "star" + orig_arg: "--soloBarcodeMate" + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBposition" + description: "position of Cell Barcode(s) on the barcode read.\n\nPresently only\ + \ works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\n\ + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor\ + \ defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter\ + \ start; 3: adapter end\nstart(end)Position is the 0-based position with of\ + \ the CB start(end) with respect to the Anchor Base\nString for different barcodes\ + \ are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols,\ + \ 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8" + info: + step: "star" + orig_arg: "--soloCBposition" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIposition" + description: "position of the UMI on the barcode read, same as soloCBposition\n\ + \nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition\ + \ 3_9_3_14" + info: + step: "star" + orig_arg: "--soloUMIposition" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloAdapterSequence" + description: "adapter sequence to anchor barcodes. Only one adapter sequence is\ + \ allowed." + info: + step: "star" + orig_arg: "--soloAdapterSequence" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloAdapterMismatchesNmax" + description: "maximum number of mismatches allowed in adapter sequence." + info: + step: "star" + orig_arg: "--soloAdapterMismatchesNmax" + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBmatchWLtype" + description: "matching the Cell Barcodes to the WhiteList\n\n- Exact \ + \ ... only exact matches allowed\n- 1MM \ + \ ... only one match in whitelist with 1 mismatched base allowed. Allowed\ + \ CBs have to have at least one read with exact match.\n- 1MM_multi \ + \ ... multiple matches in whitelist with 1 mismatched base allowed,\ + \ posterior probability calculation is used choose one of the matches.\nAllowed\ + \ CBs have to have at least one read with exact match. This option matches best\ + \ with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi,\ + \ but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts\ + \ ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for\ + \ CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2\ + \ ... allow up to edit distance of 3 fpr each of the barcodes.\ + \ May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex.\ + \ Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio\ + \ Split-seq pipeline." + info: + step: "star" + orig_arg: "--soloCBmatchWLtype" + example: + - "1MM_multi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeSeq" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance,\ + \ for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR\ + \ .\nThis parameter is required when running STARsolo with input from SAM." + info: + step: "star" + orig_arg: "--soloInputSAMattrBarcodeSeq" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeQual" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode qualities (in proper order).\n\nFor\ + \ instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual\ + \ CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned\ + \ to all bases." + info: + step: "star" + orig_arg: "--soloInputSAMattrBarcodeQual" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloStrand" + description: "strandedness of the solo libraries:\n\n- Unstranded ... no strand\ + \ information\n- Forward ... read strand same as the original RNA molecule\n\ + - Reverse ... read strand opposite to the original RNA molecule" + info: + step: "star" + orig_arg: "--soloStrand" + example: + - "Forward" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloFeatures" + description: "genomic features for which the UMI counts per Cell Barcode are collected\n\ + \n- Gene ... genes: reads match the gene transcript\n- SJ \ + \ ... splice junctions: reported in SJ.out.tab\n- GeneFull ...\ + \ full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n\ + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping\ + \ genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS\ + \ ... full gene (pre-RNA): count all reads overlapping genes' exons and\ + \ introns: prioritize >50% overlap with exons. Do not count reads with 100%\ + \ exonic overlap in the antisense direction." + info: + step: "star" + orig_arg: "--soloFeatures" + example: + - "Gene" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloMultiMappers" + description: "counting method for reads mapping to multiple genes\n\n- Unique\ + \ ... count only reads that map to unique genes\n- Uniform ... uniformly\ + \ distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs\ + \ proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique\ + \ ... distribute UMIs proportionally to unique mappers, if present, and uniformly\ + \ if not.\n- EM ... multi-gene UMIs are distributed using Expectation\ + \ Maximization algorithm" + info: + step: "star" + orig_arg: "--soloMultiMappers" + example: + - "Unique" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIdedup" + description: "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All \ + \ ... all UMIs with 1 mismatch distance to each other are\ + \ collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows\ + \ the \"directional\" method from the UMI-tools by Smith, Heger and Sudbery\ + \ (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools,\ + \ but with more stringent criteria for duplicate UMIs\n- Exact \ + \ ... only exactly matching UMIs are collapsed.\n- NoDedup \ + \ ... no deduplication of UMIs, count all reads.\n- 1MM_CR \ + \ ... CellRanger2-4 algorithm for 1MM UMI collapsing." + info: + step: "star" + orig_arg: "--soloUMIdedup" + example: + - "1MM_All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIfiltering" + description: "type of UMI filtering (for reads uniquely mapping to genes)\n\n\ + - - ... basic filtering: remove UMIs with N and homopolymers\ + \ (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count\ + \ UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove\ + \ all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic +\ + \ remove lower-count UMIs that map to more than one gene, matching CellRanger\ + \ > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR" + info: + step: "star" + orig_arg: "--soloUMIfiltering" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFileNames" + description: "file names for STARsolo output:\n\nfile_name_prefix gene_names\ + \ barcode_sequences cell_feature_count_matrix" + info: + step: "star" + orig_arg: "--soloOutFileNames" + example: + - "Solo.out/" + - "features.tsv" + - "barcodes.tsv" + - "matrix.mtx" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellFilter" + description: "cell filtering type and parameters\n\n- None ... do not\ + \ output filtered cells\n- TopCells ... only report top cells by UMI\ + \ count, followed by the exact number of cells\n- CellRanger2.2 ... simple\ + \ filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected\ + \ cells, robust maximum percentile for UMI count, maximum to minimum ratio for\ + \ UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; \ + \ maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering\ + \ in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun\ + \ et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\n\ + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile \ + \ maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR\ + \ simN\nThe harcoded values are from CellRanger: 3000 \ + \ 0.99 10 45000 90000 500 0.01 20000\ + \ 0.01 10000" + info: + step: "star" + orig_arg: "--soloCellFilter" + example: + - "CellRanger2.2" + - "3000" + - "0.99" + - "10" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFormatFeaturesGeneField3" + description: "field 3 in the Gene features.tsv file. If \"-\", then no 3rd field\ + \ is output." + info: + step: "star" + orig_arg: "--soloOutFormatFeaturesGeneField3" + example: + - "Gene Expression" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellReadStats" + description: "Output reads statistics for each CB\n\n- Standard ... standard\ + \ output" + info: + step: "star" + orig_arg: "--soloCellReadStats" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "HTSeq arguments" + arguments: + - type: "string" + name: "--stranded" + alternatives: + - "-s" + description: "Whether the data is from a strand-specific assay. 'reverse' means\ + \ 'yes' with reversed strand interpretation." + info: + step: "htseq" + orig_arg: "--stranded" + default: + - "yes" + required: false + choices: + - "yes" + - "no" + - "reverse" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--minimum_alignment_quality" + alternatives: + - "-a" + - "--minaqual" + description: "Skip all reads with MAPQ alignment quality lower than the given\ + \ minimum value. \nMAPQ is the 5th column of a SAM/BAM file and its usage depends\ + \ on the software \nused to map the reads.\n" + info: + step: "htseq" + orig_arg: "--minaqual" + default: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--type" + alternatives: + - "-t" + description: "Feature type (3rd column in GTF file) to be used, all features of\ + \ other type are ignored (default, suitable for Ensembl GTF files: exon)" + info: + step: "htseq" + orig_arg: "--type" + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--id_attribute" + alternatives: + - "-i" + description: "GTF attribute to be used as feature ID (default, suitable for Ensembl\ + \ GTF files: gene_id).\nAll feature of the right type (see -t option) within\ + \ the same GTF attribute will be added\ntogether. The typical way of using this\ + \ option is to count all exonic reads from each gene\nand add the exons but\ + \ other uses are possible as well. You can call this option multiple\ntimes:\ + \ in that case, the combination of all attributes separated by colons (:) will\ + \ be used\nas a unique identifier, e.g. for exons you might use -i gene_id -i\ + \ exon_number.\n" + info: + step: "htseq" + orig_arg: "--idattr" + example: + - "gene_id" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--additional_attributes" + description: "Additional feature attributes (suitable for Ensembl GTF files: gene_name).\ + \ Use multiple times\nfor more than one additional attribute. These attributes\ + \ are only used as annotations in the\noutput, while the determination of how\ + \ the counts are added together is done based on option -i.\n" + info: + step: "htseq" + orig_arg: "--additional-attr" + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--add_chromosome_info" + description: "Store information about the chromosome of each feature as an additional\ + \ attribute\n(e.g. colunm in the TSV output file).\n" + info: + step: "htseq" + orig_arg: "--add-chromosome-info" + direction: "input" + - type: "string" + name: "--mode" + alternatives: + - "-m" + description: "Mode to handle reads overlapping more than one feature." + info: + step: "htseq" + orig_arg: "--mode" + default: + - "union" + required: false + choices: + - "union" + - "intersection-strict" + - "intersection-nonempty" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_unique" + description: "Whether and how to score reads that are not uniquely aligned or\ + \ ambiguously assigned to features." + info: + step: "htseq" + orig_arg: "--nonunique" + default: + - "none" + required: false + choices: + - "none" + - "all" + - "fraction" + - "random" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--secondary_alignments" + description: "Whether to score secondary alignments (0x100 flag)." + info: + step: "htseq" + orig_arg: "--secondary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--supplementary_alignments" + description: "Whether to score supplementary alignments (0x800 flag)." + info: + step: "htseq" + orig_arg: "--supplementary-alignments" + required: false + choices: + - "score" + - "ignore" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--counts_output_sparse" + description: "Store the counts as a sparse matrix (mtx, h5ad, loom)." + info: + step: "htseq" + orig_arg: "--counts-output-sparse" + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using STAR." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "STAR_VERSION 2.7.10b" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + - type: "apt" + packages: + - "samtools" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pyyaml" + - "HTSeq" + - "multiprocess" + - "gtfparse" + - "pandas" + - "numpy<2" + - "multiqc~=1.15.0" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "pytest" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/multi_star/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/multi_star" + executable: "target/nextflow/mapping/multi_star/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/multi_star/main.nf b/target/nextflow/mapping/multi_star/main.nf new file mode 100644 index 00000000..edac824d --- /dev/null +++ b/target/nextflow/mapping/multi_star/main.nf @@ -0,0 +1,7676 @@ +// multi_star main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "multi_star", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input/Output", + "arguments" : [ + { + "type" : "string", + "name" : "--input_id", + "description" : "The ID of the sample being processed. This vector should have the same length as the `--input_r1` argument.", + "example" : [ + "mysample", + "mysample" + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r1", + "description" : "Paths to the sequences to be mapped. If using Illumina paired-end reads, only the R1 files should be passed.", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L002_R1_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "description" : "Paths to the sequences to be mapped. If using Illumina paired-end reads, only the R2 files should be passed.", + "example" : [ + "mysample_S1_L001_R2_001.fastq.gz", + "mysample_S1_L002_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference_index", + "alternatives" : [ + "--genomeDir" + ], + "description" : "Path to the reference built by star_build_reference. Corresponds to the --genomeDir argument in the STAR command.", + "example" : [ + "/path/to/reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference_gtf", + "description" : "Path to the gtf reference file.", + "example" : [ + "genes.gtf" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "--outFileNamePrefix" + ], + "description" : "Path to output directory. Corresponds to the --outFileNamePrefix argument in the STAR command.", + "example" : [ + "/path/to/foo" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Processing arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--run_htseq_count", + "description" : "Whether or not to also run htseq-count after STAR.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--run_multiqc", + "description" : "Whether or not to also run MultiQC at the end.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_success_rate", + "description" : "Fail when the success rate is below this threshold.", + "default" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Run Parameters", + "arguments" : [ + { + "type" : "integer", + "name" : "--runRNGseed", + "description" : "random number generator seed.", + "info" : { + "step" : "star", + "orig_arg" : "--runRNGseed" + }, + "example" : [ + 777 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Genome Parameters", + "arguments" : [ + { + "type" : "file", + "name" : "--genomeFastaFiles", + "description" : "path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.\n\nRequired for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins).", + "info" : { + "step" : "star", + "orig_arg" : "--genomeFastaFiles" + }, + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Splice Junctions Database", + "arguments" : [ + { + "type" : "string", + "name" : "--sjdbFileChrStartEnd", + "description" : "path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated.", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbFileChrStartEnd" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sjdbGTFfile", + "description" : "path to the GTF file with annotations", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFfile" + }, + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFchrPrefix", + "description" : "prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes)", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFchrPrefix" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFfeatureExon", + "description" : "feature type in GTF file to be used as exons for building transcripts", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFfeatureExon" + }, + "example" : [ + "exon" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentTranscript", + "description" : "GTF attribute name for parent transcript ID (default \\"transcript_id\\" works for GTF files)", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFtagExonParentTranscript" + }, + "example" : [ + "transcript_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGene", + "description" : "GTF attribute name for parent gene ID (default \\"gene_id\\" works for GTF files)", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFtagExonParentGene" + }, + "example" : [ + "gene_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGeneName", + "description" : "GTF attribute name for parent gene name", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFtagExonParentGeneName" + }, + "example" : [ + "gene_name" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGeneType", + "description" : "GTF attribute name for parent gene type", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbGTFtagExonParentGeneType" + }, + "example" : [ + "gene_type", + "gene_biotype" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sjdbOverhang", + "description" : "length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbOverhang" + }, + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sjdbScore", + "description" : "extra alignment score for alignments that cross database junctions", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbScore" + }, + "example" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbInsertSave", + "description" : "which files to save when sjdb junctions are inserted on the fly at the mapping step\n\n- Basic ... only small junction / transcript files\n- All ... all files including big Genome, SA and SAindex - this will create a complete genome directory", + "info" : { + "step" : "star", + "orig_arg" : "--sjdbInsertSave" + }, + "example" : [ + "Basic" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Variation parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--varVCFfile", + "description" : "path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1", + "info" : { + "step" : "star", + "orig_arg" : "--varVCFfile" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read Parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--readFilesType", + "description" : "format of input read files\n\n- Fastx ... FASTA or FASTQ\n- SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view", + "info" : { + "step" : "star", + "orig_arg" : "--readFilesType" + }, + "example" : [ + "Fastx" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesSAMattrKeep", + "description" : "for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n- None ... do not keep any tags", + "info" : { + "step" : "star", + "orig_arg" : "--readFilesSAMattrKeep" + }, + "example" : [ + "All" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--readFilesManifest", + "description" : "path to the \\"manifest\\" file with the names of read files. The manifest file should contain 3 tab-separated columns:\n\npaired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads: read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but not tabs are allowed in file names.\nIf read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line.", + "info" : { + "step" : "star", + "orig_arg" : "--readFilesManifest" + }, + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesPrefix", + "description" : "prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn", + "info" : { + "step" : "star", + "orig_arg" : "--readFilesPrefix" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesCommand", + "description" : "command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout\n\nFor example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.", + "info" : { + "step" : "star", + "orig_arg" : "--readFilesCommand" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--readMapNumber", + "description" : "number of reads to map from the beginning of the file\n\n-1: map all reads", + "info" : { + "step" : "star", + "orig_arg" : "--readMapNumber" + }, + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readMatesLengthsIn", + "description" : "Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations.", + "info" : { + "step" : "star", + "orig_arg" : "--readMatesLengthsIn" + }, + "example" : [ + "NotEqual" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readNameSeparator", + "description" : "character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)", + "info" : { + "step" : "star", + "orig_arg" : "--readNameSeparator" + }, + "example" : [ + "/" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--readQualityScoreBase", + "description" : "number to be subtracted from the ASCII code to get Phred quality score", + "info" : { + "step" : "star", + "orig_arg" : "--readQualityScoreBase" + }, + "example" : [ + 33 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read Clipping", + "arguments" : [ + { + "type" : "string", + "name" : "--clipAdapterType", + "description" : "adapter clipping type\n\n- Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n- CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ... no adapter clipping, all other clip* parameters are disregarded", + "info" : { + "step" : "star", + "orig_arg" : "--clipAdapterType" + }, + "example" : [ + "Hamming" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip3pNbases", + "description" : "number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.", + "info" : { + "step" : "star", + "orig_arg" : "--clip3pNbases" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--clip3pAdapterSeq", + "description" : "adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence with the length equal to read length", + "info" : { + "step" : "star", + "orig_arg" : "--clip3pAdapterSeq" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--clip3pAdapterMMp", + "description" : "max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates.", + "info" : { + "step" : "star", + "orig_arg" : "--clip3pAdapterMMp" + }, + "example" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip3pAfterAdapterNbases", + "description" : "number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates.", + "info" : { + "step" : "star", + "orig_arg" : "--clip3pAfterAdapterNbases" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip5pNbases", + "description" : "number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.", + "info" : { + "step" : "star", + "orig_arg" : "--clip5pNbases" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Limits", + "arguments" : [ + { + "type" : "long", + "name" : "--limitGenomeGenerateRAM", + "description" : "maximum available RAM (bytes) for genome generation", + "info" : { + "step" : "star", + "orig_arg" : "--limitGenomeGenerateRAM" + }, + "example" : [ + 31000000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitIObufferSize", + "description" : "max available buffers size (bytes) for input/output, per thread", + "info" : { + "step" : "star", + "orig_arg" : "--limitIObufferSize" + }, + "example" : [ + 30000000, + 50000000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitOutSAMoneReadBytes", + "description" : "max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax", + "info" : { + "step" : "star", + "orig_arg" : "--limitOutSAMoneReadBytes" + }, + "example" : [ + 100000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitOutSJoneRead", + "description" : "max number of junctions for one read (including all multi-mappers)", + "info" : { + "step" : "star", + "orig_arg" : "--limitOutSJoneRead" + }, + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitOutSJcollapsed", + "description" : "max number of collapsed junctions", + "info" : { + "step" : "star", + "orig_arg" : "--limitOutSJcollapsed" + }, + "example" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitBAMsortRAM", + "description" : "maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option.", + "info" : { + "step" : "star", + "orig_arg" : "--limitBAMsortRAM" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitSjdbInsertNsj", + "description" : "maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "info" : { + "step" : "star", + "orig_arg" : "--limitSjdbInsertNsj" + }, + "example" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitNreadsSoft", + "description" : "soft limit on the number of reads", + "info" : { + "step" : "star", + "orig_arg" : "--limitNreadsSoft" + }, + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output: general", + "arguments" : [ + { + "type" : "string", + "name" : "--outTmpKeep", + "description" : "whether to keep the temporary files after STAR runs is finished\n\n- None ... remove all temporary files\n- All ... keep all files", + "info" : { + "step" : "star", + "orig_arg" : "--outTmpKeep" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outStd", + "description" : "which output will be directed to stdout (standard out)\n\n- Log ... log messages\n- SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out\n- BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM", + "info" : { + "step" : "star", + "orig_arg" : "--outStd" + }, + "example" : [ + "Log" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outReadsUnmapped", + "description" : "output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s).\n\n- None ... no output\n- Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2", + "info" : { + "step" : "star", + "orig_arg" : "--outReadsUnmapped" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outQSconversionAdd", + "description" : "add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31)", + "info" : { + "step" : "star", + "orig_arg" : "--outQSconversionAdd" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outMultimapperOrder", + "description" : "order of multimapping alignments in the output files\n\n- Old_2.4 ... quasi-random order used before 2.5.0\n- Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases.", + "info" : { + "step" : "star", + "orig_arg" : "--outMultimapperOrder" + }, + "example" : [ + "Old_2.4" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output: SAM and BAM", + "arguments" : [ + { + "type" : "string", + "name" : "--outSAMmode", + "description" : "mode of SAM output\n\n- None ... no SAM output\n- Full ... full SAM output\n- NoQS ... full SAM but without quality scores", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMmode" + }, + "example" : [ + "Full" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMstrandField", + "description" : "Cufflinks-like strand field flag\n\n- None ... not used\n- intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMstrandField" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMattributes", + "description" : "a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n- None ... no attributes\n- Standard ... NH HI AS nM\n- All ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag.\n- AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag.\n- nM ... number of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag.\n- MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag.\n- jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value.\n- jI ... start and end of introns for all junctions (1-based).\n- XS ... alignment strand according to --outSAMstrandField.\n- MC ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output.\n- cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n- vA ... variant allele\n- vG ... genomic coordinate of the variant overlapped by the read.\n- vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid .\n- rB ... alignment block read/genomic coordinates.\n- vR ... read coordinate of the variant.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMattributes" + }, + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMattrIHstart", + "description" : "start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMattrIHstart" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMunmapped", + "description" : "output of unmapped reads in the SAM format\n\n1st word:\n- None ... no output\n- Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMunmapped" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMorder", + "description" : "type of sorting for the SAM output\n\nPaired: one mate after the other for all paired alignments\nPairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMorder" + }, + "example" : [ + "Paired" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMprimaryFlag", + "description" : "which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the best score is primary\n- AllBestScore ... all alignments with the best score are primary", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMprimaryFlag" + }, + "example" : [ + "OneBestScore" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMreadID", + "description" : "read ID record type\n\n- Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number (index) in the FASTx file", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMreadID" + }, + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMmapqUnique", + "description" : "0 to 255: the MAPQ value for unique mappers", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMmapqUnique" + }, + "example" : [ + 255 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMflagOR", + "description" : "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMflagOR" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMflagAND", + "description" : "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMflagAND" + }, + "example" : [ + 65535 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMattrRGline", + "description" : "SAM/BAM read group line. The first word contains the read group identifier and must start with \\"ID:\\", e.g. --outSAMattrRGline ID:xxx CN:yy \\"DS:z z z\\".\n\nxxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.\nComma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz \\"DS:z z\\" , ID:yyy DS:yyyy", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMattrRGline" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderHD", + "description" : "@HD (header) line of the SAM header", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMheaderHD" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderPG", + "description" : "extra @PG (software) line of the SAM header (in addition to STAR)", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMheaderPG" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderCommentFile", + "description" : "path to the file with @CO (comment) lines of the SAM header", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMheaderCommentFile" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMfilter", + "description" : "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage.", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMfilter" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMmultNmax", + "description" : "max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax) will be output", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMmultNmax" + }, + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMtlen", + "description" : "calculation method for the TLEN field in the SAM/BAM files\n\n- 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends", + "info" : { + "step" : "star", + "orig_arg" : "--outSAMtlen" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMcompression", + "description" : "-1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression", + "info" : { + "step" : "star", + "orig_arg" : "--outBAMcompression" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMsortingThreadN", + "description" : ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).", + "info" : { + "step" : "star", + "orig_arg" : "--outBAMsortingThreadN" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMsortingBinsN", + "description" : ">0: number of genome bins for coordinate-sorting", + "info" : { + "step" : "star", + "orig_arg" : "--outBAMsortingBinsN" + }, + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "BAM processing", + "arguments" : [ + { + "type" : "string", + "name" : "--bamRemoveDuplicatesType", + "description" : "mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- - ... no duplicate removal/marking\n- UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers.", + "info" : { + "step" : "star", + "orig_arg" : "--bamRemoveDuplicatesType" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--bamRemoveDuplicatesMate2basesN", + "description" : "number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE)", + "info" : { + "step" : "star", + "orig_arg" : "--bamRemoveDuplicatesMate2basesN" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Wiggle", + "arguments" : [ + { + "type" : "string", + "name" : "--outWigType", + "description" : "type of signal output, e.g. \\"bedGraph\\" OR \\"bedGraph read1_5p\\". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n- None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only 2nd read", + "info" : { + "step" : "star", + "orig_arg" : "--outWigType" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigStrand", + "description" : "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate strands, str1 and str2\n- Unstranded ... collapsed strands", + "info" : { + "step" : "star", + "orig_arg" : "--outWigStrand" + }, + "example" : [ + "Stranded" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigReferencesPrefix", + "description" : "prefix matching reference names to include in the output wiggle file, e.g. \\"chr\\", default \\"-\\" - include all references", + "info" : { + "step" : "star", + "orig_arg" : "--outWigReferencesPrefix" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigNorm", + "description" : "type of normalization for the signal\n\n- RPM ... reads per million of mapped reads\n- None ... no normalization, \\"raw\\" counts", + "info" : { + "step" : "star", + "orig_arg" : "--outWigNorm" + }, + "example" : [ + "RPM" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Filtering", + "arguments" : [ + { + "type" : "string", + "name" : "--outFilterType", + "description" : "type of filtering\n\n- Normal ... standard filtering using only current alignment\n- BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterType" + }, + "example" : [ + "Normal" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMultimapScoreRange", + "description" : "the score range below the maximum score for multimapping alignments", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMultimapScoreRange" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMultimapNmax", + "description" : "maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value.\n\nOtherwise no alignments will be output, and the read will be counted as \\"mapped to too many loci\\" in the Log.final.out .", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMultimapNmax" + }, + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMismatchNmax", + "description" : "alignment will be output only if it has no more mismatches than this value.", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMismatchNmax" + }, + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMismatchNoverLmax", + "description" : "alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMismatchNoverLmax" + }, + "example" : [ + 0.3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMismatchNoverReadLmax", + "description" : "alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMismatchNoverReadLmax" + }, + "example" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterScoreMin", + "description" : "alignment will be output only if its score is higher than or equal to this value.", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterScoreMin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterScoreMinOverLread", + "description" : "same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterScoreMinOverLread" + }, + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMatchNmin", + "description" : "alignment will be output only if the number of matched bases is higher than or equal to this value.", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMatchNmin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMatchNminOverLread", + "description" : "sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterMatchNminOverLread" + }, + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outFilterIntronMotifs", + "description" : "filter alignment using their motifs\n\n- None ... no filtering\n- RemoveNoncanonical ... filter out alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept.", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterIntronMotifs" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outFilterIntronStrands", + "description" : "filter alignments\n\n- RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands\n- None ... no filtering", + "info" : { + "step" : "star", + "orig_arg" : "--outFilterIntronStrands" + }, + "example" : [ + "RemoveInconsistentStrands" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output splice junctions (SJ.out.tab)", + "arguments" : [ + { + "type" : "string", + "name" : "--outSJtype", + "description" : "type of splice junction output\n\n- Standard ... standard SJ.out.tab output\n- None ... no splice junction output", + "info" : { + "step" : "star", + "orig_arg" : "--outSJtype" + }, + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Filtering: Splice Junctions", + "arguments" : [ + { + "type" : "string", + "name" : "--outSJfilterReads", + "description" : "which reads to consider for collapsed splice junctions output\n\n- All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping reads only", + "info" : { + "step" : "star", + "orig_arg" : "--outSJfilterReads" + }, + "example" : [ + "All" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterOverhangMin", + "description" : "minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply to annotated junctions", + "info" : { + "step" : "star", + "orig_arg" : "--outSJfilterOverhangMin" + }, + "example" : [ + 30, + 12, + 12, + 12 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterCountUniqueMin", + "description" : "minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied\ndoes not apply to annotated junctions", + "info" : { + "step" : "star", + "orig_arg" : "--outSJfilterCountUniqueMin" + }, + "example" : [ + 3, + 1, + 1, + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterCountTotalMin", + "description" : "minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied\ndoes not apply to annotated junctions", + "info" : { + "step" : "star", + "orig_arg" : "--outSJfilterCountTotalMin" + }, + "example" : [ + 3, + 1, + 1, + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterDistToOtherSJmin", + "description" : "minimum allowed distance to other junctions' donor/acceptor\n\ndoes not apply to annotated junctions", + "info" : { + "step" : "star", + "orig_arg" : "--outSJfilterDistToOtherSJmin" + }, + "example" : [ + 10, + 0, + 5, + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterIntronMaxVsReadN", + "description" : "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\ndoes not apply to annotated junctions", + "info" : { + "step" : "star", + "orig_arg" : "--outSJfilterIntronMaxVsReadN" + }, + "example" : [ + 50000, + 100000, + 200000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Scoring", + "arguments" : [ + { + "type" : "integer", + "name" : "--scoreGap", + "description" : "splice junction penalty (independent on intron motif)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreGap" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapNoncan", + "description" : "non-canonical junction penalty (in addition to scoreGap)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreGapNoncan" + }, + "example" : [ + -8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapGCAG", + "description" : "GC/AG and CT/GC junction penalty (in addition to scoreGap)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreGapGCAG" + }, + "example" : [ + -4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapATAC", + "description" : "AT/AC and GT/AT junction penalty (in addition to scoreGap)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreGapATAC" + }, + "example" : [ + -8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGenomicLengthLog2scale", + "description" : "extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreGenomicLengthLog2scale" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreDelOpen", + "description" : "deletion open penalty", + "info" : { + "step" : "star", + "orig_arg" : "--scoreDelOpen" + }, + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreDelBase", + "description" : "deletion extension penalty per base (in addition to scoreDelOpen)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreDelBase" + }, + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreInsOpen", + "description" : "insertion open penalty", + "info" : { + "step" : "star", + "orig_arg" : "--scoreInsOpen" + }, + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreInsBase", + "description" : "insertion extension penalty per base (in addition to scoreInsOpen)", + "info" : { + "step" : "star", + "orig_arg" : "--scoreInsBase" + }, + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreStitchSJshift", + "description" : "maximum score reduction while searching for SJ boundaries in the stitching step", + "info" : { + "step" : "star", + "orig_arg" : "--scoreStitchSJshift" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Alignments and Seeding", + "arguments" : [ + { + "type" : "integer", + "name" : "--seedSearchStartLmax", + "description" : "defines the search start point through the read - the read is split into pieces no longer than this value", + "info" : { + "step" : "star", + "orig_arg" : "--seedSearchStartLmax" + }, + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + ''' + ''' }, + { + "type" : "double", + "name" : "--seedSearchStartLmaxOverLread", + "description" : "seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)", + "info" : { + "step" : "star", + "orig_arg" : "--seedSearchStartLmaxOverLread" + }, + "example" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedSearchLmax", + "description" : "defines the maximum length of the seeds, if =0 seed length is not limited", + "info" : { + "step" : "star", + "orig_arg" : "--seedSearchLmax" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedMultimapNmax", + "description" : "only pieces that map fewer than this value are utilized in the stitching procedure", + "info" : { + "step" : "star", + "orig_arg" : "--seedMultimapNmax" + }, + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedPerReadNmax", + "description" : "max number of seeds per read", + "info" : { + "step" : "star", + "orig_arg" : "--seedPerReadNmax" + }, + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedPerWindowNmax", + "description" : "max number of seeds per window", + "info" : { + "step" : "star", + "orig_arg" : "--seedPerWindowNmax" + }, + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedNoneLociPerWindow", + "description" : "max number of one seed loci per window", + "info" : { + "step" : "star", + "orig_arg" : "--seedNoneLociPerWindow" + }, + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedSplitMin", + "description" : "min length of the seed sequences split by Ns or mate gap", + "info" : { + "step" : "star", + "orig_arg" : "--seedSplitMin" + }, + "example" : [ + 12 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedMapMin", + "description" : "min length of seeds to be mapped", + "info" : { + "step" : "star", + "orig_arg" : "--seedMapMin" + }, + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignIntronMin", + "description" : "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion", + "info" : { + "step" : "star", + "orig_arg" : "--alignIntronMin" + }, + "example" : [ + 21 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignIntronMax", + "description" : "maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins", + "info" : { + "step" : "star", + "orig_arg" : "--alignIntronMax" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignMatesGapMax", + "description" : "maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins", + "info" : { + "step" : "star", + "orig_arg" : "--alignMatesGapMax" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJoverhangMin", + "description" : "minimum overhang (i.e. block size) for spliced alignments", + "info" : { + "step" : "star", + "orig_arg" : "--alignSJoverhangMin" + }, + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJstitchMismatchNmax", + "description" : "maximum number of mismatches for stitching of the splice junctions (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.", + "info" : { + "step" : "star", + "orig_arg" : "--alignSJstitchMismatchNmax" + }, + "example" : [ + 0, + -1, + 0, + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJDBoverhangMin", + "description" : "minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments", + "info" : { + "step" : "star", + "orig_arg" : "--alignSJDBoverhangMin" + }, + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSplicedMateMapLmin", + "description" : "minimum mapped length for a read mate that is spliced", + "info" : { + "step" : "star", + "orig_arg" : "--alignSplicedMateMapLmin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alignSplicedMateMapLminOverLmate", + "description" : "alignSplicedMateMapLmin normalized to mate length", + "info" : { + "step" : "star", + "orig_arg" : "--alignSplicedMateMapLminOverLmate" + }, + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignWindowsPerReadNmax", + "description" : "max number of windows per read", + "info" : { + "step" : "star", + "orig_arg" : "--alignWindowsPerReadNmax" + }, + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignTranscriptsPerWindowNmax", + "description" : "max number of transcripts per window", + "info" : { + "step" : "star", + "orig_arg" : "--alignTranscriptsPerWindowNmax" + }, + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignTranscriptsPerReadNmax", + "description" : "max number of different alignments per read to consider", + "info" : { + "step" : "star", + "orig_arg" : "--alignTranscriptsPerReadNmax" + }, + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignEndsType", + "description" : "type of read ends alignment\n\n- Local ... standard local alignment with soft-clipping allowed\n- EndToEnd ... force end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment", + "info" : { + "step" : "star", + "orig_arg" : "--alignEndsType" + }, + "example" : [ + "Local" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignEndsProtrude", + "description" : "allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum number of protrusion bases allowed\n2nd word: string:\n- ConcordantPair ... report alignments with non-zero protrusion as concordant pairs\n- DiscordantPair ... report alignments with non-zero protrusion as discordant pairs", + "info" : { + "step" : "star", + "orig_arg" : "--alignEndsProtrude" + }, + "example" : [ + "0 ConcordantPair" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignSoftClipAtReferenceEnds", + "description" : "allow the soft-clipping of the alignments past the end of the chromosomes\n\n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks", + "info" : { + "step" : "star", + "orig_arg" : "--alignSoftClipAtReferenceEnds" + }, + "example" : [ + "Yes" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignInsertionFlush", + "description" : "how to flush ambiguous insertion positions\n\n- None ... insertions are not flushed\n- Right ... insertions are flushed to the right", + "info" : { + "step" : "star", + "orig_arg" : "--alignInsertionFlush" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Paired-End reads", + "arguments" : [ + { + "type" : "integer", + "name" : "--peOverlapNbasesMin", + "description" : "minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the \\"merginf of overlapping mates\\" algorithm.", + "info" : { + "step" : "star", + "orig_arg" : "--peOverlapNbasesMin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--peOverlapMMp", + "description" : "maximum proportion of mismatched bases in the overlap area", + "info" : { + "step" : "star", + "orig_arg" : "--peOverlapMMp" + }, + "example" : [ + 0.01 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Windows, Anchors, Binning", + "arguments" : [ + { + "type" : "integer", + "name" : "--winAnchorMultimapNmax", + "description" : "max number of loci anchors are allowed to map to", + "info" : { + "step" : "star", + "orig_arg" : "--winAnchorMultimapNmax" + }, + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winBinNbits", + "description" : "=log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.", + "info" : { + "step" : "star", + "orig_arg" : "--winBinNbits" + }, + "example" : [ + 16 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winAnchorDistNbins", + "description" : "max number of bins between two anchors that allows aggregation of anchors into one window", + "info" : { + "step" : "star", + "orig_arg" : "--winAnchorDistNbins" + }, + "example" : [ + 9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winFlankNbins", + "description" : "log2(winFlank), where win Flank is the size of the left and right flanking regions for each window", + "info" : { + "step" : "star", + "orig_arg" : "--winFlankNbins" + }, + "example" : [ + 4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--winReadCoverageRelativeMin", + "description" : "minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.", + "info" : { + "step" : "star", + "orig_arg" : "--winReadCoverageRelativeMin" + }, + "example" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winReadCoverageBasesMin", + "description" : "minimum number of bases covered by the seeds in a window , for STARlong algorithm only.", + "info" : { + "step" : "star", + "orig_arg" : "--winReadCoverageBasesMin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Chimeric Alignments", + "arguments" : [ + { + "type" : "string", + "name" : "--chimOutType", + "description" : "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n- SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n- WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n- WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments", + "info" : { + "step" : "star", + "orig_arg" : "--chimOutType" + }, + "example" : [ + "Junctions" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimSegmentMin", + "description" : "minimum length of chimeric segment length, if ==0, no chimeric output", + "info" : { + "step" : "star", + "orig_arg" : "--chimSegmentMin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreMin", + "description" : "minimum total (summed) score of the chimeric segments", + "info" : { + "step" : "star", + "orig_arg" : "--chimScoreMin" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreDropMax", + "description" : "max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length", + "info" : { + "step" : "star", + "orig_arg" : "--chimScoreDropMax" + }, + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreSeparation", + "description" : "minimum difference (separation) between the best chimeric score and the next one", + "info" : { + "step" : "star", + "orig_arg" : "--chimScoreSeparation" + }, + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreJunctionNonGTAG", + "description" : "penalty for a non-GT/AG chimeric junction", + "info" : { + "step" : "star", + "orig_arg" : "--chimScoreJunctionNonGTAG" + }, + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimJunctionOverhangMin", + "description" : "minimum overhang for a chimeric junction", + "info" : { + "step" : "star", + "orig_arg" : "--chimJunctionOverhangMin" + }, + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimSegmentReadGapMax", + "description" : "maximum gap in the read sequence between chimeric segments", + "info" : { + "step" : "star", + "orig_arg" : "--chimSegmentReadGapMax" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--chimFilter", + "description" : "different filters for chimeric alignments\n\n- None ... no filtering\n- banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction", + "info" : { + "step" : "star", + "orig_arg" : "--chimFilter" + }, + "example" : [ + "banGenomicN" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMainSegmentMultNmax", + "description" : "maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments.", + "info" : { + "step" : "star", + "orig_arg" : "--chimMainSegmentMultNmax" + }, + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMultimapNmax", + "description" : "maximum number of chimeric multi-alignments\n\n- 0 ... use the old scheme for chimeric detection which only considered unique alignments", + "info" : { + "step" : "star", + "orig_arg" : "--chimMultimapNmax" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMultimapScoreRange", + "description" : "the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1", + "info" : { + "step" : "star", + "orig_arg" : "--chimMultimapScoreRange" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimNonchimScoreDropMin", + "description" : "to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value", + "info" : { + "step" : "star", + "orig_arg" : "--chimNonchimScoreDropMin" + }, + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimOutJunctionFormat", + "description" : "formatting type for the Chimeric.out.junction file\n\n- 0 ... no comment lines/headers\n- 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping", + "info" : { + "step" : "star", + "orig_arg" : "--chimOutJunctionFormat" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Quantification of Annotations", + "arguments" : [ + { + "type" : "string", + "name" : "--quantMode", + "description" : "types of quantification requested\n\n- - ... none\n- TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file\n- GeneCounts ... count reads per gene", + "info" : { + "step" : "star", + "orig_arg" : "--quantMode" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--quantTranscriptomeBAMcompression", + "description" : "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10 ... maximum compression", + "info" : { + "step" : "star", + "orig_arg" : "--quantTranscriptomeBAMcompression" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--quantTranscriptomeBan", + "description" : "prohibit various alignment type\n\n- IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM\n- Singleend ... prohibit single-end alignments", + "info" : { + "step" : "star", + "orig_arg" : "--quantTranscriptomeBan" + }, + "example" : [ + "IndelSoftclipSingleend" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "2-pass Mapping", + "arguments" : [ + { + "type" : "string", + "name" : "--twopassMode", + "description" : "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly", + "info" : { + "step" : "star", + "orig_arg" : "--twopassMode" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--twopass1readsN", + "description" : "number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step.", + "info" : { + "step" : "star", + "orig_arg" : "--twopass1readsN" + }, + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "WASP parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--waspOutputMode", + "description" : "WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\n- SAMtag ... add WASP tags to the alignments that pass WASP filtering", + "info" : { + "step" : "star", + "orig_arg" : "--waspOutputMode" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "STARsolo (single cell RNA-seq) parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--soloType", + "description" : "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)", + "info" : { + "step" : "star", + "orig_arg" : "--soloType" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBwhitelist", + "description" : "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file.\n\n- None ... no whitelist: all cell barcodes are allowed", + "info" : { + "step" : "star", + "orig_arg" : "--soloCBwhitelist" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloCBstart", + "description" : "cell barcode start base", + "info" : { + "step" : "star", + "orig_arg" : "--soloCBstart" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloCBlen", + "description" : "cell barcode length", + "info" : { + "step" : "star", + "orig_arg" : "--soloCBlen" + }, + "example" : [ + 16 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloUMIstart", + "description" : "UMI start base", + "info" : { + "step" : "star", + "orig_arg" : "--soloUMIstart" + }, + "example" : [ + 17 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloUMIlen", + "description" : "UMI length", + "info" : { + "step" : "star", + "orig_arg" : "--soloUMIlen" + }, + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloBarcodeReadLength", + "description" : "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n- 0 ... not defined, do not check", + "info" : { + "step" : "star", + "orig_arg" : "--soloBarcodeReadLength" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloBarcodeMate", + "description" : "identifies which read mate contains the barcode (CB+UMI) sequence\n\n- 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part of mate 1\n- 2 ... barcode sequence is a part of mate 2", + "info" : { + "step" : "star", + "orig_arg" : "--soloBarcodeMate" + }, + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBposition", + "description" : "position of Cell Barcode(s) on the barcode read.\n\nPresently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\nFormat for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end\nstart(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base\nString for different barcodes are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8", + "info" : { + "step" : "star", + "orig_arg" : "--soloCBposition" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIposition", + "description" : "position of the UMI on the barcode read, same as soloCBposition\n\nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition 3_9_3_14", + "info" : { + "step" : "star", + "orig_arg" : "--soloUMIposition" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloAdapterSequence", + "description" : "adapter sequence to anchor barcodes. Only one adapter sequence is allowed.", + "info" : { + "step" : "star", + "orig_arg" : "--soloAdapterSequence" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloAdapterMismatchesNmax", + "description" : "maximum number of mismatches allowed in adapter sequence.", + "info" : { + "step" : "star", + "orig_arg" : "--soloAdapterMismatchesNmax" + }, + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBmatchWLtype", + "description" : "matching the Cell Barcodes to the WhiteList\n\n- Exact ... only exact matches allowed\n- 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match.\n- 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches.\nAllowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline.", + "info" : { + "step" : "star", + "orig_arg" : "--soloCBmatchWLtype" + }, + "example" : [ + "1MM_multi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloInputSAMattrBarcodeSeq", + "description" : "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .\nThis parameter is required when running STARsolo with input from SAM.", + "info" : { + "step" : "star", + "orig_arg" : "--soloInputSAMattrBarcodeSeq" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloInputSAMattrBarcodeQual", + "description" : "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned to all bases.", + "info" : { + "step" : "star", + "orig_arg" : "--soloInputSAMattrBarcodeQual" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloStrand", + "description" : "strandedness of the solo libraries:\n\n- Unstranded ... no strand information\n- Forward ... read strand same as the original RNA molecule\n- Reverse ... read strand opposite to the original RNA molecule", + "info" : { + "step" : "star", + "orig_arg" : "--soloStrand" + }, + "example" : [ + "Forward" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloFeatures", + "description" : "genomic features for which the UMI counts per Cell Barcode are collected\n\n- Gene ... genes: reads match the gene transcript\n- SJ ... splice junctions: reported in SJ.out.tab\n- GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n- GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.", + "info" : { + "step" : "star", + "orig_arg" : "--soloFeatures" + }, + "example" : [ + "Gene" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloMultiMappers", + "description" : "counting method for reads mapping to multiple genes\n\n- Unique ... count only reads that map to unique genes\n- Uniform ... uniformly distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not.\n- EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm", + "info" : { + "step" : "star", + "orig_arg" : "--soloMultiMappers" + }, + "example" : [ + "Unique" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIdedup", + "description" : "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows the \\"directional\\" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs\n- Exact ... only exactly matching UMIs are collapsed.\n- NoDedup ... no deduplication of UMIs, count all reads.\n- 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing.", + "info" : { + "step" : "star", + "orig_arg" : "--soloUMIdedup" + }, + "example" : [ + "1MM_All" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIfiltering", + "description" : "type of UMI filtering (for reads uniquely mapping to genes)\n\n- - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR", + "info" : { + "step" : "star", + "orig_arg" : "--soloUMIfiltering" + }, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloOutFileNames", + "description" : "file names for STARsolo output:\n\nfile_name_prefix gene_names barcode_sequences cell_feature_count_matrix", + "info" : { + "step" : "star", + "orig_arg" : "--soloOutFileNames" + }, + "example" : [ + "Solo.out/", + "features.tsv", + "barcodes.tsv", + "matrix.mtx" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCellFilter", + "description" : "cell filtering type and parameters\n\n- None ... do not output filtered cells\n- TopCells ... only report top cells by UMI count, followed by the exact number of cells\n- CellRanger2.2 ... simple filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\nCan be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN\nThe harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000", + "info" : { + "step" : "star", + "orig_arg" : "--soloCellFilter" + }, + "example" : [ + "CellRanger2.2", + "3000", + "0.99", + "10" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloOutFormatFeaturesGeneField3", + "description" : "field 3 in the Gene features.tsv file. If \\"-\\", then no 3rd field is output.", + "info" : { + "step" : "star", + "orig_arg" : "--soloOutFormatFeaturesGeneField3" + }, + "example" : [ + "Gene Expression" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCellReadStats", + "description" : "Output reads statistics for each CB\n\n- Standard ... standard output", + "info" : { + "step" : "star", + "orig_arg" : "--soloCellReadStats" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "HTSeq arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--stranded", + "alternatives" : [ + "-s" + ], + "description" : "Whether the data is from a strand-specific assay. 'reverse' means 'yes' with reversed strand interpretation.", + "info" : { + "step" : "htseq", + "orig_arg" : "--stranded" + }, + "default" : [ + "yes" + ], + "required" : false, + "choices" : [ + "yes", + "no", + "reverse" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--minimum_alignment_quality", + "alternatives" : [ + "-a", + "--minaqual" + ], + "description" : "Skip all reads with MAPQ alignment quality lower than the given minimum value. \nMAPQ is the 5th column of a SAM/BAM file and its usage depends on the software \nused to map the reads.\n", + "info" : { + "step" : "htseq", + "orig_arg" : "--minaqual" + }, + "default" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--type", + "alternatives" : [ + "-t" + ], + "description" : "Feature type (3rd column in GTF file) to be used, all features of other type are ignored (default, suitable for Ensembl GTF files: exon)", + "info" : { + "step" : "htseq", + "orig_arg" : "--type" + }, + "example" : [ + "exon" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--id_attribute", + "alternatives" : [ + "-i" + ], + "description" : "GTF attribute to be used as feature ID (default, suitable for Ensembl GTF files: gene_id).\nAll feature of the right type (see -t option) within the same GTF attribute will be added\ntogether. The typical way of using this option is to count all exonic reads from each gene\nand add the exons but other uses are possible as well. You can call this option multiple\ntimes: in that case, the combination of all attributes separated by colons (:) will be used\nas a unique identifier, e.g. for exons you might use -i gene_id -i exon_number.\n", + "info" : { + "step" : "htseq", + "orig_arg" : "--idattr" + }, + "example" : [ + "gene_id" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--additional_attributes", + "description" : "Additional feature attributes (suitable for Ensembl GTF files: gene_name). Use multiple times\nfor more than one additional attribute. These attributes are only used as annotations in the\noutput, while the determination of how the counts are added together is done based on option -i.\n", + "info" : { + "step" : "htseq", + "orig_arg" : "--additional-attr" + }, + "example" : [ + "gene_name" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--add_chromosome_info", + "description" : "Store information about the chromosome of each feature as an additional attribute\n(e.g. colunm in the TSV output file).\n", + "info" : { + "step" : "htseq", + "orig_arg" : "--add-chromosome-info" + }, + "direction" : "input" + }, + { + "type" : "string", + "name" : "--mode", + "alternatives" : [ + "-m" + ], + "description" : "Mode to handle reads overlapping more than one feature.", + "info" : { + "step" : "htseq", + "orig_arg" : "--mode" + }, + "default" : [ + "union" + ], + "required" : false, + "choices" : [ + "union", + "intersection-strict", + "intersection-nonempty" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--non_unique", + "description" : "Whether and how to score reads that are not uniquely aligned or ambiguously assigned to features.", + "info" : { + "step" : "htseq", + "orig_arg" : "--nonunique" + }, + "default" : [ + "none" + ], + "required" : false, + "choices" : [ + "none", + "all", + "fraction", + "random" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--secondary_alignments", + "description" : "Whether to score secondary alignments (0x100 flag).", + "info" : { + "step" : "htseq", + "orig_arg" : "--secondary-alignments" + }, + "required" : false, + "choices" : [ + "score", + "ignore" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--supplementary_alignments", + "description" : "Whether to score supplementary alignments (0x800 flag).", + "info" : { + "step" : "htseq", + "orig_arg" : "--supplementary-alignments" + }, + "required" : false, + "choices" : [ + "score", + "ignore" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--counts_output_sparse", + "description" : "Store the counts as a sparse matrix (mtx, h5ad, loom).", + "info" : { + "step" : "htseq", + "orig_arg" : "--counts-output-sparse" + }, + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Align fastq files using STAR.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "env" : [ + "STAR_VERSION 2.7.10b", + "PACKAGES gcc g++ make wget zlib1g-dev unzip" + ] + }, + { + "type" : "docker", + "run" : [ + "apt-get update && \\\\\n apt-get install -y --no-install-recommends ${PACKAGES} && \\\\\n cd /tmp && \\\\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \\\\\n unzip ${STAR_VERSION}.zip && \\\\\n cd STAR-${STAR_VERSION}/source && \\\\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\\\n cp STAR /usr/local/bin && \\\\\n cd / && \\\\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \\\\\n apt-get --purge autoremove -y ${PACKAGES} && \\\\\n apt-get clean\n" + ] + }, + { + "type" : "apt", + "packages" : [ + "samtools", + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pyyaml", + "HTSeq", + "multiprocess", + "gtfparse", + "pandas", + "numpy<2", + "multiqc~=1.15.0" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "pytest" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/multi_star/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/multi_star", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from typing import Any, Dict, List, Tuple +import math +import tempfile +import subprocess +import tarfile +import gzip +import shutil +from pathlib import Path +import yaml +import pandas as pd +from multiprocess import Pool +import gtfparse +import polars as pl + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'input_r1': $( if [ ! -z ${VIASH_PAR_INPUT_R1+x} ]; then echo "r'${VIASH_PAR_INPUT_R1//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'input_r2': $( if [ ! -z ${VIASH_PAR_INPUT_R2+x} ]; then echo "r'${VIASH_PAR_INPUT_R2//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference_index': $( if [ ! -z ${VIASH_PAR_REFERENCE_INDEX+x} ]; then echo "r'${VIASH_PAR_REFERENCE_INDEX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'reference_gtf': $( if [ ! -z ${VIASH_PAR_REFERENCE_GTF+x} ]; then echo "r'${VIASH_PAR_REFERENCE_GTF//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'run_htseq_count': $( if [ ! -z ${VIASH_PAR_RUN_HTSEQ_COUNT+x} ]; then echo "r'${VIASH_PAR_RUN_HTSEQ_COUNT//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'run_multiqc': $( if [ ! -z ${VIASH_PAR_RUN_MULTIQC+x} ]; then echo "r'${VIASH_PAR_RUN_MULTIQC//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'min_success_rate': $( if [ ! -z ${VIASH_PAR_MIN_SUCCESS_RATE+x} ]; then echo "float(r'${VIASH_PAR_MIN_SUCCESS_RATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'runRNGseed': $( if [ ! -z ${VIASH_PAR_RUNRNGSEED+x} ]; then echo "int(r'${VIASH_PAR_RUNRNGSEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'genomeFastaFiles': $( if [ ! -z ${VIASH_PAR_GENOMEFASTAFILES+x} ]; then echo "r'${VIASH_PAR_GENOMEFASTAFILES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbFileChrStartEnd': $( if [ ! -z ${VIASH_PAR_SJDBFILECHRSTARTEND+x} ]; then echo "r'${VIASH_PAR_SJDBFILECHRSTARTEND//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFfile': $( if [ ! -z ${VIASH_PAR_SJDBGTFFILE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFchrPrefix': $( if [ ! -z ${VIASH_PAR_SJDBGTFCHRPREFIX+x} ]; then echo "r'${VIASH_PAR_SJDBGTFCHRPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFfeatureExon': $( if [ ! -z ${VIASH_PAR_SJDBGTFFEATUREEXON+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFEATUREEXON//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentTranscript': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGene': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneName': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneType': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbOverhang': $( if [ ! -z ${VIASH_PAR_SJDBOVERHANG+x} ]; then echo "int(r'${VIASH_PAR_SJDBOVERHANG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sjdbScore': $( if [ ! -z ${VIASH_PAR_SJDBSCORE+x} ]; then echo "int(r'${VIASH_PAR_SJDBSCORE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sjdbInsertSave': $( if [ ! -z ${VIASH_PAR_SJDBINSERTSAVE+x} ]; then echo "r'${VIASH_PAR_SJDBINSERTSAVE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'varVCFfile': $( if [ ! -z ${VIASH_PAR_VARVCFFILE+x} ]; then echo "r'${VIASH_PAR_VARVCFFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesType': $( if [ ! -z ${VIASH_PAR_READFILESTYPE+x} ]; then echo "r'${VIASH_PAR_READFILESTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesSAMattrKeep': $( if [ ! -z ${VIASH_PAR_READFILESSAMATTRKEEP+x} ]; then echo "r'${VIASH_PAR_READFILESSAMATTRKEEP//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readFilesManifest': $( if [ ! -z ${VIASH_PAR_READFILESMANIFEST+x} ]; then echo "r'${VIASH_PAR_READFILESMANIFEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesPrefix': $( if [ ! -z ${VIASH_PAR_READFILESPREFIX+x} ]; then echo "r'${VIASH_PAR_READFILESPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesCommand': $( if [ ! -z ${VIASH_PAR_READFILESCOMMAND+x} ]; then echo "r'${VIASH_PAR_READFILESCOMMAND//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readMapNumber': $( if [ ! -z ${VIASH_PAR_READMAPNUMBER+x} ]; then echo "int(r'${VIASH_PAR_READMAPNUMBER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'readMatesLengthsIn': $( if [ ! -z ${VIASH_PAR_READMATESLENGTHSIN+x} ]; then echo "r'${VIASH_PAR_READMATESLENGTHSIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readNameSeparator': $( if [ ! -z ${VIASH_PAR_READNAMESEPARATOR+x} ]; then echo "r'${VIASH_PAR_READNAMESEPARATOR//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readQualityScoreBase': $( if [ ! -z ${VIASH_PAR_READQUALITYSCOREBASE+x} ]; then echo "int(r'${VIASH_PAR_READQUALITYSCOREBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'clipAdapterType': $( if [ ! -z ${VIASH_PAR_CLIPADAPTERTYPE+x} ]; then echo "r'${VIASH_PAR_CLIPADAPTERTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'clip3pNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip3pAdapterSeq': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERSEQ+x} ]; then echo "r'${VIASH_PAR_CLIP3PADAPTERSEQ//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'clip3pAdapterMMp': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERMMP+x} ]; then echo "list(map(float, r'${VIASH_PAR_CLIP3PADAPTERMMP//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip3pAfterAdapterNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PAFTERADAPTERNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PAFTERADAPTERNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip5pNbases': $( if [ ! -z ${VIASH_PAR_CLIP5PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP5PNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'limitGenomeGenerateRAM': $( if [ ! -z ${VIASH_PAR_LIMITGENOMEGENERATERAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITGENOMEGENERATERAM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitIObufferSize': $( if [ ! -z ${VIASH_PAR_LIMITIOBUFFERSIZE+x} ]; then echo "list(map(int, r'${VIASH_PAR_LIMITIOBUFFERSIZE//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'limitOutSAMoneReadBytes': $( if [ ! -z ${VIASH_PAR_LIMITOUTSAMONEREADBYTES+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSAMONEREADBYTES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitOutSJoneRead': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJONEREAD+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJONEREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitOutSJcollapsed': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJCOLLAPSED+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJCOLLAPSED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitBAMsortRAM': $( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITBAMSORTRAM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitSjdbInsertNsj': $( if [ ! -z ${VIASH_PAR_LIMITSJDBINSERTNSJ+x} ]; then echo "int(r'${VIASH_PAR_LIMITSJDBINSERTNSJ//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitNreadsSoft': $( if [ ! -z ${VIASH_PAR_LIMITNREADSSOFT+x} ]; then echo "int(r'${VIASH_PAR_LIMITNREADSSOFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outTmpKeep': $( if [ ! -z ${VIASH_PAR_OUTTMPKEEP+x} ]; then echo "r'${VIASH_PAR_OUTTMPKEEP//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outStd': $( if [ ! -z ${VIASH_PAR_OUTSTD+x} ]; then echo "r'${VIASH_PAR_OUTSTD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outReadsUnmapped': $( if [ ! -z ${VIASH_PAR_OUTREADSUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTREADSUNMAPPED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outQSconversionAdd': $( if [ ! -z ${VIASH_PAR_OUTQSCONVERSIONADD+x} ]; then echo "int(r'${VIASH_PAR_OUTQSCONVERSIONADD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outMultimapperOrder': $( if [ ! -z ${VIASH_PAR_OUTMULTIMAPPERORDER+x} ]; then echo "r'${VIASH_PAR_OUTMULTIMAPPERORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMmode': $( if [ ! -z ${VIASH_PAR_OUTSAMMODE+x} ]; then echo "r'${VIASH_PAR_OUTSAMMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMstrandField': $( if [ ! -z ${VIASH_PAR_OUTSAMSTRANDFIELD+x} ]; then echo "r'${VIASH_PAR_OUTSAMSTRANDFIELD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMattributes': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRIBUTES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMattrIHstart': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIHSTART+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMATTRIHSTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMunmapped': $( if [ ! -z ${VIASH_PAR_OUTSAMUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTSAMUNMAPPED//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMorder': $( if [ ! -z ${VIASH_PAR_OUTSAMORDER+x} ]; then echo "r'${VIASH_PAR_OUTSAMORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMprimaryFlag': $( if [ ! -z ${VIASH_PAR_OUTSAMPRIMARYFLAG+x} ]; then echo "r'${VIASH_PAR_OUTSAMPRIMARYFLAG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMreadID': $( if [ ! -z ${VIASH_PAR_OUTSAMREADID+x} ]; then echo "r'${VIASH_PAR_OUTSAMREADID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMmapqUnique': $( if [ ! -z ${VIASH_PAR_OUTSAMMAPQUNIQUE+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMAPQUNIQUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMflagOR': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGOR+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMflagAND': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGAND+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGAND//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMattrRGline': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRRGLINE+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRRGLINE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderHD': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERHD+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERHD//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderPG': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERPG+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERPG//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderCommentFile': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERCOMMENTFILE+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERCOMMENTFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMfilter': $( if [ ! -z ${VIASH_PAR_OUTSAMFILTER+x} ]; then echo "r'${VIASH_PAR_OUTSAMFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMmultNmax': $( if [ ! -z ${VIASH_PAR_OUTSAMMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMULTNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMtlen': $( if [ ! -z ${VIASH_PAR_OUTSAMTLEN+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMTLEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMcompression': $( if [ ! -z ${VIASH_PAR_OUTBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMCOMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMsortingThreadN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGTHREADN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGTHREADN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMsortingBinsN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGBINSN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGBINSN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'bamRemoveDuplicatesType': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESTYPE+x} ]; then echo "r'${VIASH_PAR_BAMREMOVEDUPLICATESTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'bamRemoveDuplicatesMate2basesN': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN+x} ]; then echo "int(r'${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outWigType': $( if [ ! -z ${VIASH_PAR_OUTWIGTYPE+x} ]; then echo "r'${VIASH_PAR_OUTWIGTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outWigStrand': $( if [ ! -z ${VIASH_PAR_OUTWIGSTRAND+x} ]; then echo "r'${VIASH_PAR_OUTWIGSTRAND//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outWigReferencesPrefix': $( if [ ! -z ${VIASH_PAR_OUTWIGREFERENCESPREFIX+x} ]; then echo "r'${VIASH_PAR_OUTWIGREFERENCESPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outWigNorm': $( if [ ! -z ${VIASH_PAR_OUTWIGNORM+x} ]; then echo "r'${VIASH_PAR_OUTWIGNORM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterType': $( if [ ! -z ${VIASH_PAR_OUTFILTERTYPE+x} ]; then echo "r'${VIASH_PAR_OUTFILTERTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMultimapNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMISMATCHNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNoverLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNoverReadLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterScoreMin': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERSCOREMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterScoreMinOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMatchNmin': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMATCHNMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMatchNminOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterIntronMotifs': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONMOTIFS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONMOTIFS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterIntronStrands': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONSTRANDS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONSTRANDS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJtype': $( if [ ! -z ${VIASH_PAR_OUTSJTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSJTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJfilterReads': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERREADS+x} ]; then echo "r'${VIASH_PAR_OUTSJFILTERREADS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJfilterOverhangMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTEROVERHANGMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTEROVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountUniqueMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountTotalMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterDistToOtherSJmin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterIntronMaxVsReadN': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'scoreGap': $( if [ ! -z ${VIASH_PAR_SCOREGAP+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapNoncan': $( if [ ! -z ${VIASH_PAR_SCOREGAPNONCAN+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPNONCAN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapGCAG': $( if [ ! -z ${VIASH_PAR_SCOREGAPGCAG+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPGCAG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapATAC': $( if [ ! -z ${VIASH_PAR_SCOREGAPATAC+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPATAC//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGenomicLengthLog2scale': $( if [ ! -z ${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE+x} ]; then echo "int(r'${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreDelOpen': $( if [ ! -z ${VIASH_PAR_SCOREDELOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELOPEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreDelBase': $( if [ ! -z ${VIASH_PAR_SCOREDELBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreInsOpen': $( if [ ! -z ${VIASH_PAR_SCOREINSOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSOPEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreInsBase': $( if [ ! -z ${VIASH_PAR_SCOREINSBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreStitchSJshift': $( if [ ! -z ${VIASH_PAR_SCORESTITCHSJSHIFT+x} ]; then echo "int(r'${VIASH_PAR_SCORESTITCHSJSHIFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchStartLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHSTARTLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchStartLmaxOverLread': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedMultimapNmax': $( if [ ! -z ${VIASH_PAR_SEEDMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedPerReadNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedPerWindowNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERWINDOWNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedNoneLociPerWindow': $( if [ ! -z ${VIASH_PAR_SEEDNONELOCIPERWINDOW+x} ]; then echo "int(r'${VIASH_PAR_SEEDNONELOCIPERWINDOW//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSplitMin': $( if [ ! -z ${VIASH_PAR_SEEDSPLITMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDSPLITMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedMapMin': $( if [ ! -z ${VIASH_PAR_SEEDMAPMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDMAPMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignIntronMin': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignIntronMax': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignMatesGapMax': $( if [ ! -z ${VIASH_PAR_ALIGNMATESGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNMATESGAPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSJoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSJstitchMismatchNmax': $( if [ ! -z ${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX+x} ]; then echo "list(map(int, r'${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'alignSJDBoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJDBOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJDBOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSplicedMateMapLmin': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSplicedMateMapLminOverLmate': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE+x} ]; then echo "float(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignWindowsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNWINDOWSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNWINDOWSPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignTranscriptsPerWindowNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignTranscriptsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignEndsType': $( if [ ! -z ${VIASH_PAR_ALIGNENDSTYPE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignEndsProtrude': $( if [ ! -z ${VIASH_PAR_ALIGNENDSPROTRUDE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSPROTRUDE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignSoftClipAtReferenceEnds': $( if [ ! -z ${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS+x} ]; then echo "r'${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignInsertionFlush': $( if [ ! -z ${VIASH_PAR_ALIGNINSERTIONFLUSH+x} ]; then echo "r'${VIASH_PAR_ALIGNINSERTIONFLUSH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'peOverlapNbasesMin': $( if [ ! -z ${VIASH_PAR_PEOVERLAPNBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_PEOVERLAPNBASESMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'peOverlapMMp': $( if [ ! -z ${VIASH_PAR_PEOVERLAPMMP+x} ]; then echo "float(r'${VIASH_PAR_PEOVERLAPMMP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winAnchorMultimapNmax': $( if [ ! -z ${VIASH_PAR_WINANCHORMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winBinNbits': $( if [ ! -z ${VIASH_PAR_WINBINNBITS+x} ]; then echo "int(r'${VIASH_PAR_WINBINNBITS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winAnchorDistNbins': $( if [ ! -z ${VIASH_PAR_WINANCHORDISTNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORDISTNBINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winFlankNbins': $( if [ ! -z ${VIASH_PAR_WINFLANKNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINFLANKNBINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winReadCoverageRelativeMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGERELATIVEMIN+x} ]; then echo "float(r'${VIASH_PAR_WINREADCOVERAGERELATIVEMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winReadCoverageBasesMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGEBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_WINREADCOVERAGEBASESMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimOutType': $( if [ ! -z ${VIASH_PAR_CHIMOUTTYPE+x} ]; then echo "r'${VIASH_PAR_CHIMOUTTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'chimSegmentMin': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreMin': $( if [ ! -z ${VIASH_PAR_CHIMSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreDropMax': $( if [ ! -z ${VIASH_PAR_CHIMSCOREDROPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREDROPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreSeparation': $( if [ ! -z ${VIASH_PAR_CHIMSCORESEPARATION+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCORESEPARATION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreJunctionNonGTAG': $( if [ ! -z ${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimJunctionOverhangMin': $( if [ ! -z ${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimSegmentReadGapMax': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTREADGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTREADGAPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimFilter': $( if [ ! -z ${VIASH_PAR_CHIMFILTER+x} ]; then echo "r'${VIASH_PAR_CHIMFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'chimMainSegmentMultNmax': $( if [ ! -z ${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimMultimapNmax': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPSCORERANGE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimNonchimScoreDropMin': $( if [ ! -z ${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimOutJunctionFormat': $( if [ ! -z ${VIASH_PAR_CHIMOUTJUNCTIONFORMAT+x} ]; then echo "int(r'${VIASH_PAR_CHIMOUTJUNCTIONFORMAT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'quantMode': $( if [ ! -z ${VIASH_PAR_QUANTMODE+x} ]; then echo "r'${VIASH_PAR_QUANTMODE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'quantTranscriptomeBAMcompression': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'quantTranscriptomeBan': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAN+x} ]; then echo "r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'twopassMode': $( if [ ! -z ${VIASH_PAR_TWOPASSMODE+x} ]; then echo "r'${VIASH_PAR_TWOPASSMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'twopass1readsN': $( if [ ! -z ${VIASH_PAR_TWOPASS1READSN+x} ]; then echo "int(r'${VIASH_PAR_TWOPASS1READSN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'waspOutputMode': $( if [ ! -z ${VIASH_PAR_WASPOUTPUTMODE+x} ]; then echo "r'${VIASH_PAR_WASPOUTPUTMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloType': $( if [ ! -z ${VIASH_PAR_SOLOTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCBwhitelist': $( if [ ! -z ${VIASH_PAR_SOLOCBWHITELIST+x} ]; then echo "r'${VIASH_PAR_SOLOCBWHITELIST//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCBstart': $( if [ ! -z ${VIASH_PAR_SOLOCBSTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBSTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBlen': $( if [ ! -z ${VIASH_PAR_SOLOCBLEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBLEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloUMIstart': $( if [ ! -z ${VIASH_PAR_SOLOUMISTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMISTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloUMIlen': $( if [ ! -z ${VIASH_PAR_SOLOUMILEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMILEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloBarcodeReadLength': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEREADLENGTH+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEREADLENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloBarcodeMate': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEMATE+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEMATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBposition': $( if [ ! -z ${VIASH_PAR_SOLOCBPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOCBPOSITION//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIposition': $( if [ ! -z ${VIASH_PAR_SOLOUMIPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOUMIPOSITION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloAdapterSequence': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERSEQUENCE+x} ]; then echo "r'${VIASH_PAR_SOLOADAPTERSEQUENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloAdapterMismatchesNmax': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX+x} ]; then echo "int(r'${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBmatchWLtype': $( if [ ! -z ${VIASH_PAR_SOLOCBMATCHWLTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOCBMATCHWLTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloInputSAMattrBarcodeSeq': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloInputSAMattrBarcodeQual': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloStrand': $( if [ ! -z ${VIASH_PAR_SOLOSTRAND+x} ]; then echo "r'${VIASH_PAR_SOLOSTRAND//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloFeatures': $( if [ ! -z ${VIASH_PAR_SOLOFEATURES+x} ]; then echo "r'${VIASH_PAR_SOLOFEATURES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloMultiMappers': $( if [ ! -z ${VIASH_PAR_SOLOMULTIMAPPERS+x} ]; then echo "r'${VIASH_PAR_SOLOMULTIMAPPERS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIdedup': $( if [ ! -z ${VIASH_PAR_SOLOUMIDEDUP+x} ]; then echo "r'${VIASH_PAR_SOLOUMIDEDUP//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIfiltering': $( if [ ! -z ${VIASH_PAR_SOLOUMIFILTERING+x} ]; then echo "r'${VIASH_PAR_SOLOUMIFILTERING//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloOutFileNames': $( if [ ! -z ${VIASH_PAR_SOLOOUTFILENAMES+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFILENAMES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCellFilter': $( if [ ! -z ${VIASH_PAR_SOLOCELLFILTER+x} ]; then echo "r'${VIASH_PAR_SOLOCELLFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloOutFormatFeaturesGeneField3': $( if [ ! -z ${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCellReadStats': $( if [ ! -z ${VIASH_PAR_SOLOCELLREADSTATS+x} ]; then echo "r'${VIASH_PAR_SOLOCELLREADSTATS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'stranded': $( if [ ! -z ${VIASH_PAR_STRANDED+x} ]; then echo "r'${VIASH_PAR_STRANDED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'minimum_alignment_quality': $( if [ ! -z ${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY+x} ]; then echo "int(r'${VIASH_PAR_MINIMUM_ALIGNMENT_QUALITY//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'type': $( if [ ! -z ${VIASH_PAR_TYPE+x} ]; then echo "r'${VIASH_PAR_TYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'id_attribute': $( if [ ! -z ${VIASH_PAR_ID_ATTRIBUTE+x} ]; then echo "r'${VIASH_PAR_ID_ATTRIBUTE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'additional_attributes': $( if [ ! -z ${VIASH_PAR_ADDITIONAL_ATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_ADDITIONAL_ATTRIBUTES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'add_chromosome_info': $( if [ ! -z ${VIASH_PAR_ADD_CHROMOSOME_INFO+x} ]; then echo "r'${VIASH_PAR_ADD_CHROMOSOME_INFO//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'mode': $( if [ ! -z ${VIASH_PAR_MODE+x} ]; then echo "r'${VIASH_PAR_MODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'non_unique': $( if [ ! -z ${VIASH_PAR_NON_UNIQUE+x} ]; then echo "r'${VIASH_PAR_NON_UNIQUE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'secondary_alignments': $( if [ ! -z ${VIASH_PAR_SECONDARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SECONDARY_ALIGNMENTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'supplementary_alignments': $( if [ ! -z ${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS+x} ]; then echo "r'${VIASH_PAR_SUPPLEMENTARY_ALIGNMENTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'counts_output_sparse': $( if [ ! -z ${VIASH_PAR_COUNTS_OUTPUT_SPARSE+x} ]; then echo "r'${VIASH_PAR_COUNTS_OUTPUT_SPARSE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +def fetch_arguments_info(config: Dict[str, Any]) -> Dict[str, Any]: + """Fetch arguments from config""" + arguments = { + arg["name"].removeprefix("-").removeprefix("-"): arg + for group in config["argument_groups"] + for arg in group["arguments"] + } + return arguments + + +def process_par( + par: Dict[str, Any], + arguments_info: Dict[str, Any], + gz_args: List[str], + temp_dir: Path, +) -> Dict[str, Any]: + """ + Process the Viash par dictionary + + This turns file strings into Path objects and extracting gzipped files if need be. + + Parameters + ---------- + par: The par dictionary created by Viash + arguments_info: The arguments info Dictionary created by \\`fetch_arguments_info\\` + gz_args: A list of argument keys which could be gzip files which need to be decompressed. + temp_dir: A temporary directory in which to ungzip files + """ + new_par = {} + for key, value in par.items(): + arg_info = arguments_info[key] + # turn file arguments into paths + if value and arg_info["type"] == "file": + is_multiple = isinstance(value, list) + + if is_multiple: + value = [Path(val) for val in value] + else: + value = Path(value) + + if key in gz_args: + print(f">> Checking compression of --{key}", flush=True) + # turn value into list if need be + if not is_multiple: + value = [value] + + # extract + value = [extract_if_need_be(path, temp_dir) for path in value] + + # unlist if need be + if not is_multiple: + value = value[0] + + new_par[key] = value + return new_par + + +def generate_cmd_arguments(par, arguments_info, step_filter=None, flatten=False): + """ + Generate command-line arguments by fetching the relevant args + + Parameters + ---------- + par: The par dictionary created by Viash + arguments_info: The arguments info Dictionary created by \\`fetch_arguments_info\\` + step_filter: If provided,\\`par\\` will be filtered to only contain arguments for which + argument.info.step == step_filter. + flatten: If \\`False\\`, the command for an argument with multiple values will be + \\`["--key", "value1", "--key", "value2"]\\`, otherwise \\`["--key", "value1", "value2"]\\`. + """ + cmd_args = [] + + for key, arg in arguments_info.items(): + arg_val = par.get(key) + # The info key is always present (changed in viash 0.7.4) + # in the parsed config (None if not specified in source config) + info = arg["info"] or {} + orig_arg = info.get("orig_arg") + step = info.get("step") + if arg_val and orig_arg and (not step_filter or step == step_filter): + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + if flatten: + arg_val = [str(x) for x in [orig_arg] + arg_val] + else: + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +def is_gz_file(path: Path) -> bool: + """Check whether something is a gzip""" + with open(path, "rb") as file: + return file.read(2) == b"\\\\x1f\\\\x8b" + + +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + """if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path""" + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +def load_star_reference(reference_index: str) -> None: + """Load star reference index into memory.""" + subprocess.run( + [ + "STAR", + "--genomeLoad", + "LoadAndExit", + "--genomeDir", + str(reference_index), + ], + check=True, + ) + + +def unload_star_reference(reference_index: str) -> None: + """Remove star reference index from memory.""" + subprocess.run( + [ + "STAR", + "--genomeLoad", + "Remove", + "--genomeDir", + str(reference_index), + ], + check=True, + ) + + +def star_and_htseq( + group_id: str, + r1_files: List[Path], + r2_files: List[Path], + temp_dir: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], + num_threads: int, +) -> Tuple[int, str]: + star_output = par["output"] / "per" / group_id + temp_dir_group = temp_dir / f"star_tmp_{group_id}" + unsorted_bam = star_output / "Aligned.out.bam" + sorted_bam = star_output / "Aligned.sorted.out.bam" + counts_file = star_output / "htseq-count.txt" + multiqc_path = star_output / "multiqc_data" + + print(f">> Running STAR for group '{group_id}' with command:", flush=True) + star_output.mkdir(parents=True, exist_ok=True) + temp_dir_group.parent.mkdir(parents=True, exist_ok=True) + run_star( + r1_files=r1_files, + r2_files=r2_files, + output_dir=star_output, + temp_dir=temp_dir / f"star_tmp_{group_id}", + par=par, + arguments_info=arguments_info, + num_threads=num_threads, + ) + if not unsorted_bam.exists(): + return (1, f"Could not find unsorted bam at '{unsorted_bam}'") + + if par["run_htseq_count"]: + print( + f">> Running samtools sort for group '{group_id}' with command:", flush=True + ) + run_samtools_sort(unsorted_bam, sorted_bam) + if not sorted_bam.exists(): + return (1, f"Could not find sorted bam at '{unsorted_bam}'") + + print( + f">> Running htseq-count for group '{group_id}' with command:", flush=True + ) + run_htseq_count(sorted_bam, counts_file, par, arguments_info) + if not counts_file.exists(): + return (1, f"Could not find counts at '{counts_file}'") + + if par["run_multiqc"]: + run_multiqc(star_output) + if not multiqc_path.exists(): + return (1, f"Could not find MultiQC output at '{multiqc_path}'") + + return (0, "") + + +def run_star( + r1_files: List[Path], + r2_files: List[Path], + output_dir: Path, + temp_dir: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], + num_threads: int, +) -> None: + """Run star""" + # process manual arguments + r1_pasted = [",".join([str(r1) for r1 in r1_files])] + r2_pasted = [",".join([str(r2) for r2 in r2_files])] if r2_files else [] + manual_par = { + "--genomeDir": [par["reference_index"]], + "--genomeLoad": ["LoadAndRemove"], + "--runThreadN": [str(num_threads)], + "--runMode": ["alignReads"], + "--readFilesIn": r1_pasted + r2_pasted, + # create a tempdir per group + "--outTmpDir": [temp_dir], + # make sure there is a trailing / + "--outFileNamePrefix": [f"{output_dir}/"], + # fix the outSAMtype to return unsorted BAM files + "--outSAMtype": ["BAM", "Unsorted"], + } + manual_cmd = [str(x) for key, values in manual_par.items() for x in [key] + values] + + # process all passthrough star arguments + par_cmd = generate_cmd_arguments(par, arguments_info, "star", flatten=True) + + # combine into one command and turn into strings + cmd_args = [str(val) for val in ["STAR"] + manual_cmd + par_cmd] + + # run star + subprocess.run(cmd_args, check=True) + + +def run_samtools_sort(unsorted_bam: Path, sorted_bam: Path) -> None: + "Run samtools sort" + cmd_args = [ + "samtools", + "sort", + "-o", + sorted_bam, + unsorted_bam, + ] + subprocess.run(cmd_args, check=True) + + +def run_htseq_count( + sorted_bam: Path, + counts_file: Path, + par: Dict[str, Any], + arguments_info: Dict[str, Any], +) -> None: + """Run HTSeq count""" + # process manual arguments + manual_cmd = [sorted_bam, par["reference_gtf"]] + + # process all passthrough htseq arguments + par_cmd = generate_cmd_arguments(par, arguments_info, "htseq") + + # combine into one command and turn into strings + cmd_args = [str(val) for val in ["htseq-count"] + manual_cmd + par_cmd] + + # run htseq + with open(counts_file, "w", encoding="utf-8") as file: + subprocess.run(cmd_args, check=True, stdout=file) + + +def get_feature_info(reference_gtf) -> pd.DataFrame: + ref = gtfparse.read_gtf(reference_gtf) + ref_genes = ref.filter((pl.col("feature") == "gene") | (pl.col("source") == "ERCC")) + return pd.DataFrame( + { + "feature_id": pd.Index(ref_genes.get_column("gene_id")), + "feature_type": "Gene Expression", + "feature_name": ref_genes.get_column("gene_name").to_pandas(), + } + ) + + +def run_multiqc(input_dir: Path) -> None: + cmd_args = [ + "multiqc", + str(input_dir), + "--outdir", + str(input_dir), + "--no-report", + "--force", + ] + + # run multiqc + subprocess.run(cmd_args, check=True) + + +######################## +### Main code ### +######################## + + +def main(par, meta): + """Main function""" + + # check input arguments + assert len(par["input_id"]) == len(par["input_r1"]), ( + "--input_r1 should have same length as --input_id" + ) + if par["input_r2"]: + assert len(par["input_id"]) == len(par["input_r2"]), ( + "--input_r2 should have same length as --input_id" + ) + + # read config arguments + with open(meta["config"], "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + # fetch all arguments from the config and turn it into a Dict[str, Argument] + arguments_info = fetch_arguments_info(config) + + # temp_dir = "tmp/" + with tempfile.TemporaryDirectory( + prefix=f"{meta['name']}-", dir=meta["temp_dir"], ignore_cleanup_errors=True + ) as temp_dir: + temp_dir = Path(temp_dir) + temp_dir.mkdir(parents=True, exist_ok=True) + + # turn file strings into Paths and decompress gzip if need be + gz_args = ["input_r1", "input_r2", "reference_index", "reference_gtf"] + par = process_par(par, arguments_info, gz_args, temp_dir) + + # make sure input_r2 has same length as input_r1 + if not par["input_r2"]: + par["input_r2"] = [None for _ in par["input_r1"]] + + # group input_files by input_id + print(">> Group by --input_id", flush=True) + grouped_inputs = {} + for group_id, file_r1, file_r2 in zip( + par["input_id"], par["input_r1"], par["input_r2"] + ): + if group_id not in grouped_inputs: + grouped_inputs[group_id] = ([], []) + grouped_inputs[group_id][0].append(file_r1) + if file_r2: + grouped_inputs[group_id][1].append(file_r2) + + # create output dir if need be + par["output"].mkdir(parents=True, exist_ok=True) + + # store features metadata + feature_info = get_feature_info(str(par["reference_gtf"])) + with open(par["output"] / "feature_info.tsv", "w", encoding="utf-8") as file: + feature_info.to_csv(file, sep="\\\\t", index=False) + + # try: + # print(">> Loading genome in memory", flush=True) + # load_star_reference(par["reference_index"]) + + cpus = meta.get("cpus", 1) + num_items = len(grouped_inputs) + pool_size = min(cpus, num_items) + num_threads_per_task = math.ceil(cpus / pool_size) + + with Pool(pool_size) as pool: + outs = pool.starmap( + lambda group_id, files: star_and_htseq( + group_id=group_id, + r1_files=files[0], + r2_files=files[1], + temp_dir=temp_dir, + par=par, + arguments_info=arguments_info, + num_threads=num_threads_per_task, + ), + grouped_inputs.items(), + ) + + num_errored = 0 + for exit, msg in outs: + if exit != 0: + print(f"Error: {msg}") + num_errored += 1 + + pct_succeeded = 1.0 - num_errored / len(outs) + print("------------------") + print(f"Success rate: {math.ceil(pct_succeeded * 100)}%") + + assert pct_succeeded >= par["min_success_rate"], ( + f"Success rate should be at least {math.ceil(par['min_success_rate'] * 100)}%" + ) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/multi_star", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/multi_star/nextflow.config b/target/nextflow/mapping/multi_star/nextflow.config new file mode 100644 index 00000000..d6672b0f --- /dev/null +++ b/target/nextflow/mapping/multi_star/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/multi_star' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Align fastq files using STAR.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/multi_star/nextflow_labels.config b/target/nextflow/mapping/multi_star/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/multi_star/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/multi_star/nextflow_schema.json b/target/nextflow/mapping/multi_star/nextflow_schema.json new file mode 100644 index 00000000..73416f1c --- /dev/null +++ b/target/nextflow/mapping/multi_star/nextflow_schema.json @@ -0,0 +1,1438 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "multi_star", + "description": "Align fastq files using STAR.", + "type": "object", + "$defs": { + "input/output": { + "title": "Input/Output", + "type": "object", + "description": "No description", + "properties": { + "input_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The ID of the sample being processed", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"mysample\";\"mysample\"]`. " + }, + "input_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths to the sequences to be mapped", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L002_R1_001.fastq.gz\"]`. " + }, + "input_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Paths to the sequences to be mapped", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R2_001.fastq.gz\";\"mysample_S1_L002_R2_001.fastq.gz\"]`. " + }, + "reference_index": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the reference built by star_build_reference", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/reference\"`. " + }, + "reference_gtf": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the gtf reference file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"genes.gtf\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Path to output directory", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/foo\"`. ", + "default": "$id.$key.output" + } + } + }, + "processing arguments": { + "title": "Processing arguments", + "type": "object", + "description": "No description", + "properties": { + "run_htseq_count": { + "type": "boolean", + "description": "Whether or not to also run htseq-count after STAR.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "run_multiqc": { + "type": "boolean", + "description": "Whether or not to also run MultiQC at the end.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "min_success_rate": { + "type": "number", + "description": "Fail when the success rate is below this threshold.", + "help_text": "Type: `double`, multiple: `False`, default: `0.5`. ", + "default": 0.5 + } + } + }, + "run parameters": { + "title": "Run Parameters", + "type": "object", + "description": "No description", + "properties": { + "runRNGseed": { + "type": "integer", + "description": "random number generator seed.", + "help_text": "Type: `integer`, multiple: `False`, example: `777`. " + } + } + }, + "genome parameters": { + "title": "Genome Parameters", + "type": "object", + "description": "No description", + "properties": { + "genomeFastaFiles": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "path(s) to the fasta files with the genome sequences, separated by spaces", + "help_text": "Type: `file`, multiple: `True`, direction: `input`. " + } + } + }, + "splice junctions database": { + "title": "Splice Junctions Database", + "type": "object", + "description": "No description", + "properties": { + "sjdbFileChrStartEnd": { + "type": "array", + "items": { + "type": "string" + }, + "description": "path to the files with genomic coordinates (chr start end strand) for the splice junction introns", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sjdbGTFfile": { + "type": "string", + "format": "path", + "description": "path to the GTF file with annotations", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sjdbGTFchrPrefix": { + "type": "string", + "description": "prefix for chromosome names in a GTF file (e.g", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sjdbGTFfeatureExon": { + "type": "string", + "description": "feature type in GTF file to be used as exons for building transcripts", + "help_text": "Type: `string`, multiple: `False`, example: `\"exon\"`. " + }, + "sjdbGTFtagExonParentTranscript": { + "type": "string", + "description": "GTF attribute name for parent transcript ID (default \"transcript_id\" works for GTF files)", + "help_text": "Type: `string`, multiple: `False`, example: `\"transcript_id\"`. " + }, + "sjdbGTFtagExonParentGene": { + "type": "string", + "description": "GTF attribute name for parent gene ID (default \"gene_id\" works for GTF files)", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_id\"`. " + }, + "sjdbGTFtagExonParentGeneName": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute name for parent gene name", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_name\"]`. " + }, + "sjdbGTFtagExonParentGeneType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute name for parent gene type", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_type\";\"gene_biotype\"]`. " + }, + "sjdbOverhang": { + "type": "integer", + "description": "length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "sjdbScore": { + "type": "integer", + "description": "extra alignment score for alignments that cross database junctions", + "help_text": "Type: `integer`, multiple: `False`, example: `2`. " + }, + "sjdbInsertSave": { + "type": "string", + "description": "which files to save when sjdb junctions are inserted on the fly at the mapping step\n\n- Basic ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`. " + } + } + }, + "variation parameters": { + "title": "Variation parameters", + "type": "object", + "description": "No description", + "properties": { + "varVCFfile": { + "type": "string", + "description": "path to the VCF file that contains variation data", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "read parameters": { + "title": "Read Parameters", + "type": "object", + "description": "No description", + "properties": { + "readFilesType": { + "type": "string", + "description": "format of input read files\n\n- Fastx ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Fastx\"`. " + }, + "readFilesSAMattrKeep": { + "type": "array", + "items": { + "type": "string" + }, + "description": "for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"All\"]`. " + }, + "readFilesManifest": { + "type": "string", + "format": "path", + "description": "path to the \"manifest\" file with the names of read files", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "readFilesPrefix": { + "type": "string", + "description": "prefix for the read files names, i.e", + "help_text": "Type: `string`, multiple: `False`. " + }, + "readFilesCommand": { + "type": "array", + "items": { + "type": "string" + }, + "description": "command line to execute for each of the input file", + "help_text": "Type: `string`, multiple: `True`. " + }, + "readMapNumber": { + "type": "integer", + "description": "number of reads to map from the beginning of the file\n\n-1: map all reads", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "readMatesLengthsIn": { + "type": "string", + "description": "Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same", + "help_text": "Type: `string`, multiple: `False`, example: `\"NotEqual\"`. " + }, + "readNameSeparator": { + "type": "array", + "items": { + "type": "string" + }, + "description": "character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)", + "help_text": "Type: `string`, multiple: `True`, example: `[\"/\"]`. " + }, + "readQualityScoreBase": { + "type": "integer", + "description": "number to be subtracted from the ASCII code to get Phred quality score", + "help_text": "Type: `integer`, multiple: `False`, example: `33`. " + } + } + }, + "read clipping": { + "title": "Read Clipping", + "type": "object", + "description": "No description", + "properties": { + "clipAdapterType": { + "type": "string", + "description": "adapter clipping type\n\n- Hamming ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Hamming\"`. " + }, + "clip3pNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number(s) of bases to clip from 3p of each mate", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "clip3pAdapterSeq": { + "type": "array", + "items": { + "type": "string" + }, + "description": "adapter sequences to clip from 3p of each mate", + "help_text": "Type: `string`, multiple: `True`. " + }, + "clip3pAdapterMMp": { + "type": "array", + "items": { + "type": "number" + }, + "description": "max proportion of mismatches for 3p adapter clipping for each mate", + "help_text": "Type: `double`, multiple: `True`, example: `[0.1]`. " + }, + "clip3pAfterAdapterNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number of bases to clip from 3p of each mate after the adapter clipping", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "clip5pNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number(s) of bases to clip from 5p of each mate", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + } + } + }, + "limits": { + "title": "Limits", + "type": "object", + "description": "No description", + "properties": { + "limitGenomeGenerateRAM": { + "type": "string", + "description": "maximum available RAM (bytes) for genome generation", + "help_text": "Type: `long`, multiple: `False`, example: `31000000000`. " + }, + "limitIObufferSize": { + "type": "array", + "items": { + "type": "string" + }, + "description": "max available buffers size (bytes) for input/output, per thread", + "help_text": "Type: `long`, multiple: `True`, example: `[30000000;50000000]`. " + }, + "limitOutSAMoneReadBytes": { + "type": "string", + "description": "max size of the SAM record (bytes) for one read", + "help_text": "Type: `long`, multiple: `False`, example: `100000`. " + }, + "limitOutSJoneRead": { + "type": "integer", + "description": "max number of junctions for one read (including all multi-mappers)", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "limitOutSJcollapsed": { + "type": "integer", + "description": "max number of collapsed junctions", + "help_text": "Type: `integer`, multiple: `False`, example: `1000000`. " + }, + "limitBAMsortRAM": { + "type": "string", + "description": "maximum available RAM (bytes) for sorting BAM", + "help_text": "Type: `long`, multiple: `False`, example: `0`. " + }, + "limitSjdbInsertNsj": { + "type": "integer", + "description": "maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "help_text": "Type: `integer`, multiple: `False`, example: `1000000`. " + }, + "limitNreadsSoft": { + "type": "integer", + "description": "soft limit on the number of reads", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + } + } + }, + "output: general": { + "title": "Output: general", + "type": "object", + "description": "No description", + "properties": { + "outTmpKeep": { + "type": "string", + "description": "whether to keep the temporary files after STAR runs is finished\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outStd": { + "type": "string", + "description": "which output will be directed to stdout (standard out)\n\n- Log ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Log\"`. " + }, + "outReadsUnmapped": { + "type": "string", + "description": "output of unmapped and partially mapped (i.e", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outQSconversionAdd": { + "type": "integer", + "description": "add this number to the quality score (e.g", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outMultimapperOrder": { + "type": "string", + "description": "order of multimapping alignments in the output files\n\n- Old_2.4 ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Old_2.4\"`. " + } + } + }, + "output: sam and bam": { + "title": "Output: SAM and BAM", + "type": "object", + "description": "No description", + "properties": { + "outSAMmode": { + "type": "string", + "description": "mode of SAM output\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Full\"`. " + }, + "outSAMstrandField": { + "type": "string", + "description": "Cufflinks-like strand field flag\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outSAMattributes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "a string of desired SAM attributes, in the order desired for the output SAM", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Standard\"]`. " + }, + "outSAMattrIHstart": { + "type": "integer", + "description": "start value for the IH attribute", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outSAMunmapped": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output of unmapped reads in the SAM format\n\n1st word:\n- None ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMorder": { + "type": "string", + "description": "type of sorting for the SAM output\n\nPaired: one mate after the other for all paired alignments\nPairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files", + "help_text": "Type: `string`, multiple: `False`, example: `\"Paired\"`. " + }, + "outSAMprimaryFlag": { + "type": "string", + "description": "which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG\n\n- OneBestScore ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"OneBestScore\"`. " + }, + "outSAMreadID": { + "type": "string", + "description": "read ID record type\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Standard\"`. " + }, + "outSAMmapqUnique": { + "type": "integer", + "description": "0 to 255: the MAPQ value for unique mappers", + "help_text": "Type: `integer`, multiple: `False`, example: `255`. " + }, + "outSAMflagOR": { + "type": "integer", + "description": "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outSAMflagAND": { + "type": "integer", + "description": "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `65535`. " + }, + "outSAMattrRGline": { + "type": "array", + "items": { + "type": "string" + }, + "description": "SAM/BAM read group line", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderHD": { + "type": "array", + "items": { + "type": "string" + }, + "description": "@HD (header) line of the SAM header", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderPG": { + "type": "array", + "items": { + "type": "string" + }, + "description": "extra @PG (software) line of the SAM header (in addition to STAR)", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderCommentFile": { + "type": "string", + "description": "path to the file with @CO (comment) lines of the SAM header", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outSAMfilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMmultNmax": { + "type": "integer", + "description": "max number of multiple alignments for a read that will be output to the SAM/BAM files", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "outSAMtlen": { + "type": "integer", + "description": "calculation method for the TLEN field in the SAM/BAM files\n\n- 1 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outBAMcompression": { + "type": "integer", + "description": "-1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outBAMsortingThreadN": { + "type": "integer", + "description": ">=0: number of threads for BAM sorting", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outBAMsortingBinsN": { + "type": "integer", + "description": ">0: number of genome bins for coordinate-sorting", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + } + } + }, + "bam processing": { + "title": "BAM processing", + "type": "object", + "description": "No description", + "properties": { + "bamRemoveDuplicatesType": { + "type": "string", + "description": "mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- - ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "bamRemoveDuplicatesMate2basesN": { + "type": "integer", + "description": "number of bases from the 5' of mate 2 to use in collapsing (e.g", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "output wiggle": { + "title": "Output Wiggle", + "type": "object", + "description": "No description", + "properties": { + "outWigType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of signal output, e.g", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outWigStrand": { + "type": "string", + "description": "strandedness of wiggle/bedGraph output\n\n- Stranded ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Stranded\"`. " + }, + "outWigReferencesPrefix": { + "type": "string", + "description": "prefix matching reference names to include in the output wiggle file, e.g", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outWigNorm": { + "type": "string", + "description": "type of normalization for the signal\n\n- RPM ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"RPM\"`. " + } + } + }, + "output filtering": { + "title": "Output Filtering", + "type": "object", + "description": "No description", + "properties": { + "outFilterType": { + "type": "string", + "description": "type of filtering\n\n- Normal ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Normal\"`. " + }, + "outFilterMultimapScoreRange": { + "type": "integer", + "description": "the score range below the maximum score for multimapping alignments", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outFilterMultimapNmax": { + "type": "integer", + "description": "maximum number of loci the read is allowed to map to", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "outFilterMismatchNmax": { + "type": "integer", + "description": "alignment will be output only if it has no more mismatches than this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "outFilterMismatchNoverLmax": { + "type": "number", + "description": "alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.", + "help_text": "Type: `double`, multiple: `False`, example: `0.3`. " + }, + "outFilterMismatchNoverReadLmax": { + "type": "number", + "description": "alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.", + "help_text": "Type: `double`, multiple: `False`, example: `1.0`. " + }, + "outFilterScoreMin": { + "type": "integer", + "description": "alignment will be output only if its score is higher than or equal to this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outFilterScoreMinOverLread": { + "type": "number", + "description": "same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "outFilterMatchNmin": { + "type": "integer", + "description": "alignment will be output only if the number of matched bases is higher than or equal to this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outFilterMatchNminOverLread": { + "type": "number", + "description": "sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "outFilterIntronMotifs": { + "type": "string", + "description": "filter alignment using their motifs\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outFilterIntronStrands": { + "type": "string", + "description": "filter alignments\n\n- RemoveInconsistentStrands ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"RemoveInconsistentStrands\"`. " + } + } + }, + "output splice junctions (sj.out.tab)": { + "title": "Output splice junctions (SJ.out.tab)", + "type": "object", + "description": "No description", + "properties": { + "outSJtype": { + "type": "string", + "description": "type of splice junction output\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Standard\"`. " + } + } + }, + "output filtering: splice junctions": { + "title": "Output Filtering: Splice Junctions", + "type": "object", + "description": "No description", + "properties": { + "outSJfilterReads": { + "type": "string", + "description": "which reads to consider for collapsed splice junctions output\n\n- All ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"All\"`. " + }, + "outSJfilterOverhangMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[30;12;12;12]`. " + }, + "outSJfilterCountUniqueMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[3;1;1;1]`. " + }, + "outSJfilterCountTotalMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[3;1;1;1]`. " + }, + "outSJfilterDistToOtherSJmin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum allowed distance to other junctions' donor/acceptor\n\ndoes not apply to annotated junctions", + "help_text": "Type: `integer`, multiple: `True`, example: `[10;0;5;10]`. " + }, + "outSJfilterIntronMaxVsReadN": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ni.e", + "help_text": "Type: `integer`, multiple: `True`, example: `[50000;100000;200000]`. " + } + } + }, + "scoring": { + "title": "Scoring", + "type": "object", + "description": "No description", + "properties": { + "scoreGap": { + "type": "integer", + "description": "splice junction penalty (independent on intron motif)", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "scoreGapNoncan": { + "type": "integer", + "description": "non-canonical junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-8`. " + }, + "scoreGapGCAG": { + "type": "integer", + "description": "GC/AG and CT/GC junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-4`. " + }, + "scoreGapATAC": { + "type": "integer", + "description": "AT/AC and GT/AT junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-8`. " + }, + "scoreGenomicLengthLog2scale": { + "type": "integer", + "description": "extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "scoreDelOpen": { + "type": "integer", + "description": "deletion open penalty", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreDelBase": { + "type": "integer", + "description": "deletion extension penalty per base (in addition to scoreDelOpen)", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreInsOpen": { + "type": "integer", + "description": "insertion open penalty", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreInsBase": { + "type": "integer", + "description": "insertion extension penalty per base (in addition to scoreInsOpen)", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreStitchSJshift": { + "type": "integer", + "description": "maximum score reduction while searching for SJ boundaries in the stitching step", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + } + } + }, + "alignments and seeding": { + "title": "Alignments and Seeding", + "type": "object", + "description": "No description", + "properties": { + "seedSearchStartLmax": { + "type": "integer", + "description": "defines the search start point through the read - the read is split into pieces no longer than this value", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "seedSearchStartLmaxOverLread": { + "type": "number", + "description": "seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)", + "help_text": "Type: `double`, multiple: `False`, example: `1.0`. " + }, + "seedSearchLmax": { + "type": "integer", + "description": "defines the maximum length of the seeds, if =0 seed length is not limited", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "seedMultimapNmax": { + "type": "integer", + "description": "only pieces that map fewer than this value are utilized in the stitching procedure", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "seedPerReadNmax": { + "type": "integer", + "description": "max number of seeds per read", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "seedPerWindowNmax": { + "type": "integer", + "description": "max number of seeds per window", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "seedNoneLociPerWindow": { + "type": "integer", + "description": "max number of one seed loci per window", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "seedSplitMin": { + "type": "integer", + "description": "min length of the seed sequences split by Ns or mate gap", + "help_text": "Type: `integer`, multiple: `False`, example: `12`. " + }, + "seedMapMin": { + "type": "integer", + "description": "min length of seeds to be mapped", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "alignIntronMin": { + "type": "integer", + "description": "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion", + "help_text": "Type: `integer`, multiple: `False`, example: `21`. " + }, + "alignIntronMax": { + "type": "integer", + "description": "maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignMatesGapMax": { + "type": "integer", + "description": "maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignSJoverhangMin": { + "type": "integer", + "description": "minimum overhang (i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "alignSJstitchMismatchNmax": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "maximum number of mismatches for stitching of the splice junctions (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.", + "help_text": "Type: `integer`, multiple: `True`, example: `[0;-1;0;0]`. " + }, + "alignSJDBoverhangMin": { + "type": "integer", + "description": "minimum overhang (i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "alignSplicedMateMapLmin": { + "type": "integer", + "description": "minimum mapped length for a read mate that is spliced", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignSplicedMateMapLminOverLmate": { + "type": "number", + "description": "alignSplicedMateMapLmin normalized to mate length", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "alignWindowsPerReadNmax": { + "type": "integer", + "description": "max number of windows per read", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "alignTranscriptsPerWindowNmax": { + "type": "integer", + "description": "max number of transcripts per window", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "alignTranscriptsPerReadNmax": { + "type": "integer", + "description": "max number of different alignments per read to consider", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "alignEndsType": { + "type": "string", + "description": "type of read ends alignment\n\n- Local ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Local\"`. " + }, + "alignEndsProtrude": { + "type": "string", + "description": "allow protrusion of alignment ends, i.e", + "help_text": "Type: `string`, multiple: `False`, example: `\"0 ConcordantPair\"`. " + }, + "alignSoftClipAtReferenceEnds": { + "type": "string", + "description": "allow the soft-clipping of the alignments past the end of the chromosomes\n\n- Yes ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Yes\"`. " + }, + "alignInsertionFlush": { + "type": "string", + "description": "how to flush ambiguous insertion positions\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "paired-end reads": { + "title": "Paired-End reads", + "type": "object", + "description": "No description", + "properties": { + "peOverlapNbasesMin": { + "type": "integer", + "description": "minimum number of overlapping bases to trigger mates merging and realignment", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "peOverlapMMp": { + "type": "number", + "description": "maximum proportion of mismatched bases in the overlap area", + "help_text": "Type: `double`, multiple: `False`, example: `0.01`. " + } + } + }, + "windows, anchors, binning": { + "title": "Windows, Anchors, Binning", + "type": "object", + "description": "No description", + "properties": { + "winAnchorMultimapNmax": { + "type": "integer", + "description": "max number of loci anchors are allowed to map to", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "winBinNbits": { + "type": "integer", + "description": "=log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.", + "help_text": "Type: `integer`, multiple: `False`, example: `16`. " + }, + "winAnchorDistNbins": { + "type": "integer", + "description": "max number of bins between two anchors that allows aggregation of anchors into one window", + "help_text": "Type: `integer`, multiple: `False`, example: `9`. " + }, + "winFlankNbins": { + "type": "integer", + "description": "log2(winFlank), where win Flank is the size of the left and right flanking regions for each window", + "help_text": "Type: `integer`, multiple: `False`, example: `4`. " + }, + "winReadCoverageRelativeMin": { + "type": "number", + "description": "minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.", + "help_text": "Type: `double`, multiple: `False`, example: `0.5`. " + }, + "winReadCoverageBasesMin": { + "type": "integer", + "description": "minimum number of bases covered by the seeds in a window , for STARlong algorithm only.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "chimeric alignments": { + "title": "Chimeric Alignments", + "type": "object", + "description": "No description", + "properties": { + "chimOutType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of chimeric output\n\n- Junctions ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Junctions\"]`. " + }, + "chimSegmentMin": { + "type": "integer", + "description": "minimum length of chimeric segment length, if ==0, no chimeric output", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimScoreMin": { + "type": "integer", + "description": "minimum total (summed) score of the chimeric segments", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimScoreDropMax": { + "type": "integer", + "description": "max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimScoreSeparation": { + "type": "integer", + "description": "minimum difference (separation) between the best chimeric score and the next one", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "chimScoreJunctionNonGTAG": { + "type": "integer", + "description": "penalty for a non-GT/AG chimeric junction", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "chimJunctionOverhangMin": { + "type": "integer", + "description": "minimum overhang for a chimeric junction", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimSegmentReadGapMax": { + "type": "integer", + "description": "maximum gap in the read sequence between chimeric segments", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "different filters for chimeric alignments\n\n- None ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"banGenomicN\"]`. " + }, + "chimMainSegmentMultNmax": { + "type": "integer", + "description": "maximum number of multi-alignments for the main chimeric segment", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "chimMultimapNmax": { + "type": "integer", + "description": "maximum number of chimeric multi-alignments\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimMultimapScoreRange": { + "type": "integer", + "description": "the score range for multi-mapping chimeras below the best chimeric score", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "chimNonchimScoreDropMin": { + "type": "integer", + "description": "to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimOutJunctionFormat": { + "type": "integer", + "description": "formatting type for the Chimeric.out.junction file\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "quantification of annotations": { + "title": "Quantification of Annotations", + "type": "object", + "description": "No description", + "properties": { + "quantMode": { + "type": "array", + "items": { + "type": "string" + }, + "description": "types of quantification requested\n\n- - ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "quantTranscriptomeBAMcompression": { + "type": "integer", + "description": "-2 to 10 transcriptome BAM compression level\n\n- -2 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "quantTranscriptomeBan": { + "type": "string", + "description": "prohibit various alignment type\n\n- IndelSoftclipSingleend ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"IndelSoftclipSingleend\"`. " + } + } + }, + "2-pass mapping": { + "title": "2-pass Mapping", + "type": "object", + "description": "No description", + "properties": { + "twopassMode": { + "type": "string", + "description": "2-pass mapping mode.\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "twopass1readsN": { + "type": "integer", + "description": "number of reads to process for the 1st step", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + } + } + }, + "wasp parameters": { + "title": "WASP parameters", + "type": "object", + "description": "No description", + "properties": { + "waspOutputMode": { + "type": "string", + "description": "WASP allele-specific output type", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "starsolo (single cell rna-seq) parameters": { + "title": "STARsolo (single cell RNA-seq) parameters", + "type": "object", + "description": "No description", + "properties": { + "soloType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of single-cell RNA-seq\n\n- CB_UMI_Simple ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloCBwhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "file(s) with whitelist(s) of cell barcodes", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloCBstart": { + "type": "integer", + "description": "cell barcode start base", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloCBlen": { + "type": "integer", + "description": "cell barcode length", + "help_text": "Type: `integer`, multiple: `False`, example: `16`. " + }, + "soloUMIstart": { + "type": "integer", + "description": "UMI start base", + "help_text": "Type: `integer`, multiple: `False`, example: `17`. " + }, + "soloUMIlen": { + "type": "integer", + "description": "UMI length", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "soloBarcodeReadLength": { + "type": "integer", + "description": "length of the barcode read\n\n- 1 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloBarcodeMate": { + "type": "integer", + "description": "identifies which read mate contains the barcode (CB+UMI) sequence\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "soloCBposition": { + "type": "array", + "items": { + "type": "string" + }, + "description": "position of Cell Barcode(s) on the barcode read.\n\nPresently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\nFormat for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end\nstart(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base\nString for different barcodes are separated by space.\nExample: inDrop (Zilionis et al, Nat", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloUMIposition": { + "type": "string", + "description": "position of the UMI on the barcode read, same as soloCBposition\n\nExample: inDrop (Zilionis et al, Nat", + "help_text": "Type: `string`, multiple: `False`. " + }, + "soloAdapterSequence": { + "type": "string", + "description": "adapter sequence to anchor barcodes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "soloAdapterMismatchesNmax": { + "type": "integer", + "description": "maximum number of mismatches allowed in adapter sequence.", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloCBmatchWLtype": { + "type": "string", + "description": "matching the Cell Barcodes to the WhiteList\n\n- Exact ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"1MM_multi\"`. " + }, + "soloInputSAMattrBarcodeSeq": { + "type": "array", + "items": { + "type": "string" + }, + "description": "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .\nThis parameter is required when running STARsolo with input from SAM.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloInputSAMattrBarcodeQual": { + "type": "array", + "items": { + "type": "string" + }, + "description": "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned to all bases.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloStrand": { + "type": "string", + "description": "strandedness of the solo libraries:\n\n- Unstranded ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Forward\"`. " + }, + "soloFeatures": { + "type": "array", + "items": { + "type": "string" + }, + "description": "genomic features for which the UMI counts per Cell Barcode are collected\n\n- Gene ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene\"]`. " + }, + "soloMultiMappers": { + "type": "array", + "items": { + "type": "string" + }, + "description": "counting method for reads mapping to multiple genes\n\n- Unique ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Unique\"]`. " + }, + "soloUMIdedup": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1MM_All\"]`. " + }, + "soloUMIfiltering": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of UMI filtering (for reads uniquely mapping to genes)\n\n- - ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloOutFileNames": { + "type": "array", + "items": { + "type": "string" + }, + "description": "file names for STARsolo output:\n\nfile_name_prefix gene_names barcode_sequences cell_feature_count_matrix", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Solo.out/\";\"features.tsv\";\"barcodes.tsv\";\"matrix.mtx\"]`. " + }, + "soloCellFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "cell filtering type and parameters\n\n- None ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"CellRanger2.2\";\"3000\";\"0.99\";\"10\"]`. " + }, + "soloOutFormatFeaturesGeneField3": { + "type": "array", + "items": { + "type": "string" + }, + "description": "field 3 in the Gene features.tsv file", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene Expression\"]`. " + }, + "soloCellReadStats": { + "type": "string", + "description": "Output reads statistics for each CB\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "htseq arguments": { + "title": "HTSeq arguments", + "type": "object", + "description": "No description", + "properties": { + "stranded": { + "type": "string", + "description": "Whether the data is from a strand-specific assay", + "help_text": "Type: `string`, multiple: `False`, default: `\"yes\"`, choices: ``yes`, `no`, `reverse``. ", + "enum": [ + "yes", + "no", + "reverse" + ], + "default": "yes" + }, + "minimum_alignment_quality": { + "type": "integer", + "description": "Skip all reads with MAPQ alignment quality lower than the given minimum value", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "type": { + "type": "string", + "description": "Feature type (3rd column in GTF file) to be used, all features of other type are ignored (default, suitable for Ensembl GTF files: exon)", + "help_text": "Type: `string`, multiple: `False`, example: `\"exon\"`. " + }, + "id_attribute": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute to be used as feature ID (default, suitable for Ensembl GTF files: gene_id).\nAll feature of the right type (see -t option) within the same GTF attribute will be added\ntogether", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_id\"]`. " + }, + "additional_attributes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Additional feature attributes (suitable for Ensembl GTF files: gene_name)", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_name\"]`. " + }, + "add_chromosome_info": { + "type": "boolean", + "description": "Store information about the chromosome of each feature as an additional attribute\n(e.g", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "mode": { + "type": "string", + "description": "Mode to handle reads overlapping more than one feature.", + "help_text": "Type: `string`, multiple: `False`, default: `\"union\"`, choices: ``union`, `intersection-strict`, `intersection-nonempty``. ", + "enum": [ + "union", + "intersection-strict", + "intersection-nonempty" + ], + "default": "union" + }, + "non_unique": { + "type": "string", + "description": "Whether and how to score reads that are not uniquely aligned or ambiguously assigned to features.", + "help_text": "Type: `string`, multiple: `False`, default: `\"none\"`, choices: ``none`, `all`, `fraction`, `random``. ", + "enum": [ + "none", + "all", + "fraction", + "random" + ], + "default": "none" + }, + "secondary_alignments": { + "type": "string", + "description": "Whether to score secondary alignments (0x100 flag).", + "help_text": "Type: `string`, multiple: `False`, choices: ``score`, `ignore``. ", + "enum": [ + "score", + "ignore" + ] + }, + "supplementary_alignments": { + "type": "string", + "description": "Whether to score supplementary alignments (0x800 flag).", + "help_text": "Type: `string`, multiple: `False`, choices: ``score`, `ignore``. ", + "enum": [ + "score", + "ignore" + ] + }, + "counts_output_sparse": { + "type": "boolean", + "description": "Store the counts as a sparse matrix (mtx, h5ad, loom).", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input/output" + }, + { + "$ref": "#/$defs/processing arguments" + }, + { + "$ref": "#/$defs/run parameters" + }, + { + "$ref": "#/$defs/genome parameters" + }, + { + "$ref": "#/$defs/splice junctions database" + }, + { + "$ref": "#/$defs/variation parameters" + }, + { + "$ref": "#/$defs/read parameters" + }, + { + "$ref": "#/$defs/read clipping" + }, + { + "$ref": "#/$defs/limits" + }, + { + "$ref": "#/$defs/output: general" + }, + { + "$ref": "#/$defs/output: sam and bam" + }, + { + "$ref": "#/$defs/bam processing" + }, + { + "$ref": "#/$defs/output wiggle" + }, + { + "$ref": "#/$defs/output filtering" + }, + { + "$ref": "#/$defs/output splice junctions (sj.out.tab)" + }, + { + "$ref": "#/$defs/output filtering: splice junctions" + }, + { + "$ref": "#/$defs/scoring" + }, + { + "$ref": "#/$defs/alignments and seeding" + }, + { + "$ref": "#/$defs/paired-end reads" + }, + { + "$ref": "#/$defs/windows, anchors, binning" + }, + { + "$ref": "#/$defs/chimeric alignments" + }, + { + "$ref": "#/$defs/quantification of annotations" + }, + { + "$ref": "#/$defs/2-pass mapping" + }, + { + "$ref": "#/$defs/wasp parameters" + }, + { + "$ref": "#/$defs/starsolo (single cell rna-seq) parameters" + }, + { + "$ref": "#/$defs/htseq arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/multi_star_to_h5mu/.config.vsh.yaml b/target/nextflow/mapping/multi_star_to_h5mu/.config.vsh.yaml new file mode 100644 index 00000000..433eb143 --- /dev/null +++ b/target/nextflow/mapping/multi_star_to_h5mu/.config.vsh.yaml @@ -0,0 +1,253 @@ +name: "multi_star_to_h5mu" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "The directory created by `multi_star`" + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert the output of `multi_star` to a h5mu.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "multi_star" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/multi_star_to_h5mu/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/multi_star_to_h5mu" + executable: "target/nextflow/mapping/multi_star_to_h5mu/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/multi_star_to_h5mu/main.nf b/target/nextflow/mapping/multi_star_to_h5mu/main.nf new file mode 100644 index 00000000..e7592162 --- /dev/null +++ b/target/nextflow/mapping/multi_star_to_h5mu/main.nf @@ -0,0 +1,3974 @@ +// multi_star_to_h5mu main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) +// * Angela Oliveira Pisco (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "multi_star_to_h5mu", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The directory created by `multi_star`", + "example" : [ + "/path/to/foo" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert the output of `multi_star` to a h5mu.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq/multi_star" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/multi_star_to_h5mu/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/multi_star_to_h5mu", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from pathlib import Path +import pandas as pd +import mudata as md +import anndata as ad +import numpy as np +import json + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# convert to path +input_dir = Path(par["input"]) + +# read counts information +print("> Read counts data", flush=True) +per_obs_data = [] + +for input_counts in (input_dir / "per").glob("**/htseq-count.txt"): + per_obs_dir = input_counts.parent + input_id = per_obs_dir.name + input_multiqc = per_obs_dir / "multiqc_data" / "multiqc_data.json" + + data = pd.read_table( + input_counts, + index_col=0, + names=["cell_id", input_id], + dtype={"cell_id": "U", input_id: "i"}, + ) + data2 = data[~data.index.str.startswith("__")] + + with open(input_multiqc, "r") as file: + qc = json.load(file) + + qc_star = qc.get("report_saved_raw_data", {}).get("multiqc_star", {}).get(input_id) + qc_htseq = ( + qc.get("report_saved_raw_data", {}).get("multiqc_htseq", {}).get("htseq-count") + ) + + per_obs_data.append( + { + "counts": data2.transpose(), + "qc_star": pd.DataFrame(qc_star, index=[input_id]), + "qc_htseq": pd.DataFrame(qc_htseq, index=[input_id]), + } + ) + + +# combine all counts +counts = pd.concat([x["counts"] for x in per_obs_data], axis=0) +qc_star = pd.concat([x["qc_star"] for x in per_obs_data], axis=0) +qc_htseq = pd.concat([x["qc_htseq"] for x in per_obs_data], axis=0) + +# read feature info +feature_info = pd.read_csv(input_dir / "feature_info.tsv", sep="\\\\t", index_col=0) +feature_info_ord = feature_info.loc[counts.columns] + +var = pd.DataFrame( + data={ + "gene_ids": feature_info_ord.index, + "feature_types": "Gene Expression", + "gene_name": feature_info_ord["feature_name"], + } +).set_index("gene_ids") + +print("> construct anndata", flush=True) +adata = ad.AnnData( + X=counts, obsm={"qc_star": qc_star, "qc_htseq": qc_htseq}, var=var, dtype=np.int32 +) + +print("> convert to mudata", flush=True) +mdata = md.MuData(adata) + +print("> write to file", flush=True) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/multi_star_to_h5mu", + "tag" : "main" + }, + "label" : [ + "midmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/multi_star_to_h5mu/nextflow.config b/target/nextflow/mapping/multi_star_to_h5mu/nextflow.config new file mode 100644 index 00000000..08f2d4ec --- /dev/null +++ b/target/nextflow/mapping/multi_star_to_h5mu/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/multi_star_to_h5mu' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert the output of `multi_star` to a h5mu.\n' + author = 'Robrecht Cannoodt, Angela Oliveira Pisco' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/multi_star_to_h5mu/nextflow_labels.config b/target/nextflow/mapping/multi_star_to_h5mu/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/multi_star_to_h5mu/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/multi_star_to_h5mu/nextflow_schema.json b/target/nextflow/mapping/multi_star_to_h5mu/nextflow_schema.json new file mode 100644 index 00000000..fd17e3d8 --- /dev/null +++ b/target/nextflow/mapping/multi_star_to_h5mu/nextflow_schema.json @@ -0,0 +1,58 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "multi_star_to_h5mu", + "description": "Convert the output of `multi_star` to a h5mu.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The directory created by `multi_star`", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/foo\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/samtools_sort/.config.vsh.yaml b/target/nextflow/mapping/samtools_sort/.config.vsh.yaml new file mode 100644 index 00000000..b2bfbf63 --- /dev/null +++ b/target/nextflow/mapping/samtools_sort/.config.vsh.yaml @@ -0,0 +1,331 @@ +name: "samtools_sort" +namespace: "mapping" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input" + arguments: + - type: "file" + name: "--input" + description: "Path to the SAM/BAM/CRAM files containing the mapped reads." + info: + orig_arg: "in_sam" + example: + - "input.bam" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output_bam" + description: "Filename to output the counts to." + info: + orig_arg: "-o" + example: + - "output.bam" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_bai" + description: "BAI-format index for BAM file." + info: null + example: + - "output.bam.bai" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_format" + description: "The output format. By default, samtools tries to select a format\ + \ based on the -o filename extension; if output is to standard output or no\ + \ format can be deduced, bam is selected." + info: + orig_arg: "-O" + example: + - "bam" + required: false + choices: + - "sam" + - "bam" + - "cram" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--compression" + description: "Compression level, from 0 (uncompressed) to 9 (best" + info: + orig_arg: "-l" + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "boolean_true" + name: "--minimizer_cluster" + description: "Sort unmapped reads (those in chromosome \"*\") by their sequence\ + \ minimiser (Schleimer et al., 2003; Roberts et al., 2004), \nalso reverse complementing\ + \ as appropriate. This has the effect of collating some similar data together,\ + \ improving the \ncompressibility of the unmapped sequence. The minimiser kmer\ + \ size is adjusted using the -K option. Note data compressed \nin this manner\ + \ may need to be name collated prior to conversion back to fastq.\n\nMapped\ + \ sequences are sorted by chromosome and position. \n" + info: + orig_arg: "-M" + direction: "input" + - type: "integer" + name: "--minimizer_kmer" + description: "Sets the kmer size to be used in the -M option." + info: + orig_arg: "-K" + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--sort_by_read_names" + description: "Sort by read names (i.e., the QNAME field) rather than by chromosomal\ + \ coordinates." + info: + orig_arg: "-n" + direction: "input" + - type: "string" + name: "--sort_by" + description: "Sort first by this value in the alignment tag, then by position\ + \ or name (if also using -n)." + info: + orig_arg: "-t" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_pg" + description: "Do not add a @PG line to the header of the output file." + info: + orig_arg: "--no-PG" + direction: "input" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Sort and (optionally) index alignments.\n\nReads are sorted by leftmost\ + \ coordinates, or by read name when `--sort_by_read_names` is used.\n\nAn appropriate\ + \ `@HD-SO` sort order header tag will be added or an existing one updated if necessary.\n\ + \nNote that to generate an index file (by specifying `--output_bai`), the default\ + \ coordinate sort must be used.\nThus the `--sort_by_read_names` and `--sort_by\ + \ ` options are incompatible with `--output_bai`. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "samtools" + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pyyaml" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/samtools_sort/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/samtools_sort" + executable: "target/nextflow/mapping/samtools_sort/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/samtools_sort/main.nf b/target/nextflow/mapping/samtools_sort/main.nf new file mode 100644 index 00000000..f5e42721 --- /dev/null +++ b/target/nextflow/mapping/samtools_sort/main.nf @@ -0,0 +1,4074 @@ +// samtools_sort main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) +// * Angela Oliveira Pisco (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "samtools_sort", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to the SAM/BAM/CRAM files containing the mapped reads.", + "info" : { + "orig_arg" : "in_sam" + }, + "example" : [ + "input.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output_bam", + "description" : "Filename to output the counts to.", + "info" : { + "orig_arg" : "-o" + }, + "example" : [ + "output.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_bai", + "description" : "BAI-format index for BAM file.", + "example" : [ + "output.bam.bai" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_format", + "description" : "The output format. By default, samtools tries to select a format based on the -o filename extension; if output is to standard output or no format can be deduced, bam is selected.", + "info" : { + "orig_arg" : "-O" + }, + "example" : [ + "bam" + ], + "required" : false, + "choices" : [ + "sam", + "bam", + "cram" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--compression", + "description" : "Compression level, from 0 (uncompressed) to 9 (best", + "info" : { + "orig_arg" : "-l" + }, + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--minimizer_cluster", + "description" : "Sort unmapped reads (those in chromosome \\"*\\") by their sequence minimiser (Schleimer et al., 2003; Roberts et al., 2004), \nalso reverse complementing as appropriate. This has the effect of collating some similar data together, improving the \ncompressibility of the unmapped sequence. The minimiser kmer size is adjusted using the -K option. Note data compressed \nin this manner may need to be name collated prior to conversion back to fastq.\n\nMapped sequences are sorted by chromosome and position. \n", + "info" : { + "orig_arg" : "-M" + }, + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--minimizer_kmer", + "description" : "Sets the kmer size to be used in the -M option.", + "info" : { + "orig_arg" : "-K" + }, + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--sort_by_read_names", + "description" : "Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates.", + "info" : { + "orig_arg" : "-n" + }, + "direction" : "input" + }, + { + "type" : "string", + "name" : "--sort_by", + "description" : "Sort first by this value in the alignment tag, then by position or name (if also using -n).", + "info" : { + "orig_arg" : "-t" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--no_pg", + "description" : "Do not add a @PG line to the header of the output file.", + "info" : { + "orig_arg" : "--no-PG" + }, + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Sort and (optionally) index alignments.\n\nReads are sorted by leftmost coordinates, or by read name when `--sort_by_read_names` is used.\n\nAn appropriate `@HD-SO` sort order header tag will be added or an existing one updated if necessary.\n\nNote that to generate an index file (by specifying `--output_bai`), the default coordinate sort must be used.\nThus the `--sort_by_read_names` and `--sort_by ` options are incompatible with `--output_bai`. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "samtools", + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pyyaml" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/samtools_sort/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/samtools_sort", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import tempfile +import subprocess +from pathlib import Path +import yaml + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_bam': $( if [ ! -z ${VIASH_PAR_OUTPUT_BAM+x} ]; then echo "r'${VIASH_PAR_OUTPUT_BAM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_bai': $( if [ ! -z ${VIASH_PAR_OUTPUT_BAI+x} ]; then echo "r'${VIASH_PAR_OUTPUT_BAI//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_format': $( if [ ! -z ${VIASH_PAR_OUTPUT_FORMAT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_FORMAT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'compression': $( if [ ! -z ${VIASH_PAR_COMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'minimizer_cluster': $( if [ ! -z ${VIASH_PAR_MINIMIZER_CLUSTER+x} ]; then echo "r'${VIASH_PAR_MINIMIZER_CLUSTER//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'minimizer_kmer': $( if [ ! -z ${VIASH_PAR_MINIMIZER_KMER+x} ]; then echo "int(r'${VIASH_PAR_MINIMIZER_KMER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sort_by_read_names': $( if [ ! -z ${VIASH_PAR_SORT_BY_READ_NAMES+x} ]; then echo "r'${VIASH_PAR_SORT_BY_READ_NAMES//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'sort_by': $( if [ ! -z ${VIASH_PAR_SORT_BY+x} ]; then echo "r'${VIASH_PAR_SORT_BY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'no_pg': $( if [ ! -z ${VIASH_PAR_NO_PG+x} ]; then echo "r'${VIASH_PAR_NO_PG//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +def generate_args(par, config): + # fetch arguments from config + arguments = [ + arg for group in config["argument_groups"] for arg in group["arguments"] + ] + + cmd_args = [] + + for arg in arguments: + arg_val = par.get(arg["name"].removeprefix("--")) + # The info key is always present (changed in viash 0.7.4) + # in the parsed config (None if not specified in source config) + info = arg["info"] or {} + orig_arg = info.get("orig_arg") + if arg_val and orig_arg: + if not arg.get("multiple", False): + arg_val = [arg_val] + + if arg["type"] in ["boolean_true", "boolean_false"]: + # if argument is a boolean_true or boolean_false, simply add the flag + arg_val = [orig_arg] + elif orig_arg.startswith("-"): + # if the orig arg flag is not a positional, + # add the flag in front of each element and flatten + arg_val = [str(x) for val in arg_val for x in [orig_arg, val]] + + cmd_args.extend(arg_val) + + return cmd_args + + +# read config arguments +config = yaml.safe_load(Path(meta["config"]).read_text()) + +print(">> Constructing command", flush=True) +cmd_args = ["samtools", "sort"] + generate_args(par, config) + +# manually process cpus parameter +if "cpus" in meta and meta["cpus"]: + cmd_args.extend(["--threads", str(meta["cpus"])]) +# add memory +if "memory_mb" in meta and meta["memory_mb"]: + import math + + mem_per_thread = math.ceil(meta["memory_mb"] * 0.8 / meta["cpus"]) + cmd_args.extend(["-m", f"{mem_per_thread}M"]) + +with tempfile.TemporaryDirectory(prefix="samtools-", dir=meta["temp_dir"]) as temp_dir: + # add tempdir + cmd_args.extend(["-T", str(temp_dir + "/")]) + + # run command + print(">> Running samtools sort with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + subprocess.run(cmd_args, check=True) + +if par.get("output_bai"): + print(">> Running samtools index with command:", flush=True) + cmd_index_args = ["samtools", "index", "-b", par["output_bam"], par["output_bai"]] + print("+ " + " ".join([str(x) for x in cmd_index_args]), flush=True) + subprocess.run(cmd_index_args, check=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/samtools_sort", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/samtools_sort/nextflow.config b/target/nextflow/mapping/samtools_sort/nextflow.config new file mode 100644 index 00000000..56cd6cc7 --- /dev/null +++ b/target/nextflow/mapping/samtools_sort/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/samtools_sort' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Sort and (optionally) index alignments.\n\nReads are sorted by leftmost coordinates, or by read name when `--sort_by_read_names` is used.\n\nAn appropriate `@HD-SO` sort order header tag will be added or an existing one updated if necessary.\n\nNote that to generate an index file (by specifying `--output_bai`), the default coordinate sort must be used.\nThus the `--sort_by_read_names` and `--sort_by ` options are incompatible with `--output_bai`. \n' + author = 'Robrecht Cannoodt, Angela Oliveira Pisco' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/samtools_sort/nextflow_labels.config b/target/nextflow/mapping/samtools_sort/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/samtools_sort/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/samtools_sort/nextflow_schema.json b/target/nextflow/mapping/samtools_sort/nextflow_schema.json new file mode 100644 index 00000000..11cbe2f7 --- /dev/null +++ b/target/nextflow/mapping/samtools_sort/nextflow_schema.json @@ -0,0 +1,119 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "samtools_sort", + "description": "Sort and (optionally) index alignments.\n\nReads are sorted by leftmost coordinates, or by read name when `--sort_by_read_names` is used.\n\nAn appropriate `@HD-SO` sort order header tag will be added or an existing one updated if necessary.\n\nNote that to generate an index file (by specifying `--output_bai`), the default coordinate sort must be used.\nThus the `--sort_by_read_names` and `--sort_by ` options are incompatible with `--output_bai`. \n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "minimizer_cluster": { + "type": "boolean", + "description": "Sort unmapped reads (those in chromosome \"*\") by their sequence minimiser (Schleimer et al., 2003; Roberts et al., 2004), \nalso reverse complementing as appropriate", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "minimizer_kmer": { + "type": "integer", + "description": "Sets the kmer size to be used in the -M option.", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "sort_by_read_names": { + "type": "boolean", + "description": "Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "sort_by": { + "type": "string", + "description": "Sort first by this value in the alignment tag, then by position or name (if also using -n).", + "help_text": "Type: `string`, multiple: `False`. " + }, + "no_pg": { + "type": "boolean", + "description": "Do not add a @PG line to the header of the output file.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the SAM/BAM/CRAM files containing the mapped reads.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.bam\"`. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output_bam": { + "type": "string", + "format": "path", + "description": "Filename to output the counts to.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_bam.bam\"`, direction: `output`, example: `\"output.bam\"`. ", + "default": "$id.$key.output_bam.bam" + }, + "output_bai": { + "type": "string", + "format": "path", + "description": "BAI-format index for BAM file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_bai.bai\"`, direction: `output`, example: `\"output.bam.bai\"`. ", + "default": "$id.$key.output_bai.bai" + }, + "output_format": { + "type": "string", + "description": "The output format", + "help_text": "Type: `string`, multiple: `False`, example: `\"bam\"`, choices: ``sam`, `bam`, `cram``. ", + "enum": [ + "sam", + "bam", + "cram" + ] + }, + "compression": { + "type": "integer", + "description": "Compression level, from 0 (uncompressed) to 9 (best", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/star_align/.config.vsh.yaml b/target/nextflow/mapping/star_align/.config.vsh.yaml new file mode 100644 index 00000000..9f934c8b --- /dev/null +++ b/target/nextflow/mapping/star_align/.config.vsh.yaml @@ -0,0 +1,2420 @@ +name: "star_align" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input/Output" + arguments: + - type: "file" + name: "--input" + alternatives: + - "--readFilesIn" + description: "The FASTQ files to be analyzed. Corresponds to the --readFilesIn\ + \ argument in the STAR command." + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "--genomeDir" + description: "Path to the reference built by star_build_reference. Corresponds\ + \ to the --genomeDir argument in the STAR command." + info: null + example: + - "/path/to/reference" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--outFileNamePrefix" + description: "Path to output directory. Corresponds to the --outFileNamePrefix\ + \ argument in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Run Parameters" + arguments: + - type: "integer" + name: "--runRNGseed" + description: "random number generator seed." + info: null + example: + - 777 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Genome Parameters" + arguments: + - type: "string" + name: "--genomeLoad" + description: "mode of shared memory usage for the genome files. Only used with\ + \ --runMode alignReads.\n\n- LoadAndKeep ... load genome into shared and\ + \ keep it in memory after run\n- LoadAndRemove ... load genome into shared\ + \ but remove it after run\n- LoadAndExit ... load genome into shared memory\ + \ and exit, keeping the genome in memory for future runs\n- Remove \ + \ ... do not map anything, just remove loaded genome from memory\n- NoSharedMemory\ + \ ... do not use shared memory, each job will have its own private copy of\ + \ the genome" + info: null + example: + - "NoSharedMemory" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genomeFastaFiles" + description: "path(s) to the fasta files with the genome sequences, separated\ + \ by spaces. These files should be plain text FASTA files, they *cannot* be\ + \ zipped.\n\nRequired for the genome generation (--runMode genomeGenerate).\ + \ Can also be used in the mapping (--runMode alignReads) to add extra (new)\ + \ sequences to the genome (e.g. spike-ins)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--genomeFileSizes" + description: "genome files exact sizes in bytes. Typically, this should not be\ + \ defined by the user." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeTransformOutput" + description: "which output to transform back to original genome\n\n- SAM ...\ + \ SAM/BAM alignments\n- SJ ... splice junctions (SJ.out.tab)\n- None \ + \ ... no transformation of the output" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeChrSetMitochondrial" + description: "names of the mitochondrial chromosomes. Presently only used for\ + \ STARsolo statistics output/" + info: null + example: + - "chrM" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Splice Junctions Database" + arguments: + - type: "string" + name: "--sjdbFileChrStartEnd" + description: "path to the files with genomic coordinates (chr start \ + \ end strand) for the splice junction introns. Multiple files can be supplied\ + \ and will be concatenated." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sjdbGTFfile" + description: "path to the GTF file with annotations" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFchrPrefix" + description: "prefix for chromosome names in a GTF file (e.g. 'chr' for using\ + \ ENSMEBL annotations with UCSC genomes)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFfeatureExon" + description: "feature type in GTF file to be used as exons for building transcripts" + info: null + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentTranscript" + description: "GTF attribute name for parent transcript ID (default \"transcript_id\"\ + \ works for GTF files)" + info: null + example: + - "transcript_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGene" + description: "GTF attribute name for parent gene ID (default \"gene_id\" works\ + \ for GTF files)" + info: null + example: + - "gene_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneName" + description: "GTF attribute name for parent gene name" + info: null + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneType" + description: "GTF attribute name for parent gene type" + info: null + example: + - "gene_type" + - "gene_biotype" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sjdbOverhang" + description: "length of the donor/acceptor sequence on each side of the junctions,\ + \ ideally = (mate_length - 1)" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sjdbScore" + description: "extra alignment score for alignments that cross database junctions" + info: null + example: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbInsertSave" + description: "which files to save when sjdb junctions are inserted on the fly\ + \ at the mapping step\n\n- Basic ... only small junction / transcript files\n\ + - All ... all files including big Genome, SA and SAindex - this will create\ + \ a complete genome directory" + info: null + example: + - "Basic" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variation parameters" + arguments: + - type: "string" + name: "--varVCFfile" + description: "path to the VCF file that contains variation data. The 10th column\ + \ should contain the genotype information, e.g. 0/1" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Parameters" + arguments: + - type: "string" + name: "--readFilesType" + description: "format of input read files\n\n- Fastx ... FASTA or FASTQ\n\ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand\ + \ samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use\ + \ --readFilesCommand samtools view" + info: null + example: + - "Fastx" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesSAMattrKeep" + description: "for --readFilesType SAM SE/PE, which SAM tags to keep in the output\ + \ BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n-\ + \ None ... do not keep any tags" + info: null + example: + - "All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--readFilesManifest" + description: "path to the \"manifest\" file with the names of read files. The\ + \ manifest file should contain 3 tab-separated columns:\n\npaired-end reads:\ + \ read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads:\ + \ read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but\ + \ not tabs are allowed in file names.\nIf read_group_line does not start with\ + \ ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line\ + \ starts with ID:, it can contain several fields separated by $tab$, and all\ + \ fields will be be copied verbatim into SAM @RG header line." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesPrefix" + description: "prefix for the read files names, i.e. it will be added in front\ + \ of the strings in --readFilesIn" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesCommand" + description: "command line to execute for each of the input file. This command\ + \ should generate FASTA or FASTQ text and send it to stdout\n\nFor example:\ + \ zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readMapNumber" + description: "number of reads to map from the beginning of the file\n\n-1: map\ + \ all reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readMatesLengthsIn" + description: "Equal/NotEqual - lengths of names,sequences,qualities for both mates\ + \ are the same / not the same. NotEqual is safe in all situations." + info: null + example: + - "NotEqual" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readNameSeparator" + description: "character(s) separating the part of the read names that will be\ + \ trimmed in output (read name after space is always trimmed)" + info: null + example: + - "/" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readQualityScoreBase" + description: "number to be subtracted from the ASCII code to get Phred quality\ + \ score" + info: null + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Clipping" + arguments: + - type: "string" + name: "--clipAdapterType" + description: "adapter clipping type\n\n- Hamming ... adapter clipping based on\ + \ Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n\ + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes\ + \ Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ...\ + \ no adapter clipping, all other clip* parameters are disregarded" + info: null + example: + - "Hamming" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clip3pNbases" + description: "number(s) of bases to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--clip3pAdapterSeq" + description: "adapter sequences to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence\ + \ with the length equal to read length" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--clip3pAdapterMMp" + description: "max proportion of mismatches for 3p adapter clipping for each mate.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip3pAfterAdapterNbases" + description: "number of bases to clip from 3p of each mate after the adapter clipping.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip5pNbases" + description: "number(s) of bases to clip from 5p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Limits" + arguments: + - type: "long" + name: "--limitGenomeGenerateRAM" + description: "maximum available RAM (bytes) for genome generation" + info: null + example: + - 31000000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitIObufferSize" + description: "max available buffers size (bytes) for input/output, per thread" + info: null + example: + - 30000000 + - 50000000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "long" + name: "--limitOutSAMoneReadBytes" + description: "max size of the SAM record (bytes) for one read. Recommended value:\ + \ >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + info: null + example: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJoneRead" + description: "max number of junctions for one read (including all multi-mappers)" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJcollapsed" + description: "max number of collapsed junctions" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitBAMsortRAM" + description: "maximum available RAM (bytes) for sorting BAM. If =0, it will be\ + \ set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory\ + \ option." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitSjdbInsertNsj" + description: "maximum number of junctions to be inserted to the genome on the\ + \ fly at the mapping stage, including those from annotations and those detected\ + \ in the 1st step of the 2-pass run" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitNreadsSoft" + description: "soft limit on the number of reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: general" + arguments: + - type: "string" + name: "--outTmpKeep" + description: "whether to keep the temporary files after STAR runs is finished\n\ + \n- None ... remove all temporary files\n- All ... keep all files" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outStd" + description: "which output will be directed to stdout (standard out)\n\n- Log\ + \ ... log messages\n- SAM ... alignments\ + \ in SAM format (which normally are output to Aligned.out.sam file), normal\ + \ standard output will go into Log.std.out\n- BAM_Unsorted ... alignments\ + \ in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate\ + \ ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype\ + \ BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome\ + \ in BAM format, unsorted. Requires --quantMode TranscriptomeSAM" + info: null + example: + - "Log" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outReadsUnmapped" + description: "output of unmapped and partially mapped (i.e. mapped only one mate\ + \ of a paired end read) reads in separate file(s).\n\n- None ... no output\n\ + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outQSconversionAdd" + description: "add this number to the quality score (e.g. to convert from Illumina\ + \ to Sanger, use -31)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outMultimapperOrder" + description: "order of multimapping alignments in the output files\n\n- Old_2.4\ + \ ... quasi-random order used before 2.5.0\n- Random \ + \ ... random order of alignments for each multi-mapper. Read mates (pairs)\ + \ are always adjacent, all alignment for each read stay together. This option\ + \ will become default in the future releases." + info: null + example: + - "Old_2.4" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: SAM and BAM" + arguments: + - type: "string" + name: "--outSAMtype" + description: "type of SAM/BAM output\n\n1st word:\n- BAM ... output BAM without\ + \ sorting\n- SAM ... output SAM without sorting\n- None ... no SAM/BAM output\n\ + 2nd, 3rd:\n- Unsorted ... standard unsorted\n- SortedByCoordinate\ + \ ... sorted by coordinate. This option will allocate extra memory for sorting\ + \ which can be specified by --limitBAMsortRAM." + info: null + example: + - "SAM" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMmode" + description: "mode of SAM output\n\n- None ... no SAM output\n- Full ... full\ + \ SAM output\n- NoQS ... full SAM but without quality scores" + info: null + example: + - "Full" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMstrandField" + description: "Cufflinks-like strand field flag\n\n- None ... not used\n\ + - intronMotif ... strand derived from the intron motif. This option changes\ + \ the output alignments: reads with inconsistent and/or non-canonical introns\ + \ are filtered out." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattributes" + description: "a string of desired SAM attributes, in the order desired for the\ + \ output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n\ + - None ... no attributes\n- Standard ... NH HI AS nM\n- All \ + \ ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number\ + \ of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard\ + \ SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart\ + \ (=1 by default). Standard SAM tag.\n- AS ... local alignment score,\ + \ +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE\ + \ reads, total score for two mates. Stadnard SAM tag.\n- nM ... number\ + \ of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance\ + \ to the reference (number of mismatched + inserted + deleted bases) for each\ + \ mate. Standard SAM tag.\n- MD ... string encoding mismatched and\ + \ deleted reference bases (see standard SAM specifications). Standard SAM tag.\n\ + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical;\ + \ 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions\ + \ database is used, and a junction is annotated, 20 is added to its motif value.\n\ + - jI ... start and end of introns for all junctions (1-based).\n- XS\ + \ ... alignment strand according to --outSAMstrandField.\n- MC \ + \ ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all\ + \ segment of all chimeric alingments for --chimOutType WithinBAM output.\n-\ + \ cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n\ + - vA ... variant allele\n- vG ... genomic coordinate of the\ + \ variant overlapped by the read.\n- vW ... 1 - alignment passes WASP\ + \ filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires\ + \ --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality\ + \ scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN \ + \ ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene\ + \ IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected\ + \ cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM\ + \ SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS \ + \ ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ \ + \ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha \ + \ ... haplotype (1/2) when mapping to the diploid genome. Requires genome\ + \ generated with --genomeTransformType Diploid .\n- rB ... alignment\ + \ block read/genomic coordinates.\n- vR ... read coordinate of the\ + \ variant." + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMattrIHstart" + description: "start value for the IH attribute. 0 may be required by some downstream\ + \ software, such as Cufflinks or StringTie." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMunmapped" + description: "output of unmapped reads in the SAM format\n\n1st word:\n- None\ + \ ... no output\n- Within ... output unmapped reads within the main SAM file\ + \ (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for\ + \ each alignment, and, in case of unsorted output, keep it adjacent to its mapped\ + \ mate. Only affects multi-mapping reads." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMorder" + description: "type of sorting for the SAM output\n\nPaired: one mate after the\ + \ other for all paired alignments\nPairedKeepInputOrder: one mate after the\ + \ other for all paired alignments, the order is kept the same as in the input\ + \ FASTQ files" + info: null + example: + - "Paired" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMprimaryFlag" + description: "which alignments are considered primary - all others will be marked\ + \ with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the\ + \ best score is primary\n- AllBestScore ... all alignments with the best score\ + \ are primary" + info: null + example: + - "OneBestScore" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMreadID" + description: "read ID record type\n\n- Standard ... first word (until space) from\ + \ the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number\ + \ (index) in the FASTx file" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMmapqUnique" + description: "0 to 255: the MAPQ value for unique mappers" + info: null + example: + - 255 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagOR" + description: "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e.\ + \ FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, and after outSAMflagAND. Can be used to set specific bits that are not\ + \ set otherwise." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagAND" + description: "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e.\ + \ FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, but before outSAMflagOR. Can be used to unset specific bits that are\ + \ not set otherwise." + info: null + example: + - 65535 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattrRGline" + description: "SAM/BAM read group line. The first word contains the read group\ + \ identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx CN:yy\ + \ \"DS:z z z\".\n\nxxx will be added as RG tag to each output alignment. Any\ + \ spaces in the tag values have to be double quoted.\nComma separated RG lines\ + \ correspons to different (comma separated) input files in --readFilesIn. Commas\ + \ have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz\ + \ \"DS:z z\" , ID:yyy DS:yyyy" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderHD" + description: "@HD (header) line of the SAM header" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderPG" + description: "extra @PG (software) line of the SAM header (in addition to STAR)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderCommentFile" + description: "path to the file with @CO (comment) lines of the SAM header" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMfilter" + description: "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences\ + \ ... only keep the reads for which all alignments are to the extra reference\ + \ sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences\ + \ ... keep all alignments to the extra reference sequences added with --genomeFastaFiles\ + \ at the mapping stage." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMmultNmax" + description: "max number of multiple alignments for a read that will be output\ + \ to the SAM/BAM files. Note that if this value is not equal to -1, the top\ + \ scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax)\ + \ will be output" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMtlen" + description: "calculation method for the TLEN field in the SAM/BAM files\n\n-\ + \ 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate.\ + \ (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost\ + \ base of any mate. (+)sign for the mate with the leftmost base. This is different\ + \ from 1 for overlapping mates with protruding ends" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMcompression" + description: "-1 to 10 BAM compression level, -1=default compression (6?), 0=no\ + \ compression, 10=maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingThreadN" + description: ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN)." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingBinsN" + description: ">0: number of genome bins for coordinate-sorting" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BAM processing" + arguments: + - type: "string" + name: "--bamRemoveDuplicatesType" + description: "mark duplicates in the BAM file, for now only works with (i) sorted\ + \ BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- -\ + \ ... no duplicate removal/marking\n- UniqueIdentical\ + \ ... mark all multimappers, and duplicate unique mappers. The coordinates,\ + \ FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate\ + \ unique mappers but not multimappers." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--bamRemoveDuplicatesMate2basesN" + description: "number of bases from the 5' of mate 2 to use in collapsing (e.g.\ + \ for RAMPAGE)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Wiggle" + arguments: + - type: "string" + name: "--outWigType" + description: "type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\"\ + . Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n\ + - None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle\ + \ ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of\ + \ the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only\ + \ 2nd read" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outWigStrand" + description: "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate\ + \ strands, str1 and str2\n- Unstranded ... collapsed strands" + info: null + example: + - "Stranded" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigReferencesPrefix" + description: "prefix matching reference names to include in the output wiggle\ + \ file, e.g. \"chr\", default \"-\" - include all references" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigNorm" + description: "type of normalization for the signal\n\n- RPM ... reads per million\ + \ of mapped reads\n- None ... no normalization, \"raw\" counts" + info: null + example: + - "RPM" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering" + arguments: + - type: "string" + name: "--outFilterType" + description: "type of filtering\n\n- Normal ... standard filtering using only\ + \ current alignment\n- BySJout ... keep only those reads that contain junctions\ + \ that passed filtering into SJ.out.tab" + info: null + example: + - "Normal" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapScoreRange" + description: "the score range below the maximum score for multimapping alignments" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapNmax" + description: "maximum number of loci the read is allowed to map to. Alignments\ + \ (all of them) will be output only if the read maps to no more loci than this\ + \ value.\n\nOtherwise no alignments will be output, and the read will be counted\ + \ as \"mapped to too many loci\" in the Log.final.out ." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMismatchNmax" + description: "alignment will be output only if it has no more mismatches than\ + \ this value." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverLmax" + description: "alignment will be output only if its ratio of mismatches to *mapped*\ + \ length is less than or equal to this value." + info: null + example: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverReadLmax" + description: "alignment will be output only if its ratio of mismatches to *read*\ + \ length is less than or equal to this value." + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterScoreMin" + description: "alignment will be output only if its score is higher than or equal\ + \ to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterScoreMinOverLread" + description: "same as outFilterScoreMin, but normalized to read length (sum of\ + \ mates' lengths for paired-end reads)" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMatchNmin" + description: "alignment will be output only if the number of matched bases is\ + \ higher than or equal to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMatchNminOverLread" + description: "sam as outFilterMatchNmin, but normalized to the read length (sum\ + \ of mates' lengths for paired-end reads)." + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronMotifs" + description: "filter alignment using their motifs\n\n- None \ + \ ... no filtering\n- RemoveNoncanonical ... filter out\ + \ alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated\ + \ ... filter out alignments that contain non-canonical unannotated junctions\ + \ when using annotated splice junctions database. The annotated non-canonical\ + \ junctions will be kept." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronStrands" + description: "filter alignments\n\n- RemoveInconsistentStrands ... remove\ + \ alignments that have junctions with inconsistent strands\n- None \ + \ ... no filtering" + info: null + example: + - "RemoveInconsistentStrands" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output splice junctions (SJ.out.tab)" + arguments: + - type: "string" + name: "--outSJtype" + description: "type of splice junction output\n\n- Standard ... standard SJ.out.tab\ + \ output\n- None ... no splice junction output" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering: Splice Junctions" + arguments: + - type: "string" + name: "--outSJfilterReads" + description: "which reads to consider for collapsed splice junctions output\n\n\ + - All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping\ + \ reads only" + info: null + example: + - "All" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterOverhangMin" + description: "minimum overhang length for splice junctions on both sides for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply\ + \ to annotated junctions" + info: null + example: + - 30 + - 12 + - 12 + - 12 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountUniqueMin" + description: "minimum uniquely mapping read count per junction for: (1) non-canonical\ + \ motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and\ + \ GT/AT motif. -1 means no output for that motif\n\nJunctions are output if\ + \ one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are\ + \ satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountTotalMin" + description: "minimum total (multi-mapping+unique) read count per junction for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions\ + \ are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin\ + \ conditions are satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterDistToOtherSJmin" + description: "minimum allowed distance to other junctions' donor/acceptor\n\n\ + does not apply to annotated junctions" + info: null + example: + - 10 + - 0 + - 5 + - 10 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterIntronMaxVsReadN" + description: "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ + \ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2\ + \ reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\n\ + does not apply to annotated junctions" + info: null + example: + - 50000 + - 100000 + - 200000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Scoring" + arguments: + - type: "integer" + name: "--scoreGap" + description: "splice junction penalty (independent on intron motif)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapNoncan" + description: "non-canonical junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapGCAG" + description: "GC/AG and CT/GC junction penalty (in addition to scoreGap)" + info: null + example: + - -4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapATAC" + description: "AT/AC and GT/AT junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGenomicLengthLog2scale" + description: "extra score logarithmically scaled with genomic length of the alignment:\ + \ scoreGenomicLengthLog2scale*log2(genomicLength)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelOpen" + description: "deletion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelBase" + description: "deletion extension penalty per base (in addition to scoreDelOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsOpen" + description: "insertion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsBase" + description: "insertion extension penalty per base (in addition to scoreInsOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreStitchSJshift" + description: "maximum score reduction while searching for SJ boundaries in the\ + \ stitching step" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Alignments and Seeding" + arguments: + - type: "integer" + name: "--seedSearchStartLmax" + description: "defines the search start point through the read - the read is split\ + \ into pieces no longer than this value" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--seedSearchStartLmaxOverLread" + description: "seedSearchStartLmax normalized to read length (sum of mates' lengths\ + \ for paired-end reads)" + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSearchLmax" + description: "defines the maximum length of the seeds, if =0 seed length is not\ + \ limited" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMultimapNmax" + description: "only pieces that map fewer than this value are utilized in the stitching\ + \ procedure" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerReadNmax" + description: "max number of seeds per read" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerWindowNmax" + description: "max number of seeds per window" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedNoneLociPerWindow" + description: "max number of one seed loci per window" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSplitMin" + description: "min length of the seed sequences split by Ns or mate gap" + info: null + example: + - 12 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMapMin" + description: "min length of seeds to be mapped" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMin" + description: "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,\ + \ otherwise it is considered Deletion" + info: null + example: + - 21 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMax" + description: "maximum intron size, if 0, max intron size will be determined by\ + \ (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignMatesGapMax" + description: "maximum gap between two mates, if 0, max intron gap will be determined\ + \ by (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJoverhangMin" + description: "minimum overhang (i.e. block size) for spliced alignments" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJstitchMismatchNmax" + description: "maximum number of mismatches for stitching of the splice junctions\ + \ (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3)\ + \ GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif." + info: null + example: + - 0 + - -1 + - 0 + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--alignSJDBoverhangMin" + description: "minimum overhang (i.e. block size) for annotated (sjdb) spliced\ + \ alignments" + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSplicedMateMapLmin" + description: "minimum mapped length for a read mate that is spliced" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alignSplicedMateMapLminOverLmate" + description: "alignSplicedMateMapLmin normalized to mate length" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignWindowsPerReadNmax" + description: "max number of windows per read" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerWindowNmax" + description: "max number of transcripts per window" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerReadNmax" + description: "max number of different alignments per read to consider" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsType" + description: "type of read ends alignment\n\n- Local ... standard\ + \ local alignment with soft-clipping allowed\n- EndToEnd ... force\ + \ end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully\ + \ extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12\ + \ ... fully extend only the 5p of the both read1 and read2, all other ends:\ + \ local alignment" + info: null + example: + - "Local" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsProtrude" + description: "allow protrusion of alignment ends, i.e. start (end) of the +strand\ + \ mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum\ + \ number of protrusion bases allowed\n2nd word: string:\n- \ + \ ConcordantPair ... report alignments with non-zero protrusion as concordant\ + \ pairs\n- DiscordantPair ... report alignments with non-zero\ + \ protrusion as discordant pairs" + info: null + example: + - "0 ConcordantPair" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignSoftClipAtReferenceEnds" + description: "allow the soft-clipping of the alignments past the end of the chromosomes\n\ + \n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks" + info: null + example: + - "Yes" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignInsertionFlush" + description: "how to flush ambiguous insertion positions\n\n- None ... insertions\ + \ are not flushed\n- Right ... insertions are flushed to the right" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-End reads" + arguments: + - type: "integer" + name: "--peOverlapNbasesMin" + description: "minimum number of overlapping bases to trigger mates merging and\ + \ realignment. Specify >0 value to switch on the \"merginf of overlapping mates\"\ + \ algorithm." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--peOverlapMMp" + description: "maximum proportion of mismatched bases in the overlap area" + info: null + example: + - 0.01 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Windows, Anchors, Binning" + arguments: + - type: "integer" + name: "--winAnchorMultimapNmax" + description: "max number of loci anchors are allowed to map to" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winBinNbits" + description: "=log2(winBin), where winBin is the size of the bin for the windows/clustering,\ + \ each window will occupy an integer number of bins." + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winAnchorDistNbins" + description: "max number of bins between two anchors that allows aggregation of\ + \ anchors into one window" + info: null + example: + - 9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winFlankNbins" + description: "log2(winFlank), where win Flank is the size of the left and right\ + \ flanking regions for each window" + info: null + example: + - 4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--winReadCoverageRelativeMin" + description: "minimum relative coverage of the read sequence by the seeds in a\ + \ window, for STARlong algorithm only." + info: null + example: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winReadCoverageBasesMin" + description: "minimum number of bases covered by the seeds in a window , for STARlong\ + \ algorithm only." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Chimeric Alignments" + arguments: + - type: "string" + name: "--chimOutType" + description: "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n\ + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n-\ + \ WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n-\ + \ WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental\ + \ chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip\ + \ ... soft-clipping in the CIGAR for supplemental chimeric alignments" + info: null + example: + - "Junctions" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentMin" + description: "minimum length of chimeric segment length, if ==0, no chimeric output" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreMin" + description: "minimum total (summed) score of the chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreDropMax" + description: "max drop (difference) of chimeric score (the sum of scores of all\ + \ chimeric segments) from the read length" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreSeparation" + description: "minimum difference (separation) between the best chimeric score\ + \ and the next one" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreJunctionNonGTAG" + description: "penalty for a non-GT/AG chimeric junction" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimJunctionOverhangMin" + description: "minimum overhang for a chimeric junction" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentReadGapMax" + description: "maximum gap in the read sequence between chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chimFilter" + description: "different filters for chimeric alignments\n\n- None ... no filtering\n\ + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric\ + \ junction" + info: null + example: + - "banGenomicN" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimMainSegmentMultNmax" + description: "maximum number of multi-alignments for the main chimeric segment.\ + \ =1 will prohibit multimapping main segments." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapNmax" + description: "maximum number of chimeric multi-alignments\n\n- 0 ... use the old\ + \ scheme for chimeric detection which only considered unique alignments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapScoreRange" + description: "the score range for multi-mapping chimeras below the best chimeric\ + \ score. Only works with --chimMultimapNmax > 1" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimNonchimScoreDropMin" + description: "to trigger chimeric detection, the drop in the best non-chimeric\ + \ alignment score with respect to the read length has to be greater than this\ + \ value" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimOutJunctionFormat" + description: "formatting type for the Chimeric.out.junction file\n\n- 0 ... no\ + \ comment lines/headers\n- 1 ... comment lines at the end of the file: command\ + \ line and Nreads: total, unique/multi-mapping" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Quantification of Annotations" + arguments: + - type: "string" + name: "--quantMode" + description: "types of quantification requested\n\n- - ... none\n\ + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate\ + \ file\n- GeneCounts ... count reads per gene" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--quantTranscriptomeBAMcompression" + description: "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM\ + \ output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10\ + \ ... maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quantTranscriptomeBan" + description: "prohibit various alignment type\n\n- IndelSoftclipSingleend ...\ + \ prohibit indels, soft clipping and single-end alignments - compatible with\ + \ RSEM\n- Singleend ... prohibit single-end alignments" + info: null + example: + - "IndelSoftclipSingleend" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "2-pass Mapping" + arguments: + - type: "string" + name: "--twopassMode" + description: "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic\ + \ ... basic 2-pass mapping, with all 1st pass junctions inserted into\ + \ the genome indices on the fly" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--twopass1readsN" + description: "number of reads to process for the 1st step. Use very large number\ + \ (or default -1) to map all reads in the first step." + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "WASP parameters" + arguments: + - type: "string" + name: "--waspOutputMode" + description: "WASP allele-specific output type. This is re-implementation of the\ + \ original WASP mappability filtering by Bryce van de Geijn, Graham McVicker,\ + \ Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature\ + \ Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\ + \n- SAMtag ... add WASP tags to the alignments that pass WASP filtering" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STARsolo (single cell RNA-seq) parameters" + arguments: + - type: "string" + name: "--soloType" + description: "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet)\ + \ one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X\ + \ Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length,\ + \ one UMI of fixed length and one adapter sequence of fixed length are allowed\ + \ in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode\ + \ as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2\ + \ if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or\ + \ SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate\ + \ FASTQ (paired- or single-end), barcodes are corresponding read-groups, no\ + \ UMI sequences, alignments deduplicated according to alignment start and end\ + \ (after extending soft-clipped bases)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCBwhitelist" + description: "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex\ + \ allows more than one whitelist file.\n\n- None ... no whitelist:\ + \ all cell barcodes are allowed" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--soloCBstart" + description: "cell barcode start base" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloCBlen" + description: "cell barcode length" + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIstart" + description: "UMI start base" + info: null + example: + - 17 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIlen" + description: "UMI length" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeReadLength" + description: "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n\ + - 0 ... not defined, do not check" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeMate" + description: "identifies which read mate contains the barcode (CB+UMI) sequence\n\ + \n- 0 ... barcode sequence is on separate read, which should always be the\ + \ last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part\ + \ of mate 1\n- 2 ... barcode sequence is a part of mate 2" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBposition" + description: "position of Cell Barcode(s) on the barcode read.\n\nPresently only\ + \ works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\n\ + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor\ + \ defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter\ + \ start; 3: adapter end\nstart(end)Position is the 0-based position with of\ + \ the CB start(end) with respect to the Anchor Base\nString for different barcodes\ + \ are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols,\ + \ 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIposition" + description: "position of the UMI on the barcode read, same as soloCBposition\n\ + \nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition\ + \ 3_9_3_14" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloAdapterSequence" + description: "adapter sequence to anchor barcodes. Only one adapter sequence is\ + \ allowed." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloAdapterMismatchesNmax" + description: "maximum number of mismatches allowed in adapter sequence." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBmatchWLtype" + description: "matching the Cell Barcodes to the WhiteList\n\n- Exact \ + \ ... only exact matches allowed\n- 1MM \ + \ ... only one match in whitelist with 1 mismatched base allowed. Allowed\ + \ CBs have to have at least one read with exact match.\n- 1MM_multi \ + \ ... multiple matches in whitelist with 1 mismatched base allowed,\ + \ posterior probability calculation is used choose one of the matches.\nAllowed\ + \ CBs have to have at least one read with exact match. This option matches best\ + \ with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi,\ + \ but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts\ + \ ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for\ + \ CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2\ + \ ... allow up to edit distance of 3 fpr each of the barcodes.\ + \ May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex.\ + \ Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio\ + \ Split-seq pipeline." + info: null + example: + - "1MM_multi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeSeq" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance,\ + \ for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR\ + \ .\nThis parameter is required when running STARsolo with input from SAM." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeQual" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode qualities (in proper order).\n\nFor\ + \ instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual\ + \ CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned\ + \ to all bases." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloStrand" + description: "strandedness of the solo libraries:\n\n- Unstranded ... no strand\ + \ information\n- Forward ... read strand same as the original RNA molecule\n\ + - Reverse ... read strand opposite to the original RNA molecule" + info: null + example: + - "Forward" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloFeatures" + description: "genomic features for which the UMI counts per Cell Barcode are collected\n\ + \n- Gene ... genes: reads match the gene transcript\n- SJ \ + \ ... splice junctions: reported in SJ.out.tab\n- GeneFull ...\ + \ full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n\ + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping\ + \ genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS\ + \ ... full gene (pre-RNA): count all reads overlapping genes' exons and\ + \ introns: prioritize >50% overlap with exons. Do not count reads with 100%\ + \ exonic overlap in the antisense direction." + info: null + example: + - "Gene" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloMultiMappers" + description: "counting method for reads mapping to multiple genes\n\n- Unique\ + \ ... count only reads that map to unique genes\n- Uniform ... uniformly\ + \ distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs\ + \ proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique\ + \ ... distribute UMIs proportionally to unique mappers, if present, and uniformly\ + \ if not.\n- EM ... multi-gene UMIs are distributed using Expectation\ + \ Maximization algorithm" + info: null + example: + - "Unique" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIdedup" + description: "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All \ + \ ... all UMIs with 1 mismatch distance to each other are\ + \ collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows\ + \ the \"directional\" method from the UMI-tools by Smith, Heger and Sudbery\ + \ (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools,\ + \ but with more stringent criteria for duplicate UMIs\n- Exact \ + \ ... only exactly matching UMIs are collapsed.\n- NoDedup \ + \ ... no deduplication of UMIs, count all reads.\n- 1MM_CR \ + \ ... CellRanger2-4 algorithm for 1MM UMI collapsing." + info: null + example: + - "1MM_All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIfiltering" + description: "type of UMI filtering (for reads uniquely mapping to genes)\n\n\ + - - ... basic filtering: remove UMIs with N and homopolymers\ + \ (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count\ + \ UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove\ + \ all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic +\ + \ remove lower-count UMIs that map to more than one gene, matching CellRanger\ + \ > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFileNames" + description: "file names for STARsolo output:\n\nfile_name_prefix gene_names\ + \ barcode_sequences cell_feature_count_matrix" + info: null + example: + - "Solo.out/" + - "features.tsv" + - "barcodes.tsv" + - "matrix.mtx" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellFilter" + description: "cell filtering type and parameters\n\n- None ... do not\ + \ output filtered cells\n- TopCells ... only report top cells by UMI\ + \ count, followed by the exact number of cells\n- CellRanger2.2 ... simple\ + \ filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected\ + \ cells, robust maximum percentile for UMI count, maximum to minimum ratio for\ + \ UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; \ + \ maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering\ + \ in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun\ + \ et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\n\ + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile \ + \ maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR\ + \ simN\nThe harcoded values are from CellRanger: 3000 \ + \ 0.99 10 45000 90000 500 0.01 20000\ + \ 0.01 10000" + info: null + example: + - "CellRanger2.2" + - "3000" + - "0.99" + - "10" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFormatFeaturesGeneField3" + description: "field 3 in the Gene features.tsv file. If \"-\", then no 3rd field\ + \ is output." + info: null + example: + - "Gene Expression" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellReadStats" + description: "Output reads statistics for each CB\n\n- Standard ... standard\ + \ output" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using STAR." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "docker" + env: + - "STAR_VERSION 2.7.10b" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/star_align/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/star_align" + executable: "target/nextflow/mapping/star_align/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/star_align/main.nf b/target/nextflow/mapping/star_align/main.nf new file mode 100644 index 00000000..b88bf247 --- /dev/null +++ b/target/nextflow/mapping/star_align/main.nf @@ -0,0 +1,6468 @@ +// star_align main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "star_align", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input/Output", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "--readFilesIn" + ], + "description" : "The FASTQ files to be analyzed. Corresponds to the --readFilesIn argument in the STAR command.", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "alternatives" : [ + "--genomeDir" + ], + "description" : "Path to the reference built by star_build_reference. Corresponds to the --genomeDir argument in the STAR command.", + "example" : [ + "/path/to/reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "--outFileNamePrefix" + ], + "description" : "Path to output directory. Corresponds to the --outFileNamePrefix argument in the STAR command.", + "example" : [ + "/path/to/foo" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Run Parameters", + "arguments" : [ + { + "type" : "integer", + "name" : "--runRNGseed", + "description" : "random number generator seed.", + "example" : [ + 777 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Genome Parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--genomeLoad", + "description" : "mode of shared memory usage for the genome files. Only used with --runMode alignReads.\n\n- LoadAndKeep ... load genome into shared and keep it in memory after run\n- LoadAndRemove ... load genome into shared but remove it after run\n- LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs\n- Remove ... do not map anything, just remove loaded genome from memory\n- NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome", + "example" : [ + "NoSharedMemory" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--genomeFastaFiles", + "description" : "path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.\n\nRequired for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins).", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--genomeFileSizes", + "description" : "genome files exact sizes in bytes. Typically, this should not be defined by the user.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--genomeTransformOutput", + "description" : "which output to transform back to original genome\n\n- SAM ... SAM/BAM alignments\n- SJ ... splice junctions (SJ.out.tab)\n- None ... no transformation of the output", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--genomeChrSetMitochondrial", + "description" : "names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/", + "example" : [ + "chrM", + "M", + "MT" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Splice Junctions Database", + "arguments" : [ + { + "type" : "string", + "name" : "--sjdbFileChrStartEnd", + "description" : "path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sjdbGTFfile", + "description" : "path to the GTF file with annotations", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFchrPrefix", + "description" : "prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes)", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFfeatureExon", + "description" : "feature type in GTF file to be used as exons for building transcripts", + "example" : [ + "exon" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentTranscript", + "description" : "GTF attribute name for parent transcript ID (default \\"transcript_id\\" works for GTF files)", + "example" : [ + "transcript_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGene", + "description" : "GTF attribute name for parent gene ID (default \\"gene_id\\" works for GTF files)", + "example" : [ + "gene_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGeneName", + "description" : "GTF attribute name for parent gene name", + "example" : [ + "gene_name" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGeneType", + "description" : "GTF attribute name for parent gene type", + "example" : [ + "gene_type", + "gene_biotype" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sjdbOverhang", + "description" : "length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sjdbScore", + "description" : "extra alignment score for alignments that cross database junctions", + "example" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbInsertSave", + "description" : "which files to save when sjdb junctions are inserted on the fly at the mapping step\n\n- Basic ... only small junction / transcript files\n- All ... all files including big Genome, SA and SAindex - this will create a complete genome directory", + "example" : [ + "Basic" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Variation parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--varVCFfile", + "description" : "path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read Parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--readFilesType", + "description" : "format of input read files\n\n- Fastx ... FASTA or FASTQ\n- SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view", + "example" : [ + "Fastx" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesSAMattrKeep", + "description" : "for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n- None ... do not keep any tags", + "example" : [ + "All" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--readFilesManifest", + "description" : "path to the \\"manifest\\" file with the names of read files. The manifest file should contain 3 tab-separated columns:\n\npaired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads: read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but not tabs are allowed in file names.\nIf read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesPrefix", + "description" : "prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesCommand", + "description" : "command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout\n\nFor example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--readMapNumber", + "description" : "number of reads to map from the beginning of the file\n\n-1: map all reads", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readMatesLengthsIn", + "description" : "Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations.", + "example" : [ + "NotEqual" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readNameSeparator", + "description" : "character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)", + "example" : [ + "/" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--readQualityScoreBase", + "description" : "number to be subtracted from the ASCII code to get Phred quality score", + "example" : [ + 33 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read Clipping", + "arguments" : [ + { + "type" : "string", + "name" : "--clipAdapterType", + "description" : "adapter clipping type\n\n- Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n- CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ... no adapter clipping, all other clip* parameters are disregarded", + "example" : [ + "Hamming" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip3pNbases", + "description" : "number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--clip3pAdapterSeq", + "description" : "adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence with the length equal to read length", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--clip3pAdapterMMp", + "description" : "max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip3pAfterAdapterNbases", + "description" : "number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip5pNbases", + "description" : "number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Limits", + "arguments" : [ + { + "type" : "long", + "name" : "--limitGenomeGenerateRAM", + "description" : "maximum available RAM (bytes) for genome generation", + "example" : [ + 31000000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitIObufferSize", + "description" : "max available buffers size (bytes) for input/output, per thread", + "example" : [ + 30000000, + 50000000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitOutSAMoneReadBytes", + "description" : "max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax", + "example" : [ + 100000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitOutSJoneRead", + "description" : "max number of junctions for one read (including all multi-mappers)", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitOutSJcollapsed", + "description" : "max number of collapsed junctions", + "example" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitBAMsortRAM", + "description" : "maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitSjdbInsertNsj", + "description" : "maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "example" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitNreadsSoft", + "description" : "soft limit on the number of reads", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output: general", + "arguments" : [ + { + "type" : "string", + "name" : "--outTmpKeep", + "description" : "whether to keep the temporary files after STAR runs is finished\n\n- None ... remove all temporary files\n- All ... keep all files", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outStd", + "description" : "which output will be directed to stdout (standard out)\n\n- Log ... log messages\n- SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out\n- BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM", + "example" : [ + "Log" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outReadsUnmapped", + "description" : "output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s).\n\n- None ... no output\n- Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outQSconversionAdd", + "description" : "add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outMultimapperOrder", + "description" : "order of multimapping alignments in the output files\n\n- Old_2.4 ... quasi-random order used before 2.5.0\n- Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases.", + "example" : [ + "Old_2.4" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output: SAM and BAM", + "arguments" : [ + { + "type" : "string", + "name" : "--outSAMtype", + "description" : "type of SAM/BAM output\n\n1st word:\n- BAM ... output BAM without sorting\n- SAM ... output SAM without sorting\n- None ... no SAM/BAM output\n2nd, 3rd:\n- Unsorted ... standard unsorted\n- SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM.", + "example" : [ + "SAM" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMmode", + "description" : "mode of SAM output\n\n- None ... no SAM output\n- Full ... full SAM output\n- NoQS ... full SAM but without quality scores", + "example" : [ + "Full" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMstrandField", + "description" : "Cufflinks-like strand field flag\n\n- None ... not used\n- intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMattributes", + "description" : "a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n- None ... no attributes\n- Standard ... NH HI AS nM\n- All ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag.\n- AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag.\n- nM ... number of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag.\n- MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag.\n- jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value.\n- jI ... start and end of introns for all junctions (1-based).\n- XS ... alignment strand according to --outSAMstrandField.\n- MC ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output.\n- cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n- vA ... variant allele\n- vG ... genomic coordinate of the variant overlapped by the read.\n- vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid .\n- rB ... alignment block read/genomic coordinates.\n- vR ... read coordinate of the variant.", + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMattrIHstart", + "description" : "start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie.", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMunmapped", + "description" : "output of unmapped reads in the SAM format\n\n1st word:\n- None ... no output\n- Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMorder", + "description" : "type of sorting for the SAM output\n\nPaired: one mate after the other for all paired alignments\nPairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files", + "example" : [ + "Paired" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMprimaryFlag", + "description" : "which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the best score is primary\n- AllBestScore ... all alignments with the best score are primary", + "example" : [ + "OneBestScore" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMreadID", + "description" : "read ID record type\n\n- Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number (index) in the FASTx file", + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMmapqUnique", + "description" : "0 to 255: the MAPQ value for unique mappers", + "example" : [ + 255 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMflagOR", + "description" : "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMflagAND", + "description" : "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.", + "example" : [ + 65535 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMattrRGline", + "description" : "SAM/BAM read group line. The first word contains the read group identifier and must start with \\"ID:\\", e.g. --outSAMattrRGline ID:xxx CN:yy \\"DS:z z z\\".\n\nxxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.\nComma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz \\"DS:z z\\" , ID:yyy DS:yyyy", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderHD", + "description" : "@HD (header) line of the SAM header", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderPG", + "description" : "extra @PG (software) line of the SAM header (in addition to STAR)", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderCommentFile", + "description" : "path to the file with @CO (comment) lines of the SAM header", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMfilter", + "description" : "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMmultNmax", + "description" : "max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax) will be output", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMtlen", + "description" : "calculation method for the TLEN field in the SAM/BAM files\n\n- 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMcompression", + "description" : "-1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMsortingThreadN", + "description" : ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMsortingBinsN", + "description" : ">0: number of genome bins for coordinate-sorting", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "BAM processing", + "arguments" : [ + { + "type" : "string", + "name" : "--bamRemoveDuplicatesType", + "description" : "mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- - ... no duplicate removal/marking\n- UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--bamRemoveDuplicatesMate2basesN", + "description" : "number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Wiggle", + "arguments" : [ + { + "type" : "string", + "name" : "--outWigType", + "description" : "type of signal output, e.g. \\"bedGraph\\" OR \\"bedGraph read1_5p\\". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n- None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only 2nd read", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigStrand", + "description" : "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate strands, str1 and str2\n- Unstranded ... collapsed strands", + "example" : [ + "Stranded" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigReferencesPrefix", + "description" : "prefix matching reference names to include in the output wiggle file, e.g. \\"chr\\", default \\"-\\" - include all references", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigNorm", + "description" : "type of normalization for the signal\n\n- RPM ... reads per million of mapped reads\n- None ... no normalization, \\"raw\\" counts", + "example" : [ + "RPM" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Filtering", + "arguments" : [ + { + "type" : "string", + "name" : "--outFilterType", + "description" : "type of filtering\n\n- Normal ... standard filtering using only current alignment\n- BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab", + "example" : [ + "Normal" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMultimapScoreRange", + "description" : "the score range below the maximum score for multimapping alignments", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMultimapNmax", + "description" : "maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value.\n\nOtherwise no alignments will be output, and the read will be counted as \\"mapped to too many loci\\" in the Log.final.out .", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMismatchNmax", + "description" : "alignment will be output only if it has no more mismatches than this value.", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMismatchNoverLmax", + "description" : "alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.", + "example" : [ + 0.3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMismatchNoverReadLmax", + "description" : "alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.", + "example" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterScoreMin", + "description" : "alignment will be output only if its score is higher than or equal to this value.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterScoreMinOverLread", + "description" : "same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)", + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMatchNmin", + "description" : "alignment will be output only if the number of matched bases is higher than or equal to this value.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMatchNminOverLread", + "description" : "sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).", + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outFilterIntronMotifs", + "description" : "filter alignment using their motifs\n\n- None ... no filtering\n- RemoveNoncanonical ... filter out alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outFilterIntronStrands", + "description" : "filter alignments\n\n- RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands\n- None ... no filtering", + "example" : [ + "RemoveInconsistentStrands" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output splice junctions (SJ.out.tab)", + "arguments" : [ + { + "type" : "string", + "name" : "--outSJtype", + "description" : "type of splice junction output\n\n- Standard ... standard SJ.out.tab output\n- None ... no splice junction output", + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Filtering: Splice Junctions", + "arguments" : [ + { + "type" : "string", + "name" : "--outSJfilterReads", + "description" : "which reads to consider for collapsed splice junctions output\n\n- All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping reads only", + "example" : [ + "All" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterOverhangMin", + "description" : "minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply to annotated junctions", + "example" : [ + 30, + 12, + 12, + 12 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterCountUniqueMin", + "description" : "minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied\ndoes not apply to annotated junctions", + "example" : [ + 3, + 1, + 1, + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterCountTotalMin", + "description" : "minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied\ndoes not apply to annotated junctions", + "example" : [ + 3, + 1, + 1, + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterDistToOtherSJmin", + "description" : "minimum allowed distance to other junctions' donor/acceptor\n\ndoes not apply to annotated junctions", + "example" : [ + 10, + 0, + 5, + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterIntronMaxVsReadN", + "description" : "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\ndoes not apply to annotated junctions", + "example" : [ + 50000, + 100000, + 200000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Scoring", + "arguments" : [ + { + "type" : "integer", + "name" : "--scoreGap", + "description" : "splice junction penalty (independent on intron motif)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapNoncan", + "description" : "non-canonical junction penalty (in addition to scoreGap)", + "example" : [ + -8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapGCAG", + "description" : "GC/AG and CT/GC junction penalty (in addition to scoreGap)", + "example" : [ + -4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapATAC", + "description" : "AT/AC and GT/AT junction penalty (in addition to scoreGap)", + "example" : [ + -8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGenomicLengthLog2scale", + "description" : "extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreDelOpen", + "description" : "deletion open penalty", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreDelBase", + "description" : "deletion extension penalty per base (in addition to scoreDelOpen)", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreInsOpen", + "description" : "insertion open penalty", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreInsBase", + "description" : "insertion extension penalty per base (in addition to scoreInsOpen)", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreStitchSJshift", + "description" : "maximum score reduction while searching for SJ boundaries in the stitching step", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Alignments and Seeding", + "arguments" : [ + { + "type" : "integer", + "name" : "--seedSearchStartLmax", + "description" : "defines the search start point through the read - the read is split into pieces no longer than this value", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--seedSearchStartLmaxOverLread", + "description" : "seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)", + "example" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedSearchLmax", + "description" : "defines the maximum length of the seeds, if =0 seed length is not limited", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedMultimapNmax", + "description" : "only pieces that map fewer than this value are utilized in the stitching procedure", + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedPerReadNmax", + "description" : "max number of seeds per read", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedPerWindowNmax", + "description" : "max number of seeds per window", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedNoneLociPerWindow", + "description" : "max number of one seed loci per window", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedSplitMin", + "description" : "min length of the seed sequences split by Ns or mate gap", + "example" : [ + 12 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedMapMin", + "description" : "min length of seeds to be mapped", + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignIntronMin", + "description" : "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion", + "example" : [ + 21 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignIntronMax", + "description" : "maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignMatesGapMax", + "description" : "maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJoverhangMin", + "description" : "minimum overhang (i.e. block size) for spliced alignments", + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJstitchMismatchNmax", + "description" : "maximum number of mismatches for stitching of the splice junctions (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.", + "example" : [ + 0, + -1, + 0, + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJDBoverhangMin", + "description" : "minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSplicedMateMapLmin", + "description" : "minimum mapped length for a read mate that is spliced", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alignSplicedMateMapLminOverLmate", + "description" : "alignSplicedMateMapLmin normalized to mate length", + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignWindowsPerReadNmax", + "description" : "max number of windows per read", + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignTranscriptsPerWindowNmax", + "description" : "max number of transcripts per window", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignTranscriptsPerReadNmax", + "description" : "max number of different alignments per read to consider", + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignEndsType", + "description" : "type of read ends alignment\n\n- Local ... standard local alignment with soft-clipping allowed\n- EndToEnd ... force end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment", + "example" : [ + "Local" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignEndsProtrude", + "description" : "allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum number of protrusion bases allowed\n2nd word: string:\n- ConcordantPair ... report alignments with non-zero protrusion as concordant pairs\n- DiscordantPair ... report alignments with non-zero protrusion as discordant pairs", + "example" : [ + "0 ConcordantPair" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignSoftClipAtReferenceEnds", + "description" : "allow the soft-clipping of the alignments past the end of the chromosomes\n\n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks", + "example" : [ + "Yes" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignInsertionFlush", + "description" : "how to flush ambiguous insertion positions\n\n- None ... insertions are not flushed\n- Right ... insertions are flushed to the right", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Paired-End reads", + "arguments" : [ + { + "type" : "integer", + "name" : "--peOverlapNbasesMin", + "description" : "minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the \\"merginf of overlapping mates\\" algorithm.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--peOverlapMMp", + "description" : "maximum proportion of mismatched bases in the overlap area", + "example" : [ + 0.01 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Windows, Anchors, Binning", + "arguments" : [ + { + "type" : "integer", + "name" : "--winAnchorMultimapNmax", + "description" : "max number of loci anchors are allowed to map to", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple"''' + ''' : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winBinNbits", + "description" : "=log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.", + "example" : [ + 16 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winAnchorDistNbins", + "description" : "max number of bins between two anchors that allows aggregation of anchors into one window", + "example" : [ + 9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winFlankNbins", + "description" : "log2(winFlank), where win Flank is the size of the left and right flanking regions for each window", + "example" : [ + 4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--winReadCoverageRelativeMin", + "description" : "minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.", + "example" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winReadCoverageBasesMin", + "description" : "minimum number of bases covered by the seeds in a window , for STARlong algorithm only.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Chimeric Alignments", + "arguments" : [ + { + "type" : "string", + "name" : "--chimOutType", + "description" : "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n- SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n- WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n- WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments", + "example" : [ + "Junctions" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimSegmentMin", + "description" : "minimum length of chimeric segment length, if ==0, no chimeric output", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreMin", + "description" : "minimum total (summed) score of the chimeric segments", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreDropMax", + "description" : "max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length", + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreSeparation", + "description" : "minimum difference (separation) between the best chimeric score and the next one", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreJunctionNonGTAG", + "description" : "penalty for a non-GT/AG chimeric junction", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimJunctionOverhangMin", + "description" : "minimum overhang for a chimeric junction", + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimSegmentReadGapMax", + "description" : "maximum gap in the read sequence between chimeric segments", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--chimFilter", + "description" : "different filters for chimeric alignments\n\n- None ... no filtering\n- banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction", + "example" : [ + "banGenomicN" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMainSegmentMultNmax", + "description" : "maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments.", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMultimapNmax", + "description" : "maximum number of chimeric multi-alignments\n\n- 0 ... use the old scheme for chimeric detection which only considered unique alignments", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMultimapScoreRange", + "description" : "the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimNonchimScoreDropMin", + "description" : "to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value", + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimOutJunctionFormat", + "description" : "formatting type for the Chimeric.out.junction file\n\n- 0 ... no comment lines/headers\n- 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Quantification of Annotations", + "arguments" : [ + { + "type" : "string", + "name" : "--quantMode", + "description" : "types of quantification requested\n\n- - ... none\n- TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file\n- GeneCounts ... count reads per gene", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--quantTranscriptomeBAMcompression", + "description" : "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10 ... maximum compression", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--quantTranscriptomeBan", + "description" : "prohibit various alignment type\n\n- IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM\n- Singleend ... prohibit single-end alignments", + "example" : [ + "IndelSoftclipSingleend" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "2-pass Mapping", + "arguments" : [ + { + "type" : "string", + "name" : "--twopassMode", + "description" : "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--twopass1readsN", + "description" : "number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step.", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "WASP parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--waspOutputMode", + "description" : "WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\n- SAMtag ... add WASP tags to the alignments that pass WASP filtering", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "STARsolo (single cell RNA-seq) parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--soloType", + "description" : "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBwhitelist", + "description" : "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file.\n\n- None ... no whitelist: all cell barcodes are allowed", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloCBstart", + "description" : "cell barcode start base", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloCBlen", + "description" : "cell barcode length", + "example" : [ + 16 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloUMIstart", + "description" : "UMI start base", + "example" : [ + 17 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloUMIlen", + "description" : "UMI length", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloBarcodeReadLength", + "description" : "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n- 0 ... not defined, do not check", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloBarcodeMate", + "description" : "identifies which read mate contains the barcode (CB+UMI) sequence\n\n- 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part of mate 1\n- 2 ... barcode sequence is a part of mate 2", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBposition", + "description" : "position of Cell Barcode(s) on the barcode read.\n\nPresently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\nFormat for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end\nstart(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base\nString for different barcodes are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIposition", + "description" : "position of the UMI on the barcode read, same as soloCBposition\n\nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition 3_9_3_14", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloAdapterSequence", + "description" : "adapter sequence to anchor barcodes. Only one adapter sequence is allowed.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloAdapterMismatchesNmax", + "description" : "maximum number of mismatches allowed in adapter sequence.", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBmatchWLtype", + "description" : "matching the Cell Barcodes to the WhiteList\n\n- Exact ... only exact matches allowed\n- 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match.\n- 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches.\nAllowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline.", + "example" : [ + "1MM_multi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloInputSAMattrBarcodeSeq", + "description" : "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .\nThis parameter is required when running STARsolo with input from SAM.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloInputSAMattrBarcodeQual", + "description" : "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned to all bases.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloStrand", + "description" : "strandedness of the solo libraries:\n\n- Unstranded ... no strand information\n- Forward ... read strand same as the original RNA molecule\n- Reverse ... read strand opposite to the original RNA molecule", + "example" : [ + "Forward" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloFeatures", + "description" : "genomic features for which the UMI counts per Cell Barcode are collected\n\n- Gene ... genes: reads match the gene transcript\n- SJ ... splice junctions: reported in SJ.out.tab\n- GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n- GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.", + "example" : [ + "Gene" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloMultiMappers", + "description" : "counting method for reads mapping to multiple genes\n\n- Unique ... count only reads that map to unique genes\n- Uniform ... uniformly distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not.\n- EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm", + "example" : [ + "Unique" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIdedup", + "description" : "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows the \\"directional\\" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs\n- Exact ... only exactly matching UMIs are collapsed.\n- NoDedup ... no deduplication of UMIs, count all reads.\n- 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing.", + "example" : [ + "1MM_All" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIfiltering", + "description" : "type of UMI filtering (for reads uniquely mapping to genes)\n\n- - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloOutFileNames", + "description" : "file names for STARsolo output:\n\nfile_name_prefix gene_names barcode_sequences cell_feature_count_matrix", + "example" : [ + "Solo.out/", + "features.tsv", + "barcodes.tsv", + "matrix.mtx" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCellFilter", + "description" : "cell filtering type and parameters\n\n- None ... do not output filtered cells\n- TopCells ... only report top cells by UMI count, followed by the exact number of cells\n- CellRanger2.2 ... simple filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\nCan be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN\nThe harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000", + "example" : [ + "CellRanger2.2", + "3000", + "0.99", + "10" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloOutFormatFeaturesGeneField3", + "description" : "field 3 in the Gene features.tsv file. If \\"-\\", then no 3rd field is output.", + "example" : [ + "Gene Expression" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCellReadStats", + "description" : "Output reads statistics for each CB\n\n- Standard ... standard output", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Align fastq files using STAR.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "docker", + "env" : [ + "STAR_VERSION 2.7.10b", + "PACKAGES gcc g++ make wget zlib1g-dev unzip" + ] + }, + { + "type" : "docker", + "run" : [ + "apt-get update && \\\\\n apt-get install -y --no-install-recommends ${PACKAGES} && \\\\\n cd /tmp && \\\\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \\\\\n unzip ${STAR_VERSION}.zip && \\\\\n cd STAR-${STAR_VERSION}/source && \\\\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\\\n cp STAR /usr/local/bin && \\\\\n cd / && \\\\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \\\\\n apt-get --purge autoremove -y ${PACKAGES} && \\\\\n apt-get clean\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/star_align/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/star_align", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import re +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'runRNGseed': $( if [ ! -z ${VIASH_PAR_RUNRNGSEED+x} ]; then echo "int(r'${VIASH_PAR_RUNRNGSEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'genomeLoad': $( if [ ! -z ${VIASH_PAR_GENOMELOAD+x} ]; then echo "r'${VIASH_PAR_GENOMELOAD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'genomeFastaFiles': $( if [ ! -z ${VIASH_PAR_GENOMEFASTAFILES+x} ]; then echo "r'${VIASH_PAR_GENOMEFASTAFILES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'genomeFileSizes': $( if [ ! -z ${VIASH_PAR_GENOMEFILESIZES+x} ]; then echo "list(map(int, r'${VIASH_PAR_GENOMEFILESIZES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'genomeTransformOutput': $( if [ ! -z ${VIASH_PAR_GENOMETRANSFORMOUTPUT+x} ]; then echo "r'${VIASH_PAR_GENOMETRANSFORMOUTPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'genomeChrSetMitochondrial': $( if [ ! -z ${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL+x} ]; then echo "r'${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbFileChrStartEnd': $( if [ ! -z ${VIASH_PAR_SJDBFILECHRSTARTEND+x} ]; then echo "r'${VIASH_PAR_SJDBFILECHRSTARTEND//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFfile': $( if [ ! -z ${VIASH_PAR_SJDBGTFFILE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFchrPrefix': $( if [ ! -z ${VIASH_PAR_SJDBGTFCHRPREFIX+x} ]; then echo "r'${VIASH_PAR_SJDBGTFCHRPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFfeatureExon': $( if [ ! -z ${VIASH_PAR_SJDBGTFFEATUREEXON+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFEATUREEXON//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentTranscript': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGene': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneName': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneType': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbOverhang': $( if [ ! -z ${VIASH_PAR_SJDBOVERHANG+x} ]; then echo "int(r'${VIASH_PAR_SJDBOVERHANG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sjdbScore': $( if [ ! -z ${VIASH_PAR_SJDBSCORE+x} ]; then echo "int(r'${VIASH_PAR_SJDBSCORE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sjdbInsertSave': $( if [ ! -z ${VIASH_PAR_SJDBINSERTSAVE+x} ]; then echo "r'${VIASH_PAR_SJDBINSERTSAVE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'varVCFfile': $( if [ ! -z ${VIASH_PAR_VARVCFFILE+x} ]; then echo "r'${VIASH_PAR_VARVCFFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesType': $( if [ ! -z ${VIASH_PAR_READFILESTYPE+x} ]; then echo "r'${VIASH_PAR_READFILESTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesSAMattrKeep': $( if [ ! -z ${VIASH_PAR_READFILESSAMATTRKEEP+x} ]; then echo "r'${VIASH_PAR_READFILESSAMATTRKEEP//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readFilesManifest': $( if [ ! -z ${VIASH_PAR_READFILESMANIFEST+x} ]; then echo "r'${VIASH_PAR_READFILESMANIFEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesPrefix': $( if [ ! -z ${VIASH_PAR_READFILESPREFIX+x} ]; then echo "r'${VIASH_PAR_READFILESPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesCommand': $( if [ ! -z ${VIASH_PAR_READFILESCOMMAND+x} ]; then echo "r'${VIASH_PAR_READFILESCOMMAND//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readMapNumber': $( if [ ! -z ${VIASH_PAR_READMAPNUMBER+x} ]; then echo "int(r'${VIASH_PAR_READMAPNUMBER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'readMatesLengthsIn': $( if [ ! -z ${VIASH_PAR_READMATESLENGTHSIN+x} ]; then echo "r'${VIASH_PAR_READMATESLENGTHSIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readNameSeparator': $( if [ ! -z ${VIASH_PAR_READNAMESEPARATOR+x} ]; then echo "r'${VIASH_PAR_READNAMESEPARATOR//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readQualityScoreBase': $( if [ ! -z ${VIASH_PAR_READQUALITYSCOREBASE+x} ]; then echo "int(r'${VIASH_PAR_READQUALITYSCOREBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'clipAdapterType': $( if [ ! -z ${VIASH_PAR_CLIPADAPTERTYPE+x} ]; then echo "r'${VIASH_PAR_CLIPADAPTERTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'clip3pNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip3pAdapterSeq': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERSEQ+x} ]; then echo "r'${VIASH_PAR_CLIP3PADAPTERSEQ//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'clip3pAdapterMMp': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERMMP+x} ]; then echo "list(map(float, r'${VIASH_PAR_CLIP3PADAPTERMMP//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip3pAfterAdapterNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PAFTERADAPTERNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PAFTERADAPTERNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip5pNbases': $( if [ ! -z ${VIASH_PAR_CLIP5PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP5PNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'limitGenomeGenerateRAM': $( if [ ! -z ${VIASH_PAR_LIMITGENOMEGENERATERAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITGENOMEGENERATERAM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitIObufferSize': $( if [ ! -z ${VIASH_PAR_LIMITIOBUFFERSIZE+x} ]; then echo "list(map(int, r'${VIASH_PAR_LIMITIOBUFFERSIZE//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'limitOutSAMoneReadBytes': $( if [ ! -z ${VIASH_PAR_LIMITOUTSAMONEREADBYTES+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSAMONEREADBYTES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitOutSJoneRead': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJONEREAD+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJONEREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitOutSJcollapsed': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJCOLLAPSED+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJCOLLAPSED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitBAMsortRAM': $( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITBAMSORTRAM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitSjdbInsertNsj': $( if [ ! -z ${VIASH_PAR_LIMITSJDBINSERTNSJ+x} ]; then echo "int(r'${VIASH_PAR_LIMITSJDBINSERTNSJ//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitNreadsSoft': $( if [ ! -z ${VIASH_PAR_LIMITNREADSSOFT+x} ]; then echo "int(r'${VIASH_PAR_LIMITNREADSSOFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outTmpKeep': $( if [ ! -z ${VIASH_PAR_OUTTMPKEEP+x} ]; then echo "r'${VIASH_PAR_OUTTMPKEEP//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outStd': $( if [ ! -z ${VIASH_PAR_OUTSTD+x} ]; then echo "r'${VIASH_PAR_OUTSTD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outReadsUnmapped': $( if [ ! -z ${VIASH_PAR_OUTREADSUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTREADSUNMAPPED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outQSconversionAdd': $( if [ ! -z ${VIASH_PAR_OUTQSCONVERSIONADD+x} ]; then echo "int(r'${VIASH_PAR_OUTQSCONVERSIONADD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outMultimapperOrder': $( if [ ! -z ${VIASH_PAR_OUTMULTIMAPPERORDER+x} ]; then echo "r'${VIASH_PAR_OUTMULTIMAPPERORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMtype': $( if [ ! -z ${VIASH_PAR_OUTSAMTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSAMTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMmode': $( if [ ! -z ${VIASH_PAR_OUTSAMMODE+x} ]; then echo "r'${VIASH_PAR_OUTSAMMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMstrandField': $( if [ ! -z ${VIASH_PAR_OUTSAMSTRANDFIELD+x} ]; then echo "r'${VIASH_PAR_OUTSAMSTRANDFIELD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMattributes': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRIBUTES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMattrIHstart': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIHSTART+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMATTRIHSTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMunmapped': $( if [ ! -z ${VIASH_PAR_OUTSAMUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTSAMUNMAPPED//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMorder': $( if [ ! -z ${VIASH_PAR_OUTSAMORDER+x} ]; then echo "r'${VIASH_PAR_OUTSAMORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMprimaryFlag': $( if [ ! -z ${VIASH_PAR_OUTSAMPRIMARYFLAG+x} ]; then echo "r'${VIASH_PAR_OUTSAMPRIMARYFLAG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMreadID': $( if [ ! -z ${VIASH_PAR_OUTSAMREADID+x} ]; then echo "r'${VIASH_PAR_OUTSAMREADID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMmapqUnique': $( if [ ! -z ${VIASH_PAR_OUTSAMMAPQUNIQUE+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMAPQUNIQUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMflagOR': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGOR+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMflagAND': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGAND+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGAND//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMattrRGline': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRRGLINE+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRRGLINE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderHD': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERHD+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERHD//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderPG': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERPG+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERPG//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderCommentFile': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERCOMMENTFILE+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERCOMMENTFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMfilter': $( if [ ! -z ${VIASH_PAR_OUTSAMFILTER+x} ]; then echo "r'${VIASH_PAR_OUTSAMFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMmultNmax': $( if [ ! -z ${VIASH_PAR_OUTSAMMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMULTNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMtlen': $( if [ ! -z ${VIASH_PAR_OUTSAMTLEN+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMTLEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMcompression': $( if [ ! -z ${VIASH_PAR_OUTBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMCOMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMsortingThreadN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGTHREADN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGTHREADN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMsortingBinsN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGBINSN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGBINSN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'bamRemoveDuplicatesType': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESTYPE+x} ]; then echo "r'${VIASH_PAR_BAMREMOVEDUPLICATESTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'bamRemoveDuplicatesMate2basesN': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN+x} ]; then echo "int(r'${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outWigType': $( if [ ! -z ${VIASH_PAR_OUTWIGTYPE+x} ]; then echo "r'${VIASH_PAR_OUTWIGTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outWigStrand': $( if [ ! -z ${VIASH_PAR_OUTWIGSTRAND+x} ]; then echo "r'${VIASH_PAR_OUTWIGSTRAND//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outWigReferencesPrefix': $( if [ ! -z ${VIASH_PAR_OUTWIGREFERENCESPREFIX+x} ]; then echo "r'${VIASH_PAR_OUTWIGREFERENCESPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outWigNorm': $( if [ ! -z ${VIASH_PAR_OUTWIGNORM+x} ]; then echo "r'${VIASH_PAR_OUTWIGNORM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterType': $( if [ ! -z ${VIASH_PAR_OUTFILTERTYPE+x} ]; then echo "r'${VIASH_PAR_OUTFILTERTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMultimapNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMISMATCHNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNoverLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNoverReadLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterScoreMin': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERSCOREMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterScoreMinOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMatchNmin': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMATCHNMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMatchNminOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterIntronMotifs': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONMOTIFS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONMOTIFS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterIntronStrands': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONSTRANDS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONSTRANDS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJtype': $( if [ ! -z ${VIASH_PAR_OUTSJTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSJTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJfilterReads': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERREADS+x} ]; then echo "r'${VIASH_PAR_OUTSJFILTERREADS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJfilterOverhangMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTEROVERHANGMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTEROVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountUniqueMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountTotalMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterDistToOtherSJmin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterIntronMaxVsReadN': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'scoreGap': $( if [ ! -z ${VIASH_PAR_SCOREGAP+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapNoncan': $( if [ ! -z ${VIASH_PAR_SCOREGAPNONCAN+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPNONCAN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapGCAG': $( if [ ! -z ${VIASH_PAR_SCOREGAPGCAG+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPGCAG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapATAC': $( if [ ! -z ${VIASH_PAR_SCOREGAPATAC+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPATAC//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGenomicLengthLog2scale': $( if [ ! -z ${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE+x} ]; then echo "int(r'${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreDelOpen': $( if [ ! -z ${VIASH_PAR_SCOREDELOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELOPEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreDelBase': $( if [ ! -z ${VIASH_PAR_SCOREDELBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreInsOpen': $( if [ ! -z ${VIASH_PAR_SCOREINSOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSOPEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreInsBase': $( if [ ! -z ${VIASH_PAR_SCOREINSBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreStitchSJshift': $( if [ ! -z ${VIASH_PAR_SCORESTITCHSJSHIFT+x} ]; then echo "int(r'${VIASH_PAR_SCORESTITCHSJSHIFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchStartLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHSTARTLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchStartLmaxOverLread': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedMultimapNmax': $( if [ ! -z ${VIASH_PAR_SEEDMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedPerReadNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedPerWindowNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERWINDOWNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedNoneLociPerWindow': $( if [ ! -z ${VIASH_PAR_SEEDNONELOCIPERWINDOW+x} ]; then echo "int(r'${VIASH_PAR_SEEDNONELOCIPERWINDOW//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSplitMin': $( if [ ! -z ${VIASH_PAR_SEEDSPLITMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDSPLITMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedMapMin': $( if [ ! -z ${VIASH_PAR_SEEDMAPMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDMAPMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignIntronMin': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignIntronMax': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignMatesGapMax': $( if [ ! -z ${VIASH_PAR_ALIGNMATESGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNMATESGAPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSJoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSJstitchMismatchNmax': $( if [ ! -z ${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX+x} ]; then echo "list(map(int, r'${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'alignSJDBoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJDBOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJDBOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSplicedMateMapLmin': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSplicedMateMapLminOverLmate': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE+x} ]; then echo "float(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignWindowsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNWINDOWSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNWINDOWSPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignTranscriptsPerWindowNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignTranscriptsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignEndsType': $( if [ ! -z ${VIASH_PAR_ALIGNENDSTYPE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignEndsProtrude': $( if [ ! -z ${VIASH_PAR_ALIGNENDSPROTRUDE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSPROTRUDE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignSoftClipAtReferenceEnds': $( if [ ! -z ${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS+x} ]; then echo "r'${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignInsertionFlush': $( if [ ! -z ${VIASH_PAR_ALIGNINSERTIONFLUSH+x} ]; then echo "r'${VIASH_PAR_ALIGNINSERTIONFLUSH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'peOverlapNbasesMin': $( if [ ! -z ${VIASH_PAR_PEOVERLAPNBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_PEOVERLAPNBASESMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'peOverlapMMp': $( if [ ! -z ${VIASH_PAR_PEOVERLAPMMP+x} ]; then echo "float(r'${VIASH_PAR_PEOVERLAPMMP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winAnchorMultimapNmax': $( if [ ! -z ${VIASH_PAR_WINANCHORMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winBinNbits': $( if [ ! -z ${VIASH_PAR_WINBINNBITS+x} ]; then echo "int(r'${VIASH_PAR_WINBINNBITS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winAnchorDistNbins': $( if [ ! -z ${VIASH_PAR_WINANCHORDISTNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORDISTNBINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winFlankNbins': $( if [ ! -z ${VIASH_PAR_WINFLANKNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINFLANKNBINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winReadCoverageRelativeMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGERELATIVEMIN+x} ]; then echo "float(r'${VIASH_PAR_WINREADCOVERAGERELATIVEMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winReadCoverageBasesMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGEBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_WINREADCOVERAGEBASESMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimOutType': $( if [ ! -z ${VIASH_PAR_CHIMOUTTYPE+x} ]; then echo "r'${VIASH_PAR_CHIMOUTTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'chimSegmentMin': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreMin': $( if [ ! -z ${VIASH_PAR_CHIMSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreDropMax': $( if [ ! -z ${VIASH_PAR_CHIMSCOREDROPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREDROPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreSeparation': $( if [ ! -z ${VIASH_PAR_CHIMSCORESEPARATION+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCORESEPARATION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreJunctionNonGTAG': $( if [ ! -z ${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimJunctionOverhangMin': $( if [ ! -z ${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimSegmentReadGapMax': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTREADGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTREADGAPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimFilter': $( if [ ! -z ${VIASH_PAR_CHIMFILTER+x} ]; then echo "r'${VIASH_PAR_CHIMFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'chimMainSegmentMultNmax': $( if [ ! -z ${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimMultimapNmax': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPSCORERANGE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimNonchimScoreDropMin': $( if [ ! -z ${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimOutJunctionFormat': $( if [ ! -z ${VIASH_PAR_CHIMOUTJUNCTIONFORMAT+x} ]; then echo "int(r'${VIASH_PAR_CHIMOUTJUNCTIONFORMAT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'quantMode': $( if [ ! -z ${VIASH_PAR_QUANTMODE+x} ]; then echo "r'${VIASH_PAR_QUANTMODE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'quantTranscriptomeBAMcompression': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'quantTranscriptomeBan': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAN+x} ]; then echo "r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'twopassMode': $( if [ ! -z ${VIASH_PAR_TWOPASSMODE+x} ]; then echo "r'${VIASH_PAR_TWOPASSMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'twopass1readsN': $( if [ ! -z ${VIASH_PAR_TWOPASS1READSN+x} ]; then echo "int(r'${VIASH_PAR_TWOPASS1READSN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'waspOutputMode': $( if [ ! -z ${VIASH_PAR_WASPOUTPUTMODE+x} ]; then echo "r'${VIASH_PAR_WASPOUTPUTMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloType': $( if [ ! -z ${VIASH_PAR_SOLOTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCBwhitelist': $( if [ ! -z ${VIASH_PAR_SOLOCBWHITELIST+x} ]; then echo "r'${VIASH_PAR_SOLOCBWHITELIST//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCBstart': $( if [ ! -z ${VIASH_PAR_SOLOCBSTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBSTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBlen': $( if [ ! -z ${VIASH_PAR_SOLOCBLEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBLEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloUMIstart': $( if [ ! -z ${VIASH_PAR_SOLOUMISTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMISTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloUMIlen': $( if [ ! -z ${VIASH_PAR_SOLOUMILEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMILEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloBarcodeReadLength': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEREADLENGTH+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEREADLENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloBarcodeMate': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEMATE+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEMATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBposition': $( if [ ! -z ${VIASH_PAR_SOLOCBPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOCBPOSITION//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIposition': $( if [ ! -z ${VIASH_PAR_SOLOUMIPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOUMIPOSITION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloAdapterSequence': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERSEQUENCE+x} ]; then echo "r'${VIASH_PAR_SOLOADAPTERSEQUENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloAdapterMismatchesNmax': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX+x} ]; then echo "int(r'${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBmatchWLtype': $( if [ ! -z ${VIASH_PAR_SOLOCBMATCHWLTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOCBMATCHWLTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloInputSAMattrBarcodeSeq': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloInputSAMattrBarcodeQual': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloStrand': $( if [ ! -z ${VIASH_PAR_SOLOSTRAND+x} ]; then echo "r'${VIASH_PAR_SOLOSTRAND//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloFeatures': $( if [ ! -z ${VIASH_PAR_SOLOFEATURES+x} ]; then echo "r'${VIASH_PAR_SOLOFEATURES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloMultiMappers': $( if [ ! -z ${VIASH_PAR_SOLOMULTIMAPPERS+x} ]; then echo "r'${VIASH_PAR_SOLOMULTIMAPPERS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIdedup': $( if [ ! -z ${VIASH_PAR_SOLOUMIDEDUP+x} ]; then echo "r'${VIASH_PAR_SOLOUMIDEDUP//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIfiltering': $( if [ ! -z ${VIASH_PAR_SOLOUMIFILTERING+x} ]; then echo "r'${VIASH_PAR_SOLOUMIFILTERING//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloOutFileNames': $( if [ ! -z ${VIASH_PAR_SOLOOUTFILENAMES+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFILENAMES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCellFilter': $( if [ ! -z ${VIASH_PAR_SOLOCELLFILTER+x} ]; then echo "r'${VIASH_PAR_SOLOCELLFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloOutFormatFeaturesGeneField3': $( if [ ! -z ${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCellReadStats': $( if [ ! -z ${VIASH_PAR_SOLOCELLREADSTATS+x} ]; then echo "r'${VIASH_PAR_SOLOCELLREADSTATS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + +# regex for matching R[12] fastq(gz) files +# examples: +# - TSP10_Fat_MAT_SS2_B134171_B115063_Immune_A1_L003_R1.fastq.gz +# - tinygex_S1_L001_I1_001.fastq.gz +fastqgz_regex = r"(.+)_(R\\\\d+)(_\\\\d+)?\\\\.fastq(\\\\.gz)?" + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\\\x1f\\\\x8b" + + +# look for fastq files in a directory +def search_fastqs(path: Path) -> list[Path]: + if path.is_dir(): + print( + f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", + flush=True, + ) + value_paths = [ + file for file in path.iterdir() if re.match(fastqgz_regex, file.name) + ] + return value_paths + else: + return [path] + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the \\`processPar()\\` generator needs to be adapted +to_rename = { + "input": "readFilesIn", + "reference": "genomeDir", + "output": "outFileNamePrefix", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the \\`to_rename\\` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["outFileNamePrefix"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory( + prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True +) as temp_dir: + print(">> Check whether input files are directories", flush=True) + new_read_files_in = [] + for path in par["readFilesIn"]: + new_read_files_in.extend(search_fastqs(path)) + par["readFilesIn"] = new_read_files_in + print("", flush=True) + + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeDir", "readFilesIn"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print("Grouping R1/R2 input files into pairs", flush=True) + input_grouped = {} + for path in par["readFilesIn"]: + key = re.search(fastqgz_regex, path.name).group(2) + if key not in input_grouped: + input_grouped[key] = [] + input_grouped[key].append(str(path)) + par["readFilesIn"] = [",".join(val) for val in input_grouped.values()] + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "alignReads" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + # make sure there is a trailing / + par["outFileNamePrefix"] = f"{par['outFileNamePrefix']}/" + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/star_align", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/star_align/nextflow.config b/target/nextflow/mapping/star_align/nextflow.config new file mode 100644 index 00000000..a6bbef25 --- /dev/null +++ b/target/nextflow/mapping/star_align/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/star_align' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Align fastq files using STAR.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/star_align/nextflow_labels.config b/target/nextflow/mapping/star_align/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/star_align/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/star_align/nextflow_schema.json b/target/nextflow/mapping/star_align/nextflow_schema.json new file mode 100644 index 00000000..75578396 --- /dev/null +++ b/target/nextflow/mapping/star_align/nextflow_schema.json @@ -0,0 +1,1322 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "star_align", + "description": "Align fastq files using STAR.", + "type": "object", + "$defs": { + "input/output": { + "title": "Input/Output", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The FASTQ files to be analyzed", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the reference built by star_build_reference", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/reference\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Path to output directory", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/foo\"`. ", + "default": "$id.$key.output" + } + } + }, + "run parameters": { + "title": "Run Parameters", + "type": "object", + "description": "No description", + "properties": { + "runRNGseed": { + "type": "integer", + "description": "random number generator seed.", + "help_text": "Type: `integer`, multiple: `False`, example: `777`. " + } + } + }, + "genome parameters": { + "title": "Genome Parameters", + "type": "object", + "description": "No description", + "properties": { + "genomeLoad": { + "type": "string", + "description": "mode of shared memory usage for the genome files", + "help_text": "Type: `string`, multiple: `False`, example: `\"NoSharedMemory\"`. " + }, + "genomeFastaFiles": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "path(s) to the fasta files with the genome sequences, separated by spaces", + "help_text": "Type: `file`, multiple: `True`, direction: `input`. " + }, + "genomeFileSizes": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "genome files exact sizes in bytes", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "genomeTransformOutput": { + "type": "array", + "items": { + "type": "string" + }, + "description": "which output to transform back to original genome\n\n- SAM ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "genomeChrSetMitochondrial": { + "type": "array", + "items": { + "type": "string" + }, + "description": "names of the mitochondrial chromosomes", + "help_text": "Type: `string`, multiple: `True`, example: `[\"chrM\";\"M\";\"MT\"]`. " + } + } + }, + "splice junctions database": { + "title": "Splice Junctions Database", + "type": "object", + "description": "No description", + "properties": { + "sjdbFileChrStartEnd": { + "type": "array", + "items": { + "type": "string" + }, + "description": "path to the files with genomic coordinates (chr start end strand) for the splice junction introns", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sjdbGTFfile": { + "type": "string", + "format": "path", + "description": "path to the GTF file with annotations", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sjdbGTFchrPrefix": { + "type": "string", + "description": "prefix for chromosome names in a GTF file (e.g", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sjdbGTFfeatureExon": { + "type": "string", + "description": "feature type in GTF file to be used as exons for building transcripts", + "help_text": "Type: `string`, multiple: `False`, example: `\"exon\"`. " + }, + "sjdbGTFtagExonParentTranscript": { + "type": "string", + "description": "GTF attribute name for parent transcript ID (default \"transcript_id\" works for GTF files)", + "help_text": "Type: `string`, multiple: `False`, example: `\"transcript_id\"`. " + }, + "sjdbGTFtagExonParentGene": { + "type": "string", + "description": "GTF attribute name for parent gene ID (default \"gene_id\" works for GTF files)", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_id\"`. " + }, + "sjdbGTFtagExonParentGeneName": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute name for parent gene name", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_name\"]`. " + }, + "sjdbGTFtagExonParentGeneType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute name for parent gene type", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_type\";\"gene_biotype\"]`. " + }, + "sjdbOverhang": { + "type": "integer", + "description": "length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "sjdbScore": { + "type": "integer", + "description": "extra alignment score for alignments that cross database junctions", + "help_text": "Type: `integer`, multiple: `False`, example: `2`. " + }, + "sjdbInsertSave": { + "type": "string", + "description": "which files to save when sjdb junctions are inserted on the fly at the mapping step\n\n- Basic ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`. " + } + } + }, + "variation parameters": { + "title": "Variation parameters", + "type": "object", + "description": "No description", + "properties": { + "varVCFfile": { + "type": "string", + "description": "path to the VCF file that contains variation data", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "read parameters": { + "title": "Read Parameters", + "type": "object", + "description": "No description", + "properties": { + "readFilesType": { + "type": "string", + "description": "format of input read files\n\n- Fastx ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Fastx\"`. " + }, + "readFilesSAMattrKeep": { + "type": "array", + "items": { + "type": "string" + }, + "description": "for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"All\"]`. " + }, + "readFilesManifest": { + "type": "string", + "format": "path", + "description": "path to the \"manifest\" file with the names of read files", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "readFilesPrefix": { + "type": "string", + "description": "prefix for the read files names, i.e", + "help_text": "Type: `string`, multiple: `False`. " + }, + "readFilesCommand": { + "type": "array", + "items": { + "type": "string" + }, + "description": "command line to execute for each of the input file", + "help_text": "Type: `string`, multiple: `True`. " + }, + "readMapNumber": { + "type": "integer", + "description": "number of reads to map from the beginning of the file\n\n-1: map all reads", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "readMatesLengthsIn": { + "type": "string", + "description": "Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same", + "help_text": "Type: `string`, multiple: `False`, example: `\"NotEqual\"`. " + }, + "readNameSeparator": { + "type": "array", + "items": { + "type": "string" + }, + "description": "character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)", + "help_text": "Type: `string`, multiple: `True`, example: `[\"/\"]`. " + }, + "readQualityScoreBase": { + "type": "integer", + "description": "number to be subtracted from the ASCII code to get Phred quality score", + "help_text": "Type: `integer`, multiple: `False`, example: `33`. " + } + } + }, + "read clipping": { + "title": "Read Clipping", + "type": "object", + "description": "No description", + "properties": { + "clipAdapterType": { + "type": "string", + "description": "adapter clipping type\n\n- Hamming ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Hamming\"`. " + }, + "clip3pNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number(s) of bases to clip from 3p of each mate", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "clip3pAdapterSeq": { + "type": "array", + "items": { + "type": "string" + }, + "description": "adapter sequences to clip from 3p of each mate", + "help_text": "Type: `string`, multiple: `True`. " + }, + "clip3pAdapterMMp": { + "type": "array", + "items": { + "type": "number" + }, + "description": "max proportion of mismatches for 3p adapter clipping for each mate", + "help_text": "Type: `double`, multiple: `True`, example: `[0.1]`. " + }, + "clip3pAfterAdapterNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number of bases to clip from 3p of each mate after the adapter clipping", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "clip5pNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number(s) of bases to clip from 5p of each mate", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + } + } + }, + "limits": { + "title": "Limits", + "type": "object", + "description": "No description", + "properties": { + "limitGenomeGenerateRAM": { + "type": "string", + "description": "maximum available RAM (bytes) for genome generation", + "help_text": "Type: `long`, multiple: `False`, example: `31000000000`. " + }, + "limitIObufferSize": { + "type": "array", + "items": { + "type": "string" + }, + "description": "max available buffers size (bytes) for input/output, per thread", + "help_text": "Type: `long`, multiple: `True`, example: `[30000000;50000000]`. " + }, + "limitOutSAMoneReadBytes": { + "type": "string", + "description": "max size of the SAM record (bytes) for one read", + "help_text": "Type: `long`, multiple: `False`, example: `100000`. " + }, + "limitOutSJoneRead": { + "type": "integer", + "description": "max number of junctions for one read (including all multi-mappers)", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "limitOutSJcollapsed": { + "type": "integer", + "description": "max number of collapsed junctions", + "help_text": "Type: `integer`, multiple: `False`, example: `1000000`. " + }, + "limitBAMsortRAM": { + "type": "string", + "description": "maximum available RAM (bytes) for sorting BAM", + "help_text": "Type: `long`, multiple: `False`, example: `0`. " + }, + "limitSjdbInsertNsj": { + "type": "integer", + "description": "maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "help_text": "Type: `integer`, multiple: `False`, example: `1000000`. " + }, + "limitNreadsSoft": { + "type": "integer", + "description": "soft limit on the number of reads", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + } + } + }, + "output: general": { + "title": "Output: general", + "type": "object", + "description": "No description", + "properties": { + "outTmpKeep": { + "type": "string", + "description": "whether to keep the temporary files after STAR runs is finished\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outStd": { + "type": "string", + "description": "which output will be directed to stdout (standard out)\n\n- Log ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Log\"`. " + }, + "outReadsUnmapped": { + "type": "string", + "description": "output of unmapped and partially mapped (i.e", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outQSconversionAdd": { + "type": "integer", + "description": "add this number to the quality score (e.g", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outMultimapperOrder": { + "type": "string", + "description": "order of multimapping alignments in the output files\n\n- Old_2.4 ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Old_2.4\"`. " + } + } + }, + "output: sam and bam": { + "title": "Output: SAM and BAM", + "type": "object", + "description": "No description", + "properties": { + "outSAMtype": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of SAM/BAM output\n\n1st word:\n- BAM ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"SAM\"]`. " + }, + "outSAMmode": { + "type": "string", + "description": "mode of SAM output\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Full\"`. " + }, + "outSAMstrandField": { + "type": "string", + "description": "Cufflinks-like strand field flag\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outSAMattributes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "a string of desired SAM attributes, in the order desired for the output SAM", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Standard\"]`. " + }, + "outSAMattrIHstart": { + "type": "integer", + "description": "start value for the IH attribute", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outSAMunmapped": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output of unmapped reads in the SAM format\n\n1st word:\n- None ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMorder": { + "type": "string", + "description": "type of sorting for the SAM output\n\nPaired: one mate after the other for all paired alignments\nPairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files", + "help_text": "Type: `string`, multiple: `False`, example: `\"Paired\"`. " + }, + "outSAMprimaryFlag": { + "type": "string", + "description": "which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG\n\n- OneBestScore ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"OneBestScore\"`. " + }, + "outSAMreadID": { + "type": "string", + "description": "read ID record type\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Standard\"`. " + }, + "outSAMmapqUnique": { + "type": "integer", + "description": "0 to 255: the MAPQ value for unique mappers", + "help_text": "Type: `integer`, multiple: `False`, example: `255`. " + }, + "outSAMflagOR": { + "type": "integer", + "description": "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outSAMflagAND": { + "type": "integer", + "description": "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `65535`. " + }, + "outSAMattrRGline": { + "type": "array", + "items": { + "type": "string" + }, + "description": "SAM/BAM read group line", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderHD": { + "type": "array", + "items": { + "type": "string" + }, + "description": "@HD (header) line of the SAM header", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderPG": { + "type": "array", + "items": { + "type": "string" + }, + "description": "extra @PG (software) line of the SAM header (in addition to STAR)", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderCommentFile": { + "type": "string", + "description": "path to the file with @CO (comment) lines of the SAM header", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outSAMfilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMmultNmax": { + "type": "integer", + "description": "max number of multiple alignments for a read that will be output to the SAM/BAM files", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "outSAMtlen": { + "type": "integer", + "description": "calculation method for the TLEN field in the SAM/BAM files\n\n- 1 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outBAMcompression": { + "type": "integer", + "description": "-1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outBAMsortingThreadN": { + "type": "integer", + "description": ">=0: number of threads for BAM sorting", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outBAMsortingBinsN": { + "type": "integer", + "description": ">0: number of genome bins for coordinate-sorting", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + } + } + }, + "bam processing": { + "title": "BAM processing", + "type": "object", + "description": "No description", + "properties": { + "bamRemoveDuplicatesType": { + "type": "string", + "description": "mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- - ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "bamRemoveDuplicatesMate2basesN": { + "type": "integer", + "description": "number of bases from the 5' of mate 2 to use in collapsing (e.g", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "output wiggle": { + "title": "Output Wiggle", + "type": "object", + "description": "No description", + "properties": { + "outWigType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of signal output, e.g", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outWigStrand": { + "type": "string", + "description": "strandedness of wiggle/bedGraph output\n\n- Stranded ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Stranded\"`. " + }, + "outWigReferencesPrefix": { + "type": "string", + "description": "prefix matching reference names to include in the output wiggle file, e.g", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outWigNorm": { + "type": "string", + "description": "type of normalization for the signal\n\n- RPM ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"RPM\"`. " + } + } + }, + "output filtering": { + "title": "Output Filtering", + "type": "object", + "description": "No description", + "properties": { + "outFilterType": { + "type": "string", + "description": "type of filtering\n\n- Normal ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Normal\"`. " + }, + "outFilterMultimapScoreRange": { + "type": "integer", + "description": "the score range below the maximum score for multimapping alignments", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outFilterMultimapNmax": { + "type": "integer", + "description": "maximum number of loci the read is allowed to map to", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "outFilterMismatchNmax": { + "type": "integer", + "description": "alignment will be output only if it has no more mismatches than this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "outFilterMismatchNoverLmax": { + "type": "number", + "description": "alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.", + "help_text": "Type: `double`, multiple: `False`, example: `0.3`. " + }, + "outFilterMismatchNoverReadLmax": { + "type": "number", + "description": "alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.", + "help_text": "Type: `double`, multiple: `False`, example: `1.0`. " + }, + "outFilterScoreMin": { + "type": "integer", + "description": "alignment will be output only if its score is higher than or equal to this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outFilterScoreMinOverLread": { + "type": "number", + "description": "same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "outFilterMatchNmin": { + "type": "integer", + "description": "alignment will be output only if the number of matched bases is higher than or equal to this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outFilterMatchNminOverLread": { + "type": "number", + "description": "sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "outFilterIntronMotifs": { + "type": "string", + "description": "filter alignment using their motifs\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outFilterIntronStrands": { + "type": "string", + "description": "filter alignments\n\n- RemoveInconsistentStrands ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"RemoveInconsistentStrands\"`. " + } + } + }, + "output splice junctions (sj.out.tab)": { + "title": "Output splice junctions (SJ.out.tab)", + "type": "object", + "description": "No description", + "properties": { + "outSJtype": { + "type": "string", + "description": "type of splice junction output\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Standard\"`. " + } + } + }, + "output filtering: splice junctions": { + "title": "Output Filtering: Splice Junctions", + "type": "object", + "description": "No description", + "properties": { + "outSJfilterReads": { + "type": "string", + "description": "which reads to consider for collapsed splice junctions output\n\n- All ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"All\"`. " + }, + "outSJfilterOverhangMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[30;12;12;12]`. " + }, + "outSJfilterCountUniqueMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[3;1;1;1]`. " + }, + "outSJfilterCountTotalMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[3;1;1;1]`. " + }, + "outSJfilterDistToOtherSJmin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum allowed distance to other junctions' donor/acceptor\n\ndoes not apply to annotated junctions", + "help_text": "Type: `integer`, multiple: `True`, example: `[10;0;5;10]`. " + }, + "outSJfilterIntronMaxVsReadN": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ni.e", + "help_text": "Type: `integer`, multiple: `True`, example: `[50000;100000;200000]`. " + } + } + }, + "scoring": { + "title": "Scoring", + "type": "object", + "description": "No description", + "properties": { + "scoreGap": { + "type": "integer", + "description": "splice junction penalty (independent on intron motif)", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "scoreGapNoncan": { + "type": "integer", + "description": "non-canonical junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-8`. " + }, + "scoreGapGCAG": { + "type": "integer", + "description": "GC/AG and CT/GC junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-4`. " + }, + "scoreGapATAC": { + "type": "integer", + "description": "AT/AC and GT/AT junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-8`. " + }, + "scoreGenomicLengthLog2scale": { + "type": "integer", + "description": "extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "scoreDelOpen": { + "type": "integer", + "description": "deletion open penalty", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreDelBase": { + "type": "integer", + "description": "deletion extension penalty per base (in addition to scoreDelOpen)", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreInsOpen": { + "type": "integer", + "description": "insertion open penalty", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreInsBase": { + "type": "integer", + "description": "insertion extension penalty per base (in addition to scoreInsOpen)", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreStitchSJshift": { + "type": "integer", + "description": "maximum score reduction while searching for SJ boundaries in the stitching step", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + } + } + }, + "alignments and seeding": { + "title": "Alignments and Seeding", + "type": "object", + "description": "No description", + "properties": { + "seedSearchStartLmax": { + "type": "integer", + "description": "defines the search start point through the read - the read is split into pieces no longer than this value", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "seedSearchStartLmaxOverLread": { + "type": "number", + "description": "seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)", + "help_text": "Type: `double`, multiple: `False`, example: `1.0`. " + }, + "seedSearchLmax": { + "type": "integer", + "description": "defines the maximum length of the seeds, if =0 seed length is not limited", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "seedMultimapNmax": { + "type": "integer", + "description": "only pieces that map fewer than this value are utilized in the stitching procedure", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "seedPerReadNmax": { + "type": "integer", + "description": "max number of seeds per read", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "seedPerWindowNmax": { + "type": "integer", + "description": "max number of seeds per window", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "seedNoneLociPerWindow": { + "type": "integer", + "description": "max number of one seed loci per window", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "seedSplitMin": { + "type": "integer", + "description": "min length of the seed sequences split by Ns or mate gap", + "help_text": "Type: `integer`, multiple: `False`, example: `12`. " + }, + "seedMapMin": { + "type": "integer", + "description": "min length of seeds to be mapped", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "alignIntronMin": { + "type": "integer", + "description": "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion", + "help_text": "Type: `integer`, multiple: `False`, example: `21`. " + }, + "alignIntronMax": { + "type": "integer", + "description": "maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignMatesGapMax": { + "type": "integer", + "description": "maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignSJoverhangMin": { + "type": "integer", + "description": "minimum overhang (i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "alignSJstitchMismatchNmax": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "maximum number of mismatches for stitching of the splice junctions (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.", + "help_text": "Type: `integer`, multiple: `True`, example: `[0;-1;0;0]`. " + }, + "alignSJDBoverhangMin": { + "type": "integer", + "description": "minimum overhang (i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "alignSplicedMateMapLmin": { + "type": "integer", + "description": "minimum mapped length for a read mate that is spliced", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignSplicedMateMapLminOverLmate": { + "type": "number", + "description": "alignSplicedMateMapLmin normalized to mate length", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "alignWindowsPerReadNmax": { + "type": "integer", + "description": "max number of windows per read", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "alignTranscriptsPerWindowNmax": { + "type": "integer", + "description": "max number of transcripts per window", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "alignTranscriptsPerReadNmax": { + "type": "integer", + "description": "max number of different alignments per read to consider", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "alignEndsType": { + "type": "string", + "description": "type of read ends alignment\n\n- Local ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Local\"`. " + }, + "alignEndsProtrude": { + "type": "string", + "description": "allow protrusion of alignment ends, i.e", + "help_text": "Type: `string`, multiple: `False`, example: `\"0 ConcordantPair\"`. " + }, + "alignSoftClipAtReferenceEnds": { + "type": "string", + "description": "allow the soft-clipping of the alignments past the end of the chromosomes\n\n- Yes ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Yes\"`. " + }, + "alignInsertionFlush": { + "type": "string", + "description": "how to flush ambiguous insertion positions\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "paired-end reads": { + "title": "Paired-End reads", + "type": "object", + "description": "No description", + "properties": { + "peOverlapNbasesMin": { + "type": "integer", + "description": "minimum number of overlapping bases to trigger mates merging and realignment", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "peOverlapMMp": { + "type": "number", + "description": "maximum proportion of mismatched bases in the overlap area", + "help_text": "Type: `double`, multiple: `False`, example: `0.01`. " + } + } + }, + "windows, anchors, binning": { + "title": "Windows, Anchors, Binning", + "type": "object", + "description": "No description", + "properties": { + "winAnchorMultimapNmax": { + "type": "integer", + "description": "max number of loci anchors are allowed to map to", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "winBinNbits": { + "type": "integer", + "description": "=log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.", + "help_text": "Type: `integer`, multiple: `False`, example: `16`. " + }, + "winAnchorDistNbins": { + "type": "integer", + "description": "max number of bins between two anchors that allows aggregation of anchors into one window", + "help_text": "Type: `integer`, multiple: `False`, example: `9`. " + }, + "winFlankNbins": { + "type": "integer", + "description": "log2(winFlank), where win Flank is the size of the left and right flanking regions for each window", + "help_text": "Type: `integer`, multiple: `False`, example: `4`. " + }, + "winReadCoverageRelativeMin": { + "type": "number", + "description": "minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.", + "help_text": "Type: `double`, multiple: `False`, example: `0.5`. " + }, + "winReadCoverageBasesMin": { + "type": "integer", + "description": "minimum number of bases covered by the seeds in a window , for STARlong algorithm only.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "chimeric alignments": { + "title": "Chimeric Alignments", + "type": "object", + "description": "No description", + "properties": { + "chimOutType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of chimeric output\n\n- Junctions ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Junctions\"]`. " + }, + "chimSegmentMin": { + "type": "integer", + "description": "minimum length of chimeric segment length, if ==0, no chimeric output", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimScoreMin": { + "type": "integer", + "description": "minimum total (summed) score of the chimeric segments", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimScoreDropMax": { + "type": "integer", + "description": "max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimScoreSeparation": { + "type": "integer", + "description": "minimum difference (separation) between the best chimeric score and the next one", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "chimScoreJunctionNonGTAG": { + "type": "integer", + "description": "penalty for a non-GT/AG chimeric junction", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "chimJunctionOverhangMin": { + "type": "integer", + "description": "minimum overhang for a chimeric junction", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimSegmentReadGapMax": { + "type": "integer", + "description": "maximum gap in the read sequence between chimeric segments", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "different filters for chimeric alignments\n\n- None ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"banGenomicN\"]`. " + }, + "chimMainSegmentMultNmax": { + "type": "integer", + "description": "maximum number of multi-alignments for the main chimeric segment", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "chimMultimapNmax": { + "type": "integer", + "description": "maximum number of chimeric multi-alignments\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimMultimapScoreRange": { + "type": "integer", + "description": "the score range for multi-mapping chimeras below the best chimeric score", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "chimNonchimScoreDropMin": { + "type": "integer", + "description": "to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimOutJunctionFormat": { + "type": "integer", + "description": "formatting type for the Chimeric.out.junction file\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "quantification of annotations": { + "title": "Quantification of Annotations", + "type": "object", + "description": "No description", + "properties": { + "quantMode": { + "type": "array", + "items": { + "type": "string" + }, + "description": "types of quantification requested\n\n- - ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "quantTranscriptomeBAMcompression": { + "type": "integer", + "description": "-2 to 10 transcriptome BAM compression level\n\n- -2 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "quantTranscriptomeBan": { + "type": "string", + "description": "prohibit various alignment type\n\n- IndelSoftclipSingleend ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"IndelSoftclipSingleend\"`. " + } + } + }, + "2-pass mapping": { + "title": "2-pass Mapping", + "type": "object", + "description": "No description", + "properties": { + "twopassMode": { + "type": "string", + "description": "2-pass mapping mode.\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "twopass1readsN": { + "type": "integer", + "description": "number of reads to process for the 1st step", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + } + } + }, + "wasp parameters": { + "title": "WASP parameters", + "type": "object", + "description": "No description", + "properties": { + "waspOutputMode": { + "type": "string", + "description": "WASP allele-specific output type", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "starsolo (single cell rna-seq) parameters": { + "title": "STARsolo (single cell RNA-seq) parameters", + "type": "object", + "description": "No description", + "properties": { + "soloType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of single-cell RNA-seq\n\n- CB_UMI_Simple ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloCBwhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "file(s) with whitelist(s) of cell barcodes", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloCBstart": { + "type": "integer", + "description": "cell barcode start base", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloCBlen": { + "type": "integer", + "description": "cell barcode length", + "help_text": "Type: `integer`, multiple: `False`, example: `16`. " + }, + "soloUMIstart": { + "type": "integer", + "description": "UMI start base", + "help_text": "Type: `integer`, multiple: `False`, example: `17`. " + }, + "soloUMIlen": { + "type": "integer", + "description": "UMI length", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "soloBarcodeReadLength": { + "type": "integer", + "description": "length of the barcode read\n\n- 1 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloBarcodeMate": { + "type": "integer", + "description": "identifies which read mate contains the barcode (CB+UMI) sequence\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "soloCBposition": { + "type": "array", + "items": { + "type": "string" + }, + "description": "position of Cell Barcode(s) on the barcode read.\n\nPresently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\nFormat for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end\nstart(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base\nString for different barcodes are separated by space.\nExample: inDrop (Zilionis et al, Nat", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloUMIposition": { + "type": "string", + "description": "position of the UMI on the barcode read, same as soloCBposition\n\nExample: inDrop (Zilionis et al, Nat", + "help_text": "Type: `string`, multiple: `False`. " + }, + "soloAdapterSequence": { + "type": "string", + "description": "adapter sequence to anchor barcodes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "soloAdapterMismatchesNmax": { + "type": "integer", + "description": "maximum number of mismatches allowed in adapter sequence.", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloCBmatchWLtype": { + "type": "string", + "description": "matching the Cell Barcodes to the WhiteList\n\n- Exact ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"1MM_multi\"`. " + }, + "soloInputSAMattrBarcodeSeq": { + "type": "array", + "items": { + "type": "string" + }, + "description": "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .\nThis parameter is required when running STARsolo with input from SAM.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloInputSAMattrBarcodeQual": { + "type": "array", + "items": { + "type": "string" + }, + "description": "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned to all bases.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloStrand": { + "type": "string", + "description": "strandedness of the solo libraries:\n\n- Unstranded ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Forward\"`. " + }, + "soloFeatures": { + "type": "array", + "items": { + "type": "string" + }, + "description": "genomic features for which the UMI counts per Cell Barcode are collected\n\n- Gene ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene\"]`. " + }, + "soloMultiMappers": { + "type": "array", + "items": { + "type": "string" + }, + "description": "counting method for reads mapping to multiple genes\n\n- Unique ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Unique\"]`. " + }, + "soloUMIdedup": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1MM_All\"]`. " + }, + "soloUMIfiltering": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of UMI filtering (for reads uniquely mapping to genes)\n\n- - ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloOutFileNames": { + "type": "array", + "items": { + "type": "string" + }, + "description": "file names for STARsolo output:\n\nfile_name_prefix gene_names barcode_sequences cell_feature_count_matrix", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Solo.out/\";\"features.tsv\";\"barcodes.tsv\";\"matrix.mtx\"]`. " + }, + "soloCellFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "cell filtering type and parameters\n\n- None ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"CellRanger2.2\";\"3000\";\"0.99\";\"10\"]`. " + }, + "soloOutFormatFeaturesGeneField3": { + "type": "array", + "items": { + "type": "string" + }, + "description": "field 3 in the Gene features.tsv file", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene Expression\"]`. " + }, + "soloCellReadStats": { + "type": "string", + "description": "Output reads statistics for each CB\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input/output" + }, + { + "$ref": "#/$defs/run parameters" + }, + { + "$ref": "#/$defs/genome parameters" + }, + { + "$ref": "#/$defs/splice junctions database" + }, + { + "$ref": "#/$defs/variation parameters" + }, + { + "$ref": "#/$defs/read parameters" + }, + { + "$ref": "#/$defs/read clipping" + }, + { + "$ref": "#/$defs/limits" + }, + { + "$ref": "#/$defs/output: general" + }, + { + "$ref": "#/$defs/output: sam and bam" + }, + { + "$ref": "#/$defs/bam processing" + }, + { + "$ref": "#/$defs/output wiggle" + }, + { + "$ref": "#/$defs/output filtering" + }, + { + "$ref": "#/$defs/output splice junctions (sj.out.tab)" + }, + { + "$ref": "#/$defs/output filtering: splice junctions" + }, + { + "$ref": "#/$defs/scoring" + }, + { + "$ref": "#/$defs/alignments and seeding" + }, + { + "$ref": "#/$defs/paired-end reads" + }, + { + "$ref": "#/$defs/windows, anchors, binning" + }, + { + "$ref": "#/$defs/chimeric alignments" + }, + { + "$ref": "#/$defs/quantification of annotations" + }, + { + "$ref": "#/$defs/2-pass mapping" + }, + { + "$ref": "#/$defs/wasp parameters" + }, + { + "$ref": "#/$defs/starsolo (single cell rna-seq) parameters" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/star_align/setup_logger.py b/target/nextflow/mapping/star_align/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/mapping/star_align/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/mapping/star_align_v273a/.config.vsh.yaml b/target/nextflow/mapping/star_align_v273a/.config.vsh.yaml new file mode 100644 index 00000000..3f8969a7 --- /dev/null +++ b/target/nextflow/mapping/star_align_v273a/.config.vsh.yaml @@ -0,0 +1,2420 @@ +name: "star_align_v273a" +namespace: "mapping" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Input/Output" + arguments: + - type: "file" + name: "--input" + alternatives: + - "--readFilesIn" + description: "The FASTQ files to be analyzed. Corresponds to the --readFilesIn\ + \ in the STAR command." + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "--genomeDir" + description: "Path to the reference built by star_build_reference. Corresponds\ + \ to the --genomeDir in the STAR command." + info: null + example: + - "/path/to/reference" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--outFileNamePrefix" + description: "Path to output directory. Corresponds to the --outFileNamePrefix\ + \ in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Run Parameters" + arguments: + - type: "integer" + name: "--runRNGseed" + description: "random number generator seed." + info: null + example: + - 777 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Genome Parameters" + arguments: + - type: "string" + name: "--genomeLoad" + description: "mode of shared memory usage for the genome files. Only used with\ + \ --runMode alignReads.\n\n- LoadAndKeep ... load genome into shared and\ + \ keep it in memory after run\n- LoadAndRemove ... load genome into shared\ + \ but remove it after run\n- LoadAndExit ... load genome into shared memory\ + \ and exit, keeping the genome in memory for future runs\n- Remove \ + \ ... do not map anything, just remove loaded genome from memory\n- NoSharedMemory\ + \ ... do not use shared memory, each job will have its own private copy of\ + \ the genome" + info: null + example: + - "NoSharedMemory" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genomeFastaFiles" + description: "path(s) to the fasta files with the genome sequences, separated\ + \ by spaces. These files should be plain text FASTA files, they *cannot* be\ + \ zipped.\n\nRequired for the genome generation (--runMode genomeGenerate).\ + \ Can also be used in the mapping (--runMode alignReads) to add extra (new)\ + \ sequences to the genome (e.g. spike-ins)." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--genomeFileSizes" + description: "genome files exact sizes in bytes. Typically, this should not be\ + \ defined by the user." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeTransformOutput" + description: "which output to transform back to original genome\n\n- SAM ...\ + \ SAM/BAM alignments\n- SJ ... splice junctions (SJ.out.tab)\n- None \ + \ ... no transformation of the output" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--genomeChrSetMitochondrial" + description: "names of the mitochondrial chromosomes. Presently only used for\ + \ STARsolo statistics output/" + info: null + example: + - "chrM" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Splice Junctions Database" + arguments: + - type: "string" + name: "--sjdbFileChrStartEnd" + description: "path to the files with genomic coordinates (chr start \ + \ end strand) for the splice junction introns. Multiple files can be supplied\ + \ and will be concatenated." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--sjdbGTFfile" + description: "path to the GTF file with annotations" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFchrPrefix" + description: "prefix for chromosome names in a GTF file (e.g. 'chr' for using\ + \ ENSMEBL annotations with UCSC genomes)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFfeatureExon" + description: "feature type in GTF file to be used as exons for building transcripts" + info: null + example: + - "exon" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentTranscript" + description: "GTF attribute name for parent transcript ID (default \"transcript_id\"\ + \ works for GTF files)" + info: null + example: + - "transcript_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGene" + description: "GTF attribute name for parent gene ID (default \"gene_id\" works\ + \ for GTF files)" + info: null + example: + - "gene_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneName" + description: "GTF attribute name for parent gene name" + info: null + example: + - "gene_name" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sjdbGTFtagExonParentGeneType" + description: "GTF attribute name for parent gene type" + info: null + example: + - "gene_type" + - "gene_biotype" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sjdbOverhang" + description: "length of the donor/acceptor sequence on each side of the junctions,\ + \ ideally = (mate_length - 1)" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--sjdbScore" + description: "extra alignment score for alignments that cross database junctions" + info: null + example: + - 2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sjdbInsertSave" + description: "which files to save when sjdb junctions are inserted on the fly\ + \ at the mapping step\n\n- Basic ... only small junction / transcript files\n\ + - All ... all files including big Genome, SA and SAindex - this will create\ + \ a complete genome directory" + info: null + example: + - "Basic" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Variation parameters" + arguments: + - type: "string" + name: "--varVCFfile" + description: "path to the VCF file that contains variation data. The 10th column\ + \ should contain the genotype information, e.g. 0/1" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Parameters" + arguments: + - type: "string" + name: "--readFilesType" + description: "format of input read files\n\n- Fastx ... FASTA or FASTQ\n\ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand\ + \ samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use\ + \ --readFilesCommand samtools view" + info: null + example: + - "Fastx" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesSAMattrKeep" + description: "for --readFilesType SAM SE/PE, which SAM tags to keep in the output\ + \ BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n-\ + \ None ... do not keep any tags" + info: null + example: + - "All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--readFilesManifest" + description: "path to the \"manifest\" file with the names of read files. The\ + \ manifest file should contain 3 tab-separated columns:\n\npaired-end reads:\ + \ read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads:\ + \ read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but\ + \ not tabs are allowed in file names.\nIf read_group_line does not start with\ + \ ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line\ + \ starts with ID:, it can contain several fields separated by $tab$, and all\ + \ fields will be be copied verbatim into SAM @RG header line." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesPrefix" + description: "prefix for the read files names, i.e. it will be added in front\ + \ of the strings in --readFilesIn" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readFilesCommand" + description: "command line to execute for each of the input file. This command\ + \ should generate FASTA or FASTQ text and send it to stdout\n\nFor example:\ + \ zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readMapNumber" + description: "number of reads to map from the beginning of the file\n\n-1: map\ + \ all reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readMatesLengthsIn" + description: "Equal/NotEqual - lengths of names,sequences,qualities for both mates\ + \ are the same / not the same. NotEqual is safe in all situations." + info: null + example: + - "NotEqual" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--readNameSeparator" + description: "character(s) separating the part of the read names that will be\ + \ trimmed in output (read name after space is always trimmed)" + info: null + example: + - "/" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--readQualityScoreBase" + description: "number to be subtracted from the ASCII code to get Phred quality\ + \ score" + info: null + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read Clipping" + arguments: + - type: "string" + name: "--clipAdapterType" + description: "adapter clipping type\n\n- Hamming ... adapter clipping based on\ + \ Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n\ + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes\ + \ Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ...\ + \ no adapter clipping, all other clip* parameters are disregarded" + info: null + example: + - "Hamming" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--clip3pNbases" + description: "number(s) of bases to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--clip3pAdapterSeq" + description: "adapter sequences to clip from 3p of each mate. If one value is\ + \ given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence\ + \ with the length equal to read length" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--clip3pAdapterMMp" + description: "max proportion of mismatches for 3p adapter clipping for each mate.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip3pAfterAdapterNbases" + description: "number of bases to clip from 3p of each mate after the adapter clipping.\ + \ If one value is given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--clip5pNbases" + description: "number(s) of bases to clip from 5p of each mate. If one value is\ + \ given, it will be assumed the same for both mates." + info: null + example: + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Limits" + arguments: + - type: "long" + name: "--limitGenomeGenerateRAM" + description: "maximum available RAM (bytes) for genome generation" + info: null + example: + - 31000000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitIObufferSize" + description: "max available buffers size (bytes) for input/output, per thread" + info: null + example: + - 30000000 + - 50000000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "long" + name: "--limitOutSAMoneReadBytes" + description: "max size of the SAM record (bytes) for one read. Recommended value:\ + \ >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax" + info: null + example: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJoneRead" + description: "max number of junctions for one read (including all multi-mappers)" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitOutSJcollapsed" + description: "max number of collapsed junctions" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--limitBAMsortRAM" + description: "maximum available RAM (bytes) for sorting BAM. If =0, it will be\ + \ set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory\ + \ option." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitSjdbInsertNsj" + description: "maximum number of junctions to be inserted to the genome on the\ + \ fly at the mapping stage, including those from annotations and those detected\ + \ in the 1st step of the 2-pass run" + info: null + example: + - 1000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--limitNreadsSoft" + description: "soft limit on the number of reads" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: general" + arguments: + - type: "string" + name: "--outTmpKeep" + description: "whether to keep the temporary files after STAR runs is finished\n\ + \n- None ... remove all temporary files\n- All ... keep all files" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outStd" + description: "which output will be directed to stdout (standard out)\n\n- Log\ + \ ... log messages\n- SAM ... alignments\ + \ in SAM format (which normally are output to Aligned.out.sam file), normal\ + \ standard output will go into Log.std.out\n- BAM_Unsorted ... alignments\ + \ in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate\ + \ ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype\ + \ BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome\ + \ in BAM format, unsorted. Requires --quantMode TranscriptomeSAM" + info: null + example: + - "Log" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outReadsUnmapped" + description: "output of unmapped and partially mapped (i.e. mapped only one mate\ + \ of a paired end read) reads in separate file(s).\n\n- None ... no output\n\ + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outQSconversionAdd" + description: "add this number to the quality score (e.g. to convert from Illumina\ + \ to Sanger, use -31)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outMultimapperOrder" + description: "order of multimapping alignments in the output files\n\n- Old_2.4\ + \ ... quasi-random order used before 2.5.0\n- Random \ + \ ... random order of alignments for each multi-mapper. Read mates (pairs)\ + \ are always adjacent, all alignment for each read stay together. This option\ + \ will become default in the future releases." + info: null + example: + - "Old_2.4" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output: SAM and BAM" + arguments: + - type: "string" + name: "--outSAMtype" + description: "type of SAM/BAM output\n\n1st word:\n- BAM ... output BAM without\ + \ sorting\n- SAM ... output SAM without sorting\n- None ... no SAM/BAM output\n\ + 2nd, 3rd:\n- Unsorted ... standard unsorted\n- SortedByCoordinate\ + \ ... sorted by coordinate. This option will allocate extra memory for sorting\ + \ which can be specified by --limitBAMsortRAM." + info: null + example: + - "SAM" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMmode" + description: "mode of SAM output\n\n- None ... no SAM output\n- Full ... full\ + \ SAM output\n- NoQS ... full SAM but without quality scores" + info: null + example: + - "Full" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMstrandField" + description: "Cufflinks-like strand field flag\n\n- None ... not used\n\ + - intronMotif ... strand derived from the intron motif. This option changes\ + \ the output alignments: reads with inconsistent and/or non-canonical introns\ + \ are filtered out." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattributes" + description: "a string of desired SAM attributes, in the order desired for the\ + \ output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n\ + - None ... no attributes\n- Standard ... NH HI AS nM\n- All \ + \ ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number\ + \ of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard\ + \ SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart\ + \ (=1 by default). Standard SAM tag.\n- AS ... local alignment score,\ + \ +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE\ + \ reads, total score for two mates. Stadnard SAM tag.\n- nM ... number\ + \ of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance\ + \ to the reference (number of mismatched + inserted + deleted bases) for each\ + \ mate. Standard SAM tag.\n- MD ... string encoding mismatched and\ + \ deleted reference bases (see standard SAM specifications). Standard SAM tag.\n\ + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical;\ + \ 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions\ + \ database is used, and a junction is annotated, 20 is added to its motif value.\n\ + - jI ... start and end of introns for all junctions (1-based).\n- XS\ + \ ... alignment strand according to --outSAMstrandField.\n- MC \ + \ ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all\ + \ segment of all chimeric alingments for --chimOutType WithinBAM output.\n-\ + \ cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n\ + - vA ... variant allele\n- vG ... genomic coordinate of the\ + \ variant overlapped by the read.\n- vW ... 1 - alignment passes WASP\ + \ filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires\ + \ --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality\ + \ scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN \ + \ ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene\ + \ IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected\ + \ cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM\ + \ SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS \ + \ ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ \ + \ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha \ + \ ... haplotype (1/2) when mapping to the diploid genome. Requires genome\ + \ generated with --genomeTransformType Diploid .\n- rB ... alignment\ + \ block read/genomic coordinates.\n- vR ... read coordinate of the\ + \ variant." + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMattrIHstart" + description: "start value for the IH attribute. 0 may be required by some downstream\ + \ software, such as Cufflinks or StringTie." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMunmapped" + description: "output of unmapped reads in the SAM format\n\n1st word:\n- None\ + \ ... no output\n- Within ... output unmapped reads within the main SAM file\ + \ (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for\ + \ each alignment, and, in case of unsorted output, keep it adjacent to its mapped\ + \ mate. Only affects multi-mapping reads." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMorder" + description: "type of sorting for the SAM output\n\nPaired: one mate after the\ + \ other for all paired alignments\nPairedKeepInputOrder: one mate after the\ + \ other for all paired alignments, the order is kept the same as in the input\ + \ FASTQ files" + info: null + example: + - "Paired" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMprimaryFlag" + description: "which alignments are considered primary - all others will be marked\ + \ with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the\ + \ best score is primary\n- AllBestScore ... all alignments with the best score\ + \ are primary" + info: null + example: + - "OneBestScore" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMreadID" + description: "read ID record type\n\n- Standard ... first word (until space) from\ + \ the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number\ + \ (index) in the FASTx file" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMmapqUnique" + description: "0 to 255: the MAPQ value for unique mappers" + info: null + example: + - 255 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagOR" + description: "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e.\ + \ FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, and after outSAMflagAND. Can be used to set specific bits that are not\ + \ set otherwise." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMflagAND" + description: "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e.\ + \ FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by\ + \ STAR, but before outSAMflagOR. Can be used to unset specific bits that are\ + \ not set otherwise." + info: null + example: + - 65535 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMattrRGline" + description: "SAM/BAM read group line. The first word contains the read group\ + \ identifier and must start with \"ID:\", e.g. --outSAMattrRGline ID:xxx CN:yy\ + \ \"DS:z z z\".\n\nxxx will be added as RG tag to each output alignment. Any\ + \ spaces in the tag values have to be double quoted.\nComma separated RG lines\ + \ correspons to different (comma separated) input files in --readFilesIn. Commas\ + \ have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz\ + \ \"DS:z z\" , ID:yyy DS:yyyy" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderHD" + description: "@HD (header) line of the SAM header" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderPG" + description: "extra @PG (software) line of the SAM header (in addition to STAR)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outSAMheaderCommentFile" + description: "path to the file with @CO (comment) lines of the SAM header" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outSAMfilter" + description: "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences\ + \ ... only keep the reads for which all alignments are to the extra reference\ + \ sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences\ + \ ... keep all alignments to the extra reference sequences added with --genomeFastaFiles\ + \ at the mapping stage." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSAMmultNmax" + description: "max number of multiple alignments for a read that will be output\ + \ to the SAM/BAM files. Note that if this value is not equal to -1, the top\ + \ scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax)\ + \ will be output" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSAMtlen" + description: "calculation method for the TLEN field in the SAM/BAM files\n\n-\ + \ 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate.\ + \ (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost\ + \ base of any mate. (+)sign for the mate with the leftmost base. This is different\ + \ from 1 for overlapping mates with protruding ends" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMcompression" + description: "-1 to 10 BAM compression level, -1=default compression (6?), 0=no\ + \ compression, 10=maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingThreadN" + description: ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN)." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outBAMsortingBinsN" + description: ">0: number of genome bins for coordinate-sorting" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BAM processing" + arguments: + - type: "string" + name: "--bamRemoveDuplicatesType" + description: "mark duplicates in the BAM file, for now only works with (i) sorted\ + \ BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- -\ + \ ... no duplicate removal/marking\n- UniqueIdentical\ + \ ... mark all multimappers, and duplicate unique mappers. The coordinates,\ + \ FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate\ + \ unique mappers but not multimappers." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--bamRemoveDuplicatesMate2basesN" + description: "number of bases from the 5' of mate 2 to use in collapsing (e.g.\ + \ for RAMPAGE)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Wiggle" + arguments: + - type: "string" + name: "--outWigType" + description: "type of signal output, e.g. \"bedGraph\" OR \"bedGraph read1_5p\"\ + . Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n\ + - None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle\ + \ ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of\ + \ the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only\ + \ 2nd read" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--outWigStrand" + description: "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate\ + \ strands, str1 and str2\n- Unstranded ... collapsed strands" + info: null + example: + - "Stranded" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigReferencesPrefix" + description: "prefix matching reference names to include in the output wiggle\ + \ file, e.g. \"chr\", default \"-\" - include all references" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outWigNorm" + description: "type of normalization for the signal\n\n- RPM ... reads per million\ + \ of mapped reads\n- None ... no normalization, \"raw\" counts" + info: null + example: + - "RPM" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering" + arguments: + - type: "string" + name: "--outFilterType" + description: "type of filtering\n\n- Normal ... standard filtering using only\ + \ current alignment\n- BySJout ... keep only those reads that contain junctions\ + \ that passed filtering into SJ.out.tab" + info: null + example: + - "Normal" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapScoreRange" + description: "the score range below the maximum score for multimapping alignments" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMultimapNmax" + description: "maximum number of loci the read is allowed to map to. Alignments\ + \ (all of them) will be output only if the read maps to no more loci than this\ + \ value.\n\nOtherwise no alignments will be output, and the read will be counted\ + \ as \"mapped to too many loci\" in the Log.final.out ." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMismatchNmax" + description: "alignment will be output only if it has no more mismatches than\ + \ this value." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverLmax" + description: "alignment will be output only if its ratio of mismatches to *mapped*\ + \ length is less than or equal to this value." + info: null + example: + - 0.3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMismatchNoverReadLmax" + description: "alignment will be output only if its ratio of mismatches to *read*\ + \ length is less than or equal to this value." + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterScoreMin" + description: "alignment will be output only if its score is higher than or equal\ + \ to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterScoreMinOverLread" + description: "same as outFilterScoreMin, but normalized to read length (sum of\ + \ mates' lengths for paired-end reads)" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outFilterMatchNmin" + description: "alignment will be output only if the number of matched bases is\ + \ higher than or equal to this value." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--outFilterMatchNminOverLread" + description: "sam as outFilterMatchNmin, but normalized to the read length (sum\ + \ of mates' lengths for paired-end reads)." + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronMotifs" + description: "filter alignment using their motifs\n\n- None \ + \ ... no filtering\n- RemoveNoncanonical ... filter out\ + \ alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated\ + \ ... filter out alignments that contain non-canonical unannotated junctions\ + \ when using annotated splice junctions database. The annotated non-canonical\ + \ junctions will be kept." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--outFilterIntronStrands" + description: "filter alignments\n\n- RemoveInconsistentStrands ... remove\ + \ alignments that have junctions with inconsistent strands\n- None \ + \ ... no filtering" + info: null + example: + - "RemoveInconsistentStrands" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output splice junctions (SJ.out.tab)" + arguments: + - type: "string" + name: "--outSJtype" + description: "type of splice junction output\n\n- Standard ... standard SJ.out.tab\ + \ output\n- None ... no splice junction output" + info: null + example: + - "Standard" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output Filtering: Splice Junctions" + arguments: + - type: "string" + name: "--outSJfilterReads" + description: "which reads to consider for collapsed splice junctions output\n\n\ + - All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping\ + \ reads only" + info: null + example: + - "All" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterOverhangMin" + description: "minimum overhang length for splice junctions on both sides for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply\ + \ to annotated junctions" + info: null + example: + - 30 + - 12 + - 12 + - 12 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountUniqueMin" + description: "minimum uniquely mapping read count per junction for: (1) non-canonical\ + \ motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and\ + \ GT/AT motif. -1 means no output for that motif\n\nJunctions are output if\ + \ one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are\ + \ satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterCountTotalMin" + description: "minimum total (multi-mapping+unique) read count per junction for:\ + \ (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif,\ + \ (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions\ + \ are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin\ + \ conditions are satisfied\ndoes not apply to annotated junctions" + info: null + example: + - 3 + - 1 + - 1 + - 1 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterDistToOtherSJmin" + description: "minimum allowed distance to other junctions' donor/acceptor\n\n\ + does not apply to annotated junctions" + info: null + example: + - 10 + - 0 + - 5 + - 10 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--outSJfilterIntronMaxVsReadN" + description: "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ + \ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2\ + \ reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\n\ + does not apply to annotated junctions" + info: null + example: + - 50000 + - 100000 + - 200000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Scoring" + arguments: + - type: "integer" + name: "--scoreGap" + description: "splice junction penalty (independent on intron motif)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapNoncan" + description: "non-canonical junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapGCAG" + description: "GC/AG and CT/GC junction penalty (in addition to scoreGap)" + info: null + example: + - -4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGapATAC" + description: "AT/AC and GT/AT junction penalty (in addition to scoreGap)" + info: null + example: + - -8 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreGenomicLengthLog2scale" + description: "extra score logarithmically scaled with genomic length of the alignment:\ + \ scoreGenomicLengthLog2scale*log2(genomicLength)" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelOpen" + description: "deletion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreDelBase" + description: "deletion extension penalty per base (in addition to scoreDelOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsOpen" + description: "insertion open penalty" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreInsBase" + description: "insertion extension penalty per base (in addition to scoreInsOpen)" + info: null + example: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scoreStitchSJshift" + description: "maximum score reduction while searching for SJ boundaries in the\ + \ stitching step" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Alignments and Seeding" + arguments: + - type: "integer" + name: "--seedSearchStartLmax" + description: "defines the search start point through the read - the read is split\ + \ into pieces no longer than this value" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--seedSearchStartLmaxOverLread" + description: "seedSearchStartLmax normalized to read length (sum of mates' lengths\ + \ for paired-end reads)" + info: null + example: + - 1.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSearchLmax" + description: "defines the maximum length of the seeds, if =0 seed length is not\ + \ limited" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMultimapNmax" + description: "only pieces that map fewer than this value are utilized in the stitching\ + \ procedure" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerReadNmax" + description: "max number of seeds per read" + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedPerWindowNmax" + description: "max number of seeds per window" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedNoneLociPerWindow" + description: "max number of one seed loci per window" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedSplitMin" + description: "min length of the seed sequences split by Ns or mate gap" + info: null + example: + - 12 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seedMapMin" + description: "min length of seeds to be mapped" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMin" + description: "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,\ + \ otherwise it is considered Deletion" + info: null + example: + - 21 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignIntronMax" + description: "maximum intron size, if 0, max intron size will be determined by\ + \ (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignMatesGapMax" + description: "maximum gap between two mates, if 0, max intron gap will be determined\ + \ by (2^winBinNbits)*winAnchorDistNbins" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJoverhangMin" + description: "minimum overhang (i.e. block size) for spliced alignments" + info: null + example: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSJstitchMismatchNmax" + description: "maximum number of mismatches for stitching of the splice junctions\ + \ (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3)\ + \ GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif." + info: null + example: + - 0 + - -1 + - 0 + - 0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--alignSJDBoverhangMin" + description: "minimum overhang (i.e. block size) for annotated (sjdb) spliced\ + \ alignments" + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignSplicedMateMapLmin" + description: "minimum mapped length for a read mate that is spliced" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alignSplicedMateMapLminOverLmate" + description: "alignSplicedMateMapLmin normalized to mate length" + info: null + example: + - 0.66 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignWindowsPerReadNmax" + description: "max number of windows per read" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerWindowNmax" + description: "max number of transcripts per window" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--alignTranscriptsPerReadNmax" + description: "max number of different alignments per read to consider" + info: null + example: + - 10000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsType" + description: "type of read ends alignment\n\n- Local ... standard\ + \ local alignment with soft-clipping allowed\n- EndToEnd ... force\ + \ end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully\ + \ extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12\ + \ ... fully extend only the 5p of the both read1 and read2, all other ends:\ + \ local alignment" + info: null + example: + - "Local" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignEndsProtrude" + description: "allow protrusion of alignment ends, i.e. start (end) of the +strand\ + \ mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum\ + \ number of protrusion bases allowed\n2nd word: string:\n- \ + \ ConcordantPair ... report alignments with non-zero protrusion as concordant\ + \ pairs\n- DiscordantPair ... report alignments with non-zero\ + \ protrusion as discordant pairs" + info: null + example: + - "0 ConcordantPair" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignSoftClipAtReferenceEnds" + description: "allow the soft-clipping of the alignments past the end of the chromosomes\n\ + \n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks" + info: null + example: + - "Yes" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--alignInsertionFlush" + description: "how to flush ambiguous insertion positions\n\n- None ... insertions\ + \ are not flushed\n- Right ... insertions are flushed to the right" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-End reads" + arguments: + - type: "integer" + name: "--peOverlapNbasesMin" + description: "minimum number of overlapping bases to trigger mates merging and\ + \ realignment. Specify >0 value to switch on the \"merginf of overlapping mates\"\ + \ algorithm." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--peOverlapMMp" + description: "maximum proportion of mismatched bases in the overlap area" + info: null + example: + - 0.01 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Windows, Anchors, Binning" + arguments: + - type: "integer" + name: "--winAnchorMultimapNmax" + description: "max number of loci anchors are allowed to map to" + info: null + example: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winBinNbits" + description: "=log2(winBin), where winBin is the size of the bin for the windows/clustering,\ + \ each window will occupy an integer number of bins." + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winAnchorDistNbins" + description: "max number of bins between two anchors that allows aggregation of\ + \ anchors into one window" + info: null + example: + - 9 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winFlankNbins" + description: "log2(winFlank), where win Flank is the size of the left and right\ + \ flanking regions for each window" + info: null + example: + - 4 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--winReadCoverageRelativeMin" + description: "minimum relative coverage of the read sequence by the seeds in a\ + \ window, for STARlong algorithm only." + info: null + example: + - 0.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--winReadCoverageBasesMin" + description: "minimum number of bases covered by the seeds in a window , for STARlong\ + \ algorithm only." + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Chimeric Alignments" + arguments: + - type: "string" + name: "--chimOutType" + description: "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n\ + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n-\ + \ WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n-\ + \ WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental\ + \ chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip\ + \ ... soft-clipping in the CIGAR for supplemental chimeric alignments" + info: null + example: + - "Junctions" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentMin" + description: "minimum length of chimeric segment length, if ==0, no chimeric output" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreMin" + description: "minimum total (summed) score of the chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreDropMax" + description: "max drop (difference) of chimeric score (the sum of scores of all\ + \ chimeric segments) from the read length" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreSeparation" + description: "minimum difference (separation) between the best chimeric score\ + \ and the next one" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimScoreJunctionNonGTAG" + description: "penalty for a non-GT/AG chimeric junction" + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimJunctionOverhangMin" + description: "minimum overhang for a chimeric junction" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimSegmentReadGapMax" + description: "maximum gap in the read sequence between chimeric segments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chimFilter" + description: "different filters for chimeric alignments\n\n- None ... no filtering\n\ + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric\ + \ junction" + info: null + example: + - "banGenomicN" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--chimMainSegmentMultNmax" + description: "maximum number of multi-alignments for the main chimeric segment.\ + \ =1 will prohibit multimapping main segments." + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapNmax" + description: "maximum number of chimeric multi-alignments\n\n- 0 ... use the old\ + \ scheme for chimeric detection which only considered unique alignments" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimMultimapScoreRange" + description: "the score range for multi-mapping chimeras below the best chimeric\ + \ score. Only works with --chimMultimapNmax > 1" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimNonchimScoreDropMin" + description: "to trigger chimeric detection, the drop in the best non-chimeric\ + \ alignment score with respect to the read length has to be greater than this\ + \ value" + info: null + example: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--chimOutJunctionFormat" + description: "formatting type for the Chimeric.out.junction file\n\n- 0 ... no\ + \ comment lines/headers\n- 1 ... comment lines at the end of the file: command\ + \ line and Nreads: total, unique/multi-mapping" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Quantification of Annotations" + arguments: + - type: "string" + name: "--quantMode" + description: "types of quantification requested\n\n- - ... none\n\ + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate\ + \ file\n- GeneCounts ... count reads per gene" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--quantTranscriptomeBAMcompression" + description: "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM\ + \ output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10\ + \ ... maximum compression" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quantTranscriptomeBan" + description: "prohibit various alignment type\n\n- IndelSoftclipSingleend ...\ + \ prohibit indels, soft clipping and single-end alignments - compatible with\ + \ RSEM\n- Singleend ... prohibit single-end alignments" + info: null + example: + - "IndelSoftclipSingleend" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "2-pass Mapping" + arguments: + - type: "string" + name: "--twopassMode" + description: "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic\ + \ ... basic 2-pass mapping, with all 1st pass junctions inserted into\ + \ the genome indices on the fly" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--twopass1readsN" + description: "number of reads to process for the 1st step. Use very large number\ + \ (or default -1) to map all reads in the first step." + info: null + example: + - -1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "WASP parameters" + arguments: + - type: "string" + name: "--waspOutputMode" + description: "WASP allele-specific output type. This is re-implementation of the\ + \ original WASP mappability filtering by Bryce van de Geijn, Graham McVicker,\ + \ Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature\ + \ Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\ + \n- SAMtag ... add WASP tags to the alignments that pass WASP filtering" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STARsolo (single cell RNA-seq) parameters" + arguments: + - type: "string" + name: "--soloType" + description: "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet)\ + \ one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X\ + \ Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length,\ + \ one UMI of fixed length and one adapter sequence of fixed length are allowed\ + \ in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode\ + \ as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2\ + \ if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or\ + \ SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate\ + \ FASTQ (paired- or single-end), barcodes are corresponding read-groups, no\ + \ UMI sequences, alignments deduplicated according to alignment start and end\ + \ (after extending soft-clipped bases)" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCBwhitelist" + description: "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex\ + \ allows more than one whitelist file.\n\n- None ... no whitelist:\ + \ all cell barcodes are allowed" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--soloCBstart" + description: "cell barcode start base" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloCBlen" + description: "cell barcode length" + info: null + example: + - 16 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIstart" + description: "UMI start base" + info: null + example: + - 17 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloUMIlen" + description: "UMI length" + info: null + example: + - 10 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeReadLength" + description: "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n\ + - 0 ... not defined, do not check" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloBarcodeMate" + description: "identifies which read mate contains the barcode (CB+UMI) sequence\n\ + \n- 0 ... barcode sequence is on separate read, which should always be the\ + \ last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part\ + \ of mate 1\n- 2 ... barcode sequence is a part of mate 2" + info: null + example: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBposition" + description: "position of Cell Barcode(s) on the barcode read.\n\nPresently only\ + \ works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\n\ + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor\ + \ defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter\ + \ start; 3: adapter end\nstart(end)Position is the 0-based position with of\ + \ the CB start(end) with respect to the Anchor Base\nString for different barcodes\ + \ are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols,\ + \ 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIposition" + description: "position of the UMI on the barcode read, same as soloCBposition\n\ + \nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition\ + \ 3_9_3_14" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloAdapterSequence" + description: "adapter sequence to anchor barcodes. Only one adapter sequence is\ + \ allowed." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--soloAdapterMismatchesNmax" + description: "maximum number of mismatches allowed in adapter sequence." + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloCBmatchWLtype" + description: "matching the Cell Barcodes to the WhiteList\n\n- Exact \ + \ ... only exact matches allowed\n- 1MM \ + \ ... only one match in whitelist with 1 mismatched base allowed. Allowed\ + \ CBs have to have at least one read with exact match.\n- 1MM_multi \ + \ ... multiple matches in whitelist with 1 mismatched base allowed,\ + \ posterior probability calculation is used choose one of the matches.\nAllowed\ + \ CBs have to have at least one read with exact match. This option matches best\ + \ with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi,\ + \ but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts\ + \ ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for\ + \ CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2\ + \ ... allow up to edit distance of 3 fpr each of the barcodes.\ + \ May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex.\ + \ Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio\ + \ Split-seq pipeline." + info: null + example: + - "1MM_multi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeSeq" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance,\ + \ for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR\ + \ .\nThis parameter is required when running STARsolo with input from SAM." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloInputSAMattrBarcodeQual" + description: "when inputting reads from a SAM file (--readsFileType SAM SE/PE),\ + \ these SAM attributes mark the barcode qualities (in proper order).\n\nFor\ + \ instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual\ + \ CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned\ + \ to all bases." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloStrand" + description: "strandedness of the solo libraries:\n\n- Unstranded ... no strand\ + \ information\n- Forward ... read strand same as the original RNA molecule\n\ + - Reverse ... read strand opposite to the original RNA molecule" + info: null + example: + - "Forward" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--soloFeatures" + description: "genomic features for which the UMI counts per Cell Barcode are collected\n\ + \n- Gene ... genes: reads match the gene transcript\n- SJ \ + \ ... splice junctions: reported in SJ.out.tab\n- GeneFull ...\ + \ full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n\ + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping\ + \ genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS\ + \ ... full gene (pre-RNA): count all reads overlapping genes' exons and\ + \ introns: prioritize >50% overlap with exons. Do not count reads with 100%\ + \ exonic overlap in the antisense direction." + info: null + example: + - "Gene" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloMultiMappers" + description: "counting method for reads mapping to multiple genes\n\n- Unique\ + \ ... count only reads that map to unique genes\n- Uniform ... uniformly\ + \ distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs\ + \ proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique\ + \ ... distribute UMIs proportionally to unique mappers, if present, and uniformly\ + \ if not.\n- EM ... multi-gene UMIs are distributed using Expectation\ + \ Maximization algorithm" + info: null + example: + - "Unique" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIdedup" + description: "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All \ + \ ... all UMIs with 1 mismatch distance to each other are\ + \ collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows\ + \ the \"directional\" method from the UMI-tools by Smith, Heger and Sudbery\ + \ (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools,\ + \ but with more stringent criteria for duplicate UMIs\n- Exact \ + \ ... only exactly matching UMIs are collapsed.\n- NoDedup \ + \ ... no deduplication of UMIs, count all reads.\n- 1MM_CR \ + \ ... CellRanger2-4 algorithm for 1MM UMI collapsing." + info: null + example: + - "1MM_All" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloUMIfiltering" + description: "type of UMI filtering (for reads uniquely mapping to genes)\n\n\ + - - ... basic filtering: remove UMIs with N and homopolymers\ + \ (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count\ + \ UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove\ + \ all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic +\ + \ remove lower-count UMIs that map to more than one gene, matching CellRanger\ + \ > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFileNames" + description: "file names for STARsolo output:\n\nfile_name_prefix gene_names\ + \ barcode_sequences cell_feature_count_matrix" + info: null + example: + - "Solo.out/" + - "features.tsv" + - "barcodes.tsv" + - "matrix.mtx" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellFilter" + description: "cell filtering type and parameters\n\n- None ... do not\ + \ output filtered cells\n- TopCells ... only report top cells by UMI\ + \ count, followed by the exact number of cells\n- CellRanger2.2 ... simple\ + \ filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected\ + \ cells, robust maximum percentile for UMI count, maximum to minimum ratio for\ + \ UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; \ + \ maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering\ + \ in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun\ + \ et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\n\ + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile \ + \ maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR\ + \ simN\nThe harcoded values are from CellRanger: 3000 \ + \ 0.99 10 45000 90000 500 0.01 20000\ + \ 0.01 10000" + info: null + example: + - "CellRanger2.2" + - "3000" + - "0.99" + - "10" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloOutFormatFeaturesGeneField3" + description: "field 3 in the Gene features.tsv file. If \"-\", then no 3rd field\ + \ is output." + info: null + example: + - "Gene Expression" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--soloCellReadStats" + description: "Output reads statistics for each CB\n\n- Standard ... standard\ + \ output" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Align fastq files using STAR." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "docker" + env: + - "STAR_VERSION 2.7.3a" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/mapping/star_align_v273a/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/mapping/star_align_v273a" + executable: "target/nextflow/mapping/star_align_v273a/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/mapping/star_align_v273a/main.nf b/target/nextflow/mapping/star_align_v273a/main.nf new file mode 100644 index 00000000..d456a999 --- /dev/null +++ b/target/nextflow/mapping/star_align_v273a/main.nf @@ -0,0 +1,6468 @@ +// star_align_v273a main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "star_align_v273a", + "namespace" : "mapping", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input/Output", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "--readFilesIn" + ], + "description" : "The FASTQ files to be analyzed. Corresponds to the --readFilesIn in the STAR command.", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "alternatives" : [ + "--genomeDir" + ], + "description" : "Path to the reference built by star_build_reference. Corresponds to the --genomeDir in the STAR command.", + "example" : [ + "/path/to/reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "--outFileNamePrefix" + ], + "description" : "Path to output directory. Corresponds to the --outFileNamePrefix in the STAR command.", + "example" : [ + "/path/to/foo" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Run Parameters", + "arguments" : [ + { + "type" : "integer", + "name" : "--runRNGseed", + "description" : "random number generator seed.", + "example" : [ + 777 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Genome Parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--genomeLoad", + "description" : "mode of shared memory usage for the genome files. Only used with --runMode alignReads.\n\n- LoadAndKeep ... load genome into shared and keep it in memory after run\n- LoadAndRemove ... load genome into shared but remove it after run\n- LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs\n- Remove ... do not map anything, just remove loaded genome from memory\n- NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome", + "example" : [ + "NoSharedMemory" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--genomeFastaFiles", + "description" : "path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.\n\nRequired for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins).", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--genomeFileSizes", + "description" : "genome files exact sizes in bytes. Typically, this should not be defined by the user.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--genomeTransformOutput", + "description" : "which output to transform back to original genome\n\n- SAM ... SAM/BAM alignments\n- SJ ... splice junctions (SJ.out.tab)\n- None ... no transformation of the output", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--genomeChrSetMitochondrial", + "description" : "names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/", + "example" : [ + "chrM", + "M", + "MT" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Splice Junctions Database", + "arguments" : [ + { + "type" : "string", + "name" : "--sjdbFileChrStartEnd", + "description" : "path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sjdbGTFfile", + "description" : "path to the GTF file with annotations", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFchrPrefix", + "description" : "prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes)", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFfeatureExon", + "description" : "feature type in GTF file to be used as exons for building transcripts", + "example" : [ + "exon" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentTranscript", + "description" : "GTF attribute name for parent transcript ID (default \\"transcript_id\\" works for GTF files)", + "example" : [ + "transcript_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGene", + "description" : "GTF attribute name for parent gene ID (default \\"gene_id\\" works for GTF files)", + "example" : [ + "gene_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGeneName", + "description" : "GTF attribute name for parent gene name", + "example" : [ + "gene_name" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbGTFtagExonParentGeneType", + "description" : "GTF attribute name for parent gene type", + "example" : [ + "gene_type", + "gene_biotype" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sjdbOverhang", + "description" : "length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sjdbScore", + "description" : "extra alignment score for alignments that cross database junctions", + "example" : [ + 2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sjdbInsertSave", + "description" : "which files to save when sjdb junctions are inserted on the fly at the mapping step\n\n- Basic ... only small junction / transcript files\n- All ... all files including big Genome, SA and SAindex - this will create a complete genome directory", + "example" : [ + "Basic" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Variation parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--varVCFfile", + "description" : "path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read Parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--readFilesType", + "description" : "format of input read files\n\n- Fastx ... FASTA or FASTQ\n- SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view\n- SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view", + "example" : [ + "Fastx" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesSAMattrKeep", + "description" : "for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ... keep all tags\n- None ... do not keep any tags", + "example" : [ + "All" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--readFilesManifest", + "description" : "path to the \\"manifest\\" file with the names of read files. The manifest file should contain 3 tab-separated columns:\n\npaired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line.\nsingle-end reads: read1_file_name $tab$ - $tab$ read_group_line.\nSpaces, but not tabs are allowed in file names.\nIf read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it.\nIf read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesPrefix", + "description" : "prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readFilesCommand", + "description" : "command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout\n\nFor example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--readMapNumber", + "description" : "number of reads to map from the beginning of the file\n\n-1: map all reads", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readMatesLengthsIn", + "description" : "Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations.", + "example" : [ + "NotEqual" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--readNameSeparator", + "description" : "character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)", + "example" : [ + "/" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--readQualityScoreBase", + "description" : "number to be subtracted from the ASCII code to get Phred quality score", + "example" : [ + 33 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read Clipping", + "arguments" : [ + { + "type" : "string", + "name" : "--clipAdapterType", + "description" : "adapter clipping type\n\n- Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp\n- CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal\n- None ... no adapter clipping, all other clip* parameters are disregarded", + "example" : [ + "Hamming" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip3pNbases", + "description" : "number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--clip3pAdapterSeq", + "description" : "adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.\n\n- polyA ... polyA sequence with the length equal to read length", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--clip3pAdapterMMp", + "description" : "max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip3pAfterAdapterNbases", + "description" : "number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--clip5pNbases", + "description" : "number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Limits", + "arguments" : [ + { + "type" : "long", + "name" : "--limitGenomeGenerateRAM", + "description" : "maximum available RAM (bytes) for genome generation", + "example" : [ + 31000000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitIObufferSize", + "description" : "max available buffers size (bytes) for input/output, per thread", + "example" : [ + 30000000, + 50000000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitOutSAMoneReadBytes", + "description" : "max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax", + "example" : [ + 100000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitOutSJoneRead", + "description" : "max number of junctions for one read (including all multi-mappers)", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitOutSJcollapsed", + "description" : "max number of collapsed junctions", + "example" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--limitBAMsortRAM", + "description" : "maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitSjdbInsertNsj", + "description" : "maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "example" : [ + 1000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--limitNreadsSoft", + "description" : "soft limit on the number of reads", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output: general", + "arguments" : [ + { + "type" : "string", + "name" : "--outTmpKeep", + "description" : "whether to keep the temporary files after STAR runs is finished\n\n- None ... remove all temporary files\n- All ... keep all files", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outStd", + "description" : "which output will be directed to stdout (standard out)\n\n- Log ... log messages\n- SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out\n- BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted\n- BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate\n- BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM", + "example" : [ + "Log" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outReadsUnmapped", + "description" : "output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s).\n\n- None ... no output\n- Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outQSconversionAdd", + "description" : "add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outMultimapperOrder", + "description" : "order of multimapping alignments in the output files\n\n- Old_2.4 ... quasi-random order used before 2.5.0\n- Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases.", + "example" : [ + "Old_2.4" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output: SAM and BAM", + "arguments" : [ + { + "type" : "string", + "name" : "--outSAMtype", + "description" : "type of SAM/BAM output\n\n1st word:\n- BAM ... output BAM without sorting\n- SAM ... output SAM without sorting\n- None ... no SAM/BAM output\n2nd, 3rd:\n- Unsorted ... standard unsorted\n- SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM.", + "example" : [ + "SAM" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMmode", + "description" : "mode of SAM output\n\n- None ... no SAM output\n- Full ... full SAM output\n- NoQS ... full SAM but without quality scores", + "example" : [ + "Full" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMstrandField", + "description" : "Cufflinks-like strand field flag\n\n- None ... not used\n- intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMattributes", + "description" : "a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order.\n\n***Presets:\n- None ... no attributes\n- Standard ... NH HI AS nM\n- All ... NH HI AS nM NM MD jM jI MC ch\n***Alignment:\n- NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag.\n- HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag.\n- AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag.\n- nM ... number of mismatches. For PE reads, sum over two mates.\n- NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag.\n- MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag.\n- jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value.\n- jI ... start and end of introns for all junctions (1-based).\n- XS ... alignment strand according to --outSAMstrandField.\n- MC ... mate's CIGAR string. Standard SAM tag.\n- ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output.\n- cN ... number of bases clipped from the read ends: 5' and 3'\n***Variation:\n- vA ... variant allele\n- vG ... genomic coordinate of the variant overlapped by the read.\n- vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag.\n***STARsolo:\n- CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.\n- GX GN ... gene ID and gene name for unique-gene reads.\n- gx gn ... gene IDs and gene names for unique- and multi-gene reads.\n- CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.\n- sM ... assessment of CB and UMI.\n- sS ... sequence of the entire barcode (CB,UMI,adapter).\n- sQ ... quality of the entire barcode.\n***Unsupported/undocumented:\n- ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid .\n- rB ... alignment block read/genomic coordinates.\n- vR ... read coordinate of the variant.", + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMattrIHstart", + "description" : "start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie.", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMunmapped", + "description" : "output of unmapped reads in the SAM format\n\n1st word:\n- None ... no output\n- Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam)\n2nd word:\n- KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMorder", + "description" : "type of sorting for the SAM output\n\nPaired: one mate after the other for all paired alignments\nPairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files", + "example" : [ + "Paired" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMprimaryFlag", + "description" : "which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG\n\n- OneBestScore ... only one alignment with the best score is primary\n- AllBestScore ... all alignments with the best score are primary", + "example" : [ + "OneBestScore" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMreadID", + "description" : "read ID record type\n\n- Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end\n- Number ... read number (index) in the FASTx file", + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMmapqUnique", + "description" : "0 to 255: the MAPQ value for unique mappers", + "example" : [ + 255 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMflagOR", + "description" : "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMflagAND", + "description" : "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.", + "example" : [ + 65535 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMattrRGline", + "description" : "SAM/BAM read group line. The first word contains the read group identifier and must start with \\"ID:\\", e.g. --outSAMattrRGline ID:xxx CN:yy \\"DS:z z z\\".\n\nxxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.\nComma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g.\n--outSAMattrRGline ID:xxx , ID:zzz \\"DS:z z\\" , ID:yyy DS:yyyy", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderHD", + "description" : "@HD (header) line of the SAM header", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderPG", + "description" : "extra @PG (software) line of the SAM header (in addition to STAR)", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMheaderCommentFile", + "description" : "path to the file with @CO (comment) lines of the SAM header", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outSAMfilter", + "description" : "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage.\n- KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMmultNmax", + "description" : "max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first\n\n- -1 ... all alignments (up to --outFilterMultimapNmax) will be output", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSAMtlen", + "description" : "calculation method for the TLEN field in the SAM/BAM files\n\n- 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate\n- 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMcompression", + "description" : "-1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMsortingThreadN", + "description" : ">=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outBAMsortingBinsN", + "description" : ">0: number of genome bins for coordinate-sorting", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "BAM processing", + "arguments" : [ + { + "type" : "string", + "name" : "--bamRemoveDuplicatesType", + "description" : "mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- - ... no duplicate removal/marking\n- UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical\n- UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--bamRemoveDuplicatesMate2basesN", + "description" : "number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Wiggle", + "arguments" : [ + { + "type" : "string", + "name" : "--outWigType", + "description" : "type of signal output, e.g. \\"bedGraph\\" OR \\"bedGraph read1_5p\\". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .\n\n1st word:\n- None ... no signal output\n- bedGraph ... bedGraph format\n- wiggle ... wiggle format\n2nd word:\n- read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc\n- read2 ... signal from only 2nd read", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigStrand", + "description" : "strandedness of wiggle/bedGraph output\n\n- Stranded ... separate strands, str1 and str2\n- Unstranded ... collapsed strands", + "example" : [ + "Stranded" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigReferencesPrefix", + "description" : "prefix matching reference names to include in the output wiggle file, e.g. \\"chr\\", default \\"-\\" - include all references", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outWigNorm", + "description" : "type of normalization for the signal\n\n- RPM ... reads per million of mapped reads\n- None ... no normalization, \\"raw\\" counts", + "example" : [ + "RPM" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Filtering", + "arguments" : [ + { + "type" : "string", + "name" : "--outFilterType", + "description" : "type of filtering\n\n- Normal ... standard filtering using only current alignment\n- BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab", + "example" : [ + "Normal" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMultimapScoreRange", + "description" : "the score range below the maximum score for multimapping alignments", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMultimapNmax", + "description" : "maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value.\n\nOtherwise no alignments will be output, and the read will be counted as \\"mapped to too many loci\\" in the Log.final.out .", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMismatchNmax", + "description" : "alignment will be output only if it has no more mismatches than this value.", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMismatchNoverLmax", + "description" : "alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.", + "example" : [ + 0.3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMismatchNoverReadLmax", + "description" : "alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.", + "example" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterScoreMin", + "description" : "alignment will be output only if its score is higher than or equal to this value.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterScoreMinOverLread", + "description" : "same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)", + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outFilterMatchNmin", + "description" : "alignment will be output only if the number of matched bases is higher than or equal to this value.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--outFilterMatchNminOverLread", + "description" : "sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).", + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outFilterIntronMotifs", + "description" : "filter alignment using their motifs\n\n- None ... no filtering\n- RemoveNoncanonical ... filter out alignments that contain non-canonical junctions\n- RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--outFilterIntronStrands", + "description" : "filter alignments\n\n- RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands\n- None ... no filtering", + "example" : [ + "RemoveInconsistentStrands" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output splice junctions (SJ.out.tab)", + "arguments" : [ + { + "type" : "string", + "name" : "--outSJtype", + "description" : "type of splice junction output\n\n- Standard ... standard SJ.out.tab output\n- None ... no splice junction output", + "example" : [ + "Standard" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output Filtering: Splice Junctions", + "arguments" : [ + { + "type" : "string", + "name" : "--outSJfilterReads", + "description" : "which reads to consider for collapsed splice junctions output\n\n- All ... all reads, unique- and multi-mappers\n- Unique ... uniquely mapping reads only", + "example" : [ + "All" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterOverhangMin", + "description" : "minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\ndoes not apply to annotated junctions", + "example" : [ + 30, + 12, + 12, + 12 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterCountUniqueMin", + "description" : "minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied\ndoes not apply to annotated junctions", + "example" : [ + 3, + 1, + 1, + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterCountTotalMin", + "description" : "minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif\n\nJunctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied\ndoes not apply to annotated junctions", + "example" : [ + 3, + 1, + 1, + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterDistToOtherSJmin", + "description" : "minimum allowed distance to other junctions' donor/acceptor\n\ndoes not apply to annotated junctions", + "example" : [ + 10, + 0, + 5, + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--outSJfilterIntronMaxVsReadN", + "description" : "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ni.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax\ndoes not apply to annotated junctions", + "example" : [ + 50000, + 100000, + 200000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Scoring", + "arguments" : [ + { + "type" : "integer", + "name" : "--scoreGap", + "description" : "splice junction penalty (independent on intron motif)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapNoncan", + "description" : "non-canonical junction penalty (in addition to scoreGap)", + "example" : [ + -8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapGCAG", + "description" : "GC/AG and CT/GC junction penalty (in addition to scoreGap)", + "example" : [ + -4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGapATAC", + "description" : "AT/AC and GT/AT junction penalty (in addition to scoreGap)", + "example" : [ + -8 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreGenomicLengthLog2scale", + "description" : "extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreDelOpen", + "description" : "deletion open penalty", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreDelBase", + "description" : "deletion extension penalty per base (in addition to scoreDelOpen)", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreInsOpen", + "description" : "insertion open penalty", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreInsBase", + "description" : "insertion extension penalty per base (in addition to scoreInsOpen)", + "example" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scoreStitchSJshift", + "description" : "maximum score reduction while searching for SJ boundaries in the stitching step", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Alignments and Seeding", + "arguments" : [ + { + "type" : "integer", + "name" : "--seedSearchStartLmax", + "description" : "defines the search start point through the read - the read is split into pieces no longer than this value", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--seedSearchStartLmaxOverLread", + "description" : "seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)", + "example" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedSearchLmax", + "description" : "defines the maximum length of the seeds, if =0 seed length is not limited", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedMultimapNmax", + "description" : "only pieces that map fewer than this value are utilized in the stitching procedure", + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedPerReadNmax", + "description" : "max number of seeds per read", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedPerWindowNmax", + "description" : "max number of seeds per window", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedNoneLociPerWindow", + "description" : "max number of one seed loci per window", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedSplitMin", + "description" : "min length of the seed sequences split by Ns or mate gap", + "example" : [ + 12 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seedMapMin", + "description" : "min length of seeds to be mapped", + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignIntronMin", + "description" : "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion", + "example" : [ + 21 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignIntronMax", + "description" : "maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignMatesGapMax", + "description" : "maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJoverhangMin", + "description" : "minimum overhang (i.e. block size) for spliced alignments", + "example" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJstitchMismatchNmax", + "description" : "maximum number of mismatches for stitching of the splice junctions (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.", + "example" : [ + 0, + -1, + 0, + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSJDBoverhangMin", + "description" : "minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignSplicedMateMapLmin", + "description" : "minimum mapped length for a read mate that is spliced", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alignSplicedMateMapLminOverLmate", + "description" : "alignSplicedMateMapLmin normalized to mate length", + "example" : [ + 0.66 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignWindowsPerReadNmax", + "description" : "max number of windows per read", + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignTranscriptsPerWindowNmax", + "description" : "max number of transcripts per window", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--alignTranscriptsPerReadNmax", + "description" : "max number of different alignments per read to consider", + "example" : [ + 10000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignEndsType", + "description" : "type of read ends alignment\n\n- Local ... standard local alignment with soft-clipping allowed\n- EndToEnd ... force end-to-end read alignment, do not soft-clip\n- Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment\n- Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment", + "example" : [ + "Local" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignEndsProtrude", + "description" : "allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate\n\n1st word: int: maximum number of protrusion bases allowed\n2nd word: string:\n- ConcordantPair ... report alignments with non-zero protrusion as concordant pairs\n- DiscordantPair ... report alignments with non-zero protrusion as discordant pairs", + "example" : [ + "0 ConcordantPair" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignSoftClipAtReferenceEnds", + "description" : "allow the soft-clipping of the alignments past the end of the chromosomes\n\n- Yes ... allow\n- No ... prohibit, useful for compatibility with Cufflinks", + "example" : [ + "Yes" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--alignInsertionFlush", + "description" : "how to flush ambiguous insertion positions\n\n- None ... insertions are not flushed\n- Right ... insertions are flushed to the right", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Paired-End reads", + "arguments" : [ + { + "type" : "integer", + "name" : "--peOverlapNbasesMin", + "description" : "minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the \\"merginf of overlapping mates\\" algorithm.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--peOverlapMMp", + "description" : "maximum proportion of mismatched bases in the overlap area", + "example" : [ + 0.01 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Windows, Anchors, Binning", + "arguments" : [ + { + "type" : "integer", + "name" : "--winAnchorMultimapNmax", + "description" : "max number of loci anchors are allowed to map to", + "example" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "''' + '''multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winBinNbits", + "description" : "=log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.", + "example" : [ + 16 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winAnchorDistNbins", + "description" : "max number of bins between two anchors that allows aggregation of anchors into one window", + "example" : [ + 9 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winFlankNbins", + "description" : "log2(winFlank), where win Flank is the size of the left and right flanking regions for each window", + "example" : [ + 4 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--winReadCoverageRelativeMin", + "description" : "minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.", + "example" : [ + 0.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--winReadCoverageBasesMin", + "description" : "minimum number of bases covered by the seeds in a window , for STARlong algorithm only.", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Chimeric Alignments", + "arguments" : [ + { + "type" : "string", + "name" : "--chimOutType", + "description" : "type of chimeric output\n\n- Junctions ... Chimeric.out.junction\n- SeparateSAMold ... output old SAM into separate Chimeric.out.sam file\n- WithinBAM ... output into main aligned BAM files (Aligned.*.bam)\n- WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present)\n- WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments", + "example" : [ + "Junctions" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimSegmentMin", + "description" : "minimum length of chimeric segment length, if ==0, no chimeric output", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreMin", + "description" : "minimum total (summed) score of the chimeric segments", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreDropMax", + "description" : "max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length", + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreSeparation", + "description" : "minimum difference (separation) between the best chimeric score and the next one", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimScoreJunctionNonGTAG", + "description" : "penalty for a non-GT/AG chimeric junction", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimJunctionOverhangMin", + "description" : "minimum overhang for a chimeric junction", + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimSegmentReadGapMax", + "description" : "maximum gap in the read sequence between chimeric segments", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--chimFilter", + "description" : "different filters for chimeric alignments\n\n- None ... no filtering\n- banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction", + "example" : [ + "banGenomicN" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMainSegmentMultNmax", + "description" : "maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments.", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMultimapNmax", + "description" : "maximum number of chimeric multi-alignments\n\n- 0 ... use the old scheme for chimeric detection which only considered unique alignments", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimMultimapScoreRange", + "description" : "the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimNonchimScoreDropMin", + "description" : "to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value", + "example" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--chimOutJunctionFormat", + "description" : "formatting type for the Chimeric.out.junction file\n\n- 0 ... no comment lines/headers\n- 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Quantification of Annotations", + "arguments" : [ + { + "type" : "string", + "name" : "--quantMode", + "description" : "types of quantification requested\n\n- - ... none\n- TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file\n- GeneCounts ... count reads per gene", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--quantTranscriptomeBAMcompression", + "description" : "-2 to 10 transcriptome BAM compression level\n\n- -2 ... no BAM output\n- -1 ... default compression (6?)\n- 0 ... no compression\n- 10 ... maximum compression", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--quantTranscriptomeBan", + "description" : "prohibit various alignment type\n\n- IndelSoftclipSingleend ... prohibit indels, soft clipping and single-end alignments - compatible with RSEM\n- Singleend ... prohibit single-end alignments", + "example" : [ + "IndelSoftclipSingleend" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "2-pass Mapping", + "arguments" : [ + { + "type" : "string", + "name" : "--twopassMode", + "description" : "2-pass mapping mode.\n\n- None ... 1-pass mapping\n- Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--twopass1readsN", + "description" : "number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step.", + "example" : [ + -1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "WASP parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--waspOutputMode", + "description" : "WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .\n\n- SAMtag ... add WASP tags to the alignments that pass WASP filtering", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "STARsolo (single cell RNA-seq) parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--soloType", + "description" : "type of single-cell RNA-seq\n\n- CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.\n- CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).\n- CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate]\n- SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBwhitelist", + "description" : "file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file.\n\n- None ... no whitelist: all cell barcodes are allowed", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloCBstart", + "description" : "cell barcode start base", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloCBlen", + "description" : "cell barcode length", + "example" : [ + 16 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloUMIstart", + "description" : "UMI start base", + "example" : [ + 17 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloUMIlen", + "description" : "UMI length", + "example" : [ + 10 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloBarcodeReadLength", + "description" : "length of the barcode read\n\n- 1 ... equal to sum of soloCBlen+soloUMIlen\n- 0 ... not defined, do not check", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloBarcodeMate", + "description" : "identifies which read mate contains the barcode (CB+UMI) sequence\n\n- 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed\n- 1 ... barcode sequence is a part of mate 1\n- 2 ... barcode sequence is a part of mate 2", + "example" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBposition", + "description" : "position of Cell Barcode(s) on the barcode read.\n\nPresently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\nFormat for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end\nstart(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base\nString for different barcodes are separated by space.\nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition 0_0_2_-1 3_1_3_8", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIposition", + "description" : "position of the UMI on the barcode read, same as soloCBposition\n\nExample: inDrop (Zilionis et al, Nat. Protocols, 2017):\n--soloCBposition 3_9_3_14", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloAdapterSequence", + "description" : "adapter sequence to anchor barcodes. Only one adapter sequence is allowed.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--soloAdapterMismatchesNmax", + "description" : "maximum number of mismatches allowed in adapter sequence.", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCBmatchWLtype", + "description" : "matching the Cell Barcodes to the WhiteList\n\n- Exact ... only exact matches allowed\n- 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match.\n- 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches.\nAllowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0\n- 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.\n- 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0\n- EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline.", + "example" : [ + "1MM_multi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloInputSAMattrBarcodeSeq", + "description" : "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .\nThis parameter is required when running STARsolo with input from SAM.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloInputSAMattrBarcodeQual", + "description" : "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned to all bases.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloStrand", + "description" : "strandedness of the solo libraries:\n\n- Unstranded ... no strand information\n- Forward ... read strand same as the original RNA molecule\n- Reverse ... read strand opposite to the original RNA molecule", + "example" : [ + "Forward" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloFeatures", + "description" : "genomic features for which the UMI counts per Cell Barcode are collected\n\n- Gene ... genes: reads match the gene transcript\n- SJ ... splice junctions: reported in SJ.out.tab\n- GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns\n- GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons\n- GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.", + "example" : [ + "Gene" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloMultiMappers", + "description" : "counting method for reads mapping to multiple genes\n\n- Unique ... count only reads that map to unique genes\n- Uniform ... uniformly distribute multi-genic UMIs to all genes\n- Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)\n- PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not.\n- EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm", + "example" : [ + "Unique" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIdedup", + "description" : "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once).\n- 1MM_Directional_UMItools ... follows the \\"directional\\" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017).\n- 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs\n- Exact ... only exactly matching UMIs are collapsed.\n- NoDedup ... no deduplication of UMIs, count all reads.\n- 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing.", + "example" : [ + "1MM_All" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloUMIfiltering", + "description" : "type of UMI filtering (for reads uniquely mapping to genes)\n\n- - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0).\n- MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene.\n- MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene.\n- MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 .\nOnly works with --soloUMIdedup 1MM_CR", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloOutFileNames", + "description" : "file names for STARsolo output:\n\nfile_name_prefix gene_names barcode_sequences cell_feature_count_matrix", + "example" : [ + "Solo.out/", + "features.tsv", + "barcodes.tsv", + "matrix.mtx" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCellFilter", + "description" : "cell filtering type and parameters\n\n- None ... do not output filtered cells\n- TopCells ... only report top cells by UMI count, followed by the exact number of cells\n- CellRanger2.2 ... simple filtering of CellRanger 2.2.\nCan be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count\nThe harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10\n- EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y\nCan be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN\nThe harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000", + "example" : [ + "CellRanger2.2", + "3000", + "0.99", + "10" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloOutFormatFeaturesGeneField3", + "description" : "field 3 in the Gene features.tsv file. If \\"-\\", then no 3rd field is output.", + "example" : [ + "Gene Expression" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--soloCellReadStats", + "description" : "Output reads statistics for each CB\n\n- Standard ... standard output", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "../star_align/script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Align fastq files using STAR.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "docker", + "env" : [ + "STAR_VERSION 2.7.3a", + "PACKAGES gcc g++ make wget zlib1g-dev unzip" + ] + }, + { + "type" : "docker", + "run" : [ + "apt-get update && \\\\\n apt-get install -y --no-install-recommends ${PACKAGES} && \\\\\n cd /tmp && \\\\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \\\\\n unzip ${STAR_VERSION}.zip && \\\\\n cd STAR-${STAR_VERSION}/source && \\\\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\\\n cp STAR /usr/local/bin && \\\\\n cd / && \\\\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \\\\\n apt-get --purge autoremove -y ${PACKAGES} && \\\\\n apt-get clean\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/mapping/star_align_v273a/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/mapping/star_align_v273a", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import re +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'runRNGseed': $( if [ ! -z ${VIASH_PAR_RUNRNGSEED+x} ]; then echo "int(r'${VIASH_PAR_RUNRNGSEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'genomeLoad': $( if [ ! -z ${VIASH_PAR_GENOMELOAD+x} ]; then echo "r'${VIASH_PAR_GENOMELOAD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'genomeFastaFiles': $( if [ ! -z ${VIASH_PAR_GENOMEFASTAFILES+x} ]; then echo "r'${VIASH_PAR_GENOMEFASTAFILES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'genomeFileSizes': $( if [ ! -z ${VIASH_PAR_GENOMEFILESIZES+x} ]; then echo "list(map(int, r'${VIASH_PAR_GENOMEFILESIZES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'genomeTransformOutput': $( if [ ! -z ${VIASH_PAR_GENOMETRANSFORMOUTPUT+x} ]; then echo "r'${VIASH_PAR_GENOMETRANSFORMOUTPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'genomeChrSetMitochondrial': $( if [ ! -z ${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL+x} ]; then echo "r'${VIASH_PAR_GENOMECHRSETMITOCHONDRIAL//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbFileChrStartEnd': $( if [ ! -z ${VIASH_PAR_SJDBFILECHRSTARTEND+x} ]; then echo "r'${VIASH_PAR_SJDBFILECHRSTARTEND//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFfile': $( if [ ! -z ${VIASH_PAR_SJDBGTFFILE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFchrPrefix': $( if [ ! -z ${VIASH_PAR_SJDBGTFCHRPREFIX+x} ]; then echo "r'${VIASH_PAR_SJDBGTFCHRPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFfeatureExon': $( if [ ! -z ${VIASH_PAR_SJDBGTFFEATUREEXON+x} ]; then echo "r'${VIASH_PAR_SJDBGTFFEATUREEXON//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentTranscript': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTTRANSCRIPT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGene': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneName': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENENAME//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbGTFtagExonParentGeneType': $( if [ ! -z ${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE+x} ]; then echo "r'${VIASH_PAR_SJDBGTFTAGEXONPARENTGENETYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'sjdbOverhang': $( if [ ! -z ${VIASH_PAR_SJDBOVERHANG+x} ]; then echo "int(r'${VIASH_PAR_SJDBOVERHANG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sjdbScore': $( if [ ! -z ${VIASH_PAR_SJDBSCORE+x} ]; then echo "int(r'${VIASH_PAR_SJDBSCORE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'sjdbInsertSave': $( if [ ! -z ${VIASH_PAR_SJDBINSERTSAVE+x} ]; then echo "r'${VIASH_PAR_SJDBINSERTSAVE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'varVCFfile': $( if [ ! -z ${VIASH_PAR_VARVCFFILE+x} ]; then echo "r'${VIASH_PAR_VARVCFFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesType': $( if [ ! -z ${VIASH_PAR_READFILESTYPE+x} ]; then echo "r'${VIASH_PAR_READFILESTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesSAMattrKeep': $( if [ ! -z ${VIASH_PAR_READFILESSAMATTRKEEP+x} ]; then echo "r'${VIASH_PAR_READFILESSAMATTRKEEP//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readFilesManifest': $( if [ ! -z ${VIASH_PAR_READFILESMANIFEST+x} ]; then echo "r'${VIASH_PAR_READFILESMANIFEST//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesPrefix': $( if [ ! -z ${VIASH_PAR_READFILESPREFIX+x} ]; then echo "r'${VIASH_PAR_READFILESPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readFilesCommand': $( if [ ! -z ${VIASH_PAR_READFILESCOMMAND+x} ]; then echo "r'${VIASH_PAR_READFILESCOMMAND//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readMapNumber': $( if [ ! -z ${VIASH_PAR_READMAPNUMBER+x} ]; then echo "int(r'${VIASH_PAR_READMAPNUMBER//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'readMatesLengthsIn': $( if [ ! -z ${VIASH_PAR_READMATESLENGTHSIN+x} ]; then echo "r'${VIASH_PAR_READMATESLENGTHSIN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'readNameSeparator': $( if [ ! -z ${VIASH_PAR_READNAMESEPARATOR+x} ]; then echo "r'${VIASH_PAR_READNAMESEPARATOR//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'readQualityScoreBase': $( if [ ! -z ${VIASH_PAR_READQUALITYSCOREBASE+x} ]; then echo "int(r'${VIASH_PAR_READQUALITYSCOREBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'clipAdapterType': $( if [ ! -z ${VIASH_PAR_CLIPADAPTERTYPE+x} ]; then echo "r'${VIASH_PAR_CLIPADAPTERTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'clip3pNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip3pAdapterSeq': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERSEQ+x} ]; then echo "r'${VIASH_PAR_CLIP3PADAPTERSEQ//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'clip3pAdapterMMp': $( if [ ! -z ${VIASH_PAR_CLIP3PADAPTERMMP+x} ]; then echo "list(map(float, r'${VIASH_PAR_CLIP3PADAPTERMMP//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip3pAfterAdapterNbases': $( if [ ! -z ${VIASH_PAR_CLIP3PAFTERADAPTERNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP3PAFTERADAPTERNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'clip5pNbases': $( if [ ! -z ${VIASH_PAR_CLIP5PNBASES+x} ]; then echo "list(map(int, r'${VIASH_PAR_CLIP5PNBASES//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'limitGenomeGenerateRAM': $( if [ ! -z ${VIASH_PAR_LIMITGENOMEGENERATERAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITGENOMEGENERATERAM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitIObufferSize': $( if [ ! -z ${VIASH_PAR_LIMITIOBUFFERSIZE+x} ]; then echo "list(map(int, r'${VIASH_PAR_LIMITIOBUFFERSIZE//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'limitOutSAMoneReadBytes': $( if [ ! -z ${VIASH_PAR_LIMITOUTSAMONEREADBYTES+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSAMONEREADBYTES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitOutSJoneRead': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJONEREAD+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJONEREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitOutSJcollapsed': $( if [ ! -z ${VIASH_PAR_LIMITOUTSJCOLLAPSED+x} ]; then echo "int(r'${VIASH_PAR_LIMITOUTSJCOLLAPSED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitBAMsortRAM': $( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "int(r'${VIASH_PAR_LIMITBAMSORTRAM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitSjdbInsertNsj': $( if [ ! -z ${VIASH_PAR_LIMITSJDBINSERTNSJ+x} ]; then echo "int(r'${VIASH_PAR_LIMITSJDBINSERTNSJ//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'limitNreadsSoft': $( if [ ! -z ${VIASH_PAR_LIMITNREADSSOFT+x} ]; then echo "int(r'${VIASH_PAR_LIMITNREADSSOFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outTmpKeep': $( if [ ! -z ${VIASH_PAR_OUTTMPKEEP+x} ]; then echo "r'${VIASH_PAR_OUTTMPKEEP//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outStd': $( if [ ! -z ${VIASH_PAR_OUTSTD+x} ]; then echo "r'${VIASH_PAR_OUTSTD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outReadsUnmapped': $( if [ ! -z ${VIASH_PAR_OUTREADSUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTREADSUNMAPPED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outQSconversionAdd': $( if [ ! -z ${VIASH_PAR_OUTQSCONVERSIONADD+x} ]; then echo "int(r'${VIASH_PAR_OUTQSCONVERSIONADD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outMultimapperOrder': $( if [ ! -z ${VIASH_PAR_OUTMULTIMAPPERORDER+x} ]; then echo "r'${VIASH_PAR_OUTMULTIMAPPERORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMtype': $( if [ ! -z ${VIASH_PAR_OUTSAMTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSAMTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMmode': $( if [ ! -z ${VIASH_PAR_OUTSAMMODE+x} ]; then echo "r'${VIASH_PAR_OUTSAMMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMstrandField': $( if [ ! -z ${VIASH_PAR_OUTSAMSTRANDFIELD+x} ]; then echo "r'${VIASH_PAR_OUTSAMSTRANDFIELD//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMattributes': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIBUTES+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRIBUTES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMattrIHstart': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRIHSTART+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMATTRIHSTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMunmapped': $( if [ ! -z ${VIASH_PAR_OUTSAMUNMAPPED+x} ]; then echo "r'${VIASH_PAR_OUTSAMUNMAPPED//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMorder': $( if [ ! -z ${VIASH_PAR_OUTSAMORDER+x} ]; then echo "r'${VIASH_PAR_OUTSAMORDER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMprimaryFlag': $( if [ ! -z ${VIASH_PAR_OUTSAMPRIMARYFLAG+x} ]; then echo "r'${VIASH_PAR_OUTSAMPRIMARYFLAG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMreadID': $( if [ ! -z ${VIASH_PAR_OUTSAMREADID+x} ]; then echo "r'${VIASH_PAR_OUTSAMREADID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMmapqUnique': $( if [ ! -z ${VIASH_PAR_OUTSAMMAPQUNIQUE+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMAPQUNIQUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMflagOR': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGOR+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMflagAND': $( if [ ! -z ${VIASH_PAR_OUTSAMFLAGAND+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMFLAGAND//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMattrRGline': $( if [ ! -z ${VIASH_PAR_OUTSAMATTRRGLINE+x} ]; then echo "r'${VIASH_PAR_OUTSAMATTRRGLINE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderHD': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERHD+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERHD//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderPG': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERPG+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERPG//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMheaderCommentFile': $( if [ ! -z ${VIASH_PAR_OUTSAMHEADERCOMMENTFILE+x} ]; then echo "r'${VIASH_PAR_OUTSAMHEADERCOMMENTFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSAMfilter': $( if [ ! -z ${VIASH_PAR_OUTSAMFILTER+x} ]; then echo "r'${VIASH_PAR_OUTSAMFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outSAMmultNmax': $( if [ ! -z ${VIASH_PAR_OUTSAMMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMMULTNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outSAMtlen': $( if [ ! -z ${VIASH_PAR_OUTSAMTLEN+x} ]; then echo "int(r'${VIASH_PAR_OUTSAMTLEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMcompression': $( if [ ! -z ${VIASH_PAR_OUTBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMCOMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMsortingThreadN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGTHREADN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGTHREADN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outBAMsortingBinsN': $( if [ ! -z ${VIASH_PAR_OUTBAMSORTINGBINSN+x} ]; then echo "int(r'${VIASH_PAR_OUTBAMSORTINGBINSN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'bamRemoveDuplicatesType': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESTYPE+x} ]; then echo "r'${VIASH_PAR_BAMREMOVEDUPLICATESTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'bamRemoveDuplicatesMate2basesN': $( if [ ! -z ${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN+x} ]; then echo "int(r'${VIASH_PAR_BAMREMOVEDUPLICATESMATE2BASESN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outWigType': $( if [ ! -z ${VIASH_PAR_OUTWIGTYPE+x} ]; then echo "r'${VIASH_PAR_OUTWIGTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'outWigStrand': $( if [ ! -z ${VIASH_PAR_OUTWIGSTRAND+x} ]; then echo "r'${VIASH_PAR_OUTWIGSTRAND//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outWigReferencesPrefix': $( if [ ! -z ${VIASH_PAR_OUTWIGREFERENCESPREFIX+x} ]; then echo "r'${VIASH_PAR_OUTWIGREFERENCESPREFIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outWigNorm': $( if [ ! -z ${VIASH_PAR_OUTWIGNORM+x} ]; then echo "r'${VIASH_PAR_OUTWIGNORM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterType': $( if [ ! -z ${VIASH_PAR_OUTFILTERTYPE+x} ]; then echo "r'${VIASH_PAR_OUTFILTERTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPSCORERANGE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMultimapNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNMAX+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMISMATCHNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNoverLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMismatchNoverReadLmax': $( if [ ! -z ${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMISMATCHNOVERREADLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterScoreMin': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERSCOREMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterScoreMinOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERSCOREMINOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMatchNmin': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMIN+x} ]; then echo "int(r'${VIASH_PAR_OUTFILTERMATCHNMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterMatchNminOverLread': $( if [ ! -z ${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_OUTFILTERMATCHNMINOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'outFilterIntronMotifs': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONMOTIFS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONMOTIFS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outFilterIntronStrands': $( if [ ! -z ${VIASH_PAR_OUTFILTERINTRONSTRANDS+x} ]; then echo "r'${VIASH_PAR_OUTFILTERINTRONSTRANDS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJtype': $( if [ ! -z ${VIASH_PAR_OUTSJTYPE+x} ]; then echo "r'${VIASH_PAR_OUTSJTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJfilterReads': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERREADS+x} ]; then echo "r'${VIASH_PAR_OUTSJFILTERREADS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'outSJfilterOverhangMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTEROVERHANGMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTEROVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountUniqueMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTUNIQUEMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterCountTotalMin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERCOUNTTOTALMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterDistToOtherSJmin': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERDISTTOOTHERSJMIN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'outSJfilterIntronMaxVsReadN': $( if [ ! -z ${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN+x} ]; then echo "list(map(int, r'${VIASH_PAR_OUTSJFILTERINTRONMAXVSREADN//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'scoreGap': $( if [ ! -z ${VIASH_PAR_SCOREGAP+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapNoncan': $( if [ ! -z ${VIASH_PAR_SCOREGAPNONCAN+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPNONCAN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapGCAG': $( if [ ! -z ${VIASH_PAR_SCOREGAPGCAG+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPGCAG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGapATAC': $( if [ ! -z ${VIASH_PAR_SCOREGAPATAC+x} ]; then echo "int(r'${VIASH_PAR_SCOREGAPATAC//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreGenomicLengthLog2scale': $( if [ ! -z ${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE+x} ]; then echo "int(r'${VIASH_PAR_SCOREGENOMICLENGTHLOG2SCALE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreDelOpen': $( if [ ! -z ${VIASH_PAR_SCOREDELOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELOPEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreDelBase': $( if [ ! -z ${VIASH_PAR_SCOREDELBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREDELBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreInsOpen': $( if [ ! -z ${VIASH_PAR_SCOREINSOPEN+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSOPEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreInsBase': $( if [ ! -z ${VIASH_PAR_SCOREINSBASE+x} ]; then echo "int(r'${VIASH_PAR_SCOREINSBASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'scoreStitchSJshift': $( if [ ! -z ${VIASH_PAR_SCORESTITCHSJSHIFT+x} ]; then echo "int(r'${VIASH_PAR_SCORESTITCHSJSHIFT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchStartLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHSTARTLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchStartLmaxOverLread': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD+x} ]; then echo "float(r'${VIASH_PAR_SEEDSEARCHSTARTLMAXOVERLREAD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSearchLmax': $( if [ ! -z ${VIASH_PAR_SEEDSEARCHLMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDSEARCHLMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedMultimapNmax': $( if [ ! -z ${VIASH_PAR_SEEDMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedPerReadNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedPerWindowNmax': $( if [ ! -z ${VIASH_PAR_SEEDPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_SEEDPERWINDOWNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedNoneLociPerWindow': $( if [ ! -z ${VIASH_PAR_SEEDNONELOCIPERWINDOW+x} ]; then echo "int(r'${VIASH_PAR_SEEDNONELOCIPERWINDOW//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedSplitMin': $( if [ ! -z ${VIASH_PAR_SEEDSPLITMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDSPLITMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seedMapMin': $( if [ ! -z ${VIASH_PAR_SEEDMAPMIN+x} ]; then echo "int(r'${VIASH_PAR_SEEDMAPMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignIntronMin': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignIntronMax': $( if [ ! -z ${VIASH_PAR_ALIGNINTRONMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNINTRONMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignMatesGapMax': $( if [ ! -z ${VIASH_PAR_ALIGNMATESGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNMATESGAPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSJoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSJstitchMismatchNmax': $( if [ ! -z ${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX+x} ]; then echo "list(map(int, r'${VIASH_PAR_ALIGNSJSTITCHMISMATCHNMAX//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'alignSJDBoverhangMin': $( if [ ! -z ${VIASH_PAR_ALIGNSJDBOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSJDBOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSplicedMateMapLmin': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN+x} ]; then echo "int(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignSplicedMateMapLminOverLmate': $( if [ ! -z ${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE+x} ]; then echo "float(r'${VIASH_PAR_ALIGNSPLICEDMATEMAPLMINOVERLMATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignWindowsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNWINDOWSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNWINDOWSPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignTranscriptsPerWindowNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERWINDOWNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignTranscriptsPerReadNmax': $( if [ ! -z ${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX+x} ]; then echo "int(r'${VIASH_PAR_ALIGNTRANSCRIPTSPERREADNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'alignEndsType': $( if [ ! -z ${VIASH_PAR_ALIGNENDSTYPE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignEndsProtrude': $( if [ ! -z ${VIASH_PAR_ALIGNENDSPROTRUDE+x} ]; then echo "r'${VIASH_PAR_ALIGNENDSPROTRUDE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignSoftClipAtReferenceEnds': $( if [ ! -z ${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS+x} ]; then echo "r'${VIASH_PAR_ALIGNSOFTCLIPATREFERENCEENDS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'alignInsertionFlush': $( if [ ! -z ${VIASH_PAR_ALIGNINSERTIONFLUSH+x} ]; then echo "r'${VIASH_PAR_ALIGNINSERTIONFLUSH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'peOverlapNbasesMin': $( if [ ! -z ${VIASH_PAR_PEOVERLAPNBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_PEOVERLAPNBASESMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'peOverlapMMp': $( if [ ! -z ${VIASH_PAR_PEOVERLAPMMP+x} ]; then echo "float(r'${VIASH_PAR_PEOVERLAPMMP//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winAnchorMultimapNmax': $( if [ ! -z ${VIASH_PAR_WINANCHORMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winBinNbits': $( if [ ! -z ${VIASH_PAR_WINBINNBITS+x} ]; then echo "int(r'${VIASH_PAR_WINBINNBITS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winAnchorDistNbins': $( if [ ! -z ${VIASH_PAR_WINANCHORDISTNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINANCHORDISTNBINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winFlankNbins': $( if [ ! -z ${VIASH_PAR_WINFLANKNBINS+x} ]; then echo "int(r'${VIASH_PAR_WINFLANKNBINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winReadCoverageRelativeMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGERELATIVEMIN+x} ]; then echo "float(r'${VIASH_PAR_WINREADCOVERAGERELATIVEMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'winReadCoverageBasesMin': $( if [ ! -z ${VIASH_PAR_WINREADCOVERAGEBASESMIN+x} ]; then echo "int(r'${VIASH_PAR_WINREADCOVERAGEBASESMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimOutType': $( if [ ! -z ${VIASH_PAR_CHIMOUTTYPE+x} ]; then echo "r'${VIASH_PAR_CHIMOUTTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'chimSegmentMin': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreMin': $( if [ ! -z ${VIASH_PAR_CHIMSCOREMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreDropMax': $( if [ ! -z ${VIASH_PAR_CHIMSCOREDROPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREDROPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreSeparation': $( if [ ! -z ${VIASH_PAR_CHIMSCORESEPARATION+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCORESEPARATION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimScoreJunctionNonGTAG': $( if [ ! -z ${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG+x} ]; then echo "int(r'${VIASH_PAR_CHIMSCOREJUNCTIONNONGTAG//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimJunctionOverhangMin': $( if [ ! -z ${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMJUNCTIONOVERHANGMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimSegmentReadGapMax': $( if [ ! -z ${VIASH_PAR_CHIMSEGMENTREADGAPMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMSEGMENTREADGAPMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimFilter': $( if [ ! -z ${VIASH_PAR_CHIMFILTER+x} ]; then echo "r'${VIASH_PAR_CHIMFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'chimMainSegmentMultNmax': $( if [ ! -z ${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMAINSEGMENTMULTNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimMultimapNmax': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPNMAX+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimMultimapScoreRange': $( if [ ! -z ${VIASH_PAR_CHIMMULTIMAPSCORERANGE+x} ]; then echo "int(r'${VIASH_PAR_CHIMMULTIMAPSCORERANGE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimNonchimScoreDropMin': $( if [ ! -z ${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN+x} ]; then echo "int(r'${VIASH_PAR_CHIMNONCHIMSCOREDROPMIN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'chimOutJunctionFormat': $( if [ ! -z ${VIASH_PAR_CHIMOUTJUNCTIONFORMAT+x} ]; then echo "int(r'${VIASH_PAR_CHIMOUTJUNCTIONFORMAT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'quantMode': $( if [ ! -z ${VIASH_PAR_QUANTMODE+x} ]; then echo "r'${VIASH_PAR_QUANTMODE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'quantTranscriptomeBAMcompression': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION+x} ]; then echo "int(r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAMCOMPRESSION//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'quantTranscriptomeBan': $( if [ ! -z ${VIASH_PAR_QUANTTRANSCRIPTOMEBAN+x} ]; then echo "r'${VIASH_PAR_QUANTTRANSCRIPTOMEBAN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'twopassMode': $( if [ ! -z ${VIASH_PAR_TWOPASSMODE+x} ]; then echo "r'${VIASH_PAR_TWOPASSMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'twopass1readsN': $( if [ ! -z ${VIASH_PAR_TWOPASS1READSN+x} ]; then echo "int(r'${VIASH_PAR_TWOPASS1READSN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'waspOutputMode': $( if [ ! -z ${VIASH_PAR_WASPOUTPUTMODE+x} ]; then echo "r'${VIASH_PAR_WASPOUTPUTMODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloType': $( if [ ! -z ${VIASH_PAR_SOLOTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOTYPE//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCBwhitelist': $( if [ ! -z ${VIASH_PAR_SOLOCBWHITELIST+x} ]; then echo "r'${VIASH_PAR_SOLOCBWHITELIST//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCBstart': $( if [ ! -z ${VIASH_PAR_SOLOCBSTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBSTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBlen': $( if [ ! -z ${VIASH_PAR_SOLOCBLEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOCBLEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloUMIstart': $( if [ ! -z ${VIASH_PAR_SOLOUMISTART+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMISTART//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloUMIlen': $( if [ ! -z ${VIASH_PAR_SOLOUMILEN+x} ]; then echo "int(r'${VIASH_PAR_SOLOUMILEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloBarcodeReadLength': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEREADLENGTH+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEREADLENGTH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloBarcodeMate': $( if [ ! -z ${VIASH_PAR_SOLOBARCODEMATE+x} ]; then echo "int(r'${VIASH_PAR_SOLOBARCODEMATE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBposition': $( if [ ! -z ${VIASH_PAR_SOLOCBPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOCBPOSITION//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIposition': $( if [ ! -z ${VIASH_PAR_SOLOUMIPOSITION+x} ]; then echo "r'${VIASH_PAR_SOLOUMIPOSITION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloAdapterSequence': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERSEQUENCE+x} ]; then echo "r'${VIASH_PAR_SOLOADAPTERSEQUENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloAdapterMismatchesNmax': $( if [ ! -z ${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX+x} ]; then echo "int(r'${VIASH_PAR_SOLOADAPTERMISMATCHESNMAX//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'soloCBmatchWLtype': $( if [ ! -z ${VIASH_PAR_SOLOCBMATCHWLTYPE+x} ]; then echo "r'${VIASH_PAR_SOLOCBMATCHWLTYPE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloInputSAMattrBarcodeSeq': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODESEQ//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloInputSAMattrBarcodeQual': $( if [ ! -z ${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL+x} ]; then echo "r'${VIASH_PAR_SOLOINPUTSAMATTRBARCODEQUAL//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloStrand': $( if [ ! -z ${VIASH_PAR_SOLOSTRAND+x} ]; then echo "r'${VIASH_PAR_SOLOSTRAND//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'soloFeatures': $( if [ ! -z ${VIASH_PAR_SOLOFEATURES+x} ]; then echo "r'${VIASH_PAR_SOLOFEATURES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloMultiMappers': $( if [ ! -z ${VIASH_PAR_SOLOMULTIMAPPERS+x} ]; then echo "r'${VIASH_PAR_SOLOMULTIMAPPERS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIdedup': $( if [ ! -z ${VIASH_PAR_SOLOUMIDEDUP+x} ]; then echo "r'${VIASH_PAR_SOLOUMIDEDUP//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloUMIfiltering': $( if [ ! -z ${VIASH_PAR_SOLOUMIFILTERING+x} ]; then echo "r'${VIASH_PAR_SOLOUMIFILTERING//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloOutFileNames': $( if [ ! -z ${VIASH_PAR_SOLOOUTFILENAMES+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFILENAMES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCellFilter': $( if [ ! -z ${VIASH_PAR_SOLOCELLFILTER+x} ]; then echo "r'${VIASH_PAR_SOLOCELLFILTER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloOutFormatFeaturesGeneField3': $( if [ ! -z ${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3+x} ]; then echo "r'${VIASH_PAR_SOLOOUTFORMATFEATURESGENEFIELD3//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'soloCellReadStats': $( if [ ! -z ${VIASH_PAR_SOLOCELLREADSTATS+x} ]; then echo "r'${VIASH_PAR_SOLOCELLREADSTATS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + +# regex for matching R[12] fastq(gz) files +# examples: +# - TSP10_Fat_MAT_SS2_B134171_B115063_Immune_A1_L003_R1.fastq.gz +# - tinygex_S1_L001_I1_001.fastq.gz +fastqgz_regex = r"(.+)_(R\\\\d+)(_\\\\d+)?\\\\.fastq(\\\\.gz)?" + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\\\x1f\\\\x8b" + + +# look for fastq files in a directory +def search_fastqs(path: Path) -> list[Path]: + if path.is_dir(): + print( + f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", + flush=True, + ) + value_paths = [ + file for file in path.iterdir() if re.match(fastqgz_regex, file.name) + ] + return value_paths + else: + return [path] + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the \\`processPar()\\` generator needs to be adapted +to_rename = { + "input": "readFilesIn", + "reference": "genomeDir", + "output": "outFileNamePrefix", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the \\`to_rename\\` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["outFileNamePrefix"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory( + prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True +) as temp_dir: + print(">> Check whether input files are directories", flush=True) + new_read_files_in = [] + for path in par["readFilesIn"]: + new_read_files_in.extend(search_fastqs(path)) + par["readFilesIn"] = new_read_files_in + print("", flush=True) + + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeDir", "readFilesIn"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print("Grouping R1/R2 input files into pairs", flush=True) + input_grouped = {} + for path in par["readFilesIn"]: + key = re.search(fastqgz_regex, path.name).group(2) + if key not in input_grouped: + input_grouped[key] = [] + input_grouped[key].append(str(path)) + par["readFilesIn"] = [",".join(val) for val in input_grouped.values()] + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "alignReads" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + # make sure there is a trailing / + par["outFileNamePrefix"] = f"{par['outFileNamePrefix']}/" + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/mapping/star_align_v273a", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/mapping/star_align_v273a/nextflow.config b/target/nextflow/mapping/star_align_v273a/nextflow.config new file mode 100644 index 00000000..c8589fbb --- /dev/null +++ b/target/nextflow/mapping/star_align_v273a/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'mapping/star_align_v273a' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Align fastq files using STAR.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/mapping/star_align_v273a/nextflow_labels.config b/target/nextflow/mapping/star_align_v273a/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/mapping/star_align_v273a/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/mapping/star_align_v273a/nextflow_schema.json b/target/nextflow/mapping/star_align_v273a/nextflow_schema.json new file mode 100644 index 00000000..f7aa3394 --- /dev/null +++ b/target/nextflow/mapping/star_align_v273a/nextflow_schema.json @@ -0,0 +1,1322 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "star_align_v273a", + "description": "Align fastq files using STAR.", + "type": "object", + "$defs": { + "input/output": { + "title": "Input/Output", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The FASTQ files to be analyzed", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the reference built by star_build_reference", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"/path/to/reference\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Path to output directory", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/foo\"`. ", + "default": "$id.$key.output" + } + } + }, + "run parameters": { + "title": "Run Parameters", + "type": "object", + "description": "No description", + "properties": { + "runRNGseed": { + "type": "integer", + "description": "random number generator seed.", + "help_text": "Type: `integer`, multiple: `False`, example: `777`. " + } + } + }, + "genome parameters": { + "title": "Genome Parameters", + "type": "object", + "description": "No description", + "properties": { + "genomeLoad": { + "type": "string", + "description": "mode of shared memory usage for the genome files", + "help_text": "Type: `string`, multiple: `False`, example: `\"NoSharedMemory\"`. " + }, + "genomeFastaFiles": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "path(s) to the fasta files with the genome sequences, separated by spaces", + "help_text": "Type: `file`, multiple: `True`, direction: `input`. " + }, + "genomeFileSizes": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "genome files exact sizes in bytes", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "genomeTransformOutput": { + "type": "array", + "items": { + "type": "string" + }, + "description": "which output to transform back to original genome\n\n- SAM ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "genomeChrSetMitochondrial": { + "type": "array", + "items": { + "type": "string" + }, + "description": "names of the mitochondrial chromosomes", + "help_text": "Type: `string`, multiple: `True`, example: `[\"chrM\";\"M\";\"MT\"]`. " + } + } + }, + "splice junctions database": { + "title": "Splice Junctions Database", + "type": "object", + "description": "No description", + "properties": { + "sjdbFileChrStartEnd": { + "type": "array", + "items": { + "type": "string" + }, + "description": "path to the files with genomic coordinates (chr start end strand) for the splice junction introns", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sjdbGTFfile": { + "type": "string", + "format": "path", + "description": "path to the GTF file with annotations", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "sjdbGTFchrPrefix": { + "type": "string", + "description": "prefix for chromosome names in a GTF file (e.g", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sjdbGTFfeatureExon": { + "type": "string", + "description": "feature type in GTF file to be used as exons for building transcripts", + "help_text": "Type: `string`, multiple: `False`, example: `\"exon\"`. " + }, + "sjdbGTFtagExonParentTranscript": { + "type": "string", + "description": "GTF attribute name for parent transcript ID (default \"transcript_id\" works for GTF files)", + "help_text": "Type: `string`, multiple: `False`, example: `\"transcript_id\"`. " + }, + "sjdbGTFtagExonParentGene": { + "type": "string", + "description": "GTF attribute name for parent gene ID (default \"gene_id\" works for GTF files)", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_id\"`. " + }, + "sjdbGTFtagExonParentGeneName": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute name for parent gene name", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_name\"]`. " + }, + "sjdbGTFtagExonParentGeneType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "GTF attribute name for parent gene type", + "help_text": "Type: `string`, multiple: `True`, example: `[\"gene_type\";\"gene_biotype\"]`. " + }, + "sjdbOverhang": { + "type": "integer", + "description": "length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "sjdbScore": { + "type": "integer", + "description": "extra alignment score for alignments that cross database junctions", + "help_text": "Type: `integer`, multiple: `False`, example: `2`. " + }, + "sjdbInsertSave": { + "type": "string", + "description": "which files to save when sjdb junctions are inserted on the fly at the mapping step\n\n- Basic ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`. " + } + } + }, + "variation parameters": { + "title": "Variation parameters", + "type": "object", + "description": "No description", + "properties": { + "varVCFfile": { + "type": "string", + "description": "path to the VCF file that contains variation data", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "read parameters": { + "title": "Read Parameters", + "type": "object", + "description": "No description", + "properties": { + "readFilesType": { + "type": "string", + "description": "format of input read files\n\n- Fastx ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Fastx\"`. " + }, + "readFilesSAMattrKeep": { + "type": "array", + "items": { + "type": "string" + }, + "description": "for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL\n\n- All ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"All\"]`. " + }, + "readFilesManifest": { + "type": "string", + "format": "path", + "description": "path to the \"manifest\" file with the names of read files", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "readFilesPrefix": { + "type": "string", + "description": "prefix for the read files names, i.e", + "help_text": "Type: `string`, multiple: `False`. " + }, + "readFilesCommand": { + "type": "array", + "items": { + "type": "string" + }, + "description": "command line to execute for each of the input file", + "help_text": "Type: `string`, multiple: `True`. " + }, + "readMapNumber": { + "type": "integer", + "description": "number of reads to map from the beginning of the file\n\n-1: map all reads", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "readMatesLengthsIn": { + "type": "string", + "description": "Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same", + "help_text": "Type: `string`, multiple: `False`, example: `\"NotEqual\"`. " + }, + "readNameSeparator": { + "type": "array", + "items": { + "type": "string" + }, + "description": "character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)", + "help_text": "Type: `string`, multiple: `True`, example: `[\"/\"]`. " + }, + "readQualityScoreBase": { + "type": "integer", + "description": "number to be subtracted from the ASCII code to get Phred quality score", + "help_text": "Type: `integer`, multiple: `False`, example: `33`. " + } + } + }, + "read clipping": { + "title": "Read Clipping", + "type": "object", + "description": "No description", + "properties": { + "clipAdapterType": { + "type": "string", + "description": "adapter clipping type\n\n- Hamming ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Hamming\"`. " + }, + "clip3pNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number(s) of bases to clip from 3p of each mate", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "clip3pAdapterSeq": { + "type": "array", + "items": { + "type": "string" + }, + "description": "adapter sequences to clip from 3p of each mate", + "help_text": "Type: `string`, multiple: `True`. " + }, + "clip3pAdapterMMp": { + "type": "array", + "items": { + "type": "number" + }, + "description": "max proportion of mismatches for 3p adapter clipping for each mate", + "help_text": "Type: `double`, multiple: `True`, example: `[0.1]`. " + }, + "clip3pAfterAdapterNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number of bases to clip from 3p of each mate after the adapter clipping", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + }, + "clip5pNbases": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "number(s) of bases to clip from 5p of each mate", + "help_text": "Type: `integer`, multiple: `True`, example: `[0]`. " + } + } + }, + "limits": { + "title": "Limits", + "type": "object", + "description": "No description", + "properties": { + "limitGenomeGenerateRAM": { + "type": "string", + "description": "maximum available RAM (bytes) for genome generation", + "help_text": "Type: `long`, multiple: `False`, example: `31000000000`. " + }, + "limitIObufferSize": { + "type": "array", + "items": { + "type": "string" + }, + "description": "max available buffers size (bytes) for input/output, per thread", + "help_text": "Type: `long`, multiple: `True`, example: `[30000000;50000000]`. " + }, + "limitOutSAMoneReadBytes": { + "type": "string", + "description": "max size of the SAM record (bytes) for one read", + "help_text": "Type: `long`, multiple: `False`, example: `100000`. " + }, + "limitOutSJoneRead": { + "type": "integer", + "description": "max number of junctions for one read (including all multi-mappers)", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "limitOutSJcollapsed": { + "type": "integer", + "description": "max number of collapsed junctions", + "help_text": "Type: `integer`, multiple: `False`, example: `1000000`. " + }, + "limitBAMsortRAM": { + "type": "string", + "description": "maximum available RAM (bytes) for sorting BAM", + "help_text": "Type: `long`, multiple: `False`, example: `0`. " + }, + "limitSjdbInsertNsj": { + "type": "integer", + "description": "maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "help_text": "Type: `integer`, multiple: `False`, example: `1000000`. " + }, + "limitNreadsSoft": { + "type": "integer", + "description": "soft limit on the number of reads", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + } + } + }, + "output: general": { + "title": "Output: general", + "type": "object", + "description": "No description", + "properties": { + "outTmpKeep": { + "type": "string", + "description": "whether to keep the temporary files after STAR runs is finished\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outStd": { + "type": "string", + "description": "which output will be directed to stdout (standard out)\n\n- Log ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Log\"`. " + }, + "outReadsUnmapped": { + "type": "string", + "description": "output of unmapped and partially mapped (i.e", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outQSconversionAdd": { + "type": "integer", + "description": "add this number to the quality score (e.g", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outMultimapperOrder": { + "type": "string", + "description": "order of multimapping alignments in the output files\n\n- Old_2.4 ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Old_2.4\"`. " + } + } + }, + "output: sam and bam": { + "title": "Output: SAM and BAM", + "type": "object", + "description": "No description", + "properties": { + "outSAMtype": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of SAM/BAM output\n\n1st word:\n- BAM ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"SAM\"]`. " + }, + "outSAMmode": { + "type": "string", + "description": "mode of SAM output\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Full\"`. " + }, + "outSAMstrandField": { + "type": "string", + "description": "Cufflinks-like strand field flag\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outSAMattributes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "a string of desired SAM attributes, in the order desired for the output SAM", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Standard\"]`. " + }, + "outSAMattrIHstart": { + "type": "integer", + "description": "start value for the IH attribute", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outSAMunmapped": { + "type": "array", + "items": { + "type": "string" + }, + "description": "output of unmapped reads in the SAM format\n\n1st word:\n- None ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMorder": { + "type": "string", + "description": "type of sorting for the SAM output\n\nPaired: one mate after the other for all paired alignments\nPairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files", + "help_text": "Type: `string`, multiple: `False`, example: `\"Paired\"`. " + }, + "outSAMprimaryFlag": { + "type": "string", + "description": "which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG\n\n- OneBestScore ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"OneBestScore\"`. " + }, + "outSAMreadID": { + "type": "string", + "description": "read ID record type\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Standard\"`. " + }, + "outSAMmapqUnique": { + "type": "integer", + "description": "0 to 255: the MAPQ value for unique mappers", + "help_text": "Type: `integer`, multiple: `False`, example: `255`. " + }, + "outSAMflagOR": { + "type": "integer", + "description": "0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outSAMflagAND": { + "type": "integer", + "description": "0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `65535`. " + }, + "outSAMattrRGline": { + "type": "array", + "items": { + "type": "string" + }, + "description": "SAM/BAM read group line", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderHD": { + "type": "array", + "items": { + "type": "string" + }, + "description": "@HD (header) line of the SAM header", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderPG": { + "type": "array", + "items": { + "type": "string" + }, + "description": "extra @PG (software) line of the SAM header (in addition to STAR)", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMheaderCommentFile": { + "type": "string", + "description": "path to the file with @CO (comment) lines of the SAM header", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outSAMfilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "filter the output into main SAM/BAM files\n\n- KeepOnlyAddedReferences ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outSAMmultNmax": { + "type": "integer", + "description": "max number of multiple alignments for a read that will be output to the SAM/BAM files", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "outSAMtlen": { + "type": "integer", + "description": "calculation method for the TLEN field in the SAM/BAM files\n\n- 1 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outBAMcompression": { + "type": "integer", + "description": "-1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outBAMsortingThreadN": { + "type": "integer", + "description": ">=0: number of threads for BAM sorting", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outBAMsortingBinsN": { + "type": "integer", + "description": ">0: number of genome bins for coordinate-sorting", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + } + } + }, + "bam processing": { + "title": "BAM processing", + "type": "object", + "description": "No description", + "properties": { + "bamRemoveDuplicatesType": { + "type": "string", + "description": "mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only\n\n- - ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "bamRemoveDuplicatesMate2basesN": { + "type": "integer", + "description": "number of bases from the 5' of mate 2 to use in collapsing (e.g", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "output wiggle": { + "title": "Output Wiggle", + "type": "object", + "description": "No description", + "properties": { + "outWigType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of signal output, e.g", + "help_text": "Type: `string`, multiple: `True`. " + }, + "outWigStrand": { + "type": "string", + "description": "strandedness of wiggle/bedGraph output\n\n- Stranded ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Stranded\"`. " + }, + "outWigReferencesPrefix": { + "type": "string", + "description": "prefix matching reference names to include in the output wiggle file, e.g", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outWigNorm": { + "type": "string", + "description": "type of normalization for the signal\n\n- RPM ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"RPM\"`. " + } + } + }, + "output filtering": { + "title": "Output Filtering", + "type": "object", + "description": "No description", + "properties": { + "outFilterType": { + "type": "string", + "description": "type of filtering\n\n- Normal ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Normal\"`. " + }, + "outFilterMultimapScoreRange": { + "type": "integer", + "description": "the score range below the maximum score for multimapping alignments", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "outFilterMultimapNmax": { + "type": "integer", + "description": "maximum number of loci the read is allowed to map to", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "outFilterMismatchNmax": { + "type": "integer", + "description": "alignment will be output only if it has no more mismatches than this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "outFilterMismatchNoverLmax": { + "type": "number", + "description": "alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.", + "help_text": "Type: `double`, multiple: `False`, example: `0.3`. " + }, + "outFilterMismatchNoverReadLmax": { + "type": "number", + "description": "alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.", + "help_text": "Type: `double`, multiple: `False`, example: `1.0`. " + }, + "outFilterScoreMin": { + "type": "integer", + "description": "alignment will be output only if its score is higher than or equal to this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outFilterScoreMinOverLread": { + "type": "number", + "description": "same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "outFilterMatchNmin": { + "type": "integer", + "description": "alignment will be output only if the number of matched bases is higher than or equal to this value.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "outFilterMatchNminOverLread": { + "type": "number", + "description": "sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "outFilterIntronMotifs": { + "type": "string", + "description": "filter alignment using their motifs\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "outFilterIntronStrands": { + "type": "string", + "description": "filter alignments\n\n- RemoveInconsistentStrands ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"RemoveInconsistentStrands\"`. " + } + } + }, + "output splice junctions (sj.out.tab)": { + "title": "Output splice junctions (SJ.out.tab)", + "type": "object", + "description": "No description", + "properties": { + "outSJtype": { + "type": "string", + "description": "type of splice junction output\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Standard\"`. " + } + } + }, + "output filtering: splice junctions": { + "title": "Output Filtering: Splice Junctions", + "type": "object", + "description": "No description", + "properties": { + "outSJfilterReads": { + "type": "string", + "description": "which reads to consider for collapsed splice junctions output\n\n- All ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"All\"`. " + }, + "outSJfilterOverhangMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[30;12;12;12]`. " + }, + "outSJfilterCountUniqueMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[3;1;1;1]`. " + }, + "outSJfilterCountTotalMin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif", + "help_text": "Type: `integer`, multiple: `True`, example: `[3;1;1;1]`. " + }, + "outSJfilterDistToOtherSJmin": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "minimum allowed distance to other junctions' donor/acceptor\n\ndoes not apply to annotated junctions", + "help_text": "Type: `integer`, multiple: `True`, example: `[10;0;5;10]`. " + }, + "outSJfilterIntronMaxVsReadN": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "maximum gap allowed for junctions supported by 1,2,3,,,N reads\n\ni.e", + "help_text": "Type: `integer`, multiple: `True`, example: `[50000;100000;200000]`. " + } + } + }, + "scoring": { + "title": "Scoring", + "type": "object", + "description": "No description", + "properties": { + "scoreGap": { + "type": "integer", + "description": "splice junction penalty (independent on intron motif)", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "scoreGapNoncan": { + "type": "integer", + "description": "non-canonical junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-8`. " + }, + "scoreGapGCAG": { + "type": "integer", + "description": "GC/AG and CT/GC junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-4`. " + }, + "scoreGapATAC": { + "type": "integer", + "description": "AT/AC and GT/AT junction penalty (in addition to scoreGap)", + "help_text": "Type: `integer`, multiple: `False`, example: `-8`. " + }, + "scoreGenomicLengthLog2scale": { + "type": "integer", + "description": "extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "scoreDelOpen": { + "type": "integer", + "description": "deletion open penalty", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreDelBase": { + "type": "integer", + "description": "deletion extension penalty per base (in addition to scoreDelOpen)", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreInsOpen": { + "type": "integer", + "description": "insertion open penalty", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreInsBase": { + "type": "integer", + "description": "insertion extension penalty per base (in addition to scoreInsOpen)", + "help_text": "Type: `integer`, multiple: `False`, example: `-2`. " + }, + "scoreStitchSJshift": { + "type": "integer", + "description": "maximum score reduction while searching for SJ boundaries in the stitching step", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + } + } + }, + "alignments and seeding": { + "title": "Alignments and Seeding", + "type": "object", + "description": "No description", + "properties": { + "seedSearchStartLmax": { + "type": "integer", + "description": "defines the search start point through the read - the read is split into pieces no longer than this value", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "seedSearchStartLmaxOverLread": { + "type": "number", + "description": "seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)", + "help_text": "Type: `double`, multiple: `False`, example: `1.0`. " + }, + "seedSearchLmax": { + "type": "integer", + "description": "defines the maximum length of the seeds, if =0 seed length is not limited", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "seedMultimapNmax": { + "type": "integer", + "description": "only pieces that map fewer than this value are utilized in the stitching procedure", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "seedPerReadNmax": { + "type": "integer", + "description": "max number of seeds per read", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + }, + "seedPerWindowNmax": { + "type": "integer", + "description": "max number of seeds per window", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "seedNoneLociPerWindow": { + "type": "integer", + "description": "max number of one seed loci per window", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "seedSplitMin": { + "type": "integer", + "description": "min length of the seed sequences split by Ns or mate gap", + "help_text": "Type: `integer`, multiple: `False`, example: `12`. " + }, + "seedMapMin": { + "type": "integer", + "description": "min length of seeds to be mapped", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "alignIntronMin": { + "type": "integer", + "description": "minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion", + "help_text": "Type: `integer`, multiple: `False`, example: `21`. " + }, + "alignIntronMax": { + "type": "integer", + "description": "maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignMatesGapMax": { + "type": "integer", + "description": "maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignSJoverhangMin": { + "type": "integer", + "description": "minimum overhang (i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `5`. " + }, + "alignSJstitchMismatchNmax": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "maximum number of mismatches for stitching of the splice junctions (-1: no limit).\n\n(1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.", + "help_text": "Type: `integer`, multiple: `True`, example: `[0;-1;0;0]`. " + }, + "alignSJDBoverhangMin": { + "type": "integer", + "description": "minimum overhang (i.e", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "alignSplicedMateMapLmin": { + "type": "integer", + "description": "minimum mapped length for a read mate that is spliced", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "alignSplicedMateMapLminOverLmate": { + "type": "number", + "description": "alignSplicedMateMapLmin normalized to mate length", + "help_text": "Type: `double`, multiple: `False`, example: `0.66`. " + }, + "alignWindowsPerReadNmax": { + "type": "integer", + "description": "max number of windows per read", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "alignTranscriptsPerWindowNmax": { + "type": "integer", + "description": "max number of transcripts per window", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "alignTranscriptsPerReadNmax": { + "type": "integer", + "description": "max number of different alignments per read to consider", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "alignEndsType": { + "type": "string", + "description": "type of read ends alignment\n\n- Local ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Local\"`. " + }, + "alignEndsProtrude": { + "type": "string", + "description": "allow protrusion of alignment ends, i.e", + "help_text": "Type: `string`, multiple: `False`, example: `\"0 ConcordantPair\"`. " + }, + "alignSoftClipAtReferenceEnds": { + "type": "string", + "description": "allow the soft-clipping of the alignments past the end of the chromosomes\n\n- Yes ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Yes\"`. " + }, + "alignInsertionFlush": { + "type": "string", + "description": "how to flush ambiguous insertion positions\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "paired-end reads": { + "title": "Paired-End reads", + "type": "object", + "description": "No description", + "properties": { + "peOverlapNbasesMin": { + "type": "integer", + "description": "minimum number of overlapping bases to trigger mates merging and realignment", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "peOverlapMMp": { + "type": "number", + "description": "maximum proportion of mismatched bases in the overlap area", + "help_text": "Type: `double`, multiple: `False`, example: `0.01`. " + } + } + }, + "windows, anchors, binning": { + "title": "Windows, Anchors, Binning", + "type": "object", + "description": "No description", + "properties": { + "winAnchorMultimapNmax": { + "type": "integer", + "description": "max number of loci anchors are allowed to map to", + "help_text": "Type: `integer`, multiple: `False`, example: `50`. " + }, + "winBinNbits": { + "type": "integer", + "description": "=log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.", + "help_text": "Type: `integer`, multiple: `False`, example: `16`. " + }, + "winAnchorDistNbins": { + "type": "integer", + "description": "max number of bins between two anchors that allows aggregation of anchors into one window", + "help_text": "Type: `integer`, multiple: `False`, example: `9`. " + }, + "winFlankNbins": { + "type": "integer", + "description": "log2(winFlank), where win Flank is the size of the left and right flanking regions for each window", + "help_text": "Type: `integer`, multiple: `False`, example: `4`. " + }, + "winReadCoverageRelativeMin": { + "type": "number", + "description": "minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.", + "help_text": "Type: `double`, multiple: `False`, example: `0.5`. " + }, + "winReadCoverageBasesMin": { + "type": "integer", + "description": "minimum number of bases covered by the seeds in a window , for STARlong algorithm only.", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "chimeric alignments": { + "title": "Chimeric Alignments", + "type": "object", + "description": "No description", + "properties": { + "chimOutType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of chimeric output\n\n- Junctions ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Junctions\"]`. " + }, + "chimSegmentMin": { + "type": "integer", + "description": "minimum length of chimeric segment length, if ==0, no chimeric output", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimScoreMin": { + "type": "integer", + "description": "minimum total (summed) score of the chimeric segments", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimScoreDropMax": { + "type": "integer", + "description": "max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimScoreSeparation": { + "type": "integer", + "description": "minimum difference (separation) between the best chimeric score and the next one", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "chimScoreJunctionNonGTAG": { + "type": "integer", + "description": "penalty for a non-GT/AG chimeric junction", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + }, + "chimJunctionOverhangMin": { + "type": "integer", + "description": "minimum overhang for a chimeric junction", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimSegmentReadGapMax": { + "type": "integer", + "description": "maximum gap in the read sequence between chimeric segments", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "different filters for chimeric alignments\n\n- None ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"banGenomicN\"]`. " + }, + "chimMainSegmentMultNmax": { + "type": "integer", + "description": "maximum number of multi-alignments for the main chimeric segment", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "chimMultimapNmax": { + "type": "integer", + "description": "maximum number of chimeric multi-alignments\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "chimMultimapScoreRange": { + "type": "integer", + "description": "the score range for multi-mapping chimeras below the best chimeric score", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "chimNonchimScoreDropMin": { + "type": "integer", + "description": "to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value", + "help_text": "Type: `integer`, multiple: `False`, example: `20`. " + }, + "chimOutJunctionFormat": { + "type": "integer", + "description": "formatting type for the Chimeric.out.junction file\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + } + } + }, + "quantification of annotations": { + "title": "Quantification of Annotations", + "type": "object", + "description": "No description", + "properties": { + "quantMode": { + "type": "array", + "items": { + "type": "string" + }, + "description": "types of quantification requested\n\n- - ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "quantTranscriptomeBAMcompression": { + "type": "integer", + "description": "-2 to 10 transcriptome BAM compression level\n\n- -2 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "quantTranscriptomeBan": { + "type": "string", + "description": "prohibit various alignment type\n\n- IndelSoftclipSingleend ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"IndelSoftclipSingleend\"`. " + } + } + }, + "2-pass mapping": { + "title": "2-pass Mapping", + "type": "object", + "description": "No description", + "properties": { + "twopassMode": { + "type": "string", + "description": "2-pass mapping mode.\n\n- None ..", + "help_text": "Type: `string`, multiple: `False`. " + }, + "twopass1readsN": { + "type": "integer", + "description": "number of reads to process for the 1st step", + "help_text": "Type: `integer`, multiple: `False`, example: `-1`. " + } + } + }, + "wasp parameters": { + "title": "WASP parameters", + "type": "object", + "description": "No description", + "properties": { + "waspOutputMode": { + "type": "string", + "description": "WASP allele-specific output type", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "starsolo (single cell rna-seq) parameters": { + "title": "STARsolo (single cell RNA-seq) parameters", + "type": "object", + "description": "No description", + "properties": { + "soloType": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of single-cell RNA-seq\n\n- CB_UMI_Simple ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloCBwhitelist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "file(s) with whitelist(s) of cell barcodes", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloCBstart": { + "type": "integer", + "description": "cell barcode start base", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloCBlen": { + "type": "integer", + "description": "cell barcode length", + "help_text": "Type: `integer`, multiple: `False`, example: `16`. " + }, + "soloUMIstart": { + "type": "integer", + "description": "UMI start base", + "help_text": "Type: `integer`, multiple: `False`, example: `17`. " + }, + "soloUMIlen": { + "type": "integer", + "description": "UMI length", + "help_text": "Type: `integer`, multiple: `False`, example: `10`. " + }, + "soloBarcodeReadLength": { + "type": "integer", + "description": "length of the barcode read\n\n- 1 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloBarcodeMate": { + "type": "integer", + "description": "identifies which read mate contains the barcode (CB+UMI) sequence\n\n- 0 ..", + "help_text": "Type: `integer`, multiple: `False`, example: `0`. " + }, + "soloCBposition": { + "type": "array", + "items": { + "type": "string" + }, + "description": "position of Cell Barcode(s) on the barcode read.\n\nPresently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.\nFormat for each barcode: startAnchor_startPosition_endAnchor_endPosition\nstart(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end\nstart(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base\nString for different barcodes are separated by space.\nExample: inDrop (Zilionis et al, Nat", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloUMIposition": { + "type": "string", + "description": "position of the UMI on the barcode read, same as soloCBposition\n\nExample: inDrop (Zilionis et al, Nat", + "help_text": "Type: `string`, multiple: `False`. " + }, + "soloAdapterSequence": { + "type": "string", + "description": "adapter sequence to anchor barcodes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "soloAdapterMismatchesNmax": { + "type": "integer", + "description": "maximum number of mismatches allowed in adapter sequence.", + "help_text": "Type: `integer`, multiple: `False`, example: `1`. " + }, + "soloCBmatchWLtype": { + "type": "string", + "description": "matching the Cell Barcodes to the WhiteList\n\n- Exact ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"1MM_multi\"`. " + }, + "soloInputSAMattrBarcodeSeq": { + "type": "array", + "items": { + "type": "string" + }, + "description": "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .\nThis parameter is required when running STARsolo with input from SAM.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloInputSAMattrBarcodeQual": { + "type": "array", + "items": { + "type": "string" + }, + "description": "when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).\n\nFor instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .\nIf this parameter is '-' (default), the quality 'H' will be assigned to all bases.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloStrand": { + "type": "string", + "description": "strandedness of the solo libraries:\n\n- Unstranded ..", + "help_text": "Type: `string`, multiple: `False`, example: `\"Forward\"`. " + }, + "soloFeatures": { + "type": "array", + "items": { + "type": "string" + }, + "description": "genomic features for which the UMI counts per Cell Barcode are collected\n\n- Gene ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene\"]`. " + }, + "soloMultiMappers": { + "type": "array", + "items": { + "type": "string" + }, + "description": "counting method for reads mapping to multiple genes\n\n- Unique ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Unique\"]`. " + }, + "soloUMIdedup": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of UMI deduplication (collapsing) algorithm\n\n- 1MM_All ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1MM_All\"]`. " + }, + "soloUMIfiltering": { + "type": "array", + "items": { + "type": "string" + }, + "description": "type of UMI filtering (for reads uniquely mapping to genes)\n\n- - ..", + "help_text": "Type: `string`, multiple: `True`. " + }, + "soloOutFileNames": { + "type": "array", + "items": { + "type": "string" + }, + "description": "file names for STARsolo output:\n\nfile_name_prefix gene_names barcode_sequences cell_feature_count_matrix", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Solo.out/\";\"features.tsv\";\"barcodes.tsv\";\"matrix.mtx\"]`. " + }, + "soloCellFilter": { + "type": "array", + "items": { + "type": "string" + }, + "description": "cell filtering type and parameters\n\n- None ..", + "help_text": "Type: `string`, multiple: `True`, example: `[\"CellRanger2.2\";\"3000\";\"0.99\";\"10\"]`. " + }, + "soloOutFormatFeaturesGeneField3": { + "type": "array", + "items": { + "type": "string" + }, + "description": "field 3 in the Gene features.tsv file", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene Expression\"]`. " + }, + "soloCellReadStats": { + "type": "string", + "description": "Output reads statistics for each CB\n\n- Standard ..", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input/output" + }, + { + "$ref": "#/$defs/run parameters" + }, + { + "$ref": "#/$defs/genome parameters" + }, + { + "$ref": "#/$defs/splice junctions database" + }, + { + "$ref": "#/$defs/variation parameters" + }, + { + "$ref": "#/$defs/read parameters" + }, + { + "$ref": "#/$defs/read clipping" + }, + { + "$ref": "#/$defs/limits" + }, + { + "$ref": "#/$defs/output: general" + }, + { + "$ref": "#/$defs/output: sam and bam" + }, + { + "$ref": "#/$defs/bam processing" + }, + { + "$ref": "#/$defs/output wiggle" + }, + { + "$ref": "#/$defs/output filtering" + }, + { + "$ref": "#/$defs/output splice junctions (sj.out.tab)" + }, + { + "$ref": "#/$defs/output filtering: splice junctions" + }, + { + "$ref": "#/$defs/scoring" + }, + { + "$ref": "#/$defs/alignments and seeding" + }, + { + "$ref": "#/$defs/paired-end reads" + }, + { + "$ref": "#/$defs/windows, anchors, binning" + }, + { + "$ref": "#/$defs/chimeric alignments" + }, + { + "$ref": "#/$defs/quantification of annotations" + }, + { + "$ref": "#/$defs/2-pass mapping" + }, + { + "$ref": "#/$defs/wasp parameters" + }, + { + "$ref": "#/$defs/starsolo (single cell rna-seq) parameters" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/mapping/star_align_v273a/setup_logger.py b/target/nextflow/mapping/star_align_v273a/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/mapping/star_align_v273a/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/metadata/add_id/.config.vsh.yaml b/target/nextflow/metadata/add_id/.config.vsh.yaml new file mode 100644 index 00000000..2de2e2d9 --- /dev/null +++ b/target/nextflow/metadata/add_id/.config.vsh.yaml @@ -0,0 +1,273 @@ +name: "add_id" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the input .h5mu." + info: null + example: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_id" + description: "The input id." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_output" + description: "Name of the .obs column where to store the id." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Name of output MuData file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--make_observation_keys_unique" + description: "Join the id to the .obs index (.obs_names)." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Add id of .obs. Also allows to make .obs_names (the .obs index) unique\ + \ \nby prefixing the values with an unique id per .h5mu file.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/add_id/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/metadata/add_id" + executable: "target/nextflow/metadata/add_id/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/metadata/add_id/main.nf b/target/nextflow/metadata/add_id/main.nf new file mode 100644 index 00000000..063d1308 --- /dev/null +++ b/target/nextflow/metadata/add_id/main.nf @@ -0,0 +1,3997 @@ +// add_id main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "add_id", + "namespace" : "metadata", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the input .h5mu.", + "example" : [ + "sample_path" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_id", + "description" : "The input id.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_output", + "description" : "Name of the .obs column where to store the id.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Name of output MuData file.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--make_observation_keys_unique", + "description" : "Join the id to the .obs index (.obs_names).", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/metadata/add_id/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/metadata/add_id", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from __future__ import annotations +import sys +from mudata import read_h5mu, MuData + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_id': $( if [ ! -z ${VIASH_PAR_INPUT_ID+x} ]; then echo "r'${VIASH_PAR_INPUT_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_output': $( if [ ! -z ${VIASH_PAR_OBS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OBS_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'make_observation_keys_unique': $( if [ ! -z ${VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE+x} ]; then echo "r'${VIASH_PAR_MAKE_OBSERVATION_KEYS_UNIQUE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def make_observation_keys_unique(sample_id: str, sample: MuData) -> None: + """ + Make the observation keys unique across all samples. At input, + the observation keys are unique within a sample. By adding the sample name + (unique for a sample) to each observation key, the observation key is made + unique across all samples as well. + """ + logger.info( + "Making observation keys unique across all " + "samples by appending prefix '%s' to the observation names.", + sample_id, + ) + sample.obs.index = f"{sample_id}_" + sample.obs.index + make_observation_keys_unique_per_mod(sample_id, sample) + logger.info("Done making observation keys unique.") + + +def make_observation_keys_unique_per_mod(sample_id: str, sample: MuData) -> None: + """ + Updating MuData.obs_names is not allowed (it is read-only). + So the observation keys for each modality has to be updated manually. + """ + for mod_name, mod in sample.mod.items(): + logger.info("Processing modality '%s'", mod_name) + mod.obs_names = f"{sample_id}_" + mod.obs_names + + +def main(): + logger.info("Reading input file '%s'.", par["input"]) + input_data = read_h5mu(par["input"]) + logger.info( + "Adding column '%s' to global .obs dataframe, populated with ID '%s'", + par["obs_output"], + par["input_id"], + ) + input_data.obs[par["obs_output"]] = par["input_id"] + logger.info("Done adding column to global .obs") + for mod_name, mod_data in input_data.mod.items(): + logger.info( + "Adding column '%s' to .obs dataframe for modality '%s', " + "populated with ID '%s'", + par["obs_output"], + mod_name, + par["input_id"], + ) + mod_data.obs[par["obs_output"]] = par["input_id"] + logger.info("Done adding per-modality columns.") + if par["make_observation_keys_unique"]: + make_observation_keys_unique(par["input_id"], input_data) + logger.info( + "Writing out data to '%s' with compression '%s'.", + par["output"], + par["output_compression"], + ) + input_data.write_h5mu(par["output"], compression=par["output_compression"]) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/metadata/add_id", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metadata/add_id/nextflow.config b/target/nextflow/metadata/add_id/nextflow.config new file mode 100644 index 00000000..5bad2308 --- /dev/null +++ b/target/nextflow/metadata/add_id/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'metadata/add_id' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/metadata/add_id/nextflow_labels.config b/target/nextflow/metadata/add_id/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/metadata/add_id/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/metadata/add_id/nextflow_schema.json b/target/nextflow/metadata/add_id/nextflow_schema.json new file mode 100644 index 00000000..83fcf941 --- /dev/null +++ b/target/nextflow/metadata/add_id/nextflow_schema.json @@ -0,0 +1,75 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "add_id", + "description": "Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the input .h5mu.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"sample_path\"`. " + }, + "input_id": { + "type": "string", + "description": "The input id.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "obs_output": { + "type": "string", + "description": "Name of the .obs column where to store the id.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "output": { + "type": "string", + "format": "path", + "description": "Name of output MuData file.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "make_observation_keys_unique": { + "type": "boolean", + "description": "Join the id to the .obs index (.obs_names).", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/metadata/add_id/setup_logger.py b/target/nextflow/metadata/add_id/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/metadata/add_id/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/metadata/grep_annotation_column/.config.vsh.yaml b/target/nextflow/metadata/grep_annotation_column/.config.vsh.yaml new file mode 100644 index 00000000..4d52ab5b --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/.config.vsh.yaml @@ -0,0 +1,329 @@ +name: "grep_annotation_column" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + description: "Arguments related to the input dataset." + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the input .h5mu." + info: null + example: + - "sample_path" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_column" + description: "Column to query. If not specified, use .var_names or .obs_names,\ + \ depending on the value of --matrix" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input data to use when calculating fraction of observations that\ + \ match with the query. \nOnly used when --output_fraction_column is provided.\ + \ If not specified, .X is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to get the annotation matrix from.\n" + info: null + example: + - "rna" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--matrix" + description: "Matrix to fetch the column from that will be searched." + info: null + example: + - "var" + required: false + choices: + - "var" + - "obs" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Arguments related to how the output will be written." + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Location of the output MuData file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_match_column" + description: "Name of the column to write the result to." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_fraction_column" + description: "For the opposite axis, name of the column to write the fraction\ + \ of \nobservations that matches to the pattern.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Query options" + description: "Options related to the query" + arguments: + - type: "string" + name: "--regex_pattern" + description: "Regex to use to match with the input column." + info: null + example: + - "^[mM][tT]-" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Perform a regex lookup on a column from the annotation matrices .obs\ + \ or .var.\nThe annotation matrix can originate from either a modality, or all modalities\ + \ (global .var or .obs).\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/grep_annotation_column/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/metadata/grep_annotation_column" + executable: "target/nextflow/metadata/grep_annotation_column/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/metadata/grep_annotation_column/compress_h5mu.py b/target/nextflow/metadata/grep_annotation_column/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/metadata/grep_annotation_column/main.nf b/target/nextflow/metadata/grep_annotation_column/main.nf new file mode 100644 index 00000000..09451a21 --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/main.nf @@ -0,0 +1,4136 @@ +// grep_annotation_column main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "grep_annotation_column", + "namespace" : "metadata", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "description" : "Arguments related to the input dataset.", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the input .h5mu.", + "example" : [ + "sample_path" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_column", + "description" : "Column to query. If not specified, use .var_names or .obs_names, depending on the value of --matrix", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input data to use when calculating fraction of observations that match with the query. \nOnly used when --output_fraction_column is provided. If not specified, .X is used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to get the annotation matrix from.\n", + "example" : [ + "rna" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--matrix", + "description" : "Matrix to fetch the column from that will be searched.", + "example" : [ + "var" + ], + "required" : false, + "choices" : [ + "var", + "obs" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Arguments related to how the output will be written.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Location of the output MuData file.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_match_column", + "description" : "Name of the column to write the result to.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_fraction_column", + "description" : "For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Query options", + "description" : "Options related to the query", + "arguments" : [ + { + "type" : "string", + "name" : "--regex_pattern", + "description" : "Regex to use to match with the input column.", + "example" : [ + "^[mM][tT]-" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/metadata/grep_annotation_column/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/metadata/grep_annotation_column", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +from pathlib import Path +from operator import attrgetter +from pandas import Series +import scipy as sc +import re +import numpy as np + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_column': $( if [ ! -z ${VIASH_PAR_INPUT_COLUMN+x} ]; then echo "r'${VIASH_PAR_INPUT_COLUMN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'matrix': $( if [ ! -z ${VIASH_PAR_MATRIX+x} ]; then echo "r'${VIASH_PAR_MATRIX//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_match_column': $( if [ ! -z ${VIASH_PAR_OUTPUT_MATCH_COLUMN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MATCH_COLUMN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_fraction_column': $( if [ ! -z ${VIASH_PAR_OUTPUT_FRACTION_COLUMN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_FRACTION_COLUMN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'regex_pattern': $( if [ ! -z ${VIASH_PAR_REGEX_PATTERN+x} ]; then echo "r'${VIASH_PAR_REGEX_PATTERN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def describe_array(arr, msg): + # Note: sc.stats returns a DescribeResult NamedTuple. For NamedTuples, + # the _asdict method is public facing even though it starts with an underscore. + description = sc.stats.describe(arr)._asdict() + logger.info( + "%s:\\\\nshape: %s\\\\nmean: %s\\\\nnobs: %s\\\\n" + "variance: %s\\\\nmin: %s\\\\nmax: %s\\\\ncontains na: %s\\\\ndtype: %s\\\\ncontains 0: %s", + msg, + arr.shape, + description["mean"], + description["nobs"], + description["variance"], + description["minmax"][0], + description["minmax"][1], + np.isnan(arr).any(), + arr.dtype, + (arr == 0).any(), + ) + + +def main(par): + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) + logger.info(f"Compiling regular expression '{par['regex_pattern']}'.") + try: + compiled_regex = re.compile(par["regex_pattern"]) + except (TypeError, re.error) as e: + raise ValueError( + f"{par['regex_pattern']} is not a valid regular expression pattern." + ) from e + else: + if compiled_regex.groups: + raise NotImplementedError( + "Using match groups is not supported by this component." + ) + logger.info("Reading input file %s, modality %s.", input_file, mod_name) + + modality_data = mu.read_h5ad(input_file, mod=mod_name) + logger.info("Reading input file done.") + logger.info("Using annotation dataframe '%s'.", par["matrix"]) + annotation_matrix = getattr(modality_data, par["matrix"]) + default_column = {"var": attrgetter("var_names"), "obs": attrgetter("obs_names")} + if par["input_column"]: + logger.info("Input column '%s' was specified.", par["input_column"]) + try: + annotation_column = annotation_matrix[par["input_column"]] + except KeyError as e: + raise ValueError( + f"Column {par['input_column']} could not be found for modality " + f"{par['modality']}. Available columns:" + f" {','.join(annotation_matrix.columns.to_list())}" + ) from e + else: + logger.info(f"No input column specified, using '.{par['matrix']}_names'") + annotation_column = default_column[par["matrix"]](modality_data).to_series() + logger.info("Applying regex search.") + grep_result = annotation_column.str.contains(par["regex_pattern"], regex=True) + logger.info("Search results: %s", grep_result.value_counts()) + # A Series object cannot be used as an indexer for a scipy sparse array + # when the data type is a pandas boolean extension array because + # extension arrays do not define .nonzero() + # See https://github.com/pandas-dev/pandas/issues/46025 + grep_result = grep_result.to_numpy(dtype="bool", na_value=False) + + other_axis_attribute = {"var": "obs", "obs": "var"} + if par["output_fraction_column"]: + logger.info( + "Enabled writing the fraction of values that matches to the pattern." + ) + input_layer = ( + modality_data.X + if not par["input_layer"] + else modality_data.layers[par["input_layer"]] + ) + totals = np.ravel(input_layer.sum(axis=1)) + describe_array(totals, "Summary of total counts for layer") + counts_for_matches = np.ravel(input_layer[:, grep_result].sum(axis=1)) + describe_array(counts_for_matches, "Summary of counts matching grep") + with np.errstate(all="raise"): + pct_matching = np.divide( + counts_for_matches, + totals, + out=np.zeros_like(totals, dtype=np.float64), + where=(~np.isclose(totals, np.zeros_like(totals))), + ) + logger.info("Testing wether or not fractions data contains NA.") + assert ~np.isnan(pct_matching).any(), "Fractions should not contain NA." + logger.info("Fraction statistics: \\\\n%s", Series(pct_matching).describe()) + pct_matching = np.where(np.isclose(pct_matching, 0, atol=1e-6), 0, pct_matching) + pct_matching = np.where(np.isclose(pct_matching, 1, atol=1e-6), 1, pct_matching) + assert (np.logical_and(pct_matching >= 0, pct_matching <= 1)).all(), ( + "Fractions are not within bounds, please report this as a bug" + ) + output_matrix = other_axis_attribute[par["matrix"]] + logger.info( + "Writing fractions to matrix '%s', column '%s'", + output_matrix, + par["output_fraction_column"], + ) + getattr(modality_data, output_matrix)[par["output_fraction_column"]] = ( + pct_matching + ) + logger.info( + "Adding values that matched the pattern to '%s', column '%s'", + par["matrix"], + par["output_match_column"], + ) + getattr(modality_data, par["matrix"])[par["output_match_column"]] = grep_result + logger.info( + "Writing out data to '%s' with compression '%s'.", + output_file, + par["output_compression"], + ) + write_h5ad_to_h5mu_with_compression( + output_file, par["input"], mod_name, modality_data, par["output_compression"] + ) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/metadata/grep_annotation_column", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metadata/grep_annotation_column/nextflow.config b/target/nextflow/metadata/grep_annotation_column/nextflow.config new file mode 100644 index 00000000..14ceff25 --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'metadata/grep_annotation_column' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/metadata/grep_annotation_column/nextflow_labels.config b/target/nextflow/metadata/grep_annotation_column/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/metadata/grep_annotation_column/nextflow_schema.json b/target/nextflow/metadata/grep_annotation_column/nextflow_schema.json new file mode 100644 index 00000000..ff552cb4 --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/nextflow_schema.json @@ -0,0 +1,117 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "grep_annotation_column", + "description": "Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "Arguments related to the input dataset.", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the input .h5mu.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"sample_path\"`. " + }, + "input_column": { + "type": "string", + "description": "Column to query", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_layer": { + "type": "string", + "description": "Input data to use when calculating fraction of observations that match with the query", + "help_text": "Type: `string`, multiple: `False`. " + }, + "modality": { + "type": "string", + "description": "Which modality to get the annotation matrix from.\n", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"rna\"`. " + }, + "matrix": { + "type": "string", + "description": "Matrix to fetch the column from that will be searched.", + "help_text": "Type: `string`, multiple: `False`, example: `\"var\"`, choices: ``var`, `obs``. ", + "enum": [ + "var", + "obs" + ] + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Arguments related to how the output will be written.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Location of the output MuData file.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_match_column": { + "type": "string", + "description": "Name of the column to write the result to.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "output_fraction_column": { + "type": "string", + "description": "For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "query options": { + "title": "Query options", + "type": "object", + "description": "Options related to the query", + "properties": { + "regex_pattern": { + "type": "string", + "description": "Regex to use to match with the input column.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"^[mM][tT]-\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/query options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/metadata/grep_annotation_column/setup_logger.py b/target/nextflow/metadata/grep_annotation_column/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/metadata/grep_annotation_column/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/metadata/join_csv/.config.vsh.yaml b/target/nextflow/metadata/join_csv/.config.vsh.yaml new file mode 100644 index 00000000..270e84cc --- /dev/null +++ b/target/nextflow/metadata/join_csv/.config.vsh.yaml @@ -0,0 +1,304 @@ +name: "join_csv" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "MuData Input" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_key" + description: "Obs column name where the sample id can be found for each observation\ + \ to join on.\nUseful when adding metadata to concatenated samples.\nMutually\ + \ exclusive with `--var_key`.\"\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_key" + description: "Var column name where the sample id can be found for each variable\ + \ to join on.\nMutually exclusive with `--obs_key`.\"\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "MuData Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Metadata Input" + arguments: + - type: "file" + name: "--input_csv" + description: ".csv file containing metadata" + info: null + example: + - "metadata.csv" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--csv_key" + description: "column of the the csv that corresponds to the sample id." + info: null + default: + - "id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Join a csv containing metadata to the .obs or .var field of a mudata\ + \ file." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/join_csv/config.vsh.yml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/metadata/join_csv" + executable: "target/nextflow/metadata/join_csv/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/metadata/join_csv/compress_h5mu.py b/target/nextflow/metadata/join_csv/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/metadata/join_csv/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/metadata/join_csv/main.nf b/target/nextflow/metadata/join_csv/main.nf new file mode 100644 index 00000000..560cabf3 --- /dev/null +++ b/target/nextflow/metadata/join_csv/main.nf @@ -0,0 +1,4013 @@ +// join_csv main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "join_csv", + "namespace" : "metadata", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "MuData Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_key", + "description" : "Obs column name where the sample id can be found for each observation to join on.\nUseful when adding metadata to concatenated samples.\nMutually exclusive with `--var_key`.\\"\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_key", + "description" : "Var column name where the sample id can be found for each variable to join on.\nMutually exclusive with `--obs_key`.\\"\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "MuData Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Metadata Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input_csv", + "description" : ".csv file containing metadata", + "example" : [ + "metadata.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--csv_key", + "description" : "column of the the csv that corresponds to the sample id.", + "default" : [ + "id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Join a csv containing metadata to the .obs or .var field of a mudata file.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/metadata/join_csv/config.vsh.yml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/metadata/join_csv", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import pandas as pd +from mudata import read_h5ad + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_key': $( if [ ! -z ${VIASH_PAR_OBS_KEY+x} ]; then echo "r'${VIASH_PAR_OBS_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_key': $( if [ ! -z ${VIASH_PAR_VAR_KEY+x} ]; then echo "r'${VIASH_PAR_VAR_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_csv': $( if [ ! -z ${VIASH_PAR_INPUT_CSV+x} ]; then echo "r'${VIASH_PAR_INPUT_CSV//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'csv_key': $( if [ ! -z ${VIASH_PAR_CSV_KEY+x} ]; then echo "r'${VIASH_PAR_CSV_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +if par["obs_key"] and par["var_key"]: + raise ValueError("--obs_key can not be used in conjuction with --var_key.") +if not (par["obs_key"] or par["var_key"]): + raise ValueError("Must define either --obs_key or --var_key") + +logger.info("Read metadata csv from file") +metadata = pd.read_csv(par["input_csv"], sep=",", header=0, index_col=par["csv_key"]) +metadata.fillna("", inplace=True) + +logger.info("Read modality %s from file %s", par["modality"], par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Joining csv to mudata") +matrix = "var" if par["var_key"] else "obs" +matrix_sample_column_name = par["var_key"] if par["var_key"] else par["obs_key"] +original_matrix = getattr(mod_data, matrix) +sample_ids = original_matrix[matrix_sample_column_name] + +try: + new_columns = metadata.loc[sample_ids.tolist()] +except KeyError as e: + raise KeyError( + f"Not all sample IDs selected from {matrix} " + "(using the column selected with --var_key or --obs_key) were found in " + "the csv file." + ) from e +new_matrix = pd.concat( + [original_matrix.reset_index(drop=True), new_columns.reset_index(drop=True)], axis=1 +).set_axis(original_matrix.index) +setattr(mod_data, matrix, new_matrix) + +logger.info("Write output to mudata file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/metadata/join_csv", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metadata/join_csv/nextflow.config b/target/nextflow/metadata/join_csv/nextflow.config new file mode 100644 index 00000000..dcc51884 --- /dev/null +++ b/target/nextflow/metadata/join_csv/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'metadata/join_csv' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Join a csv containing metadata to the .obs or .var field of a mudata file.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/metadata/join_csv/nextflow_labels.config b/target/nextflow/metadata/join_csv/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/metadata/join_csv/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/metadata/join_csv/nextflow_schema.json b/target/nextflow/metadata/join_csv/nextflow_schema.json new file mode 100644 index 00000000..adc8e312 --- /dev/null +++ b/target/nextflow/metadata/join_csv/nextflow_schema.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "join_csv", + "description": "Join a csv containing metadata to the .obs or .var field of a mudata file.", + "type": "object", + "$defs": { + "mudata input": { + "title": "MuData Input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_key": { + "type": "string", + "description": "Obs column name where the sample id can be found for each observation to join on.\nUseful when adding metadata to concatenated samples.\nMutually exclusive with `--var_key`.\"\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_key": { + "type": "string", + "description": "Var column name where the sample id can be found for each variable to join on.\nMutually exclusive with `--obs_key`.\"\n", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "mudata output": { + "title": "MuData Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "The compression format to be used on the output h5mu object.", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "metadata input": { + "title": "Metadata Input", + "type": "object", + "description": "No description", + "properties": { + "input_csv": { + "type": "string", + "format": "path", + "exists": true, + "description": ".csv file containing metadata", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"metadata.csv\"`. " + }, + "csv_key": { + "type": "string", + "description": "column of the the csv that corresponds to the sample id.", + "help_text": "Type: `string`, multiple: `False`, default: `\"id\"`. ", + "default": "id" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/mudata input" + }, + { + "$ref": "#/$defs/mudata output" + }, + { + "$ref": "#/$defs/metadata input" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/metadata/join_csv/setup_logger.py b/target/nextflow/metadata/join_csv/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/metadata/join_csv/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/metadata/join_uns_to_obs/.config.vsh.yaml b/target/nextflow/metadata/join_uns_to_obs/.config.vsh.yaml new file mode 100644 index 00000000..686353f5 --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/.config.vsh.yaml @@ -0,0 +1,252 @@ +name: "join_uns_to_obs" +namespace: "metadata" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_key" + description: "Lookup key from .uns pointing to the input data.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object." + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Join a data frame of length 1 (1 row index value) in .uns containing\ + \ metadata to the .obs of a mudata file." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/join_uns_to_obs/config.vsh.yml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/metadata/join_uns_to_obs" + executable: "target/nextflow/metadata/join_uns_to_obs/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/metadata/join_uns_to_obs/compress_h5mu.py b/target/nextflow/metadata/join_uns_to_obs/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/metadata/join_uns_to_obs/main.nf b/target/nextflow/metadata/join_uns_to_obs/main.nf new file mode 100644 index 00000000..8b9f192f --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/main.nf @@ -0,0 +1,3925 @@ +// join_uns_to_obs main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "join_uns_to_obs", + "namespace" : "metadata", + "version" : "main", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_key", + "description" : "Lookup key from .uns pointing to the input data.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Join a data frame of length 1 (1 row index value) in .uns containing metadata to the .obs of a mudata file.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/metadata/join_uns_to_obs/config.vsh.yml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/metadata/join_uns_to_obs", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import pandas as pd +from mudata import read_h5ad + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_key': $( if [ ! -z ${VIASH_PAR_UNS_KEY+x} ]; then echo "r'${VIASH_PAR_UNS_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Read modality %s from file %s", par["modality"], par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Joining uns to obs") +# get data frame +uns_df = mod_data.uns[par["uns_key"]] + +# check for overlapping colnames +intersect_keys = uns_df.keys().intersection(mod_data.obs.keys()) +obs_drop = mod_data.obs.drop(intersect_keys, axis=1) + +# create data frame to join +uns_df_rep = uns_df.loc[uns_df.index.repeat(mod_data.n_obs)] +uns_df_rep.index = mod_data.obs_names + +# create new obs +mod_data.obs = pd.concat([obs_drop, uns_df_rep], axis=1) + +logger.info("Write output to mudata file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/metadata/join_uns_to_obs", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metadata/join_uns_to_obs/nextflow.config b/target/nextflow/metadata/join_uns_to_obs/nextflow.config new file mode 100644 index 00000000..10500cf0 --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'metadata/join_uns_to_obs' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Join a data frame of length 1 (1 row index value) in .uns containing metadata to the .obs of a mudata file.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/metadata/join_uns_to_obs/nextflow_labels.config b/target/nextflow/metadata/join_uns_to_obs/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/metadata/join_uns_to_obs/nextflow_schema.json b/target/nextflow/metadata/join_uns_to_obs/nextflow_schema.json new file mode 100644 index 00000000..95bf0d51 --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/nextflow_schema.json @@ -0,0 +1,69 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "join_uns_to_obs", + "description": "Join a data frame of length 1 (1 row index value) in .uns containing metadata to the .obs of a mudata file.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "uns_key": { + "type": "string", + "description": "Lookup key from .uns pointing to the input data.\n", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "The compression format to be used on the output h5mu object.", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/metadata/join_uns_to_obs/setup_logger.py b/target/nextflow/metadata/join_uns_to_obs/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/metadata/join_uns_to_obs/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/metadata/move_obsm_to_obs/.config.vsh.yaml b/target/nextflow/metadata/move_obsm_to_obs/.config.vsh.yaml new file mode 100644 index 00000000..d6f7f9bb --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/.config.vsh.yaml @@ -0,0 +1,271 @@ +name: "move_obsm_to_obs" +namespace: "metadata" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "MuData Input" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_key" + description: "Key of a data structure to move from `.obsm` to `.obs`." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "MuData Output" + arguments: + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Move a matrix from .obsm to .obs. Newly created columns in .obs will\ + \ \nbe created from the .obsm key suffixed with an underscore and the name of the\ + \ columns\nof the specified .obsm matrix.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/metadata/move_obsm_to_obs/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/metadata/move_obsm_to_obs" + executable: "target/nextflow/metadata/move_obsm_to_obs/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/metadata/move_obsm_to_obs/compress_h5mu.py b/target/nextflow/metadata/move_obsm_to_obs/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/metadata/move_obsm_to_obs/main.nf b/target/nextflow/metadata/move_obsm_to_obs/main.nf new file mode 100644 index 00000000..845bca01 --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/main.nf @@ -0,0 +1,3986 @@ +// move_obsm_to_obs main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "move_obsm_to_obs", + "namespace" : "metadata", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "MuData Input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_key", + "description" : "Key of a data structure to move from `.obsm` to `.obs`.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "MuData Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Move a matrix from .obsm to .obs. Newly created columns in .obs will \nbe created from the .obsm key suffixed with an underscore and the name of the columns\nof the specified .obsm matrix.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/metadata/move_obsm_to_obs/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/metadata/move_obsm_to_obs", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from functools import partial +from pandas.errors import MergeError +from mudata import read_h5ad + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_key': $( if [ ! -z ${VIASH_PAR_OBSM_KEY+x} ]; then echo "r'${VIASH_PAR_OBSM_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from file %s", par["modality"], par["input"]) +try: + mod_data = read_h5ad(par["input"], mod=par["modality"]) +except KeyError: + raise ValueError(f"Modality {par['modality']} does not exist.") + +logger.info("Moving .obm key %s", par["obsm_key"]) +try: + obsm_matrix = mod_data.obsm[par["obsm_key"]].copy() +except KeyError: + raise ValueError( + f".obsm key {par['obsm_key']} was not found in " + f".obsm slot for modality {par['modality']}." + ) + + +obsm_matrix.rename( + partial("{key}_{}".format, key=par["obsm_key"]), + axis="columns", + copy=False, + inplace=True, +) + +original_n_obs = len(mod_data.obs) +try: + logger.info(f".obs names: {mod_data.obs_names}") + logger.info(f".obsm index: {obsm_matrix.index}") + new_obs = mod_data.obs.drop(obsm_matrix.columns, axis=1, errors="ignore") + new_obs = new_obs.merge( + obsm_matrix, + how="left", + validate="one_to_one", + left_index=True, + right_index=True, + ) + mod_data.obs = new_obs +except MergeError: + raise ValueError( + f"Could not join .obsm matrix at {par['obsm_key']} to .obs because there " + "are some observation that are not overlapping between the two matrices " + "(indexes should overlap). This is either a bug or your mudata file is corrupt." + ) +del mod_data.obsm[par["obsm_key"]] + +logger.info( + "Write output to %s with compression %s", par["output"], par["output_compression"] +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/metadata/move_obsm_to_obs", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/metadata/move_obsm_to_obs/nextflow.config b/target/nextflow/metadata/move_obsm_to_obs/nextflow.config new file mode 100644 index 00000000..489040aa --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'metadata/move_obsm_to_obs' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Move a matrix from .obsm to .obs. Newly created columns in .obs will \nbe created from the .obsm key suffixed with an underscore and the name of the columns\nof the specified .obsm matrix.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/metadata/move_obsm_to_obs/nextflow_labels.config b/target/nextflow/metadata/move_obsm_to_obs/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/metadata/move_obsm_to_obs/nextflow_schema.json b/target/nextflow/metadata/move_obsm_to_obs/nextflow_schema.json new file mode 100644 index 00000000..44f13831 --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/nextflow_schema.json @@ -0,0 +1,79 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "move_obsm_to_obs", + "description": "Move a matrix from .obsm to .obs. Newly created columns in .obs will \nbe created from the .obsm key suffixed with an underscore and the name of the columns\nof the specified .obsm matrix.\n", + "type": "object", + "$defs": { + "mudata input": { + "title": "MuData Input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obsm_key": { + "type": "string", + "description": "Key of a data structure to move from `.obsm` to `.obs`.", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "mudata output": { + "title": "MuData Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/mudata input" + }, + { + "$ref": "#/$defs/mudata output" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/metadata/move_obsm_to_obs/setup_logger.py b/target/nextflow/metadata/move_obsm_to_obs/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/metadata/move_obsm_to_obs/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/neighbors/bbknn/.config.vsh.yaml b/target/nextflow/neighbors/bbknn/.config.vsh.yaml new file mode 100644 index 00000000..cd807337 --- /dev/null +++ b/target/nextflow/neighbors/bbknn/.config.vsh.yaml @@ -0,0 +1,357 @@ +name: "bbknn" +namespace: "neighbors" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "The dimensionality reduction in `.obsm` to use for neighbour detection.\ + \ Defaults to X_pca." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: ".obs column name discriminating between your batches." + info: null + default: + - "batch" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output .h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "Mandatory .uns slot to store various neighbor output objects." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors_within_batch" + description: "How many top neighbours to report for each batch; total number of\ + \ neighbours in the initial k-nearest-neighbours computation will be this number\ + \ times the number of batches." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_pcs" + description: "How many dimensions (in case of PCA, principal components) to use\ + \ in the analysis." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_trim" + description: "Trim the neighbours of each cell to these many top connectivities.\ + \ May help with population independence and improve the tidiness of clustering.\ + \ The lower the value the more independent the individual populations, at the\ + \ cost of more conserved batch effect. If `None` (default), sets the parameter\ + \ value automatically to 10 times `neighbors_within_batch` times the number\ + \ of batches. Set to 0 to skip." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "BBKNN network generation\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "highmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "bbknn" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/neighbors/bbknn/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/neighbors/bbknn" + executable: "target/nextflow/neighbors/bbknn/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/neighbors/bbknn/compress_h5mu.py b/target/nextflow/neighbors/bbknn/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/neighbors/bbknn/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/neighbors/bbknn/main.nf b/target/nextflow/neighbors/bbknn/main.nf new file mode 100644 index 00000000..6ad24796 --- /dev/null +++ b/target/nextflow/neighbors/bbknn/main.nf @@ -0,0 +1,4056 @@ +// bbknn main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (author) +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bbknn", + "namespace" : "neighbors", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_input", + "description" : "The dimensionality reduction in `.obsm` to use for neighbour detection. Defaults to X_pca.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch", + "description" : ".obs column name discriminating between your batches.", + "default" : [ + "batch" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output .h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_output", + "description" : "Mandatory .uns slot to store various neighbor output objects.", + "default" : [ + "neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_neighbors_within_batch", + "description" : "How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches.", + "default" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_pcs", + "description" : "How many dimensions (in case of PCA, principal components) to use in the analysis.", + "default" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_trim", + "description" : "Trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. The lower the value the more independent the individual populations, at the cost of more conserved batch effect. If `None` (default), sets the parameter value automatically to 10 times `neighbors_within_batch` times the number of batches. Set to 0 to skip.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "BBKNN network generation\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "highmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "build-essential" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "bbknn" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/neighbors/bbknn/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/neighbors/bbknn", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import bbknn +from mudata import read_h5ad + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_batch': $( if [ ! -z ${VIASH_PAR_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsp_distances': $( if [ ! -z ${VIASH_PAR_OBSP_DISTANCES+x} ]; then echo "r'${VIASH_PAR_OBSP_DISTANCES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsp_connectivities': $( if [ ! -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then echo "r'${VIASH_PAR_OBSP_CONNECTIVITIES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_neighbors_within_batch': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS_WITHIN_BATCH//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_pcs': $( if [ ! -z ${VIASH_PAR_N_PCS+x} ]; then echo "int(r'${VIASH_PAR_N_PCS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_trim': $( if [ ! -z ${VIASH_PAR_N_TRIM+x} ]; then echo "int(r'${VIASH_PAR_N_TRIM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +adata = read_h5ad(par["input"], mod=par["modality"]) + +# copy data +tmp_adata = adata.copy() +bbknn.bbknn( + tmp_adata, + use_rep=par["obsm_input"], + batch_key=par["obs_batch"], + neighbors_within_batch=par["n_neighbors_within_batch"], + n_pcs=par["n_pcs"], + trim=par["n_trim"], +) + +# store output +adata.obsp[par["obsp_connectivities"]] = tmp_adata.obsp["connectivities"] +adata.obsp[par["obsp_distances"]] = tmp_adata.obsp["distances"] +adata.uns[par["uns_output"]] = tmp_adata.uns["neighbors"] +adata.uns[par["uns_output"]]["distances_key"] = par["obsp_distances"] +adata.uns[par["uns_output"]]["connectivities_key"] = par["obsp_connectivities"] + +# write to file +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/neighbors/bbknn", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "highmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/neighbors/bbknn/nextflow.config b/target/nextflow/neighbors/bbknn/nextflow.config new file mode 100644 index 00000000..5458cf81 --- /dev/null +++ b/target/nextflow/neighbors/bbknn/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'neighbors/bbknn' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'BBKNN network generation\n' + author = 'Dries De Maeyer, Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/neighbors/bbknn/nextflow_labels.config b/target/nextflow/neighbors/bbknn/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/neighbors/bbknn/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/neighbors/bbknn/nextflow_schema.json b/target/nextflow/neighbors/bbknn/nextflow_schema.json new file mode 100644 index 00000000..c5d97479 --- /dev/null +++ b/target/nextflow/neighbors/bbknn/nextflow_schema.json @@ -0,0 +1,111 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bbknn", + "description": "BBKNN network generation\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obsm_input": { + "type": "string", + "description": "The dimensionality reduction in `.obsm` to use for neighbour detection", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "obs_batch": { + "type": "string", + "description": ".obs column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"batch\"`. ", + "default": "batch" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output .h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "uns_output": { + "type": "string", + "description": "Mandatory .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"neighbors\"`. ", + "default": "neighbors" + }, + "obsp_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"distances\"`. ", + "default": "distances" + }, + "obsp_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"connectivities\"`. ", + "default": "connectivities" + }, + "n_neighbors_within_batch": { + "type": "integer", + "description": "How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches.", + "help_text": "Type: `integer`, multiple: `False`, default: `3`. ", + "default": 3 + }, + "n_pcs": { + "type": "integer", + "description": "How many dimensions (in case of PCA, principal components) to use in the analysis.", + "help_text": "Type: `integer`, multiple: `False`, default: `50`. ", + "default": 50 + }, + "n_trim": { + "type": "integer", + "description": "Trim the neighbours of each cell to these many top connectivities", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/neighbors/find_neighbors/.config.vsh.yaml b/target/nextflow/neighbors/find_neighbors/.config.vsh.yaml new file mode 100644 index 00000000..9ee4b27c --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/.config.vsh.yaml @@ -0,0 +1,385 @@ +name: "find_neighbors" +namespace: "neighbors" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "Which .obsm slot to use as a starting PCA embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file containing the found neighbors." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "Mandatory .uns slot to store various neighbor output objects." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--metric" + description: "The distance metric to be used in the generation of the nearest\ + \ neighborhood network." + info: null + default: + - "euclidean" + required: false + choices: + - "cityblock" + - "cosine" + - "euclidean" + - "l1" + - "l2" + - "manhattan" + - "braycurtis" + - "canberra" + - "chebyshev" + - "correlation" + - "dice" + - "hamming" + - "jaccard" + - "kulsinski" + - "mahalanobis" + - "minkowski" + - "rogerstanimoto" + - "russellrao" + - "seuclidean" + - "sokalmichener" + - "sokalsneath" + - "sqeuclidean" + - "yule" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--num_neighbors" + description: "The size of local neighborhood (in terms of number of neighboring\ + \ data points) used for manifold approximation. Larger values result in more\ + \ global views of the manifold, while smaller values result in more local data\ + \ being preserved. In general values should be in the range 2 to 100. If knn\ + \ is True, number of nearest neighbors to be searched. If knn is False, a Gaussian\ + \ kernel width is set to the distance of the n_neighbors neighbor." + info: null + default: + - 15 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "A random seed." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Compute a neighborhood graph of observations [McInnes18].\n\nThe neighbor\ + \ search efficiency of this heavily relies on UMAP [McInnes18], which also provides\ + \ a method for estimating connectivities of data points - the connectivity of the\ + \ manifold (method=='umap'). If method=='gauss', connectivities are computed according\ + \ to [Coifman05], in the adaption of [Haghverdi16].\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + - "middisk" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/neighbors/find_neighbors/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/neighbors/find_neighbors" + executable: "target/nextflow/neighbors/find_neighbors/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/neighbors/find_neighbors/compress_h5mu.py b/target/nextflow/neighbors/find_neighbors/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/neighbors/find_neighbors/main.nf b/target/nextflow/neighbors/find_neighbors/main.nf new file mode 100644 index 00000000..d94184e7 --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/main.nf @@ -0,0 +1,4110 @@ +// find_neighbors main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) +// * Robrecht Cannoodt (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "find_neighbors", + "namespace" : "neighbors", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_input", + "description" : "Which .obsm slot to use as a starting PCA embedding.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file containing the found neighbors.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_output", + "description" : "Mandatory .uns slot to store various neighbor output objects.", + "default" : [ + "neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--metric", + "description" : "The distance metric to be used in the generation of the nearest neighborhood network.", + "default" : [ + "euclidean" + ], + "required" : false, + "choices" : [ + "cityblock", + "cosine", + "euclidean", + "l1", + "l2", + "manhattan", + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "dice", + "hamming", + "jaccard", + "kulsinski", + "mahalanobis", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalmichener", + "sokalsneath", + "sqeuclidean", + "yule" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--num_neighbors", + "description" : "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If knn is True, number of nearest neighbors to be searched. If knn is False, a Gaussian kernel width is set to the distance of the n_neighbors neighbor.", + "default" : [ + 15 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "A random seed.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Compute a neighborhood graph of observations [McInnes18].\n\nThe neighbor search efficiency of this heavily relies on UMAP [McInnes18], which also provides a method for estimating connectivities of data points - the connectivity of the manifold (method=='umap'). If method=='gauss', connectivities are computed according to [Coifman05], in the adaption of [Haghverdi16].\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "midmem", + "middisk" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/neighbors/find_neighbors/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/neighbors/find_neighbors", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import numpy as np + +numpy_module = sys.modules["numpy"] +numpy_module.float_ = np.float64 +sys.modules["numpy"] = numpy_module + +import mudata as mu +import scanpy as sc +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_input': $( if [ ! -z ${VIASH_PAR_OBSM_INPUT+x} ]; then echo "r'${VIASH_PAR_OBSM_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'uns_output': $( if [ ! -z ${VIASH_PAR_UNS_OUTPUT+x} ]; then echo "r'${VIASH_PAR_UNS_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsp_distances': $( if [ ! -z ${VIASH_PAR_OBSP_DISTANCES+x} ]; then echo "r'${VIASH_PAR_OBSP_DISTANCES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsp_connectivities': $( if [ ! -z ${VIASH_PAR_OBSP_CONNECTIVITIES+x} ]; then echo "r'${VIASH_PAR_OBSP_CONNECTIVITIES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'metric': $( if [ ! -z ${VIASH_PAR_METRIC+x} ]; then echo "r'${VIASH_PAR_METRIC//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'num_neighbors': $( if [ ! -z ${VIASH_PAR_NUM_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_NUM_NEIGHBORS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input modality %s from %s", par["modality"], par["input"]) +adata = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("Computing a neighborhood graph.") +neighbors = sc.Neighbors(adata) +neighbors.compute_neighbors( + n_neighbors=par["num_neighbors"], + use_rep=par["obsm_input"], + metric=par["metric"], + random_state=par["seed"], + method="umap", +) + +adata.uns[par["uns_output"]] = { + "connectivities_key": par["obsp_connectivities"], + "distances_key": par["obsp_distances"], + "params": { + "n_neighbors": neighbors.n_neighbors, + "method": "umap", + "random_state": par["seed"], + "metric": par["metric"], + "use_rep": par["obsm_input"], + }, +} + +adata.obsp[par["obsp_distances"]] = neighbors.distances +adata.obsp[par["obsp_connectivities"]] = neighbors.connectivities + + +logger.info("Writing to %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/neighbors/find_neighbors", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "midmem", + "middisk" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/neighbors/find_neighbors/nextflow.config b/target/nextflow/neighbors/find_neighbors/nextflow.config new file mode 100644 index 00000000..a034dbc7 --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'neighbors/find_neighbors' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Compute a neighborhood graph of observations [McInnes18].\n\nThe neighbor search efficiency of this heavily relies on UMAP [McInnes18], which also provides a method for estimating connectivities of data points - the connectivity of the manifold (method==\'umap\'). If method==\'gauss\', connectivities are computed according to [Coifman05], in the adaption of [Haghverdi16].\n' + author = 'Dries De Maeyer, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/neighbors/find_neighbors/nextflow_labels.config b/target/nextflow/neighbors/find_neighbors/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/neighbors/find_neighbors/nextflow_schema.json b/target/nextflow/neighbors/find_neighbors/nextflow_schema.json new file mode 100644 index 00000000..1f81de26 --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/nextflow_schema.json @@ -0,0 +1,131 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "find_neighbors", + "description": "Compute a neighborhood graph of observations [McInnes18].\n\nThe neighbor search efficiency of this heavily relies on UMAP [McInnes18], which also provides a method for estimating connectivities of data points - the connectivity of the manifold (method=='umap'). If method=='gauss', connectivities are computed according to [Coifman05], in the adaption of [Haghverdi16].\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obsm_input": { + "type": "string", + "description": "Which .obsm slot to use as a starting PCA embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file containing the found neighbors.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "uns_output": { + "type": "string", + "description": "Mandatory .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"neighbors\"`. ", + "default": "neighbors" + }, + "obsp_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"distances\"`. ", + "default": "distances" + }, + "obsp_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"connectivities\"`. ", + "default": "connectivities" + }, + "metric": { + "type": "string", + "description": "The distance metric to be used in the generation of the nearest neighborhood network.", + "help_text": "Type: `string`, multiple: `False`, default: `\"euclidean\"`, choices: ``cityblock`, `cosine`, `euclidean`, `l1`, `l2`, `manhattan`, `braycurtis`, `canberra`, `chebyshev`, `correlation`, `dice`, `hamming`, `jaccard`, `kulsinski`, `mahalanobis`, `minkowski`, `rogerstanimoto`, `russellrao`, `seuclidean`, `sokalmichener`, `sokalsneath`, `sqeuclidean`, `yule``. ", + "enum": [ + "cityblock", + "cosine", + "euclidean", + "l1", + "l2", + "manhattan", + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "dice", + "hamming", + "jaccard", + "kulsinski", + "mahalanobis", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalmichener", + "sokalsneath", + "sqeuclidean", + "yule" + ], + "default": "euclidean" + }, + "num_neighbors": { + "type": "integer", + "description": "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation", + "help_text": "Type: `integer`, multiple: `False`, default: `15`. ", + "default": 15 + }, + "seed": { + "type": "integer", + "description": "A random seed.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/neighbors/find_neighbors/setup_logger.py b/target/nextflow/neighbors/find_neighbors/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/neighbors/find_neighbors/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/process_10xh5/filter_10xh5/.config.vsh.yaml b/target/nextflow/process_10xh5/filter_10xh5/.config.vsh.yaml new file mode 100644 index 00000000..e5a7122a --- /dev/null +++ b/target/nextflow/process_10xh5/filter_10xh5/.config.vsh.yaml @@ -0,0 +1,273 @@ +name: "filter_10xh5" +namespace: "process_10xh5" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "An h5 file from the 10x genomics website." + info: null + example: + - "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output h5 file." + info: null + example: + - "pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_library_size" + description: "Minimum library size." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_gene" + description: "Minimum number of cells per gene." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--keep_feature_types" + description: "Specify which feature types will never be filtered out" + info: null + example: + - "Antibody Capture" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--verbose" + description: "Increase verbosity" + info: null + direction: "input" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Filter a 10x h5 dataset.\n" +usage: "filter_10xh5 \\\n --input pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \\\n\ + \ --output pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5 \\\n --min_library_size\ + \ 1000 --min_cells_per_gene 300\n" +test_resources: +- type: "r_script" + path: "run_test.R" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "eddelbuettel/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "PIP_BREAK_SYSTEM_PACKAGES=1" + - "RETICULATE_PYTHON=/usr/bin/python" + - type: "apt" + packages: + - "libhdf5-dev" + - "python3-pip" + - "python3-dev" + - "python-is-python3" + interactive: false + - type: "python" + user: true + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "r" + cran: + - "testthat" + - "anndata" + - "hdf5r" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/process_10xh5/filter_10xh5/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/process_10xh5/filter_10xh5" + executable: "target/nextflow/process_10xh5/filter_10xh5/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/process_10xh5/filter_10xh5/main.nf b/target/nextflow/process_10xh5/filter_10xh5/main.nf new file mode 100644 index 00000000..b5d2367f --- /dev/null +++ b/target/nextflow/process_10xh5/filter_10xh5/main.nf @@ -0,0 +1,4012 @@ +// filter_10xh5 main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "filter_10xh5", + "namespace" : "process_10xh5", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "An h5 file from the 10x genomics website.", + "example" : [ + "pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output h5 file.", + "example" : [ + "pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_library_size", + "description" : "Minimum library size.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells_per_gene", + "description" : "Minimum number of cells per gene.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--keep_feature_types", + "description" : "Specify which feature types will never be filtered out", + "example" : [ + "Antibody Capture" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--verbose", + "description" : "Increase verbosity", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Filter a 10x h5 dataset.\n", + "usage" : "filter_10xh5 \\\\\n --input pbmc_1k_protein_v3_raw_feature_bc_matrix.h5 \\\\\n --output pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5 \\\\\n --min_library_size 1000 --min_cells_per_gene 300\n", + "test_resources" : [ + { + "type" : "r_script", + "path" : "run_test.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "eddelbuettel/r2u:24.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "env" : [ + "PIP_BREAK_SYSTEM_PACKAGES=1", + "RETICULATE_PYTHON=/usr/bin/python" + ] + }, + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "python3-pip", + "python3-dev", + "python-is-python3" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : true, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "r", + "cran" : [ + "testthat", + "anndata", + "hdf5r" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/process_10xh5/filter_10xh5/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/process_10xh5/filter_10xh5", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "min_library_size" = $( if [ ! -z ${VIASH_PAR_MIN_LIBRARY_SIZE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_LIBRARY_SIZE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "min_cells_per_gene" = $( if [ ! -z ${VIASH_PAR_MIN_CELLS_PER_GENE+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_PAR_MIN_CELLS_PER_GENE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "keep_feature_types" = $( if [ ! -z ${VIASH_PAR_KEEP_FEATURE_TYPES+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_KEEP_FEATURE_TYPES" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "verbose" = $( if [ ! -z ${VIASH_PAR_VERBOSE+x} ]; then echo -n "as.logical(toupper('"; echo -n "$VIASH_PAR_VERBOSE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'))"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +if (par\\$verbose) cat("Loading dependencies\\\\n") +requireNamespace("hdf5r", quietly = TRUE) + +if (par\\$verbose) cat("Opening h5 file\\\\n") +h5 <- hdf5r::H5File\\$new(par\\$input, mode = "r") + +if (par\\$verbose) cat("Reading data in memory\\\\n") +features__all_tag_keys <- h5[["matrix/features/_all_tag_keys"]][] + +features <- data.frame( + feature_type = h5[["matrix/features/feature_type"]][], + genome = h5[["matrix/features/genome"]][], + id = h5[["matrix/features/id"]][], + name = h5[["matrix/features/name"]][] +) + +mat <- Matrix::sparseMatrix( + i = h5[["matrix/indices"]][], + p = h5[["matrix/indptr"]][], + x = h5[["matrix/data"]][], + dims = h5[["matrix/shape"]][], + index1 = FALSE, + dimnames = list( + features\\$id, + h5[["matrix/barcodes"]][] + ) +) + +if (par\\$verbose) { + cat("Filtering out cells with library size < ", + par\\$min_library_size, "\\\\n", + sep = "" + ) +} +library_size <- Matrix::colSums(mat) +mat2 <- mat[, library_size >= par\\$min_library_size, drop = FALSE] + +if (par\\$verbose) { + cat("Filtering genes with num cells < ", + par\\$min_cells_per_gene, "\\\\n", + sep = "" + ) +} +num_cells <- Matrix::rowSums(mat2 > 0) +mat3 <- mat2[ + num_cells >= par\\$min_cells_per_gene | + features\\$feature_type %in% par\\$keep_feature_types, , + drop = FALSE +] +features2 <- features[match(rownames(mat3), features\\$id), , drop = FALSE] + +# helper fun +set_with_type <- function(path, value) { + orig_dtype <- h5[[path]]\\$get_type() + orig_chunk <- h5[[path]]\\$chunk_dims + if (is.na(orig_chunk)) orig_chunk <- "auto" + h5new\\$create_dataset(path, value, dtype = orig_dtype, chunk_dims = orig_chunk) +} + +# create new file +if (par\\$verbose) cat("Saving h5 file at '", par\\$output, "'\\\\n", sep = "") +h5new <- hdf5r::H5File\\$new(par\\$output, mode = "w") +zz <- h5new\\$create_group("matrix") +zz <- h5new\\$create_group("matrix/features") + +set_with_type("matrix/features/feature_type", features2\\$feature_type) +set_with_type("matrix/features/genome", features2\\$genome) +set_with_type("matrix/features/id", features2\\$id) +set_with_type("matrix/features/name", features2\\$name) +set_with_type("matrix/features/_all_tag_keys", features__all_tag_keys) +set_with_type("matrix/indices", mat3@i) +set_with_type("matrix/indptr", mat3@p) +set_with_type("matrix/data", as.integer(mat3@x)) +set_with_type("matrix/shape", dim(mat3)) +set_with_type("matrix/barcodes", colnames(mat3)) + +for (attname in hdf5r::h5attr_names(h5)) { + h5new\\$create_attr(attname, hdf5r::h5attr(h5, attname)) +} +h5new\\$close_all() +h5\\$close_all() +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/process_10xh5/filter_10xh5", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/process_10xh5/filter_10xh5/nextflow.config b/target/nextflow/process_10xh5/filter_10xh5/nextflow.config new file mode 100644 index 00000000..10e884ed --- /dev/null +++ b/target/nextflow/process_10xh5/filter_10xh5/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'process_10xh5/filter_10xh5' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Filter a 10x h5 dataset.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/process_10xh5/filter_10xh5/nextflow_labels.config b/target/nextflow/process_10xh5/filter_10xh5/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/process_10xh5/filter_10xh5/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/process_10xh5/filter_10xh5/nextflow_schema.json b/target/nextflow/process_10xh5/filter_10xh5/nextflow_schema.json new file mode 100644 index 00000000..e419eaef --- /dev/null +++ b/target/nextflow/process_10xh5/filter_10xh5/nextflow_schema.json @@ -0,0 +1,75 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "filter_10xh5", + "description": "Filter a 10x h5 dataset.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "An h5 file from the 10x genomics website.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"pbmc_1k_protein_v3_raw_feature_bc_matrix.h5\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5 file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5\"`, direction: `output`, example: `\"pbmc_1k_protein_v3_raw_feature_bc_matrix_filtered.h5\"`. ", + "default": "$id.$key.output.h5" + }, + "min_library_size": { + "type": "integer", + "description": "Minimum library size.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "min_cells_per_gene": { + "type": "integer", + "description": "Minimum number of cells per gene.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "keep_feature_types": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Specify which feature types will never be filtered out", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Antibody Capture\"]`. " + }, + "verbose": { + "type": "boolean", + "description": "Increase verbosity", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/.config.vsh.yaml b/target/nextflow/qc/calculate_atac_qc_metrics/.config.vsh.yaml new file mode 100644 index 00000000..b9b86648 --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/.config.vsh.yaml @@ -0,0 +1,332 @@ +name: "calculate_atac_qc_metrics" +namespace: "qc" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--fragments_path" + description: "Path to the fragments file. If not provided and not present in the\ + \ input h5mu file,\nthe nucleosome signal and TSS enrichment score will not\ + \ be calculated.\n" + info: null + example: + - "fragments.tsv.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "atac" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer at `.layers` to use for the calculation. If not provided,\ + \ `.X` is used.\n" + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_fragments_for_nucleosome_signal" + description: "Number of fragments to use per cell for nucleosome signal calculation.\n\ + Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended.\n\ + See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal\n\ + for more information\n" + info: null + default: + - 100000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--nuc_signal_threshold" + description: "Threshold for nucleosome signal. Cells with nucleosome signal above\ + \ this threshold\nwill be marked as low quality (\"NS_FAIL\"), otherwise they\ + \ will be marked \"NS_PASS\".\n" + info: null + default: + - 2.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_tss" + description: "Number of the transcription start sites to calculate TSS enrichment\ + \ score.\nSee https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment\n\ + for more information\n" + info: null + default: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--tss_threshold" + description: "Threshold for TSS enrichment score. Cells with TSS enrichment score\ + \ below this threshold\nwill be marked as low quality (\"TSS_FAIL\") otherwise\ + \ they will be marked as \"TSS_PASS\".\n" + info: null + default: + - 1.5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Add basic ATAC quality control metrics to an .h5mu file.\n\nThe metrics\ + \ are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\ + \ slightly different names:\n\nObs metrics (name in this component -> name in scanpy):\n\ + \ - n_features_per_cell -> n_genes_by_counts\n - total_fragment_counts -> total_counts\n\ + \ \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "counts" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pkg-config" + - "libhdf5-dev" + - "gcc" + interactive: false + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + - "muon~=0.1.5" + - "pysam~=0.22.0" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/calculate_atac_qc_metrics/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/qc/calculate_atac_qc_metrics" + executable: "target/nextflow/qc/calculate_atac_qc_metrics/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/compress_h5mu.py b/target/nextflow/qc/calculate_atac_qc_metrics/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/main.nf b/target/nextflow/qc/calculate_atac_qc_metrics/main.nf new file mode 100644 index 00000000..bd29b02b --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/main.nf @@ -0,0 +1,4073 @@ +// calculate_atac_qc_metrics main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "calculate_atac_qc_metrics", + "namespace" : "qc", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--fragments_path", + "description" : "Path to the fragments file. If not provided and not present in the input h5mu file,\nthe nucleosome signal and TSS enrichment score will not be calculated.\n", + "example" : [ + "fragments.tsv.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "atac" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Layer at `.layers` to use for the calculation. If not provided, `.X` is used.\n", + "example" : [ + "raw_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_fragments_for_nucleosome_signal", + "description" : "Number of fragments to use per cell for nucleosome signal calculation.\nTakes very long to calculate, for a test run lower value (e.g. 10e3) is recommended.\nSee https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal\nfor more information\n", + "default" : [ + 100000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--nuc_signal_threshold", + "description" : "Threshold for nucleosome signal. Cells with nucleosome signal above this threshold\nwill be marked as low quality (\\"NS_FAIL\\"), otherwise they will be marked \\"NS_PASS\\".\n", + "default" : [ + 2.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_tss", + "description" : "Number of the transcription start sites to calculate TSS enrichment score.\nSee https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment\nfor more information\n", + "default" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--tss_threshold", + "description" : "Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold\nwill be marked as low quality (\\"TSS_FAIL\\") otherwise they will be marked as \\"TSS_PASS\\".\n", + "default" : [ + 1.5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Add basic ATAC quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nObs metrics (name in this component -> name in scanpy):\n - n_features_per_cell -> n_genes_by_counts\n - total_fragment_counts -> total_counts\n \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_atac_tiny_bcl/counts/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "pkg-config", + "libhdf5-dev", + "gcc" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scanpy~=1.10.4", + "muon~=0.1.5", + "pysam~=0.22.0" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/qc/calculate_atac_qc_metrics/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/qc/calculate_atac_qc_metrics", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys + +import scanpy as sc +import muon as mu +from muon import atac as ac # the module containing function for scATAC data processing +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'fragments_path': $( if [ ! -z ${VIASH_PAR_FRAGMENTS_PATH+x} ]; then echo "r'${VIASH_PAR_FRAGMENTS_PATH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_fragments_for_nucleosome_signal': $( if [ ! -z ${VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL+x} ]; then echo "int(r'${VIASH_PAR_N_FRAGMENTS_FOR_NUCLEOSOME_SIGNAL//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'nuc_signal_threshold': $( if [ ! -z ${VIASH_PAR_NUC_SIGNAL_THRESHOLD+x} ]; then echo "float(r'${VIASH_PAR_NUC_SIGNAL_THRESHOLD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_tss': $( if [ ! -z ${VIASH_PAR_N_TSS+x} ]; then echo "int(r'${VIASH_PAR_N_TSS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'tss_threshold': $( if [ ! -z ${VIASH_PAR_TSS_THRESHOLD+x} ]; then echo "float(r'${VIASH_PAR_TSS_THRESHOLD//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info("Reading input data") + atac = mu.read_h5ad(par["input"], mod=par["modality"]) + + logger.info("Checking if QC columns are already calculated") + for col in ("n_features_per_cell", "total_fragment_counts"): + if col in atac.obs: + logger.warning(f"{col} is already in atac.obs, dropping") + atac.obs.drop(col, axis=1, inplace=True) + + logger.info("Calculating QC metrics") + sc.pp.calculate_qc_metrics( + atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"] + ) + + logger.debug("Putting QC columns to ATAC adata") + atac.obs.rename( + columns={ + "n_genes_by_counts": "n_features_per_cell", + "total_counts": "total_fragment_counts", + }, + inplace=True, + ) + + logger.debug("Adding log-transformed total fragment counts") + # log-transform total counts and add as column + atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"]) + + if par["fragments_path"] is not None: + logger.info("Trying to locate frafments") + ac.tl.locate_fragments(atac, fragments=par["fragments_path"]) + else: + logger.info("Skipping fragment location: \\`fragments_path\\` is not set") + + # Calculate the nucleosome signal across cells + if "files" in atac.uns and "fragments" in atac.uns["files"]: + logger.info("Trying to calculate nucleosome signal") + ac.tl.nucleosome_signal( + atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs + ) + atac.obs["nuc_signal_filter"] = [ + "NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS" + for ns in atac.obs["nucleosome_signal"] + ] + else: + logger.info( + "Skipping nucleosome signal calculation: fragments information is not found" + ) + + # If interval information is available, calculate TSS enrichment + if "peak_annotation" in atac.uns.keys(): + tss = ac.tl.tss_enrichment( + atac, + features=atac.uns["peak_annotation"], + n_tss=par["n_tss"], + random_state=666, + ) + + tss.obs["tss_filter"] = [ + "TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS" + for score in atac.obs["tss_score"] + ] + else: + logger.info( + "Skipping TSS enrichment calculation: genes intervals are not found" + ) + + logger.info("Writing output") + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], atac, par["output_compression"] + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/qc/calculate_atac_qc_metrics", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/nextflow.config b/target/nextflow/qc/calculate_atac_qc_metrics/nextflow.config new file mode 100644 index 00000000..260fdbfa --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'qc/calculate_atac_qc_metrics' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Add basic ATAC quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nObs metrics (name in this component -> name in scanpy):\n - n_features_per_cell -> n_genes_by_counts\n - total_fragment_counts -> total_counts\n \n' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/nextflow_labels.config b/target/nextflow/qc/calculate_atac_qc_metrics/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/nextflow_schema.json b/target/nextflow/qc/calculate_atac_qc_metrics/nextflow_schema.json new file mode 100644 index 00000000..023fcd75 --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/nextflow_schema.json @@ -0,0 +1,109 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "calculate_atac_qc_metrics", + "description": "Add basic ATAC quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nObs metrics (name in this component -> name in scanpy):\n - n_features_per_cell -> n_genes_by_counts\n - total_fragment_counts -> total_counts\n \n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "fragments_path": { + "type": "string", + "format": "path", + "description": "Path to the fragments file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"fragments.tsv.gz\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"atac\"`. ", + "default": "atac" + }, + "layer": { + "type": "string", + "description": "Layer at `.layers` to use for the calculation", + "help_text": "Type: `string`, multiple: `False`, example: `\"raw_counts\"`. " + }, + "n_fragments_for_nucleosome_signal": { + "type": "integer", + "description": "Number of fragments to use per cell for nucleosome signal calculation.\nTakes very long to calculate, for a test run lower value (e.g", + "help_text": "Type: `integer`, multiple: `False`, default: `100000`. ", + "default": 100000 + }, + "nuc_signal_threshold": { + "type": "number", + "description": "Threshold for nucleosome signal", + "help_text": "Type: `double`, multiple: `False`, default: `2.0`. ", + "default": 2.0 + }, + "n_tss": { + "type": "integer", + "description": "Number of the transcription start sites to calculate TSS enrichment score.\nSee https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment\nfor more information\n", + "help_text": "Type: `integer`, multiple: `False`, default: `3000`. ", + "default": 3000 + }, + "tss_threshold": { + "type": "number", + "description": "Threshold for TSS enrichment score", + "help_text": "Type: `double`, multiple: `False`, default: `1.5`. ", + "default": 1.5 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/qc/calculate_atac_qc_metrics/setup_logger.py b/target/nextflow/qc/calculate_atac_qc_metrics/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/qc/calculate_atac_qc_metrics/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/qc/calculate_qc_metrics/.config.vsh.yaml b/target/nextflow/qc/calculate_qc_metrics/.config.vsh.yaml new file mode 100644 index 00000000..50129a94 --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/.config.vsh.yaml @@ -0,0 +1,389 @@ +name: "calculate_qc_metrics" +namespace: "qc" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process. \n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer from modality to use as input data. If not provided the .X\ + \ attribute is used.\n" + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Metrics added to .obs" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes.\n" + info: null + example: + - "ercc,highly_variable,mitochondrial" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean" + name: "--var_qc_metrics_fill_na_value" + description: "Fill any 'NA' values found in the columns specified with --var_qc_metrics\ + \ to 'True' or 'False'.\nas False.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20;50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_num_nonzero_vars" + description: "Name of column in .obs describing, for each observation, the number\ + \ of stored values\n(including explicit zeroes). In other words, the name of\ + \ the column that counts\nfor each row the number of columns that contain data.\n" + info: null + default: + - "num_nonzero_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_total_counts_vars" + description: "Name of the column for .obs describing, for each observation (row),\n\ + the sum of the stored values in the columns.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Metrics added to .var" + arguments: + - type: "string" + name: "--output_var_num_nonzero_obs" + description: "Name of column describing, for each feature, the number of stored\ + \ values\n(including explicit zeroes). In other words, the name of the column\ + \ that counts\nfor each column the number of rows that contain data.\n" + info: null + default: + - "num_nonzero_obs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_total_counts_obs" + description: "Name of the column in .var describing, for each feature (column),\n\ + the sum of the stored values in the rows.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_obs_mean" + description: "Name of the column in .obs providing the mean of the values in each\ + \ row.\n" + info: null + default: + - "obs_mean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_pct_dropout" + description: "Name of the column in .obs providing for each feature the percentage\ + \ of\nobservations the feature does not appear on (i.e. is missing). Same as\ + \ `--num_nonzero_obs`\nbut percentage based.\n" + info: null + default: + - "pct_dropout" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are\ + \ comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\ + \ slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n\ + \ - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n\ + \ - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs\ + \ metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics}\ + \ -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n\ + \ - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n\ + \ - total_counts -> total_{expr_type}\n \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scipy" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "scanpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/calculate_qc_metrics/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/qc/calculate_qc_metrics" + executable: "target/nextflow/qc/calculate_qc_metrics/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/qc/calculate_qc_metrics/compress_h5mu.py b/target/nextflow/qc/calculate_qc_metrics/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/qc/calculate_qc_metrics/main.nf b/target/nextflow/qc/calculate_qc_metrics/main.nf new file mode 100644 index 00000000..faa56375 --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/main.nf @@ -0,0 +1,4233 @@ +// calculate_qc_metrics main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "calculate_qc_metrics", + "namespace" : "qc", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process. \n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Layer from modality to use as input data. If not provided the .X attribute is used.\n", + "example" : [ + "raw_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Metrics added to .obs", + "arguments" : [ + { + "type" : "string", + "name" : "--var_qc_metrics", + "description" : "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes.\n", + "example" : [ + "ercc,highly_variable,mitochondrial" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--var_qc_metrics_fill_na_value", + "description" : "Fill any 'NA' values found in the columns specified with --var_qc_metrics to 'True' or 'False'.\nas False.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--top_n_vars", + "description" : "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20;50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_num_nonzero_vars", + "description" : "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n", + "default" : [ + "num_nonzero_vars" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_total_counts_vars", + "description" : "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Metrics added to .var", + "arguments" : [ + { + "type" : "string", + "name" : "--output_var_num_nonzero_obs", + "description" : "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n", + "default" : [ + "num_nonzero_obs" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_total_counts_obs", + "description" : "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_obs_mean", + "description" : "Name of the column in .obs providing the mean of the values in each row.\n", + "default" : [ + "obs_mean" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_pct_dropout", + "description" : "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs`\nbut percentage based.\n", + "default" : [ + "pct_dropout" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scipy" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scanpy" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/qc/calculate_qc_metrics/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/qc/calculate_qc_metrics", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from mudata import read_h5ad +from scipy.sparse import issparse, csr_array +import numpy as np + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_qc_metrics': $( if [ ! -z ${VIASH_PAR_VAR_QC_METRICS+x} ]; then echo "r'${VIASH_PAR_VAR_QC_METRICS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'var_qc_metrics_fill_na_value': $( if [ ! -z ${VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE+x} ]; then echo "r'${VIASH_PAR_VAR_QC_METRICS_FILL_NA_VALUE//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'top_n_vars': $( if [ ! -z ${VIASH_PAR_TOP_N_VARS+x} ]; then echo "list(map(int, r'${VIASH_PAR_TOP_N_VARS//\\'/\\'\\"\\'\\"r\\'}'.split(';')))"; else echo None; fi ), + 'output_obs_num_nonzero_vars': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_NUM_NONZERO_VARS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_total_counts_vars': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_TOTAL_COUNTS_VARS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_num_nonzero_obs': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_NUM_NONZERO_OBS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_total_counts_obs': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_TOTAL_COUNTS_OBS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_obs_mean': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_OBS_MEAN+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_OBS_MEAN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_pct_dropout': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_PCT_DROPOUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def count_nonzero(layer, axis): + """ + This method is the functional equivalent of the old .getnnz function from scirpy, + but that function was deprecated. So we use the nonzero function to mimic the old + behavior. + """ + axis ^= 1 + nonzero_counts = dict(zip(*np.unique(layer.nonzero()[axis], return_counts=True))) + nonzero_per_axis_item = { + row_index: nonzero_counts.get(row_index, 0) + for row_index in range(layer.shape[axis]) + } + return np.array(list(nonzero_per_axis_item.values()), dtype="int64") + + +def main(): + modality_data = read_h5ad(par["input"], mod=par["modality"]) + var = modality_data.var + layer = modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] + if not issparse(layer): + raise NotImplementedError("Expected layer to be in sparse format.") + layer = csr_array(layer) + layer.eliminate_zeros() + + var_columns_to_add = {} + + # var statistics + if par["output_var_obs_mean"]: + obs_mean = layer.mean(axis=0) + var_columns_to_add[par["output_var_obs_mean"]] = obs_mean + if par["output_var_total_counts_obs"]: + # from the np.sum documentation: + # Especially when summing a large number of lower precision floating point numbers, + # such as float32, numerical errors can become significant. In such cases it can + # be advisable to use dtype="float64" to use a higher precision for the output. + layer_with_type = layer + if np.issubdtype(layer.dtype, np.floating) and np.can_cast( + layer.dtype, np.float64, casting="safe" + ): + # 'safe' casting makes sure not to cast np.float128 or anything else to a lower precision dtype + layer_with_type = layer.astype(np.float64) + total_counts_obs = np.ravel(layer_with_type.sum(axis=0)) + var_columns_to_add[par["output_var_total_counts_obs"]] = total_counts_obs + + num_nonzero_obs = count_nonzero(layer, axis=0) + if par["output_var_num_nonzero_obs"]: + var_columns_to_add[par["output_var_num_nonzero_obs"]] = num_nonzero_obs + if par["output_var_pct_dropout"]: + var_columns_to_add[par["output_var_pct_dropout"]] = ( + 1 - num_nonzero_obs / layer.shape[0] + ) * 100 + + modality_data.var = modality_data.var.assign(**var_columns_to_add) + + # obs statistics + obs_columns_to_add = {} + total_counts_var = np.ravel(layer.sum(axis=1)) + + if par["output_obs_num_nonzero_vars"]: + num_nonzero_vars = count_nonzero(layer, axis=1) + obs_columns_to_add[par["output_obs_num_nonzero_vars"]] = num_nonzero_vars + + if par["output_obs_total_counts_vars"]: + obs_columns_to_add[par["output_obs_total_counts_vars"]] = total_counts_var + + top_metrics = {} + if par["top_n_vars"]: + par["top_n_vars"] = sorted(par["top_n_vars"]) + distributions = get_top_from_csr_matrix(layer, par["top_n_vars"]) + top_metrics = { + distribution_size: distribution * 100 + for distribution_size, distribution in zip( + par["top_n_vars"], distributions.T + ) + } + obs_columns_to_add |= { + f"pct_of_counts_in_top_{n_top}_vars": col + for n_top, col in top_metrics.items() + } + + if par["var_qc_metrics"]: + print(f"qc_metrics: {par['var_qc_metrics']}") + for qc_metric in par["var_qc_metrics"]: + if qc_metric not in var: + raise ValueError( + f"Value for --var_qc_metrics, '{qc_metric}' " + f"not found in .var for modality {par['modality']}" + ) + qc_column = var[qc_metric] + if qc_column.isna().any(): + if par["var_qc_metrics_fill_na_value"] is None: + raise ValueError( + f"The .var column '{qc_metric}', selected by '--var_qc_metrics', contains NA values. " + "It is ambiguous whether or not to include these values in the static calulation. " + "You can explicitly map the NA values to 'False' or 'True using '--var_qc_metrics_fill_na_value'" + ) + else: + qc_column = qc_column.fillna( + par["var_qc_metrics_fill_na_value"], inplace=False + ) + qc_column = qc_column.to_list() + if set(np.unique(qc_column)) - {True, False}: + raise ValueError( + f"Column {qc_metric} in .var for modality {par['modality']} " + f"must only contain boolean values" + ) + total_counts_qc_metric = np.ravel(layer[:, qc_column].sum(axis=1)) + obs_columns_to_add |= { + f"total_counts_{qc_metric}": total_counts_qc_metric, + f"pct_{qc_metric}": total_counts_qc_metric / total_counts_var * 100, + } + + modality_data.obs = modality_data.obs.assign(**obs_columns_to_add) + write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], + ) + + +def get_top_from_csr_matrix(array, top_n_genes): + # csr matrices stores a 3D matrix in a format such that data for individual cells + # are stored in 1 array. Another array (indptr) here stores the ranges of indices + # to select from the data-array (.e.g. data[indptr[0]:indptr[1]] for row 0) for each row. + # Another array 'indices' maps each element of data to a column + # (data and indices arrays have the same length) + top_n_genes = np.array(top_n_genes).astype(np.int64) + assert np.all(top_n_genes[:-1] <= top_n_genes[1:]), "top_n_genes must be sorted" + row_indices, data = array.indptr, array.data + number_of_rows, max_genes_to_parse = row_indices.size - 1, top_n_genes[-1] + top_data = np.zeros((number_of_rows, max_genes_to_parse), dtype=data.dtype) + # Loop over each row to create a dense matrix without the 0 counts, + # but not for the whole matrix, only store the genes up until + # the largest number of top n genes. + for row_number in range(number_of_rows): + row_start_index, row_end_index = ( + row_indices[row_number], + row_indices[row_number + 1], + ) + row_data = data[row_start_index:row_end_index] # all non-zero counts for an row + try: + # There are less genes with counts in the row than the + # maximum number of genes we would like to select + # all these genes are in the top genes, just store them + top_data[row_number, : row_end_index - row_start_index] = row_data + except ValueError: + # Store the counts for the top genes + top_data[row_number, :] = np.partition(row_data, -max_genes_to_parse)[ + -max_genes_to_parse: + ] + + # Partition works from smallest to largest, but we want largest + # so do smallest to largest first (but with reversed indices) + top_data = np.partition(top_data, max_genes_to_parse - top_n_genes) + # And then switch the order around + top_data = np.flip(top_data, axis=1) + + cumulative = top_data.cumsum(axis=1, dtype=np.float64)[:, top_n_genes - 1] + return cumulative / np.expand_dims(array.sum(axis=1), 1) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/qc/calculate_qc_metrics", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/qc/calculate_qc_metrics/nextflow.config b/target/nextflow/qc/calculate_qc_metrics/nextflow.config new file mode 100644 index 00000000..4ffcd705 --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'qc/calculate_qc_metrics' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/qc/calculate_qc_metrics/nextflow_labels.config b/target/nextflow/qc/calculate_qc_metrics/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/qc/calculate_qc_metrics/nextflow_schema.json b/target/nextflow/qc/calculate_qc_metrics/nextflow_schema.json new file mode 100644 index 00000000..83f9504f --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/nextflow_schema.json @@ -0,0 +1,156 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "calculate_qc_metrics", + "description": "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Layer from modality to use as input data", + "help_text": "Type: `string`, multiple: `False`, example: `\"raw_counts\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "metrics added to .obs": { + "title": "Metrics added to .obs", + "type": "object", + "description": "No description", + "properties": { + "var_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"ercc,highly_variable,mitochondrial\"]`. " + }, + "var_qc_metrics_fill_na_value": { + "type": "boolean", + "description": "Fill any 'NA' values found in the columns specified with --var_qc_metrics to 'True' or 'False'.\nas False.\n", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "top_n_vars": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated", + "help_text": "Type: `integer`, multiple: `True`. " + }, + "output_obs_num_nonzero_vars": { + "type": "string", + "description": "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_vars\"`. ", + "default": "num_nonzero_vars" + }, + "output_obs_total_counts_vars": { + "type": "string", + "description": "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + } + } + }, + "metrics added to .var": { + "title": "Metrics added to .var", + "type": "object", + "description": "No description", + "properties": { + "output_var_num_nonzero_obs": { + "type": "string", + "description": "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_obs\"`. ", + "default": "num_nonzero_obs" + }, + "output_var_total_counts_obs": { + "type": "string", + "description": "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_obs_mean": { + "type": "string", + "description": "Name of the column in .obs providing the mean of the values in each row.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"obs_mean\"`. ", + "default": "obs_mean" + }, + "output_var_pct_dropout": { + "type": "string", + "description": "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e", + "help_text": "Type: `string`, multiple: `False`, default: `\"pct_dropout\"`. ", + "default": "pct_dropout" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/metrics added to .obs" + }, + { + "$ref": "#/$defs/metrics added to .var" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/qc/calculate_qc_metrics/setup_logger.py b/target/nextflow/qc/calculate_qc_metrics/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/qc/calculate_qc_metrics/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/qc/fastqc/.config.vsh.yaml b/target/nextflow/qc/fastqc/.config.vsh.yaml new file mode 100644 index 00000000..4e777430 --- /dev/null +++ b/target/nextflow/qc/fastqc/.config.vsh.yaml @@ -0,0 +1,225 @@ +name: "fastqc" +namespace: "qc" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--mode" + alternatives: + - "-m" + description: "The mode in which the component works. Can be either files or dir." + info: null + default: + - "files" + required: false + choices: + - "files" + - "dir" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Directory containing input fastq files." + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output directory to write reports to." + info: null + example: + - "qc" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--threads" + alternatives: + - "-t" + description: "Specifies the number of files which can be processed simultaneously.\ + \ Each thread will be allocated 250MB of\nmemory.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Fastqc component, please see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/.\ + \ This component can take one or more files (by means of shell globbing) or a complete\ + \ directory.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "fastqc" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/fastqc/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/qc/fastqc" + executable: "target/nextflow/qc/fastqc/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/qc/fastqc/main.nf b/target/nextflow/qc/fastqc/main.nf new file mode 100644 index 00000000..38bd9067 --- /dev/null +++ b/target/nextflow/qc/fastqc/main.nf @@ -0,0 +1,3853 @@ +// fastqc main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "fastqc", + "namespace" : "qc", + "version" : "main", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--mode", + "alternatives" : [ + "-m" + ], + "description" : "The mode in which the component works. Can be either files or dir.", + "default" : [ + "files" + ], + "required" : false, + "choices" : [ + "files", + "dir" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Directory containing input fastq files.", + "example" : [ + "fastq_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output directory to write reports to.", + "example" : [ + "qc" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--threads", + "alternatives" : [ + "-t" + ], + "description" : "Specifies the number of files which can be processed simultaneously. Each thread will be allocated 250MB of\nmemory.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Fastqc component, please see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/. This component can take one or more files (by means of shell globbing) or a complete directory.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowcpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "fastqc" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/qc/fastqc/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/qc/fastqc", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_MODE+x} ]; then echo "${VIASH_PAR_MODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_mode='&'#" ; else echo "# par_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_THREADS+x} ]; then echo "${VIASH_PAR_THREADS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_threads='&'#" ; else echo "# par_threads="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +mkdir -p "\\$par_output" + +if [ "\\$par_mode" == "dir" ]; then + par_input="\\$par_input/*.fastq.gz" +fi + +eval fastqc \\${par_threads:+--threads \\$par_threads} -o "\\$par_output" "\\$par_input" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/qc/fastqc", + "tag" : "main" + }, + "label" : [ + "lowcpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/qc/fastqc/nextflow.config b/target/nextflow/qc/fastqc/nextflow.config new file mode 100644 index 00000000..8a648b55 --- /dev/null +++ b/target/nextflow/qc/fastqc/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'qc/fastqc' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Fastqc component, please see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/. This component can take one or more files (by means of shell globbing) or a complete directory.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/qc/fastqc/nextflow_labels.config b/target/nextflow/qc/fastqc/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/qc/fastqc/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/qc/fastqc/nextflow_schema.json b/target/nextflow/qc/fastqc/nextflow_schema.json new file mode 100644 index 00000000..607e6f22 --- /dev/null +++ b/target/nextflow/qc/fastqc/nextflow_schema.json @@ -0,0 +1,64 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "fastqc", + "description": "Fastqc component, please see https://www.bioinformatics.babraham.ac.uk/projects/fastqc/. This component can take one or more files (by means of shell globbing) or a complete directory.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "mode": { + "type": "string", + "description": "The mode in which the component works", + "help_text": "Type: `string`, multiple: `False`, default: `\"files\"`, choices: ``files`, `dir``. ", + "enum": [ + "files", + "dir" + ], + "default": "files" + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Directory containing input fastq files.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"fastq_dir\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output directory to write reports to.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"qc\"`. ", + "default": "$id.$key.output" + }, + "threads": { + "type": "integer", + "description": "Specifies the number of files which can be processed simultaneously", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/qc/multiqc/.config.vsh.yaml b/target/nextflow/qc/multiqc/.config.vsh.yaml new file mode 100644 index 00000000..8060bff7 --- /dev/null +++ b/target/nextflow/qc/multiqc/.config.vsh.yaml @@ -0,0 +1,211 @@ +name: "multiqc" +namespace: "qc" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Inputs for MultiQC." + info: null + example: + - "input.txt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Create report in the specified output directory." + info: null + example: + - "report" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "MultiQC aggregates results from bioinformatics analyses across many\ + \ samples into a single report.\nIt searches a given directory for analysis logs\ + \ and compiles a HTML report. It's a general use tool, perfect for summarising the\ + \ output from numerous bioinformatics tools.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "fastqc" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "multiqc" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/qc/multiqc/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/qc/multiqc" + executable: "target/nextflow/qc/multiqc/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/qc/multiqc/main.nf b/target/nextflow/qc/multiqc/main.nf new file mode 100644 index 00000000..c818afc4 --- /dev/null +++ b/target/nextflow/qc/multiqc/main.nf @@ -0,0 +1,3839 @@ +// multiqc main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "multiqc", + "namespace" : "qc", + "version" : "main", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Inputs for MultiQC.", + "example" : [ + "input.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Create report in the specified output directory.", + "example" : [ + "report" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "MultiQC aggregates results from bioinformatics analyses across many samples into a single report.\nIt searches a given directory for analysis logs and compiles a HTML report. It's a general use tool, perfect for summarising the output from numerous bioinformatics tools.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_anticmv/fastqc/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "multiqc" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/qc/multiqc/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/qc/multiqc", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import subprocess + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +# Run MultiQC +subprocess.run(["multiqc", "-o", par["output"]] + par["input"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/qc/multiqc", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/qc/multiqc/nextflow.config b/target/nextflow/qc/multiqc/nextflow.config new file mode 100644 index 00000000..9e406645 --- /dev/null +++ b/target/nextflow/qc/multiqc/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'qc/multiqc' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'MultiQC aggregates results from bioinformatics analyses across many samples into a single report.\nIt searches a given directory for analysis logs and compiles a HTML report. It\'s a general use tool, perfect for summarising the output from numerous bioinformatics tools.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/qc/multiqc/nextflow_labels.config b/target/nextflow/qc/multiqc/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/qc/multiqc/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/qc/multiqc/nextflow_schema.json b/target/nextflow/qc/multiqc/nextflow_schema.json new file mode 100644 index 00000000..51133994 --- /dev/null +++ b/target/nextflow/qc/multiqc/nextflow_schema.json @@ -0,0 +1,52 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "multiqc", + "description": "MultiQC aggregates results from bioinformatics analyses across many samples into a single report.\nIt searches a given directory for analysis logs and compiles a HTML report. It's a general use tool, perfect for summarising the output from numerous bioinformatics tools.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Inputs for MultiQC.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"input.txt\"]`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Create report in the specified output directory.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"report\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/query/cellxgene_census/.config.vsh.yaml b/target/nextflow/query/cellxgene_census/.config.vsh.yaml new file mode 100644 index 00000000..4fbc553f --- /dev/null +++ b/target/nextflow/query/cellxgene_census/.config.vsh.yaml @@ -0,0 +1,586 @@ +name: "cellxgene_census" +namespace: "query" +version: "main" +authors: +- name: "Matthias Beyens" + roles: + - "maintainer" + - "author" + info: + role: "Contributor" + links: + github: "MatthiasBeyens" + orcid: "0000-0003-3304-0706" + email: "matthias.beyens@gmail.com" + linkedin: "mbeyens" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Kai Waldrant" + roles: + - "contributor" + info: + role: "Contributor" + links: + email: "kai@data-intuitive.com" + github: "KaiWaldrant" + orcid: "0009-0003-8555-1361" + linkedin: "kaiwaldrant" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatician" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Contributor" +argument_groups: +- name: "Input database" + description: "Open CellxGene Census by version or URI." + arguments: + - type: "string" + name: "--input_uri" + description: "If specified, a URI containing the Census SOMA objects. If specified,\ + \ will take precedence over the `--census_version` argument." + info: null + example: + - "s3://bucket/path" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--census_version" + description: "Which release of CellxGene census to use. Possible values are \"\ + latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"\ + ). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." + info: null + example: + - "stable" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--add_dataset_metadata" + description: "If true, the experiment metadata will be added to the cell metadata.\ + \ More specifically: `collection_id`, `collection_name`, `collection_doi`, `dataset_title`." + info: null + direction: "input" +- name: "Cell query" + description: "Arguments related to the query." + arguments: + - type: "string" + name: "--species" + description: "The organism to query, usually one of `Homo sapiens` or `Mus musculus`." + info: null + example: + - "homo_sapiens" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_value_filter" + description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a\ + \ filter query written in the SOMA `value_filter` syntax." + info: null + example: + - "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311',\ + \ 'CL:0002616'] and suspension_type == 'cell'" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Filter cells by grouping" + description: "Filter groups with fewer than X number of cells." + arguments: + - type: "string" + name: "--cell_filter_grouping" + description: "A subset of 'obs' columns by which to group the cells for filtering.\n\ + Only groups surpassing or equal to the `--cell_filter_minimum_count`\nthreshold\ + \ will be retained. Take care not to introduce a selection\nbias against cells\ + \ with more fine-grained ontology annotations.\n" + info: null + example: + - "dataset_id" + - "tissue" + - "assay" + - "disease" + - "cell_type" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--cell_filter_minimum_count" + description: "A minimum number of cells per group to retain. If `--cell_filter_grouping`\n\ + is defined, this parameter should also be provided and vice versa.\n" + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Count filtering" + description: "Arguments related to filtering cells and genes by counts." + arguments: + - type: "integer" + name: "--cell_filter_min_genes" + description: "Remove cells with less than this number of genes." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--cell_filter_min_counts" + description: "Remove cells with less than this number of counts." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gene_filter_min_cells" + description: "Remove genes expressed in less than this number of cells." + info: null + default: + - 5 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gene_filter_min_counts" + description: "Remove genes with less than this number of counts." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + description: "Output arguments." + arguments: + - type: "file" + name: "--output" + description: "Output h5mu file." + info: + label: "CellxGene dataset" + summary: "A dataset queried from the CellxGene Census platform" + description: "The format of this file is derived from the [CELLxGENE schema\ + \ v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).\n" + slots: + mod: + - name: "rna" + layers: + - type: "integer" + name: "counts" + description: "Raw counts" + required: true + obs: + - type: "string" + name: "dataset_id" + description: "Identifier for the dataset from which the cell data is derived,\ + \ useful for tracking and referencing purposes." + required: false + - type: "string" + name: "assay" + description: "Type of assay used to generate the cell data, indicating\ + \ the methodology or technique employed." + required: true + - type: "string" + name: "assay_ontology_term_id" + description: "Experimental Factor Ontology (`EFO:`) term identifier for\ + \ the assay, providing a standardized reference to the assay type." + required: true + - type: "string" + name: "cell_type" + description: "Classification of the cell type based on its characteristics\ + \ and function within the tissue or organism." + required: true + - type: "string" + name: "cell_type_ontology_term_id" + description: "Cell Ontology (`CL:`) term identifier for the cell type,\ + \ offering a standardized reference to the specific cell classification." + required: true + - type: "string" + name: "development_stage" + description: "Stage of development of the organism or tissue from which\ + \ the cell is derived, indicating its maturity or developmental phase." + required: true + - type: "string" + name: "development_stage_ontology_term_id" + description: "Ontology term identifier for the developmental stage, providing\ + \ a standardized reference to the organism's developmental phase.\n\n\ + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`),\ + \ then the Human Developmental Stages (`HsapDv:`) ontology is used.\ + \ \nIf the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`),\ + \ then the Mouse Developmental Stages (`MmusDv:`) ontology is used.\n\ + Otherwise, the Uberon (`UBERON:`) ontology is used.\n" + required: true + - type: "string" + name: "disease" + description: "Information on any disease or pathological condition associated\ + \ with the cell or donor." + required: true + - type: "string" + name: "disease_ontology_term_id" + description: "Ontology term identifier for the disease, enabling standardized\ + \ disease classification and referencing.\n\nMust be a term from the\ + \ Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461`\ + \ from the Phenotype And Trait Ontology (`PATO:`).\n" + required: true + - type: "string" + name: "donor_id" + description: "Identifier for the donor from whom the cell sample is obtained." + required: true + - type: "boolean" + name: "is_primary_data" + description: "Indicates whether the data is primary (directly obtained\ + \ from experiments) or has been computationally derived from other primary\ + \ data." + required: true + - type: "string" + name: "organism" + description: "Organism from which the cell sample is obtained." + required: true + - type: "string" + name: "organism_ontology_term_id" + description: "Ontology term identifier for the organism, providing a standardized\ + \ reference for the organism.\n\nMust be a term from the NCBI Taxonomy\ + \ Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`.\n" + required: true + - type: "string" + name: "self_reported_ethnicity" + description: "Ethnicity of the donor as self-reported, relevant for studies\ + \ considering genetic diversity and population-specific traits." + required: true + - type: "string" + name: "self_reported_ethnicity_ontology_term_id" + description: "Ontology term identifier for the self-reported ethnicity,\ + \ providing a standardized reference for ethnic classifications.\n\n\ + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`),\ + \ then the Human Ancestry Ontology (`HANCESTRO:`) is used.\n" + required: true + - type: "string" + name: "sex" + description: "Biological sex of the donor or source organism, crucial\ + \ for studies involving sex-specific traits or conditions." + required: true + - type: "string" + name: "sex_ontology_term_id" + description: "Ontology term identifier for the biological sex, ensuring\ + \ standardized classification of sex. Only `PATO:0000383`, `PATO:0000384`\ + \ and `PATO:0001340` are allowed." + required: true + - type: "string" + name: "suspension_type" + description: "Type of suspension or medium in which the cells were stored\ + \ or processed, important for understanding cell handling and conditions." + required: true + - type: "string" + name: "tissue" + description: "Specific tissue from which the cells were derived, key for\ + \ context and specificity in cell studies." + required: true + - type: "string" + name: "tissue_ontology_term_id" + description: "Ontology term identifier for the tissue, providing a standardized\ + \ reference for the tissue type.\n\nFor organoid or tissue samples,\ + \ the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be\ + \ a child term of `UBERON:0001062` (anatomical entity).\nFor cell cultures,\ + \ the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`,\ + \ `CL:0000257` or `CL:0000548`.\n" + required: true + - type: "string" + name: "tissue_general" + description: "General category or classification of the tissue, useful\ + \ for broader grouping and comparison of cell data." + required: true + - type: "string" + name: "tissue_general_ontology_term_id" + description: "Ontology term identifier for the general tissue category,\ + \ aiding in standardizing and grouping tissue types.\n\nFor organoid\ + \ or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used.\ + \ The term ids must be a child term of `UBERON:0001062` (anatomical\ + \ entity).\nFor cell cultures, the Cell Ontology (`CL:`) is used. The\ + \ term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.\n" + required: true + - type: "integer" + name: "soma_joinid" + description: "If the dataset was retrieved from CELLxGENE census, this\ + \ is a unique identifier for the cell." + required: true + var: + - type: "string" + name: "feature_id" + description: "Unique identifier for the feature, usually a ENSEMBL gene\ + \ id." + required: true + - type: "string" + name: "feature_name" + description: "A human-readable name for the feature, usually a gene symbol." + required: true + - type: "integer" + name: "soma_joinid" + description: "If the dataset was retrieved from CELLxGENE census, this\ + \ is a unique identifier for the feature." + required: true + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_modality" + description: "Which modality to store the output in." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer_counts" + description: "Which layer to store the raw counts in. If not provided, the .X\ + \ layer will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Query cells from a CellxGene Census or custom TileDBSoma object.\nAside\ + \ from fetching the cells' RNA counts (`.X`), cell metadata\n(`.obs`) and gene metadata\ + \ (`.var`), this component also fetches\nthe dataset metadata and joins it into\ + \ the cell metadata.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "cellxgene-census" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/query/cellxgene_census/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/query/cellxgene_census" + executable: "target/nextflow/query/cellxgene_census/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/query/cellxgene_census/main.nf b/target/nextflow/query/cellxgene_census/main.nf new file mode 100644 index 00000000..16873b4a --- /dev/null +++ b/target/nextflow/query/cellxgene_census/main.nf @@ -0,0 +1,4481 @@ +// cellxgene_census main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Matthias Beyens (maintainer, author) +// * Dries De Maeyer (author) +// * Robrecht Cannoodt (author) +// * Kai Waldrant (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellxgene_census", + "namespace" : "query", + "version" : "main", + "authors" : [ + { + "name" : "Matthias Beyens", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "MatthiasBeyens", + "orcid" : "0000-0003-3304-0706", + "email" : "matthias.beyens@gmail.com", + "linkedin" : "mbeyens" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Kai Waldrant", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "kai@data-intuitive.com", + "github" : "KaiWaldrant", + "orcid" : "0009-0003-8555-1361", + "linkedin" : "kaiwaldrant" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatician" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Contributor" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input database", + "description" : "Open CellxGene Census by version or URI.", + "arguments" : [ + { + "type" : "string", + "name" : "--input_uri", + "description" : "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument.", + "example" : [ + "s3://bucket/path" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--census_version", + "description" : "Which release of CellxGene census to use. Possible values are \\"latest\\", \\"stable\\", or the date of one of the releases (e.g. \\"2023-07-25\\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html).", + "example" : [ + "stable" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--add_dataset_metadata", + "description" : "If true, the experiment metadata will be added to the cell metadata. More specifically: `collection_id`, `collection_name`, `collection_doi`, `dataset_title`.", + "direction" : "input" + } + ] + }, + { + "name" : "Cell query", + "description" : "Arguments related to the query.", + "arguments" : [ + { + "type" : "string", + "name" : "--species", + "description" : "The organism to query, usually one of `Homo sapiens` or `Mus musculus`.", + "example" : [ + "homo_sapiens" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_value_filter", + "description" : "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax.", + "example" : [ + "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filter cells by grouping", + "description" : "Filter groups with fewer than X number of cells.", + "arguments" : [ + { + "type" : "string", + "name" : "--cell_filter_grouping", + "description" : "A subset of 'obs' columns by which to group the cells for filtering.\nOnly groups surpassing or equal to the `--cell_filter_minimum_count`\nthreshold will be retained. Take care not to introduce a selection\nbias against cells with more fine-grained ontology annotations.\n", + "example" : [ + "dataset_id", + "tissue", + "assay", + "disease", + "cell_type" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--cell_filter_minimum_count", + "description" : "A minimum number of cells per group to retain. If `--cell_filter_grouping`\nis defined, this parameter should also be provided and vice versa.\n", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Count filtering", + "description" : "Arguments related to filtering cells and genes by counts.", + "arguments" : [ + { + "type" : "integer", + "name" : "--cell_filter_min_genes", + "description" : "Remove cells with less than this number of genes.", + "default" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--cell_filter_min_counts", + "description" : "Remove cells with less than this number of counts.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gene_filter_min_cells", + "description" : "Remove genes expressed in less than this number of cells.", + "default" : [ + 5 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gene_filter_min_counts", + "description" : "Remove genes with less than this number of counts.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Output arguments.", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output h5mu file.", + "info" : { + "label" : "CellxGene dataset", + "summary" : "A dataset queried from the CellxGene Census platform", + "description" : "The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).\n", + "slots" : { + "mod" : [ + { + "name" : "rna", + "layers" : [ + { + "type" : "integer", + "name" : "counts", + "description" : "Raw counts", + "required" : true + } + ], + "obs" : [ + { + "type" : "string", + "name" : "dataset_id", + "description" : "Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes.", + "required" : false + }, + { + "type" : "string", + "name" : "assay", + "description" : "Type of assay used to generate the cell data, indicating the methodology or technique employed.", + "required" : true + }, + { + "type" : "string", + "name" : "assay_ontology_term_id", + "description" : "Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type.", + "required" : true + }, + { + "type" : "string", + "name" : "cell_type", + "description" : "Classification of the cell type based on its characteristics and function within the tissue or organism.", + "required" : true + }, + { + "type" : "string", + "name" : "cell_type_ontology_term_id", + "description" : "Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification.", + "required" : true + }, + { + "type" : "string", + "name" : "development_stage", + "description" : "Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase.", + "required" : true + }, + { + "type" : "string", + "name" : "development_stage_ontology_term_id", + "description" : "Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase.\n\nIf the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. \nIf the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used.\nOtherwise, the Uberon (`UBERON:`) ontology is used.\n", + "required" : true + }, + { + "type" : "string", + "name" : "disease", + "description" : "Information on any disease or pathological condition associated with the cell or donor.", + "required" : true + }, + { + "type" : "string", + "name" : "disease_ontology_term_id", + "description" : "Ontology term identifier for the disease, enabling standardized disease classification and referencing.\n\nMust be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`).\n", + "required" : true + }, + { + "type" : "string", + "name" : "donor_id", + "description" : "Identifier for the donor from whom the cell sample is obtained.", + "required" : true + }, + { + "type" : "boolean", + "name" : "is_primary_data", + "description" : "Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data.", + "required" : true + }, + { + "type" : "string", + "name" : "organism", + "description" : "Organism from which the cell sample is obtained.", + "required" : true + }, + { + "type" : "string", + "name" : "organism_ontology_term_id", + "description" : "Ontology term identifier for the organism, providing a standardized reference for the organism.\n\nMust be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`.\n", + "required" : true + }, + { + "type" : "string", + "name" : "self_reported_ethnicity", + "description" : "Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits.", + "required" : true + }, + { + "type" : "string", + "name" : "self_reported_ethnicity_ontology_term_id", + "description" : "Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications.\n\nIf the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used.\n", + "required" : true + }, + { + "type" : "string", + "name" : "sex", + "description" : "Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions.", + "required" : true + }, + { + "type" : "string", + "name" : "sex_ontology_term_id", + "description" : "Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed.", + "required" : true + }, + { + "type" : "string", + "name" : "suspension_type", + "description" : "Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions.", + "required" : true + }, + { + "type" : "string", + "name" : "tissue", + "description" : "Specific tissue from which the cells were derived, key for context and specificity in cell studies.", + "required" : true + }, + { + "type" : "string", + "name" : "tissue_ontology_term_id", + "description" : "Ontology term identifier for the tissue, providing a standardized reference for the tissue type.\n\nFor organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).\nFor cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.\n", + "required" : true + }, + { + "type" : "string", + "name" : "tissue_general", + "description" : "General category or classification of the tissue, useful for broader grouping and comparison of cell data.", + "required" : true + }, + { + "type" : "string", + "name" : "tissue_general_ontology_term_id", + "description" : "Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types.\n\nFor organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).\nFor cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.\n", + "required" : true + }, + { + "type" : "integer", + "name" : "soma_joinid", + "description" : "If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell.", + "required" : true + } + ], + "var" : [ + { + "type" : "string", + "name" : "feature_id", + "description" : "Unique identifier for the feature, usually a ENSEMBL gene id.", + "required" : true + }, + { + "type" : "string", + "name" : "feature_name", + "description" : "A human-readable name for the feature, usually a gene symbol.", + "required" : true + }, + { + "type" : "integer", + "name" : "soma_joinid", + "description" : "If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature.", + "required" : true + } + ] + } + ] + } + }, + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_modality", + "description" : "Which modality to store the output in.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer_counts", + "description" : "Which layer to store the raw counts in. If not provided, the .X layer will be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Query cells from a CellxGene Census or custom TileDBSoma object.\nAside from fetching the cells' RNA counts (`.X`), cell metadata\n(`.obs`) and gene metadata (`.var`), this component also fetches\nthe dataset metadata and joins it into the cell metadata.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "cellxgene-census" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/query/cellxgene_census/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/query/cellxgene_census", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import cellxgene_census +import scanpy as sc +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input_uri': $( if [ ! -z ${VIASH_PAR_INPUT_URI+x} ]; then echo "r'${VIASH_PAR_INPUT_URI//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'census_version': $( if [ ! -z ${VIASH_PAR_CENSUS_VERSION+x} ]; then echo "r'${VIASH_PAR_CENSUS_VERSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'add_dataset_metadata': $( if [ ! -z ${VIASH_PAR_ADD_DATASET_METADATA+x} ]; then echo "r'${VIASH_PAR_ADD_DATASET_METADATA//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'species': $( if [ ! -z ${VIASH_PAR_SPECIES+x} ]; then echo "r'${VIASH_PAR_SPECIES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_value_filter': $( if [ ! -z ${VIASH_PAR_OBS_VALUE_FILTER+x} ]; then echo "r'${VIASH_PAR_OBS_VALUE_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cell_filter_grouping': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_GROUPING+x} ]; then echo "r'${VIASH_PAR_CELL_FILTER_GROUPING//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'cell_filter_minimum_count': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_MINIMUM_COUNT+x} ]; then echo "int(r'${VIASH_PAR_CELL_FILTER_MINIMUM_COUNT//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'cell_filter_min_genes': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_MIN_GENES+x} ]; then echo "int(r'${VIASH_PAR_CELL_FILTER_MIN_GENES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'cell_filter_min_counts': $( if [ ! -z ${VIASH_PAR_CELL_FILTER_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_CELL_FILTER_MIN_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gene_filter_min_cells': $( if [ ! -z ${VIASH_PAR_GENE_FILTER_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_GENE_FILTER_MIN_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'gene_filter_min_counts': $( if [ ! -z ${VIASH_PAR_GENE_FILTER_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_GENE_FILTER_MIN_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_modality': $( if [ ! -z ${VIASH_PAR_OUTPUT_MODALITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer_counts': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER_COUNTS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER_COUNTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) + + +from setup_logger import setup_logger + +logger = setup_logger() + + +def connect_census(uri, census_version): + """ + Connect to CellxGene Census or user-provided TileDBSoma object + """ + ver = census_version or "stable" + logger.info( + "Connecting to CellxGene Census at %s", + f"'{uri}'" if uri else f"version '{ver}'", + ) + return cellxgene_census.open_soma(uri=uri, census_version=ver) + + +def get_anndata(census_connection, par): + logger.info( + "Getting gene expression data based on \\`%s\\` query.", par["obs_value_filter"] + ) + return cellxgene_census.get_anndata( + census=census_connection, + obs_value_filter=par["obs_value_filter"], + organism=par["species"], + ) + + +def add_cellcensus_metadata_obs(census_connection, adata): + logger.info("Adding additional metadata to gene expression data.") + census_datasets = ( + census_connection["census_info"]["datasets"].read().concat().to_pandas() + ) + + adata.obs.dataset_id = adata.obs.dataset_id.astype("category") + + dataset_info = ( + census_datasets[ + census_datasets.dataset_id.isin(adata.obs.dataset_id.cat.categories) + ][ + [ + "collection_id", + "collection_name", + "collection_doi", + "dataset_id", + "dataset_title", + ] + ] + .reset_index(drop=True) + .astype("category") + ) + + adata.obs = adata.obs.merge(dataset_info, on="dataset_id", how="left") + + +def filter_min_cells_per_group(adata, par): + n_cells_before, _ = adata.shape + cell_count = adata.obs.groupby(par["cell_filter_grouping"])[ + "soma_joinid" + ].transform("count") + adata = adata[cell_count >= par["cell_filter_minimum_count"]] + n_cells_after, _ = adata.shape + logger.info( + "Removed %s cells based on %s cell_filter_minimum_count of %s cell_filter_grouping." + % ( + (n_cells_before - n_cells_after), + par["cell_filter_minimum_count"], + par["cell_filter_grouping"], + ) + ) + return adata + + +def filter_by_counts(adata, par): + logger.info("Remove cells with few counts and genes with few counts.") + n_cells_before, n_genes_before = adata.shape + # remove cells with few counts and genes with few counts + scanpy_proc = { + par["cell_filter_min_counts"]: (sc.pp.filter_cells, "min_counts"), + par["cell_filter_min_genes"]: (sc.pp.filter_cells, "min_genes"), + par["gene_filter_min_counts"]: (sc.pp.filter_genes, "min_counts"), + par["gene_filter_min_cells"]: (sc.pp.filter_genes, "min_cells"), + } + for threshold, (func, arg) in scanpy_proc.items(): + if threshold: + func(adata, **{arg: threshold}) + n_cells_after, n_genes_after = adata.shape + logger.info( + "Removed %s cells and %s genes.", + (n_cells_before - n_cells_after), + (n_genes_before - n_genes_after), + ) + + +def move_x_to_layers(adata, layer_name): + logger.info(f"Move .X to .layers['{layer_name}']") + adata.layers[layer_name] = adata.X + adata.X = None + + +def print_unique(adata, column): + unique_values = adata.obs[column].unique().astype(str) + formatted = "', '".join(unique_values[:50]) + if len(unique_values) > 50: + formatted += ", ..." + logger.info(f"Unique {column}: ['{formatted}']") + + +def print_summary(adata): + logger.info(f"Resulting dataset: {adata}") + + logger.info("Summary of dataset:") + for field in adata.obs.columns: + print_unique(adata, field) + + +def write_anndata(adata, par): + logger.info("Writing MuData object to '%s'", par["output"]) + + mdata = mu.MuData({par["output_modality"]: adata}) + + mdata.write_h5mu(par["output"], compression=par["output_compression"]) + + +def main(par, meta): + # check arguments + if (par["cell_filter_grouping"] is None) != ( + par["cell_filter_minimum_count"] is None + ): + raise NotImplementedError( + "You need to specify either both or none of the following parameters: cell_filter_grouping, cell_filter_minimum_count" + ) + + with connect_census( + uri=par["input_uri"], census_version=par["census_version"] + ) as conn: + adata = get_anndata(conn, par) + + if par["add_dataset_metadata"]: + add_cellcensus_metadata_obs(conn, adata) + + print(f"AnnData: {adata}", flush=True) + + if par["cell_filter_grouping"] is not None: + adata = filter_min_cells_per_group(adata, par) + + # remove cells with few counts and genes with few counts + filter_by_counts(adata, par) + + # logger.log(f"Filtered AnnData: {adata}") + print(f"Filtered AnnData: {adata}", flush=True) + + # use feature_id as var_names + adata.var_names = adata.var["feature_id"] + + # move .X to .layers["counts"] + if par["output_layer_counts"]: + move_x_to_layers(adata, par["output_layer_counts"]) + + # print summary + print_summary(adata) + + # write output to file + write_anndata(adata, par) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/query/cellxgene_census", + "tag" : "main" + }, + "label" : [ + "highmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/query/cellxgene_census/nextflow.config b/target/nextflow/query/cellxgene_census/nextflow.config new file mode 100644 index 00000000..6a9b1b33 --- /dev/null +++ b/target/nextflow/query/cellxgene_census/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'query/cellxgene_census' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Query cells from a CellxGene Census or custom TileDBSoma object.\nAside from fetching the cells\' RNA counts (`.X`), cell metadata\n(`.obs`) and gene metadata (`.var`), this component also fetches\nthe dataset metadata and joins it into the cell metadata.\n' + author = 'Matthias Beyens, Dries De Maeyer, Robrecht Cannoodt, Kai Waldrant' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/query/cellxgene_census/nextflow_labels.config b/target/nextflow/query/cellxgene_census/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/query/cellxgene_census/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/query/cellxgene_census/nextflow_schema.json b/target/nextflow/query/cellxgene_census/nextflow_schema.json new file mode 100644 index 00000000..a5b64615 --- /dev/null +++ b/target/nextflow/query/cellxgene_census/nextflow_schema.json @@ -0,0 +1,165 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellxgene_census", + "description": "Query cells from a CellxGene Census or custom TileDBSoma object.\nAside from fetching the cells' RNA counts (`.X`), cell metadata\n(`.obs`) and gene metadata (`.var`), this component also fetches\nthe dataset metadata and joins it into the cell metadata.\n", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Output arguments.", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_modality": { + "type": "string", + "description": "Which modality to store the output in.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "output_layer_counts": { + "type": "string", + "description": "Which layer to store the raw counts in", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "input database": { + "title": "Input database", + "type": "object", + "description": "Open CellxGene Census by version or URI.", + "properties": { + "input_uri": { + "type": "string", + "description": "If specified, a URI containing the Census SOMA objects", + "help_text": "Type: `string`, multiple: `False`, example: `\"s3://bucket/path\"`. " + }, + "census_version": { + "type": "string", + "description": "Which release of CellxGene census to use", + "help_text": "Type: `string`, multiple: `False`, example: `\"stable\"`. " + }, + "add_dataset_metadata": { + "type": "boolean", + "description": "If true, the experiment metadata will be added to the cell metadata", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "cell query": { + "title": "Cell query", + "type": "object", + "description": "Arguments related to the query.", + "properties": { + "species": { + "type": "string", + "description": "The organism to query, usually one of `Homo sapiens` or `Mus musculus`.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"homo_sapiens\"`. " + }, + "obs_value_filter": { + "type": "string", + "description": "Filter for selecting the `obs` metadata (i.e", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'\"`. " + } + } + }, + "filter cells by grouping": { + "title": "Filter cells by grouping", + "type": "object", + "description": "Filter groups with fewer than X number of cells.", + "properties": { + "cell_filter_grouping": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A subset of 'obs' columns by which to group the cells for filtering.\nOnly groups surpassing or equal to the `--cell_filter_minimum_count`\nthreshold will be retained", + "help_text": "Type: `string`, multiple: `True`, example: `[\"dataset_id\";\"tissue\";\"assay\";\"disease\";\"cell_type\"]`. " + }, + "cell_filter_minimum_count": { + "type": "integer", + "description": "A minimum number of cells per group to retain", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + } + } + }, + "count filtering": { + "title": "Count filtering", + "type": "object", + "description": "Arguments related to filtering cells and genes by counts.", + "properties": { + "cell_filter_min_genes": { + "type": "integer", + "description": "Remove cells with less than this number of genes.", + "help_text": "Type: `integer`, multiple: `False`, default: `50`. ", + "default": 50 + }, + "cell_filter_min_counts": { + "type": "integer", + "description": "Remove cells with less than this number of counts.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + }, + "gene_filter_min_cells": { + "type": "integer", + "description": "Remove genes expressed in less than this number of cells.", + "help_text": "Type: `integer`, multiple: `False`, default: `5`. ", + "default": 5 + }, + "gene_filter_min_counts": { + "type": "integer", + "description": "Remove genes with less than this number of counts.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/input database" + }, + { + "$ref": "#/$defs/cell query" + }, + { + "$ref": "#/$defs/filter cells by grouping" + }, + { + "$ref": "#/$defs/count filtering" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/query/cellxgene_census/setup_logger.py b/target/nextflow/query/cellxgene_census/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/query/cellxgene_census/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/reference/build_bdrhap_reference/.config.vsh.yaml b/target/nextflow/reference/build_bdrhap_reference/.config.vsh.yaml new file mode 100644 index 00000000..f0901268 --- /dev/null +++ b/target/nextflow/reference/build_bdrhap_reference/.config.vsh.yaml @@ -0,0 +1,323 @@ +name: "build_bdrhap_reference" +namespace: "reference" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody\ + \ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse." + info: + config_key: "Genome_fasta" + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gtf" + description: "File path to the transcript annotation files in GTF or GTF.GZ format.\ + \ The Sequence Analysis Pipeline requires the 'gene_name' or \n'gene_id' attribute\ + \ to be set on each gene and exon feature. Gene and exon feature lines must\ + \ have the same attribute, and exons\nmust have a corresponding gene with the\ + \ same value. For TCR/BCR assays, the TCR or BCR gene segments must have the\ + \ 'gene_type' or\n'gene_biotype' attribute set, and the value should begin with\ + \ 'TR' or 'IG', respectively.\n" + info: + config_key: "Gtf" + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--extra_sequences" + description: "File path to additional sequences in FASTA format to use when building\ + \ the STAR index. (e.g. transgenes or CRISPR guide barcodes).\nGTF lines for\ + \ these sequences will be automatically generated and combined with the main\ + \ GTF.\n" + info: + config_key: "Extra_sequences" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--reference_archive" + description: "A Compressed archive containing the Reference Genome Index and annotation\ + \ GTF files. This archive is meant to be used as an\ninput in the BD Rhapsody\ + \ Sequencing Analysis Pipeline.\n" + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--mitochondrial_contigs" + description: "Names of the Mitochondrial contigs in the provided Reference Genome.\ + \ Fragments originating from contigs other than these are\nidentified as 'nuclear\ + \ fragments' in the ATACseq analysis pipeline.\n" + info: + config_key: "Mitochondrial_contigs" + default: + - "chrM" + - "chrMT" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--filtering_off" + description: "By default the input Transcript Annotation files are filtered based\ + \ on the gene_type/gene_biotype attribute. Only features \nhaving the following\ + \ attribute values are kept:\n\n - protein_coding\n - lncRNA \n - IG_LV_gene\n\ + \ - IG_V_gene\n - IG_V_pseudogene\n - IG_D_gene\n - IG_J_gene\n - IG_J_pseudogene\n\ + \ - IG_C_gene\n - IG_C_pseudogene\n - TR_V_gene\n - TR_V_pseudogene\n -\ + \ TR_D_gene\n - TR_J_gene\n - TR_J_pseudogene\n - TR_C_gene\n\n If you have\ + \ already pre-filtered the input Annotation files and/or wish to turn-off the\ + \ filtering, please set this option to True.\n" + info: + config_key: "Filtering_off" + direction: "input" + - type: "boolean_true" + name: "--wta_only_index" + description: "Build a WTA only index, otherwise builds a WTA + ATAC index." + info: + config_key: "Wta_Only" + direction: "input" + - type: "string" + name: "--extra_star_params" + description: "Additional parameters to pass to STAR when building the genome index.\ + \ Specify exactly like how you would on the command line." + info: + config_key: "Extra_STAR_params" + example: + - "--limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "make_rhap_reference_2.2.1_nodocker.cwl" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "The Reference Files Generator creates an archive containing Genome Index\n\ + and Transcriptome annotation files needed for the BD Rhapsody Sequencing\nAnalysis\ + \ Pipeline. The app takes as input one or more FASTA and GTF files\nand produces\ + \ a compressed archive in the form of a tar.gz file. The \narchive contains:\n \ + \ \n- STAR index\n- Filtered GTF file\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "reference.fa.gz" +- type: "file" + path: "reference.gtf.gz" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bdgenomics/rhapsody:2.2.1" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "seqkit" + interactive: false + - type: "python" + user: false + packages: + - "cwlref-runner" + - "cwl-runner" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_bdrhap_reference/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/reference/build_bdrhap_reference" + executable: "target/nextflow/reference/build_bdrhap_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/reference/build_bdrhap_reference/main.nf b/target/nextflow/reference/build_bdrhap_reference/main.nf new file mode 100644 index 00000000..130f9586 --- /dev/null +++ b/target/nextflow/reference/build_bdrhap_reference/main.nf @@ -0,0 +1,4136 @@ +// build_bdrhap_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (author, maintainer) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "build_bdrhap_reference", + "namespace" : "reference", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--genome_fasta", + "description" : "Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse.", + "info" : { + "config_key" : "Genome_fasta" + }, + "example" : [ + "genome_sequence.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--gtf", + "description" : "File path to the transcript annotation files in GTF or GTF.GZ format. The Sequence Analysis Pipeline requires the 'gene_name' or \n'gene_id' attribute to be set on each gene and exon feature. Gene and exon feature lines must have the same attribute, and exons\nmust have a corresponding gene with the same value. For TCR/BCR assays, the TCR or BCR gene segments must have the 'gene_type' or\n'gene_biotype' attribute set, and the value should begin with 'TR' or 'IG', respectively.\n", + "info" : { + "config_key" : "Gtf" + }, + "example" : [ + "transcriptome_annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--extra_sequences", + "description" : "File path to additional sequences in FASTA format to use when building the STAR index. (e.g. transgenes or CRISPR guide barcodes).\nGTF lines for these sequences will be automatically generated and combined with the main GTF.\n", + "info" : { + "config_key" : "Extra_sequences" + }, + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--reference_archive", + "description" : "A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an\ninput in the BD Rhapsody Sequencing Analysis Pipeline.\n", + "example" : [ + "reference.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--mitochondrial_contigs", + "description" : "Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are\nidentified as 'nuclear fragments' in the ATACseq analysis pipeline.\n", + "info" : { + "config_key" : "Mitochondrial_contigs" + }, + "default" : [ + "chrM", + "chrMT", + "M", + "MT" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--filtering_off", + "description" : "By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features \nhaving the following attribute values are kept:\n\n - protein_coding\n - lncRNA \n - IG_LV_gene\n - IG_V_gene\n - IG_V_pseudogene\n - IG_D_gene\n - IG_J_gene\n - IG_J_pseudogene\n - IG_C_gene\n - IG_C_pseudogene\n - TR_V_gene\n - TR_V_pseudogene\n - TR_D_gene\n - TR_J_gene\n - TR_J_pseudogene\n - TR_C_gene\n\n If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True.\n", + "info" : { + "config_key" : "Filtering_off" + }, + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--wta_only_index", + "description" : "Build a WTA only index, otherwise builds a WTA + ATAC index.", + "info" : { + "config_key" : "Wta_Only" + }, + "direction" : "input" + }, + { + "type" : "string", + "name" : "--extra_star_params", + "description" : "Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line.", + "info" : { + "config_key" : "Extra_STAR_params" + }, + "example" : [ + "--limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "make_rhap_reference_2.2.1_nodocker.cwl" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "The Reference Files Generator creates an archive containing Genome Index\nand Transcriptome annotation files needed for the BD Rhapsody Sequencing\nAnalysis Pipeline. The app takes as input one or more FASTA and GTF files\nand produces a compressed archive in the form of a tar.gz file. The \narchive contains:\n \n- STAR index\n- Filtered GTF file\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1/reference.fa.gz" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1/reference.gtf.gz" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "bdgenomics/rhapsody:2.2.1", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "seqkit" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "cwlref-runner", + "cwl-runner" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/reference/build_bdrhap_reference/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/reference/build_bdrhap_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'genome_fasta': $( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "r'${VIASH_PAR_GENOME_FASTA//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'gtf': $( if [ ! -z ${VIASH_PAR_GTF+x} ]; then echo "r'${VIASH_PAR_GTF//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'extra_sequences': $( if [ ! -z ${VIASH_PAR_EXTRA_SEQUENCES+x} ]; then echo "r'${VIASH_PAR_EXTRA_SEQUENCES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reference_archive': $( if [ ! -z ${VIASH_PAR_REFERENCE_ARCHIVE+x} ]; then echo "r'${VIASH_PAR_REFERENCE_ARCHIVE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'mitochondrial_contigs': $( if [ ! -z ${VIASH_PAR_MITOCHONDRIAL_CONTIGS+x} ]; then echo "r'${VIASH_PAR_MITOCHONDRIAL_CONTIGS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'filtering_off': $( if [ ! -z ${VIASH_PAR_FILTERING_OFF+x} ]; then echo "r'${VIASH_PAR_FILTERING_OFF//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'wta_only_index': $( if [ ! -z ${VIASH_PAR_WTA_ONLY_INDEX+x} ]; then echo "r'${VIASH_PAR_WTA_ONLY_INDEX//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'extra_star_params': $( if [ ! -z ${VIASH_PAR_EXTRA_STAR_PARAMS+x} ]; then echo "r'${VIASH_PAR_EXTRA_STAR_PARAMS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] + ] + + return config + + +def strip_margin(text: str) -> str: + return re.sub("(\\\\n?)[ \\\\t]*\\\\|", "\\\\\\\\1", text) + + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".gz"), ( + "Output reference_archive must end with .tar.gz." + ) + + # make paths absolute + for argument in config["arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] + else: + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + + return par + + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [ + strip_margin("""\\\\ +#!/usr/bin/env cwl-runner + +""") + ] + + config_key_value_pairs = [] + for argument in config["arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\\\\ +{config_key}: +""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\\\\ + - class: File + location: "{file}" +""") + else: + str += strip_margin(f"""\\\\ + class: File + location: "{par_value}" +""") + content_list.append(str) + else: + content_list.append( + strip_margin(f"""\\\\ +{config_key}: {par_value} +""") + ) + + ## Write config to file + return "".join(content_list) + + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file = os.path.join( + meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl" + ) + + return os.path.abspath(cwl_file) + + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"] + ) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file, + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + shutil.move( + os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"] + ) + + +if __name__ == "__main__": + main(par, meta) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/reference/build_bdrhap_reference", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl b/target/nextflow/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/target/nextflow/reference/build_bdrhap_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/target/nextflow/reference/build_bdrhap_reference/nextflow.config b/target/nextflow/reference/build_bdrhap_reference/nextflow.config new file mode 100644 index 00000000..26fe3f83 --- /dev/null +++ b/target/nextflow/reference/build_bdrhap_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'reference/build_bdrhap_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'The Reference Files Generator creates an archive containing Genome Index\nand Transcriptome annotation files needed for the BD Rhapsody Sequencing\nAnalysis Pipeline. The app takes as input one or more FASTA and GTF files\nand produces a compressed archive in the form of a tar.gz file. The \narchive contains:\n \n- STAR index\n- Filtered GTF file\n' + author = 'Robrecht Cannoodt, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/reference/build_bdrhap_reference/nextflow_labels.config b/target/nextflow/reference/build_bdrhap_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/reference/build_bdrhap_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/reference/build_bdrhap_reference/nextflow_schema.json b/target/nextflow/reference/build_bdrhap_reference/nextflow_schema.json new file mode 100644 index 00000000..e69de29b diff --git a/target/nextflow/reference/build_cellranger_arc_reference/.config.vsh.yaml b/target/nextflow/reference/build_cellranger_arc_reference/.config.vsh.yaml new file mode 100644 index 00000000..42d035b8 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_arc_reference/.config.vsh.yaml @@ -0,0 +1,295 @@ +name: "build_cellranger_arc_reference" +namespace: "reference" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "author" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--annotation_gtf" + description: "Reference annotation." + info: null + example: + - "annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--motifs_file" + description: "Transcription factor motifs in JASPAR format. See https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references" + info: null + example: + - "JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_nuclear_contigs" + description: "Name(s) of contig(s) that do not have any chromatin structure, for\ + \ example, mitochondria or plastids. These contigs are excluded from peak calling\ + \ since the entire contig will be \"open\" due to a lack of chromatin structure.\ + \ Leave empty if there are no such contigs." + info: null + example: + - "chrM" + default: + - "chrM" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output folder" + info: null + example: + - "cellranger_reference" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--genome" + description: "Name of the genome. This will be the name of the intermediate output\ + \ folder" + info: null + example: + - "GRCh38" + default: + - "output" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--organism" + description: "Name of the organism. This is displayed in the web summary but is\ + \ otherwise not used in the analysis." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--subset_regex" + description: "Will subset the reference chromosomes using the given regex." + info: null + example: + - "(ERCC-00002|chr1)" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied\ + \ genome FASTA and gene GTF files. Creates a new folder named after the genome." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger_arc:2.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps pigz && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "docker" + - type: "apt" + packages: + - "git" + - "wget" + interactive: false + - type: "docker" + run: + - "TARGETARCH=\"${TARGETARCH:-$(dpkg --print-architecture)}\" && \\\nTARGETOS=\"\ + ${TARGETOS:-linux}\" && \\\nPATH=\"${PATH}:/usr/local/go/bin\" && \\\nwget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz\ + \ && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\nrm\ + \ go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\ngit clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git\ + \ && \\\ncd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ &&\ + \ rm -rf seqkit && rm -r /usr/local/go\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_cellranger_arc_reference/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/reference/build_cellranger_arc_reference" + executable: "target/nextflow/reference/build_cellranger_arc_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/reference/build_cellranger_arc_reference/main.nf b/target/nextflow/reference/build_cellranger_arc_reference/main.nf new file mode 100644 index 00000000..15324b6d --- /dev/null +++ b/target/nextflow/reference/build_cellranger_arc_reference/main.nf @@ -0,0 +1,4002 @@ +// build_cellranger_arc_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "build_cellranger_arc_reference", + "namespace" : "reference", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--genome_fasta", + "description" : "Reference genome fasta.", + "example" : [ + "genome_sequence.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--annotation_gtf", + "description" : "Reference annotation.", + "example" : [ + "annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--motifs_file", + "description" : "Transcription factor motifs in JASPAR format. See https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references", + "example" : [ + "JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--non_nuclear_contigs", + "description" : "Name(s) of contig(s) that do not have any chromatin structure, for example, mitochondria or plastids. These contigs are excluded from peak calling since the entire contig will be \\"open\\" due to a lack of chromatin structure. Leave empty if there are no such contigs.", + "example" : [ + "chrM" + ], + "default" : [ + "chrM" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output folder", + "example" : [ + "cellranger_reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--genome", + "description" : "Name of the genome. This will be the name of the intermediate output folder", + "example" : [ + "GRCh38" + ], + "default" : [ + "output" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--organism", + "description" : "Name of the organism. This is displayed in the web summary but is otherwise not used in the analysis.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--subset_regex", + "description" : "Will subset the reference chromosomes using the given regex.", + "example" : [ + "(ERCC-00002|chr1)" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger_arc:2.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update && \\\\\napt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/*\n" + ] + } + ], + "test_setup" : [ + { + "type" : "docker" + }, + { + "type" : "apt", + "packages" : [ + "git", + "wget" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "TARGETARCH=\\"${TARGETARCH:-$(dpkg --print-architecture)}\\" && \\\\\nTARGETOS=\\"${TARGETOS:-linux}\\" && \\\\\nPATH=\\"${PATH}:/usr/local/go/bin\\" && \\\\\nwget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\\\nrm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\\\ngit clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \\\\\ncd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/reference/build_cellranger_arc_reference/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/reference/build_cellranger_arc_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "${VIASH_PAR_GENOME_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genome_fasta='&'#" ; else echo "# par_genome_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_ANNOTATION_GTF+x} ]; then echo "${VIASH_PAR_ANNOTATION_GTF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_annotation_gtf='&'#" ; else echo "# par_annotation_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_MOTIFS_FILE+x} ]; then echo "${VIASH_PAR_MOTIFS_FILE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_motifs_file='&'#" ; else echo "# par_motifs_file="; fi ) +$( if [ ! -z ${VIASH_PAR_NON_NUCLEAR_CONTIGS+x} ]; then echo "${VIASH_PAR_NON_NUCLEAR_CONTIGS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_non_nuclear_contigs='&'#" ; else echo "# par_non_nuclear_contigs="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOME+x} ]; then echo "${VIASH_PAR_GENOME}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genome='&'#" ; else echo "# par_genome="; fi ) +$( if [ ! -z ${VIASH_PAR_ORGANISM+x} ]; then echo "${VIASH_PAR_ORGANISM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_organism='&'#" ; else echo "# par_organism="; fi ) +$( if [ ! -z ${VIASH_PAR_SUBSET_REGEX+x} ]; then echo "${VIASH_PAR_SUBSET_REGEX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_subset_regex='&'#" ; else echo "# par_subset_regex="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\\$(mktemp -d "$VIASH_TEMP/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# just to make sure +echo "> Getting path of fasta file" +par_genome_fasta=\\`realpath \\$par_genome_fasta\\` +echo "> Getting path of annotation file" +par_annotation_gtf=\\`realpath \\$par_annotation_gtf\\` +echo "> Getting path of output file" +par_output=\\`realpath \\$par_output\\` +echo "> Getting path of motifs file" +par_motifs_file=\\`realpath \\$par_motifs_file\\` + +# process params +extra_params=( ) + +if [ ! -z "\\$meta_cpus" ]; then + extra_params+=( "nthreads: \\\\"\\$meta_cpus"\\\\" ) +fi +if [ ! -z "\\$meta_memory_gb" ]; then + # always keep 2gb for the OS itself + memory_gb=\\`python -c "print(int('\\$meta_memory_gb') - 2)"\\` + extra_params+=( "memgb: \\\\"\\$memory_gb"\\\\" ) +fi + +echo "> Unzipping input files" +unpigz -c "\\$par_genome_fasta" > "\\$tmpdir/genome.fa" + +echo "> Building star index" +cd "\\$tmpdir" + +echo "> Building config" +config_in="\\${tmpdir}/config" + +# If non_nuclear_contigs is not set or bash thinks it is a flag, set it to an empty string +if [[ -z \\$par_non_nuclear_contigs || \\$par_non_nuclear_contigs == "--non_nuclear_contigs" ]]; then + non_nuclear_contigs="" +else + printf -v non_nuclear_contigs '"%s",' "\\${par_non_nuclear_contigs[@]}" + non_nuclear_contigs="[\\${non_nuclear_contigs%,}]" # remove trailing comma +fi + +echo """{ + \\${par_organism:+organism: \\\\"\\$par_organism\\\\"} + genome: [\\\\"\\${par_genome}\\\\"] + input_fasta: [\\\\""\\${tmpdir}/genome.fa"\\\\"] + input_gtf: [\\\\""\\${par_annotation_gtf}\\\\""] + \\${non_nuclear_contigs:+non_nuclear_contigs: "\\${non_nuclear_contigs}"} + input_motifs: \\\\""\\$par_motifs_file"\\\\" + \\$(printf "%s\\\\n" "\\${extra_params[@]}") +}""" > "\\$config_in" + +echo "> Config content:" +cat \\${config_in} + +echo "> Running cellranger" +cellranger-arc mkref --config=\\${config_in} + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "\\$par_output" -C "\\${tmpdir}/\\${par_genome}" . +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/reference/build_cellranger_arc_reference", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/reference/build_cellranger_arc_reference/nextflow.config b/target/nextflow/reference/build_cellranger_arc_reference/nextflow.config new file mode 100644 index 00000000..8c3e0dd4 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_arc_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'reference/build_cellranger_arc_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome.' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/reference/build_cellranger_arc_reference/nextflow_labels.config b/target/nextflow/reference/build_cellranger_arc_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_arc_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/reference/build_cellranger_arc_reference/nextflow_schema.json b/target/nextflow/reference/build_cellranger_arc_reference/nextflow_schema.json new file mode 100644 index 00000000..b24fde33 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_arc_reference/nextflow_schema.json @@ -0,0 +1,89 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "build_cellranger_arc_reference", + "description": "Build a Cell Ranger-arc and -atac compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "genome_fasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference genome fasta.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"genome_sequence.fa.gz\"`. " + }, + "annotation_gtf": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference annotation.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"annotation.gtf.gz\"`. " + }, + "motifs_file": { + "type": "string", + "format": "path", + "description": "Transcription factor motifs in JASPAR format", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"JASPAR2024_CORE_non-redundant_pfms_jaspar.txt.modified\"`. " + }, + "non_nuclear_contigs": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Name(s) of contig(s) that do not have any chromatin structure, for example, mitochondria or plastids", + "help_text": "Type: `string`, multiple: `True`, default: `[\"chrM\"]`, example: `[\"chrM\"]`. ", + "default": [ + "chrM" + ] + }, + "output": { + "type": "string", + "format": "path", + "description": "Output folder", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"cellranger_reference\"`. ", + "default": "$id.$key.output" + }, + "genome": { + "type": "string", + "description": "Name of the genome", + "help_text": "Type: `string`, multiple: `False`, required, default: `\"output\"`, example: `\"GRCh38\"`. ", + "default": "output" + }, + "organism": { + "type": "string", + "description": "Name of the organism", + "help_text": "Type: `string`, multiple: `False`. " + }, + "subset_regex": { + "type": "string", + "description": "Will subset the reference chromosomes using the given regex.", + "help_text": "Type: `string`, multiple: `False`, example: `\"(ERCC-00002|chr1)\"`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/reference/build_cellranger_reference/.config.vsh.yaml b/target/nextflow/reference/build_cellranger_reference/.config.vsh.yaml new file mode 100644 index 00000000..ca17d9a2 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_reference/.config.vsh.yaml @@ -0,0 +1,263 @@ +name: "build_cellranger_reference" +namespace: "reference" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + description: "Reference transcriptome annotation." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_version" + description: "Optional reference version string to include with reference" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Output folder" + info: null + example: + - "cellranger_reference" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Build a Cell Ranger-compatible reference folder from user-supplied genome\ + \ FASTA and gene GTF files. Creates a new folder named after the genome." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y procps pigz && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "apt" + packages: + - "git" + - "wget" + interactive: false + - type: "docker" + run: + - "TARGETARCH=\"${TARGETARCH:-$(dpkg --print-architecture)}\" && \\\nTARGETOS=\"\ + ${TARGETOS:-linux}\" && \\\nPATH=\"${PATH}:/usr/local/go/bin\" && \\\nwget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz\ + \ && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\nrm\ + \ go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\ngit clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git\ + \ && \\\ncd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ &&\ + \ rm -rf seqkit && rm -r /usr/local/go\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_cellranger_reference/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/reference/build_cellranger_reference" + executable: "target/nextflow/reference/build_cellranger_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/reference/build_cellranger_reference/main.nf b/target/nextflow/reference/build_cellranger_reference/main.nf new file mode 100644 index 00000000..b451693b --- /dev/null +++ b/target/nextflow/reference/build_cellranger_reference/main.nf @@ -0,0 +1,3936 @@ +// build_cellranger_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "build_cellranger_reference", + "namespace" : "reference", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--genome_fasta", + "description" : "Reference genome fasta.", + "example" : [ + "genome_sequence.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--transcriptome_gtf", + "description" : "Reference transcriptome annotation.", + "example" : [ + "transcriptome_annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_version", + "description" : "Optional reference version string to include with reference", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Output folder", + "example" : [ + "cellranger_reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger:9.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update && \\\\\napt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/*\n" + ] + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git", + "wget" + ], + "interactive" : false + }, + { + "type" : "docker", + "run" : [ + "TARGETARCH=\\"${TARGETARCH:-$(dpkg --print-architecture)}\\" && \\\\\nTARGETOS=\\"${TARGETOS:-linux}\\" && \\\\\nPATH=\\"${PATH}:/usr/local/go/bin\\" && \\\\\nwget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\\\nrm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \\\\\ngit clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \\\\\ncd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/reference/build_cellranger_reference/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/reference/build_cellranger_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "${VIASH_PAR_GENOME_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genome_fasta='&'#" ; else echo "# par_genome_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then echo "${VIASH_PAR_TRANSCRIPTOME_GTF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_transcriptome_gtf='&'#" ; else echo "# par_transcriptome_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_REFERENCE_VERSION+x} ]; then echo "${VIASH_PAR_REFERENCE_VERSION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_reference_version='&'#" ; else echo "# par_reference_version="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\\$(mktemp -d "$VIASH_TEMP/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_genome_fasta=\\`realpath \\$par_genome_fasta\\` +par_transcriptome_gtf=\\`realpath \\$par_transcriptome_gtf\\` +par_output=\\`realpath \\$par_output\\` + + +echo "> Unzipping input files" +unpigz -c "\\$par_genome_fasta" > "\\$tmpdir/genome.fa" + +echo "> Building star index" +cd "\\$tmpdir" +cellranger mkref \\\\ + --fasta "\\$tmpdir/genome.fa" \\\\ + --genes "\\$par_transcriptome_gtf" \\\\ + --genome output \\\\ + \\${par_reference_version:+--ref-version \\$par_reference_version} \\\\ + \\${meta_cpus:+--nthreads \\$meta_cpus} \\\\ + \\${meta_memory_gb:+--memgb \\$((\\$meta_memory_gb-2))} # always keep 2 gb for the OS itseld + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "\\$par_output" -C "\\$tmpdir/output" . +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/reference/build_cellranger_reference", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/reference/build_cellranger_reference/nextflow.config b/target/nextflow/reference/build_cellranger_reference/nextflow.config new file mode 100644 index 00000000..d7f3004c --- /dev/null +++ b/target/nextflow/reference/build_cellranger_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'reference/build_cellranger_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/reference/build_cellranger_reference/nextflow_labels.config b/target/nextflow/reference/build_cellranger_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/reference/build_cellranger_reference/nextflow_schema.json b/target/nextflow/reference/build_cellranger_reference/nextflow_schema.json new file mode 100644 index 00000000..148ef0e0 --- /dev/null +++ b/target/nextflow/reference/build_cellranger_reference/nextflow_schema.json @@ -0,0 +1,61 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "build_cellranger_reference", + "description": "Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "genome_fasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference genome fasta.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"genome_sequence.fa.gz\"`. " + }, + "transcriptome_gtf": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference transcriptome annotation.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"transcriptome_annotation.gtf.gz\"`. " + }, + "reference_version": { + "type": "string", + "description": "Optional reference version string to include with reference", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output folder", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"cellranger_reference\"`. ", + "default": "$id.$key.output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/reference/build_star_reference/.config.vsh.yaml b/target/nextflow/reference/build_star_reference/.config.vsh.yaml new file mode 100644 index 00000000..93c293b5 --- /dev/null +++ b/target/nextflow/reference/build_star_reference/.config.vsh.yaml @@ -0,0 +1,257 @@ +name: "build_star_reference" +namespace: "reference" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input/Output" + arguments: + - type: "file" + name: "--genome_fasta" + alternatives: + - "--genomeFastaFiles" + description: "The fasta files to be included in the reference. Corresponds to\ + \ the --genomeFastaFiles argument in the STAR command." + info: null + example: + - "chr1.fasta" + - "chr2.fasta" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + alternatives: + - "--sjdbGTFfile" + description: "Specifies the path to the file with annotated transcripts in the\ + \ standard GTF\nformat. STAR will extract splice junctions from this file and\ + \ use them to greatly improve\naccuracy of the mapping. Corresponds to the --sjdbGTFfile\ + \ argument in the STAR command.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "--genomeDir" + description: "Path to output directory. Corresponds to the --genomeDir argument\ + \ in the STAR command." + info: null + example: + - "/path/to/foo" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Genome indexing arguments" + arguments: + - type: "integer" + name: "--genomeSAindexNbases" + description: "Length (bases) of the SA pre-indexing string. Typically between\ + \ 10 and 15.\nLonger strings will use much more memory, but allow faster searches.\ + \ For small\ngenomes, the parameter {genomeSAindexNbases must be scaled down\ + \ to\nmin(14, log2(GenomeLength)/2 - 1).\n" + info: null + default: + - 14 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Create a reference for STAR from a set of fasta files." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "docker" + env: + - "STAR_VERSION 2.7.10b" + - "PACKAGES gcc g++ make wget zlib1g-dev unzip" + - type: "docker" + run: + - "apt-get update && \\\n apt-get install -y --no-install-recommends ${PACKAGES}\ + \ && \\\n cd /tmp && \\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\ + \ && \\\n unzip ${STAR_VERSION}.zip && \\\n cd STAR-${STAR_VERSION}/source\ + \ && \\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\n cp STAR /usr/local/bin\ + \ && \\\n cd / && \\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\ + \ && \\\n apt-get --purge autoremove -y ${PACKAGES} && \\\n apt-get clean\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/build_star_reference/config.vsh.yml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/reference/build_star_reference" + executable: "target/nextflow/reference/build_star_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/reference/build_star_reference/main.nf b/target/nextflow/reference/build_star_reference/main.nf new file mode 100644 index 00000000..57d75fe2 --- /dev/null +++ b/target/nextflow/reference/build_star_reference/main.nf @@ -0,0 +1,4034 @@ +// build_star_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "build_star_reference", + "namespace" : "reference", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input/Output", + "arguments" : [ + { + "type" : "file", + "name" : "--genome_fasta", + "alternatives" : [ + "--genomeFastaFiles" + ], + "description" : "The fasta files to be included in the reference. Corresponds to the --genomeFastaFiles argument in the STAR command.", + "example" : [ + "chr1.fasta", + "chr2.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--transcriptome_gtf", + "alternatives" : [ + "--sjdbGTFfile" + ], + "description" : "Specifies the path to the file with annotated transcripts in the standard GTF\nformat. STAR will extract splice junctions from this file and use them to greatly improve\naccuracy of the mapping. Corresponds to the --sjdbGTFfile argument in the STAR command.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "--genomeDir" + ], + "description" : "Path to output directory. Corresponds to the --genomeDir argument in the STAR command.", + "example" : [ + "/path/to/foo" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Genome indexing arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--genomeSAindexNbases", + "description" : "Length (bases) of the SA pre-indexing string. Typically between 10 and 15.\nLonger strings will use much more memory, but allow faster searches. For small\ngenomes, the parameter {genomeSAindexNbases must be scaled down to\nmin(14, log2(GenomeLength)/2 - 1).\n", + "default" : [ + 14 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Create a reference for STAR from a set of fasta files.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "docker", + "env" : [ + "STAR_VERSION 2.7.10b", + "PACKAGES gcc g++ make wget zlib1g-dev unzip" + ] + }, + { + "type" : "docker", + "run" : [ + "apt-get update && \\\\\n apt-get install -y --no-install-recommends ${PACKAGES} && \\\\\n cd /tmp && \\\\\n wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \\\\\n unzip ${STAR_VERSION}.zip && \\\\\n cd STAR-${STAR_VERSION}/source && \\\\\n make STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\\\n cp STAR /usr/local/bin && \\\\\n cd / && \\\\\n rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \\\\\n apt-get --purge autoremove -y ${PACKAGES} && \\\\\n apt-get clean\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/reference/build_star_reference/config.vsh.yml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/reference/build_star_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import tempfile +import subprocess +from pathlib import Path +import tarfile +import gzip +import shutil + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'genome_fasta': $( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "r'${VIASH_PAR_GENOME_FASTA//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'transcriptome_gtf': $( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then echo "r'${VIASH_PAR_TRANSCRIPTOME_GTF//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'genomeSAindexNbases': $( if [ ! -z ${VIASH_PAR_GENOMESAINDEXNBASES+x} ]; then echo "int(r'${VIASH_PAR_GENOMESAINDEXNBASES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +######################## +### Helper functions ### +######################## + + +# helper function for cheching whether something is a gzip +def is_gz_file(path: Path) -> bool: + with open(path, "rb") as file: + return file.read(2) == b"\\\\x1f\\\\x8b" + + +# if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path +def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: + if par_value.is_file() and tarfile.is_tarfile(par_value): + # Remove two extensions (if they exist) + extaction_dir_name = Path(par_value.stem).stem + unpacked_path = temp_dir_path / extaction_dir_name + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) + + with tarfile.open(par_value, "r") as open_tar: + members = open_tar.getmembers() + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] + # if there is only one root_dir (and there are files in that directory) + # strip that directory name from the destination folder + if len(root_dirs) == 1: + for mem in members: + mem.path = Path(*Path(mem.path).parts[1:]) + members_to_move = [mem for mem in members if mem.path != Path(".")] + open_tar.extractall(unpacked_path, members=members_to_move) + return unpacked_path + + elif par_value.is_file() and is_gz_file(par_value): + # Remove extension (if it exists) + extaction_file_name = Path(par_value.stem) + unpacked_path = temp_dir_path / extaction_file_name + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) + + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return unpacked_path + + else: + return par_value + + +######################## +### Main code ### +######################## + +# rename keys and convert path strings to Path +# note: only list file arguments here. if non-file arguments also need to be renamed, +# the \\`processPar()\\` generator needs to be adapted +to_rename = { + "genome_fasta": "genomeFastaFiles", + "output": "genomeDir", + "transcriptome_gtf": "sjdbGTFfile", +} + + +def process_par(orig_par, to_rename): + for key, value in orig_par.items(): + # rename the key in par based on the \\`to_rename\\` dict + if key in to_rename.keys(): + new_key = to_rename[key] + + # also turn value into a Path + if isinstance(value, list): + new_value = [Path(val) for val in value] + else: + new_value = Path(value) + else: + new_key = key + new_value = value + yield new_key, new_value + + +par = dict(process_par(par, to_rename)) + +# create output dir if need be +par["genomeDir"].mkdir(parents=True, exist_ok=True) + +with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"]) as temp_dir: + # checking for compressed files, ungzip files if need be + temp_dir_path = Path(temp_dir) + for par_name in ["genomeFastaFiles", "sjdbGTFfile"]: + par_values = par[par_name] + if par_values: + # turn value into list + is_multiple = isinstance(par_values, list) + if not is_multiple: + par_values = [par_values] + + # output list + new_values = [] + for par_value in par_values: + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) + new_value = extract_if_need_be(par_value, temp_dir_path) + new_values.append(new_value) + + # unlist if need be + if not is_multiple: + new_values = new_values[0] + + # replace value + par[par_name] = new_values + # end ungzipping + print("", flush=True) + + print(">> Constructing command", flush=True) + par["runMode"] = "genomeGenerate" + par["outTmpDir"] = temp_dir_path / "run" + if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + + cmd_args = ["STAR"] + for name, value in par.items(): + if value is not None: + if isinstance(value, list): + cmd_args.extend(["--" + name] + [str(x) for x in value]) + else: + cmd_args.extend(["--" + name, str(value)]) + print("", flush=True) + + print(">> Running STAR with command:", flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) + print("", flush=True) + + subprocess.run(cmd_args, check=True) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/reference/build_star_reference", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/reference/build_star_reference/nextflow.config b/target/nextflow/reference/build_star_reference/nextflow.config new file mode 100644 index 00000000..0fa4871f --- /dev/null +++ b/target/nextflow/reference/build_star_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'reference/build_star_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Create a reference for STAR from a set of fasta files.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/reference/build_star_reference/nextflow_labels.config b/target/nextflow/reference/build_star_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/reference/build_star_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/reference/build_star_reference/nextflow_schema.json b/target/nextflow/reference/build_star_reference/nextflow_schema.json new file mode 100644 index 00000000..f75c88bc --- /dev/null +++ b/target/nextflow/reference/build_star_reference/nextflow_schema.json @@ -0,0 +1,74 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "build_star_reference", + "description": "Create a reference for STAR from a set of fasta files.", + "type": "object", + "$defs": { + "input/output": { + "title": "Input/Output", + "type": "object", + "description": "No description", + "properties": { + "genome_fasta": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The fasta files to be included in the reference", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"chr1.fasta\";\"chr2.fasta\"]`. " + }, + "transcriptome_gtf": { + "type": "string", + "format": "path", + "description": "Specifies the path to the file with annotated transcripts in the standard GTF\nformat", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Path to output directory", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/foo\"`. ", + "default": "$id.$key.output" + } + } + }, + "genome indexing arguments": { + "title": "Genome indexing arguments", + "type": "object", + "description": "No description", + "properties": { + "genomeSAindexNbases": { + "type": "integer", + "description": "Length (bases) of the SA pre-indexing string", + "help_text": "Type: `integer`, multiple: `False`, default: `14`. ", + "default": 14 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input/output" + }, + { + "$ref": "#/$defs/genome indexing arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/reference/cellranger_mkgtf/.config.vsh.yaml b/target/nextflow/reference/cellranger_mkgtf/.config.vsh.yaml new file mode 100644 index 00000000..d40ab8cc --- /dev/null +++ b/target/nextflow/reference/cellranger_mkgtf/.config.vsh.yaml @@ -0,0 +1,231 @@ +name: "cellranger_mkgtf" +namespace: "reference" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_gtf" + description: "Reference GTF annotation." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_gtf" + description: "Output GTF file." + info: null + example: + - "output.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--attribute" + description: "Key-value pair in attributes field to be kept in the GTF file of\ + \ the format attribute:attribute_value." + info: null + example: + - "gene_type:transcribed_unprocessed_pseudogene" + - "gene_type:miRNA" + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Make a GTF file - filter by a specific attribute." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ghcr.io/data-intuitive/cellranger:9.0" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + run: + - "DEBIAN_FRONTEND=noninteractive apt update && \\\napt upgrade -y && apt install\ + \ -y pigz procps && rm -rf /var/lib/apt/lists/*\n" + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/cellranger_mkgtf/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/reference/cellranger_mkgtf" + executable: "target/nextflow/reference/cellranger_mkgtf/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/reference/cellranger_mkgtf/main.nf b/target/nextflow/reference/cellranger_mkgtf/main.nf new file mode 100644 index 00000000..326d7f78 --- /dev/null +++ b/target/nextflow/reference/cellranger_mkgtf/main.nf @@ -0,0 +1,3900 @@ +// cellranger_mkgtf main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_mkgtf", + "namespace" : "reference", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_gtf", + "description" : "Reference GTF annotation.", + "example" : [ + "transcriptome_annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_gtf", + "description" : "Output GTF file.", + "example" : [ + "output.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--attribute", + "description" : "Key-value pair in attributes field to be kept in the GTF file of the format attribute:attribute_value.", + "example" : [ + "gene_type:transcribed_unprocessed_pseudogene", + "gene_type:miRNA" + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Make a GTF file - filter by a specific attribute.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ghcr.io/data-intuitive/cellranger:9.0", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "run" : [ + "DEBIAN_FRONTEND=noninteractive apt update && \\\\\napt upgrade -y && apt install -y pigz procps && rm -rf /var/lib/apt/lists/*\n" + ] + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/reference/cellranger_mkgtf/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/reference/cellranger_mkgtf", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT_GTF+x} ]; then echo "${VIASH_PAR_INPUT_GTF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input_gtf='&'#" ; else echo "# par_input_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_GTF+x} ]; then echo "${VIASH_PAR_OUTPUT_GTF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output_gtf='&'#" ; else echo "# par_output_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_ATTRIBUTE+x} ]; then echo "${VIASH_PAR_ATTRIBUTE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_attribute='&'#" ; else echo "# par_attribute="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +echo $VIASH_TEMP +mkdir -p "$VIASH_TEMP" +tmpdir=\\$(mktemp -d "$VIASH_TEMP/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_input_gtf=\\`realpath \\$par_input_gtf\\` +par_output_gtf=\\`realpath \\$par_output_gtf\\` + +echo "> Unzipping input files" +unpigz -c "\\$par_input_gtf" > "\\$tmpdir/input_gtf.gtf" + +echo "\\${par_attribute}" + +echo "> Building gtf" +cd "\\$tmpdir" +# Start the cellranger mkgtf command +IFS=';' read -r -a attributes <<< "\\$par_attribute" +cmd="cellranger mkgtf \\\\"\\$tmpdir/input_gtf.gtf\\\\" \\\\"\\$tmpdir/output.gtf\\\\"" +# Append each key-value pair as a separate --attribute argument +for attribute in "\\${attributes[@]}"; do + cmd+=" --attribute=\\$attribute" +done +# Execute the command +eval \\$cmd + +echo "> Creating archive" +pigz -k "\\$tmpdir/output.gtf" +mv "\\$tmpdir/output.gtf.gz" "\\$par_output_gtf" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/reference/cellranger_mkgtf", + "tag" : "main" + }, + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/reference/cellranger_mkgtf/nextflow.config b/target/nextflow/reference/cellranger_mkgtf/nextflow.config new file mode 100644 index 00000000..2e2abad9 --- /dev/null +++ b/target/nextflow/reference/cellranger_mkgtf/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'reference/cellranger_mkgtf' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Make a GTF file - filter by a specific attribute.' + author = 'Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/reference/cellranger_mkgtf/nextflow_labels.config b/target/nextflow/reference/cellranger_mkgtf/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/reference/cellranger_mkgtf/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/reference/cellranger_mkgtf/nextflow_schema.json b/target/nextflow/reference/cellranger_mkgtf/nextflow_schema.json new file mode 100644 index 00000000..e73e78e3 --- /dev/null +++ b/target/nextflow/reference/cellranger_mkgtf/nextflow_schema.json @@ -0,0 +1,57 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_mkgtf", + "description": "Make a GTF file - filter by a specific attribute.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input_gtf": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference GTF annotation.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"transcriptome_annotation.gtf.gz\"`. " + }, + "output_gtf": { + "type": "string", + "format": "path", + "description": "Output GTF file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_gtf.gz\"`, direction: `output`, example: `\"output.gtf.gz\"`. ", + "default": "$id.$key.output_gtf.gz" + }, + "attribute": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Key-value pair in attributes field to be kept in the GTF file of the format attribute:attribute_value.", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"gene_type:transcribed_unprocessed_pseudogene\";\"gene_type:miRNA\"]`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/reference/make_reference/.config.vsh.yaml b/target/nextflow/reference/make_reference/.config.vsh.yaml new file mode 100644 index 00000000..69e75288 --- /dev/null +++ b/target/nextflow/reference/make_reference/.config.vsh.yaml @@ -0,0 +1,280 @@ +name: "make_reference" +namespace: "reference" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta. Example: " + info: null + example: + - "genome_fasta.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + description: "Reference transcriptome annotation." + info: null + example: + - "transcriptome.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--ercc" + description: "ERCC sequence and annotation file." + info: null + example: + - "ercc.zip" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--subset_regex" + description: "Will subset the reference chromosomes using the given regex." + info: null + example: + - "(ERCC-00002|chr1)" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_fasta" + description: "Output genome sequence fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_gtf" + description: "Output transcriptome annotation gtf." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Preprocess and build a transcriptome reference.\n\nExample input files\ + \ are:\n - `genome_fasta`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz\n\ + \ - `transcriptome_gtf`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz\n\ + \ - `ercc`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "ubuntu:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "pigz" + - "seqkit" + - "curl" + - "wget" + - "unzip" + - "file" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/reference/make_reference/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/reference/make_reference" + executable: "target/nextflow/reference/make_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/reference/make_reference/main.nf b/target/nextflow/reference/make_reference/main.nf new file mode 100644 index 00000000..904cf865 --- /dev/null +++ b/target/nextflow/reference/make_reference/main.nf @@ -0,0 +1,3989 @@ +// make_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "make_reference", + "namespace" : "reference", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--genome_fasta", + "description" : "Reference genome fasta. Example: ", + "example" : [ + "genome_fasta.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--transcriptome_gtf", + "description" : "Reference transcriptome annotation.", + "example" : [ + "transcriptome.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--ercc", + "description" : "ERCC sequence and annotation file.", + "example" : [ + "ercc.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--subset_regex", + "description" : "Will subset the reference chromosomes using the given regex.", + "example" : [ + "(ERCC-00002|chr1)" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_fasta", + "description" : "Output genome sequence fasta.", + "example" : [ + "genome_sequence.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_gtf", + "description" : "Output transcriptome annotation gtf.", + "example" : [ + "transcriptome_annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Preprocess and build a transcriptome reference.\n\nExample input files are:\n - `genome_fasta`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz\n - `transcriptome_gtf`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz\n - `ercc`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "ubuntu:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "pigz", + "seqkit", + "curl", + "wget", + "unzip", + "file" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/reference/make_reference/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/reference/make_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_GENOME_FASTA+x} ]; then echo "${VIASH_PAR_GENOME_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genome_fasta='&'#" ; else echo "# par_genome_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME_GTF+x} ]; then echo "${VIASH_PAR_TRANSCRIPTOME_GTF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_transcriptome_gtf='&'#" ; else echo "# par_transcriptome_gtf="; fi ) +$( if [ ! -z ${VIASH_PAR_ERCC+x} ]; then echo "${VIASH_PAR_ERCC}" | sed "s#'#'\\"'\\"'#g;s#.*#par_ercc='&'#" ; else echo "# par_ercc="; fi ) +$( if [ ! -z ${VIASH_PAR_SUBSET_REGEX+x} ]; then echo "${VIASH_PAR_SUBSET_REGEX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_subset_regex='&'#" ; else echo "# par_subset_regex="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_FASTA+x} ]; then echo "${VIASH_PAR_OUTPUT_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output_fasta='&'#" ; else echo "# par_output_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_GTF+x} ]; then echo "${VIASH_PAR_OUTPUT_GTF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output_gtf='&'#" ; else echo "# par_output_gtf="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +# create temporary directory +tmpdir=\\$(mktemp -d "$VIASH_TEMP/\\$meta_name-XXXXXXXX") +function clean_up { + rm -rf "\\$tmpdir" +} +trap clean_up EXIT + +echo "> Getting path of fasta file" +par_genome_fasta=\\$(realpath \\$par_genome_fasta) +echo "> Getting path of annotation file" +par_transcriptome_gtf=\\$(realpath \\$par_transcriptome_gtf) + +echo "> Processing genome sequence" +genome_fasta="\\$tmpdir/genome_sequence.fa" +# if genome is gzipped, extract. otherwise not +if file --mime-type "\\$par_genome_fasta" | grep -q gzip\\$; then + zcat "\\$par_genome_fasta" > "\\$genome_fasta" +else + cp "\\$par_genome_fasta" "\\$genome_fasta" +fi + +echo "> Processing transcriptome annotation" +transcriptome_gtf="\\$tmpdir/transcriptome_annotation.gtf" +# if transcriptome is gzipped, extract. otherwise not +if file --mime-type "\\$par_transcriptome_gtf" | grep -q gzip\\$; then + zcat "\\$par_transcriptome_gtf" > "\\$transcriptome_gtf" +else + cp "\\$par_transcriptome_gtf" "\\$transcriptome_gtf" +fi + +if [[ ! -z \\$par_ercc ]]; then + echo "> Processing ERCC sequences" + # wget "\\$par_ercc" -O "\\$tmpdir/ercc.zip" + # unzip "\\$tmpdir/ercc.zip" -d "\\$tmpdir" + unzip "\\$par_ercc" -d "\\$tmpdir" + cat "\\$tmpdir/ERCC92.fa" >> "\\$genome_fasta" + cat "\\$tmpdir/ERCC92.gtf" >> "\\$transcriptome_gtf" +fi + +# create output & filter reference if so desired +if [[ ! -z \\$par_subset_regex ]]; then + echo "> Subsetting reference with regex '\\$par_subset_regex'" + awk '{print \\$1}' "\\$genome_fasta" | seqkit grep -r -p "^\\$par_subset_regex\\\\\\$" > "\\$tmpdir/genome_sequence_filtered.fa" + genome_fasta="\\$tmpdir/genome_sequence_filtered.fa" + grep -E "^\\$par_subset_regex[^A-Za-z0-9]" "\\$transcriptome_gtf" > "\\$tmpdir/transcriptome_annotation_filtered.gtf" + transcriptome_gtf="\\$tmpdir/transcriptome_annotation_filtered.gtf" + + echo + echo "Matched tags:" + cat "\\$genome_fasta" | grep '^>' | sed 's#^>##' | sed 's# .*##' | sort | uniq + echo +fi + +echo "> Gzipping outputs" +pigz -c "\\$genome_fasta" > "\\$par_output_fasta" +pigz -c "\\$transcriptome_gtf" > "\\$par_output_gtf" + +# to do: re enable +# echo "> Sanity check of outputs" +# readarray -t fasta_tags < <( cat "\\$genome_fasta" | grep '^>' | sed 's#^>##' | sed 's# .*##' | sort | uniq ) +# readarray -t transcriptome_tags < <( cat "\\$transcriptome_gtf" | cut -d\\$'\\\\t' -f1 | sort | uniq | grep '^[^#]' ) +# [ "\\${fasta_tags[*]}" == "\\${transcriptome_tags[*]}" ] || { echo "Warning: fasta tags differ from transcriptome tags"; exit 1; } +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/reference/make_reference", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/reference/make_reference/nextflow.config b/target/nextflow/reference/make_reference/nextflow.config new file mode 100644 index 00000000..69718774 --- /dev/null +++ b/target/nextflow/reference/make_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'reference/make_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Preprocess and build a transcriptome reference.\n\nExample input files are:\n - `genome_fasta`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz\n - `transcriptome_gtf`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz\n - `ercc`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip\n' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/reference/make_reference/nextflow_labels.config b/target/nextflow/reference/make_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/reference/make_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/reference/make_reference/nextflow_schema.json b/target/nextflow/reference/make_reference/nextflow_schema.json new file mode 100644 index 00000000..a8d86374 --- /dev/null +++ b/target/nextflow/reference/make_reference/nextflow_schema.json @@ -0,0 +1,74 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "make_reference", + "description": "Preprocess and build a transcriptome reference.\n\nExample input files are:\n - `genome_fasta`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz\n - `transcriptome_gtf`: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz\n - `ercc`: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "genome_fasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference genome fasta", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"genome_fasta.fa.gz\"`. " + }, + "transcriptome_gtf": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference transcriptome annotation.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"transcriptome.gtf.gz\"`. " + }, + "ercc": { + "type": "string", + "format": "path", + "description": "ERCC sequence and annotation file.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"ercc.zip\"`. " + }, + "subset_regex": { + "type": "string", + "description": "Will subset the reference chromosomes using the given regex.", + "help_text": "Type: `string`, multiple: `False`, example: `\"(ERCC-00002|chr1)\"`. " + }, + "output_fasta": { + "type": "string", + "format": "path", + "description": "Output genome sequence fasta.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_fasta.gz\"`, direction: `output`, example: `\"genome_sequence.fa.gz\"`. ", + "default": "$id.$key.output_fasta.gz" + }, + "output_gtf": { + "type": "string", + "format": "path", + "description": "Output transcriptome annotation gtf.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_gtf.gz\"`, direction: `output`, example: `\"transcriptome_annotation.gtf.gz\"`. ", + "default": "$id.$key.output_gtf.gz" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/report/mermaid/.config.vsh.yaml b/target/nextflow/report/mermaid/.config.vsh.yaml new file mode 100644 index 00000000..e13f3256 --- /dev/null +++ b/target/nextflow/report/mermaid/.config.vsh.yaml @@ -0,0 +1,252 @@ +name: "mermaid" +namespace: "report" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input directory" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Generated network as output." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_format" + description: "Output format for the generated image. By default will be inferred\ + \ from the extension \nof the file specified with --output.\n" + info: null + required: false + choices: + - "svg" + - "png" + - "pdf" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--width" + description: "Width of the page" + info: null + default: + - 800 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--height" + description: "Height of the page" + info: null + default: + - 600 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--background_color" + description: "Background color for pngs/svgs (not pdfs)" + info: null + example: + - "#F0F0F0" + default: + - "white" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "puppeteer-config.json" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Generates a network from mermaid code.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "node:20-bullseye" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "javascript" + npm: + - "@mermaid-js/mermaid-cli" + - type: "apt" + packages: + - "chromium" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/report/mermaid/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/report/mermaid" + executable: "target/nextflow/report/mermaid/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/report/mermaid/main.nf b/target/nextflow/report/mermaid/main.nf new file mode 100644 index 00000000..0e7a160c --- /dev/null +++ b/target/nextflow/report/mermaid/main.nf @@ -0,0 +1,3893 @@ +// mermaid main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "mermaid", + "namespace" : "report", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input directory", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Generated network as output.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_format", + "description" : "Output format for the generated image. By default will be inferred from the extension \nof the file specified with --output.\n", + "required" : false, + "choices" : [ + "svg", + "png", + "pdf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--width", + "description" : "Width of the page", + "default" : [ + 800 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--height", + "description" : "Height of the page", + "default" : [ + 600 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--background_color", + "description" : "Background color for pngs/svgs (not pdfs)", + "example" : [ + "#F0F0F0" + ], + "default" : [ + "white" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "./puppeteer-config.json" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Generates a network from mermaid code.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "node:20-bullseye", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "javascript", + "npm" : [ + "@mermaid-js/mermaid-cli" + ] + }, + { + "type" : "apt", + "packages" : [ + "chromium" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/report/mermaid/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/report/mermaid", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT_FORMAT+x} ]; then echo "${VIASH_PAR_OUTPUT_FORMAT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output_format='&'#" ; else echo "# par_output_format="; fi ) +$( if [ ! -z ${VIASH_PAR_WIDTH+x} ]; then echo "${VIASH_PAR_WIDTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_width='&'#" ; else echo "# par_width="; fi ) +$( if [ ! -z ${VIASH_PAR_HEIGHT+x} ]; then echo "${VIASH_PAR_HEIGHT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_height='&'#" ; else echo "# par_height="; fi ) +$( if [ ! -z ${VIASH_PAR_BACKGROUND_COLOR+x} ]; then echo "${VIASH_PAR_BACKGROUND_COLOR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_background_color='&'#" ; else echo "# par_background_color="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +mmdc -p "\\$meta_resources_dir/puppeteer-config.json" \\\\ + -i "\\$par_input" \\\\ + -o "\\$par_output" \\\\ + --width "\\$par_width" \\\\ + --height "\\$par_height" \\\\ + \\${par_background_color:+--backgroundColor \\$par_background_color} \\\\ + \\${output_format:+--outputFormat \\$par_output_format} +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/report/mermaid", + "tag" : "main" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/report/mermaid/nextflow.config b/target/nextflow/report/mermaid/nextflow.config new file mode 100644 index 00000000..873d337a --- /dev/null +++ b/target/nextflow/report/mermaid/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'report/mermaid' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Generates a network from mermaid code.\n' + author = 'Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/report/mermaid/nextflow_labels.config b/target/nextflow/report/mermaid/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/report/mermaid/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/report/mermaid/nextflow_schema.json b/target/nextflow/report/mermaid/nextflow_schema.json new file mode 100644 index 00000000..cd385e05 --- /dev/null +++ b/target/nextflow/report/mermaid/nextflow_schema.json @@ -0,0 +1,77 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "mermaid", + "description": "Generates a network from mermaid code.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input directory", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Generated network as output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_format": { + "type": "string", + "description": "Output format for the generated image", + "help_text": "Type: `string`, multiple: `False`, choices: ``svg`, `png`, `pdf``. ", + "enum": [ + "svg", + "png", + "pdf" + ] + }, + "width": { + "type": "integer", + "description": "Width of the page", + "help_text": "Type: `integer`, multiple: `False`, default: `800`. ", + "default": 800 + }, + "height": { + "type": "integer", + "description": "Height of the page", + "help_text": "Type: `integer`, multiple: `False`, default: `600`. ", + "default": 600 + }, + "background_color": { + "type": "string", + "description": "Background color for pngs/svgs (not pdfs)", + "help_text": "Type: `string`, multiple: `False`, default: `\"white\"`, example: `\"#F0F0F0\"`. ", + "default": "white" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/report/mermaid/puppeteer-config.json b/target/nextflow/report/mermaid/puppeteer-config.json new file mode 100644 index 00000000..7b2851c2 --- /dev/null +++ b/target/nextflow/report/mermaid/puppeteer-config.json @@ -0,0 +1,6 @@ +{ + "executablePath": "/usr/bin/chromium", + "args": [ + "--no-sandbox" + ] +} \ No newline at end of file diff --git a/target/nextflow/scgpt/binning/.config.vsh.yaml b/target/nextflow/scgpt/binning/.config.vsh.yaml new file mode 100644 index 00000000..5b397107 --- /dev/null +++ b/target/nextflow/scgpt/binning/.config.vsh.yaml @@ -0,0 +1,326 @@ +name: "binning" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Mudata layer (key from .layers) to use as input data for binning.\ + \ If not specified, .X is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "The name of the adata.var column containing boolean mask for vocabulary-cross\ + \ checked and/or highly variable genes.\n" + info: null + default: + - "id_in_vocab" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_input_bins" + description: "The number of bins to discretize the data into. When no value is\ + \ provided, data won't be binned.\n" + info: null + default: + - 51 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output h5mu file containing the binned data. \n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_binned_counts" + description: "The name of the adata layer to write the binned data to.\n" + info: null + default: + - "binned_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "Seed for random number generation.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Conversion of (pre-processed) expression count data into relative values\ + \ (bins) to address scale differences across sequencing batches.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "Kim2020_Lung_subset_preprocessed.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midcpu" + - "midmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.11-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/binning/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/scgpt/binning" + executable: "target/nextflow/scgpt/binning/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/scgpt/binning/main.nf b/target/nextflow/scgpt/binning/main.nf new file mode 100644 index 00000000..3d86ae5c --- /dev/null +++ b/target/nextflow/scgpt/binning/main.nf @@ -0,0 +1,4119 @@ +// binning main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Elizabeth Mlynarski (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "binning", + "namespace" : "scgpt", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Elizabeth Mlynarski", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Principal Scientist Computational Genomics" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file.\n", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Mudata layer (key from .layers) to use as input data for binning. If not specified, .X is used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : "The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.\n", + "default" : [ + "id_in_vocab" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_input_bins", + "description" : "The number of bins to discretize the data into. When no value is provided, data won't be binned.\n", + "default" : [ + 51 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The output h5mu file containing the binned data. \n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obsm_binned_counts", + "description" : "The name of the adata layer to write the binned data to.\n", + "default" : [ + "binned_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "Seed for random number generation.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Conversion of (pre-processed) expression count data into relative values (bins) to address scale differences across sequencing batches.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midcpu", + "midmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.11-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/scgpt/binning/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/scgpt/binning", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +import numpy as np +from scipy.sparse import csr_matrix +import warnings + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'n_input_bins': $( if [ ! -z ${VIASH_PAR_N_INPUT_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_INPUT_BINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obsm_binned_counts': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBSM_BINNED_COUNTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +if par["seed"]: + np.random.seed(par["seed"]) + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars + +logger = setup_logger() + +logger.info("Reading in data") +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +logger.info("Subsetting data based on highly variable gene and/or cross-checked genes") +adata = subset_vars(adata, par["var_input"]) + +logger.info("Converting the input layer into a CSR matrix") +if not par["input_layer"] or par["input_layer"] == "X": + layer_data = adata.X +else: + layer_data = adata.layers[par["input_layer"]] +layer_data = csr_matrix(layer_data) + +if layer_data.min() < 0: + raise ValueError( + f"Assuming non-negative data, but got min value {layer_data.min()}." + ) + +n_bins = par["n_input_bins"] # NOTE: the first bin is always a spectial for zero +logger.info(f"Binning data into {par['n_input_bins']} bins.") + + +def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: + assert x.ndim == 1 and bins.ndim == 1 + + left_digits = np.digitize(x, bins) + right_difits = np.digitize(x, bins, right=True) + + rands = np.random.rand(len(x)) # uniform random numbers + + digits = rands * (right_difits - left_digits) + left_digits + digits = np.ceil(digits) + smallest_dtype = np.min_scalar_type( + digits.max().astype(np.uint) + ) # Already checked for non-negative values + digits = digits.astype(smallest_dtype) + + return digits + + +with warnings.catch_warnings(): + # Make sure warnings are displayed once. + warnings.simplefilter("once") + # layer_data.indptr.size is the number of rows in the sparse matrix + binned_rows = [] + bin_edges = [] + logger.info( + "Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix" + ) + for row_number in range(layer_data.indptr.size - 1): + row_start_index, row_end_index = ( + layer_data.indptr[row_number], + layer_data.indptr[row_number + 1], + ) + # These are all non-zero counts in the row + non_zero_row = layer_data.data[row_start_index:row_end_index] + if len(non_zero_row) == 0: + logger.warning( + "The input data contains all zero rows. Please make sure " + "this is expected. You can use the \\`filter_cell_by_counts\\` " + "arg to filter out all zero rows." + ) + + # Add binned_rows and bin_edges as all 0 + # np.stack will upcast the dtype later + binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8)) + bin_edges.append(np.array([0] * n_bins)) + continue + + # Binning of non-zero values + bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1)) + non_zero_digits = _digitize(non_zero_row, bins) + assert non_zero_digits.min() >= 1 + assert non_zero_digits.max() <= n_bins - 1 + binned_rows.append(non_zero_digits) + + bin_edges.append(np.concatenate([[0], bins])) + +# Create new CSR matrix +logger.info("Creating a new CSR matrix of the binned count values") +binned_counts = csr_matrix( + ( + np.concatenate(binned_rows, casting="same_kind"), + layer_data.indices, + layer_data.indptr, + ), + shape=layer_data.shape, +) + +# Set binned values and bin edges layers to adata object +input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts +input_adata.obsm["bin_edges"] = np.stack(bin_edges) + +# Write mudata output +logger.info("Writing output data") +mdata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/scgpt/binning", + "tag" : "main" + }, + "label" : [ + "midcpu", + "midmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/scgpt/binning/nextflow.config b/target/nextflow/scgpt/binning/nextflow.config new file mode 100644 index 00000000..1a504e19 --- /dev/null +++ b/target/nextflow/scgpt/binning/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'scgpt/binning' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Conversion of (pre-processed) expression count data into relative values (bins) to address scale differences across sequencing batches.\n' + author = 'Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/scgpt/binning/nextflow_labels.config b/target/nextflow/scgpt/binning/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/scgpt/binning/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/scgpt/binning/nextflow_schema.json b/target/nextflow/scgpt/binning/nextflow_schema.json new file mode 100644 index 00000000..1eb47ae4 --- /dev/null +++ b/target/nextflow/scgpt/binning/nextflow_schema.json @@ -0,0 +1,102 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "binning", + "description": "Conversion of (pre-processed) expression count data into relative values (bins) to address scale differences across sequencing batches.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Mudata layer (key from .layers) to use as input data for binning", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": "The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"id_in_vocab\"`. ", + "default": "id_in_vocab" + }, + "n_input_bins": { + "type": "integer", + "description": "The number of bins to discretize the data into", + "help_text": "Type: `integer`, multiple: `False`, default: `51`. ", + "default": 51 + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The output h5mu file containing the binned data", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obsm_binned_counts": { + "type": "string", + "description": "The name of the adata layer to write the binned data to.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"binned_counts\"`. ", + "default": "binned_counts" + }, + "seed": { + "type": "integer", + "description": "Seed for random number generation.\n", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/scgpt/binning/setup_logger.py b/target/nextflow/scgpt/binning/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/scgpt/binning/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/scgpt/binning/subset_vars.py b/target/nextflow/scgpt/binning/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/scgpt/binning/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/scgpt/cell_type_annotation/.config.vsh.yaml b/target/nextflow/scgpt/cell_type_annotation/.config.vsh.yaml new file mode 100644 index 00000000..3c7cb049 --- /dev/null +++ b/target/nextflow/scgpt/cell_type_annotation/.config.vsh.yaml @@ -0,0 +1,458 @@ +name: "cell_type_annotation" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +argument_groups: +- name: "Model input" + arguments: + - type: "file" + name: "--model" + description: "The model file containing checkpoints and cell type label mapper.\n" + info: null + example: + - "best_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_config" + description: "The model configuration file. \n" + info: null + example: + - "args.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Model vocabulary file directory.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--finetuned_checkpoints_key" + description: "Key in the model file containing the pretrained checkpoints.\n" + info: null + default: + - "model_state_dict" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--label_mapper_key" + description: "Key in the model file containing the cell type class to label mapper\ + \ dictionary.\n" + info: null + default: + - "id_to_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Query input" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file containing of data that have been pre-processed\ + \ (normalized, binned, genes cross-checked and tokenized).\n" + info: null + example: + - "scgpt_preprocess_ouput.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_label" + description: "The name of the adata.obs column containing the batch labels. Required\ + \ if dsbn is set to true.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_tokens" + description: "The key of the .obsm array containing the gene token ids\n" + info: null + default: + - "gene_id_tokens" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_tokenized_values" + description: "The key of the .obsm array containing the count values of the tokenized\ + \ genes\n" + info: null + default: + - "values_tokenized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output mudata file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "The name of the adata.obs column to write predicted cell type labels\ + \ to.\n" + info: null + default: + - "scgpt_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "The name of the adata.obs column to write the probabilities of the\ + \ predicted cell type labels to.\n" + info: null + default: + - "scgpt_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "The padding token used in the model.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_input_bins" + description: "The number of input bins.\n" + info: null + default: + - 51 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size. \n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--dsbn" + description: "Whether to use domain-specific batch normalization.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "Seed for random number generation. If not specified, no seed is\ + \ used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Annotate gene expression data with cell type classes through the scGPT\ + \ model.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "Kim2020_Lung_subset_tokenized.h5mu" +- type: "file" + path: "args.json" +- type: "file" + path: "vocab.json" +- type: "file" + path: "best_model.pt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + packages: + - "scanpy~=1.10.4" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/cell_type_annotation/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/scgpt/cell_type_annotation" + executable: "target/nextflow/scgpt/cell_type_annotation/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/scgpt/cell_type_annotation/main.nf b/target/nextflow/scgpt/cell_type_annotation/main.nf new file mode 100644 index 00000000..2561ef92 --- /dev/null +++ b/target/nextflow/scgpt/cell_type_annotation/main.nf @@ -0,0 +1,4402 @@ +// cell_type_annotation main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Jakub Majercik (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cell_type_annotation", + "namespace" : "scgpt", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Model input", + "arguments" : [ + { + "type" : "file", + "name" : "--model", + "description" : "The model file containing checkpoints and cell type label mapper.\n", + "example" : [ + "best_model.pt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_config", + "description" : "The model configuration file. \n", + "example" : [ + "args.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_vocab", + "description" : "Model vocabulary file directory.\n", + "example" : [ + "vocab.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--finetuned_checkpoints_key", + "description" : "Key in the model file containing the pretrained checkpoints.\n", + "default" : [ + "model_state_dict" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--label_mapper_key", + "description" : "Key in the model file containing the cell type class to label mapper dictionary.\n", + "default" : [ + "id_to_class" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Query input", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input h5mu file containing of data that have been pre-processed (normalized, binned, genes cross-checked and tokenized).\n", + "example" : [ + "scgpt_preprocess_ouput.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch_label", + "description" : "The name of the adata.obs column containing the batch labels. Required if dsbn is set to true.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_gene_tokens", + "description" : "The key of the .obsm array containing the gene token ids\n", + "default" : [ + "gene_id_tokens" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_tokenized_values", + "description" : "The key of the .obsm array containing the count values of the tokenized genes\n", + "default" : [ + "values_tokenized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The output mudata file.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "The name of the adata.obs column to write predicted cell type labels to.\n", + "default" : [ + "scgpt_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "The name of the adata.obs column to write the probabilities of the predicted cell type labels to.\n", + "default" : [ + "scgpt_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--pad_token", + "description" : "The padding token used in the model.\n", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--pad_value", + "description" : "The value of the padding.\n", + "default" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_input_bins", + "description" : "The number of input bins.\n", + "default" : [ + 51 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--batch_size", + "description" : "The batch size. \n", + "default" : [ + 64 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--dsbn", + "description" : "Whether to use domain-specific batch normalization.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "Seed for random number generation. If not specified, no seed is used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Annotate gene expression data with cell type classes through the scGPT model.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/source/args.json" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/source/vocab.json" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/finetuned_model/best_model.pt" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:23.09-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scgpt==0.2.1" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scanpy~=1.10.4" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/scgpt/cell_type_annotation/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/scgpt/cell_type_annotation", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import json +from multiprocessing import freeze_support +import os +import mudata as mu +from typing import Dict +import warnings +import torch +import numpy as np +from torch.nn import functional +from torch.utils.data import Dataset, DataLoader +from scgpt.model import TransformerModel +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.utils import set_seed +from tqdm import tqdm + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_config': $( if [ ! -z ${VIASH_PAR_MODEL_CONFIG+x} ]; then echo "r'${VIASH_PAR_MODEL_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_vocab': $( if [ ! -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then echo "r'${VIASH_PAR_MODEL_VOCAB//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'finetuned_checkpoints_key': $( if [ ! -z ${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY+x} ]; then echo "r'${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'label_mapper_key': $( if [ ! -z ${VIASH_PAR_LABEL_MAPPER_KEY+x} ]; then echo "r'${VIASH_PAR_LABEL_MAPPER_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_batch_label': $( if [ ! -z ${VIASH_PAR_OBS_BATCH_LABEL+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_gene_tokens': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_TOKENS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_tokenized_values': $( if [ ! -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then echo "r'${VIASH_PAR_OBSM_TOKENIZED_VALUES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_predictions': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PREDICTIONS+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PREDICTIONS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_obs_probability': $( if [ ! -z ${VIASH_PAR_OUTPUT_OBS_PROBABILITY+x} ]; then echo "r'${VIASH_PAR_OUTPUT_OBS_PROBABILITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_value': $( if [ ! -z ${VIASH_PAR_PAD_VALUE+x} ]; then echo "int(r'${VIASH_PAR_PAD_VALUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_input_bins': $( if [ ! -z ${VIASH_PAR_N_INPUT_BINS+x} ]; then echo "int(r'${VIASH_PAR_N_INPUT_BINS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'batch_size': $( if [ ! -z ${VIASH_PAR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_BATCH_SIZE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'dsbn': $( if [ ! -z ${VIASH_PAR_DSBN+x} ]; then echo "r'${VIASH_PAR_DSBN//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'seed': $( if [ ! -z ${VIASH_PAR_SEED+x} ]; then echo "int(r'${VIASH_PAR_SEED//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +class SeqDataset(Dataset): + def __init__(self, data: Dict[str, torch.Tensor]): + self.data = data + + def __len__(self): + return self.data["gene_ids"].shape[0] + + def __getitem__(self, idx): + return {k: v[idx] for k, v in self.data.items()} + + +def main(): + # Setting seed + if par["seed"]: + set_seed(par["seed"]) + + # Setting device + logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Read in data + logger.info("Reading in data") + mdata = mu.read(par["input"]) + input_adata = mdata.mod[par["modality"]] + adata = input_adata.copy() + + # Fetch batch ids for domain-specific batch normalization + if par["dsbn"] and not par["obs_batch_label"]: + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (obs_batch_labels)." + ) + elif par["dsbn"] and par["obs_batch_label"]: + logger.info("Fetching batch id's for domain-specific batch normalization") + batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") + batch_id_labels = batch_id_cats.cat.codes.values + batch_ids = batch_id_labels.tolist() + batch_ids = np.array(batch_ids) + num_batch_types = len(set(batch_ids)) + elif not par["dsbn"]: + # forward pass requires a tensor as input + batch_ids = np.zeros(adata.shape[0]) + + # Vocabulary configuration + logger.info("Loading model vocabulary") + special_tokens = [par["pad_token"], "", ""] + logger.info(f"Loading model vocab from {par['model_vocab']}") + vocab_file = par["model_vocab"] + vocab = GeneVocab.from_file(vocab_file) + [vocab.append_token(s) for s in special_tokens if s not in vocab] + vocab.set_default_index(vocab[par["pad_token"]]) + ntokens = len(vocab) + + # Model configuration + logger.info("Loading model and configurations") + model_config_file = par["model_config"] + with open(model_config_file, "r") as f: + model_configs = json.load(f) + embsize = model_configs["embsize"] + nhead = model_configs["nheads"] + d_hid = model_configs["d_hid"] + nlayers = model_configs["nlayers"] + + # Ensure the provided model has the correct architecture + logger.info("Loading model") + model_file = par["model"] + model_dict = torch.load(model_file, map_location=device) + for k, v in { + "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"], + "--label_mapper_key": par["label_mapper_key"], + }.items(): + if v not in model_dict.keys(): + raise KeyError( + f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) + pretrained_dict = model_dict[par["finetuned_checkpoints_key"]] + + # Label mapper configuration + logger.info("Loading label mapper") + label_mapper = model_dict[par["label_mapper_key"]] + cell_type_mapper = {int(k): v for k, v in label_mapper.items()} + n_cls = len(cell_type_mapper) + + # Model instatiation + logger.info("Instantiating model") + model = TransformerModel( + ntokens, + d_model=embsize, # self.encoder (GenEncoder), self.value_encoder (ContinuousValueEncoder), self.transformerencoder(TransformerEncoderLayer) + nhead=nhead, # self.transformer_encoder(TransformerEncoderLayer) + d_hid=d_hid, # self.transformer_encoder(TransformerEncoderLayer) + nlayers=nlayers, # self.transformer_encoder(TransformerEncoderLayer), self.cls_decoder + nlayers_cls=3, # self.cls_decoder + n_cls=n_cls, # self.cls_decoder + vocab=vocab, + dropout=0.2, # self.transformer_encoder + pad_token=par["pad_token"], + pad_value=par["pad_value"], + do_mvc=False, + do_dab=False, + use_batch_labels=par["dsbn"], + num_batch_labels=num_batch_types if par["dsbn"] else None, + domain_spec_batchnorm=par["dsbn"], + input_emb_style="continuous", + n_input_bins=par["n_input_bins"], + cell_emb_style="cls", # required for cell-type annotation + use_fast_transformer=False, # TODO: parametrize when GPU is available + fast_transformer_backend="flash", # TODO: parametrize when GPU is available + pre_norm=False, # TODO: parametrize when GPU is available + ) + + # Load model params + logger.info(f"Loading model params from {model_file}") + try: + model.load_state_dict(pretrained_dict) + except RuntimeError: + logger.info("only load params that are in the model and match the size") + model_dict = model.state_dict() + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if k in model_dict and v.shape == model_dict[k].shape + } + for k, v in pretrained_dict.items(): + logger.info(f"Loading params {k} with shape {v.shape}") + model_dict.update(pretrained_dict) + model.load_state_dict(model_dict) + + model.to(device) + + # Load tokenized gene data + logger.info("Loading data for inference") + for k, v in { + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + }.items(): + if v not in adata.obsm.keys(): + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) + + input_gene_ids = adata.obsm[par["obsm_gene_tokens"]] + input_values = adata.obsm[par["obsm_tokenized_values"]] + + data_pt = { + "gene_ids": input_gene_ids, + "values": input_values, + "batch_labels": torch.from_numpy(batch_ids).long(), + } + + data_loader = DataLoader( + dataset=SeqDataset(data_pt), + batch_size=par["batch_size"], + num_workers=min(os.cpu_count(), par["batch_size"] // 2), + pin_memory=True, + ) + + # Inference + logger.info("Predicting cell type classes") + model.eval() + predictions = [] + probabilities = [] + with torch.no_grad(): + for batch_data in tqdm(data_loader): + input_gene_ids = batch_data["gene_ids"].to(device) + input_values = batch_data["values"].to(device) + batch_labels = batch_data["batch_labels"].to(device) + + src_key_padding_mask = input_gene_ids.eq(vocab[par["pad_token"]]) + with torch.cuda.amp.autocast(enabled=False): + output_dict = model( + input_gene_ids, + input_values, + src_key_padding_mask=src_key_padding_mask, + batch_labels=batch_labels if par["dsbn"] else None, + CLS=True, # Return celltype classification objective output + CCE=False, + MVC=False, + ECS=False, + ) + output_values = output_dict["cls_output"] + + preds = output_values.argmax(1).cpu().numpy() + predictions.append(preds) + + probs = functional.softmax(output_values, dim=1).max(1)[0] + probabilities.append(probs.cpu().numpy()) + + predictions = np.concatenate(predictions, axis=0) + probabilities = np.concatenate(probabilities, axis=0) + + # Assign cell type labels to predicted classes + logger.info("Assigning cell type predictions and probabilities") + adata.obs["scgpt_class_pred"] = predictions + adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map( + lambda x: cell_type_mapper[x] + ) + adata.obs[par["output_obs_probability"]] = probabilities + + # Write output + logger.info("Writing output data") + mdata.mod[par["modality"]] = adata + mdata.write(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + freeze_support() + warnings.filterwarnings("ignore") + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/scgpt/cell_type_annotation", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/scgpt/cell_type_annotation/nextflow.config b/target/nextflow/scgpt/cell_type_annotation/nextflow.config new file mode 100644 index 00000000..b36372ea --- /dev/null +++ b/target/nextflow/scgpt/cell_type_annotation/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'scgpt/cell_type_annotation' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Annotate gene expression data with cell type classes through the scGPT model.\n' + author = 'Dorien Roosen, Jakub Majercik' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/scgpt/cell_type_annotation/nextflow_labels.config b/target/nextflow/scgpt/cell_type_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/scgpt/cell_type_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/scgpt/cell_type_annotation/nextflow_schema.json b/target/nextflow/scgpt/cell_type_annotation/nextflow_schema.json new file mode 100644 index 00000000..2c400c2c --- /dev/null +++ b/target/nextflow/scgpt/cell_type_annotation/nextflow_schema.json @@ -0,0 +1,191 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cell_type_annotation", + "description": "Annotate gene expression data with cell type classes through the scGPT model.\n", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The output mudata file.\n", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "string", + "description": "The name of the adata.obs column to write predicted cell type labels to.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scgpt_pred\"`. ", + "default": "scgpt_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "The name of the adata.obs column to write the probabilities of the predicted cell type labels to.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scgpt_probability\"`. ", + "default": "scgpt_probability" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "pad_token": { + "type": "string", + "description": "The padding token used in the model.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + }, + "pad_value": { + "type": "integer", + "description": "The value of the padding.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `-2`. ", + "default": -2 + }, + "n_input_bins": { + "type": "integer", + "description": "The number of input bins.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `51`. ", + "default": 51 + }, + "batch_size": { + "type": "integer", + "description": "The batch size", + "help_text": "Type: `integer`, multiple: `False`, default: `64`. ", + "default": 64 + }, + "dsbn": { + "type": "boolean", + "description": "Whether to use domain-specific batch normalization.\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "seed": { + "type": "integer", + "description": "Seed for random number generation", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "model input": { + "title": "Model input", + "type": "object", + "description": "No description", + "properties": { + "model": { + "type": "string", + "format": "path", + "exists": true, + "description": "The model file containing checkpoints and cell type label mapper.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"best_model.pt\"`. " + }, + "model_config": { + "type": "string", + "format": "path", + "exists": true, + "description": "The model configuration file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"args.json\"`. " + }, + "model_vocab": { + "type": "string", + "format": "path", + "exists": true, + "description": "Model vocabulary file directory.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"vocab.json\"`. " + }, + "finetuned_checkpoints_key": { + "type": "string", + "description": "Key in the model file containing the pretrained checkpoints.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"model_state_dict\"`. ", + "default": "model_state_dict" + }, + "label_mapper_key": { + "type": "string", + "description": "Key in the model file containing the cell type class to label mapper dictionary.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"id_to_class\"`. ", + "default": "id_to_class" + } + } + }, + "query input": { + "title": "Query input", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input h5mu file containing of data that have been pre-processed (normalized, binned, genes cross-checked and tokenized).\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"scgpt_preprocess_ouput.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_batch_label": { + "type": "string", + "description": "The name of the adata.obs column containing the batch labels", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obsm_gene_tokens": { + "type": "string", + "description": "The key of the .obsm array containing the gene token ids\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"gene_id_tokens\"`. ", + "default": "gene_id_tokens" + }, + "obsm_tokenized_values": { + "type": "string", + "description": "The key of the .obsm array containing the count values of the tokenized genes\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"values_tokenized\"`. ", + "default": "values_tokenized" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/model input" + }, + { + "$ref": "#/$defs/query input" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/scgpt/cell_type_annotation/setup_logger.py b/target/nextflow/scgpt/cell_type_annotation/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/scgpt/cell_type_annotation/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/scgpt/cross_check_genes/.config.vsh.yaml b/target/nextflow/scgpt/cross_check_genes/.config.vsh.yaml new file mode 100644 index 00000000..bd1b81c4 --- /dev/null +++ b/target/nextflow/scgpt/cross_check_genes/.config.vsh.yaml @@ -0,0 +1,349 @@ +name: "cross_check_genes" +namespace: "scgpt" +version: "main" +authors: +- name: "Jakub Majercik" + roles: + - "author" + info: + role: "Contributor" + links: + email: "jakub@data-intuitive.com" + github: "jakubmajercik" + linkedin: "jakubmajercik" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Bioinformatics Engineer" +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file containing of pre-processed data.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "The modality key of the MuData object containing the RNA AnnData\ + \ object.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vocab_file" + description: "Model vocabulary file path.\n" + info: null + example: + - "resources_test/scgpt/vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The name of the adata.var column containing gene names. By default\ + \ the .var index will be used.\n" + info: null + example: + - "gene_name" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. If provided, will\ + \ only cross-check HVG filtered genes with model vocabulary." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output cross-checked anndata file.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_filter" + description: "In which .var slot to store a boolean array corresponding to which\ + \ observations should be filtered out based on HVG and model vocabulary." + info: null + default: + - "id_in_vocab" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "The padding token used in the model.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cross-check genes with pre-trained scGPT model.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "Kim2020_Lung_subset_preprocessed.h5mu" +- type: "file" + path: "vocab.json" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + - type: "python" + user: false + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/cross_check_genes/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/scgpt/cross_check_genes" + executable: "target/nextflow/scgpt/cross_check_genes/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/scgpt/cross_check_genes/main.nf b/target/nextflow/scgpt/cross_check_genes/main.nf new file mode 100644 index 00000000..e1ef56e2 --- /dev/null +++ b/target/nextflow/scgpt/cross_check_genes/main.nf @@ -0,0 +1,4097 @@ +// cross_check_genes main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Jakub Majercik (author) +// * Dorien Roosen (maintainer, author) +// * Elizabeth Mlynarski (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cross_check_genes", + "namespace" : "scgpt", + "version" : "main", + "authors" : [ + { + "name" : "Jakub Majercik", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "jakub@data-intuitive.com", + "github" : "jakubmajercik", + "linkedin" : "jakubmajercik" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Bioinformatics Engineer" + } + ] + } + }, + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Elizabeth Mlynarski", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Principal Scientist Computational Genomics" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input h5mu file containing of pre-processed data.\n", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "The modality key of the MuData object containing the RNA AnnData object.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vocab_file", + "description" : "Model vocabulary file path.\n", + "example" : [ + "resources_test/scgpt/vocab.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The name of the adata.var column containing gene names. By default the .var index will be used.\n", + "example" : [ + "gene_name" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : ".var column containing highly variable genes. If provided, will only cross-check HVG filtered genes with model vocabulary.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The output cross-checked anndata file.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_filter", + "description" : "In which .var slot to store a boolean array corresponding to which observations should be filtered out based on HVG and model vocabulary.", + "default" : [ + "id_in_vocab" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--pad_token", + "description" : "The padding token used in the model.\n", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Cross-check genes with pre-trained scGPT model.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/source/vocab.json" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:23.09-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scgpt==0.2.1" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/scgpt/cross_check_genes/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/scgpt/cross_check_genes", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +from scgpt.tokenizer.gene_tokenizer import GeneVocab + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'vocab_file': $( if [ ! -z ${VIASH_PAR_VOCAB_FILE+x} ]; then echo "r'${VIASH_PAR_VOCAB_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_var_gene_names': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_var_filter': $( if [ ! -z ${VIASH_PAR_OUTPUT_VAR_FILTER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_VAR_FILTER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +# Read in data +logger.info(f"Reading {par['input']}") +mudata = mu.read_h5mu(par["input"]) +adata = mudata.mod[par["modality"]].copy() + +pad_token = par["pad_token"] +special_tokens = [pad_token, "", ""] + +# Fetching gene names +if not par["input_var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +elif par["input_var_gene_names"] not in adata.var.columns: + raise ValueError( + f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs." + ) +else: + genes = adata.var[par["input_var_gene_names"]].astype(str).tolist() + +# Cross-check genes with pre-trained model +logger.info(f"Loading model vocab from {par['vocab_file']}") +vocab_file = par["vocab_file"] +vocab = GeneVocab.from_file(vocab_file) +[vocab.append_token(s) for s in special_tokens if s not in vocab] + +if par["var_input"]: + logger.info("Filtering genes based on model vocab and HVG") + filter_with_hvg = adata.var[par["var_input"]].tolist() + gene_filter_mask = [ + 1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg) + ] + logger.info( + f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}" + ) +else: + logger.info("Filtering genes based on model vocab") + gene_filter_mask = [1 if gene in vocab else 0 for gene in genes] + logger.info( + f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}" + ) + +logger.info(f"Writing to {par['output']}") +adata.var[par["output_var_filter"]] = gene_filter_mask +adata.var[par["output_var_filter"]] = adata.var[par["output_var_filter"]].astype("bool") +mudata.mod[par["modality"]] = adata +mudata.write_h5mu(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/scgpt/cross_check_genes", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/scgpt/cross_check_genes/nextflow.config b/target/nextflow/scgpt/cross_check_genes/nextflow.config new file mode 100644 index 00000000..f0c1d77a --- /dev/null +++ b/target/nextflow/scgpt/cross_check_genes/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'scgpt/cross_check_genes' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Cross-check genes with pre-trained scGPT model.\n' + author = 'Jakub Majercik, Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/scgpt/cross_check_genes/nextflow_labels.config b/target/nextflow/scgpt/cross_check_genes/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/scgpt/cross_check_genes/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/scgpt/cross_check_genes/nextflow_schema.json b/target/nextflow/scgpt/cross_check_genes/nextflow_schema.json new file mode 100644 index 00000000..f49cc773 --- /dev/null +++ b/target/nextflow/scgpt/cross_check_genes/nextflow_schema.json @@ -0,0 +1,113 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cross_check_genes", + "description": "Cross-check genes with pre-trained scGPT model.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input h5mu file containing of pre-processed data.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "The modality key of the MuData object containing the RNA AnnData object.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "vocab_file": { + "type": "string", + "format": "path", + "exists": true, + "description": "Model vocabulary file path.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"resources_test/scgpt/vocab.json\"`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The name of the adata.var column containing gene names", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_name\"`. " + }, + "var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The output cross-checked anndata file.\n", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_var_filter": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding to which observations should be filtered out based on HVG and model vocabulary.", + "help_text": "Type: `string`, multiple: `False`, default: `\"id_in_vocab\"`. ", + "default": "id_in_vocab" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "pad_token": { + "type": "string", + "description": "The padding token used in the model.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/scgpt/cross_check_genes/setup_logger.py b/target/nextflow/scgpt/cross_check_genes/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/scgpt/cross_check_genes/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/scgpt/embedding/.config.vsh.yaml b/target/nextflow/scgpt/embedding/.config.vsh.yaml new file mode 100644 index 00000000..aa73c6fa --- /dev/null +++ b/target/nextflow/scgpt/embedding/.config.vsh.yaml @@ -0,0 +1,428 @@ +name: "embedding" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file containing tokenized gene and count data. \n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model" + description: "Path to scGPT model file.\n" + info: null + example: + - "best_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Path to scGPT model vocabulary file.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_config" + description: "Path to scGPT model config file.\n" + info: null + example: + - "args.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_tokens" + description: "The key of the .obsm array containing the gene token ids\n" + info: null + example: + - "values.pt" + default: + - "gene_id_tokens" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_tokenized_values" + description: "The key of the .obsm array containing the count values of the tokenized\ + \ genes\n" + info: null + default: + - "values_tokenized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_padding_mask" + description: "The key of the .obsm array containing the padding mask.\n" + info: null + default: + - "padding_mask" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the .var column containing gene names. When no gene_name_layer\ + \ is provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_label" + description: "The name of the adata.obs column containing the batch labels. Must\ + \ be provided when 'dsbn' is set to True.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--finetuned_checkpoints_key" + description: "Key in the model file containing the pretrained checkpoints. Only\ + \ relevant for fine-tuned models.\n" + info: null + example: + - "model_state_dict" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Path to output anndata file containing pre-processed data as well\ + \ as scGPT embeddings.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_embeddings" + description: "The name of the adata.obsm array to which scGPT embeddings will\ + \ be written.\n" + info: null + default: + - "X_scGPT" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "The token to be used for padding.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding token.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--dsbn" + description: "Whether to apply domain-specific batch normalization for generating\ + \ embeddings. When set to True, 'obs_batch_labels' must be set as well.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size to be used for inference\n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Generation of cell embeddings for the integration of single cell transcriptomic\ + \ count data using scGPT.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "source" +- type: "file" + path: "finetuned_model" +- type: "file" + path: "Kim2020_Lung_subset_tokenized.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + - "gpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/embedding/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/scgpt/embedding" + executable: "target/nextflow/scgpt/embedding/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/scgpt/embedding/main.nf b/target/nextflow/scgpt/embedding/main.nf new file mode 100644 index 00000000..c8b8cb86 --- /dev/null +++ b/target/nextflow/scgpt/embedding/main.nf @@ -0,0 +1,4295 @@ +// embedding main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Elizabeth Mlynarski (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "embedding", + "namespace" : "scgpt", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Elizabeth Mlynarski", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Principal Scientist Computational Genomics" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input h5mu file containing tokenized gene and count data. \n", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model", + "description" : "Path to scGPT model file.\n", + "example" : [ + "best_model.pt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_vocab", + "description" : "Path to scGPT model vocabulary file.\n", + "example" : [ + "vocab.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_config", + "description" : "Path to scGPT model config file.\n", + "example" : [ + "args.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_gene_tokens", + "description" : "The key of the .obsm array containing the gene token ids\n", + "example" : [ + "values.pt" + ], + "default" : [ + "gene_id_tokens" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_tokenized_values", + "description" : "The key of the .obsm array containing the count values of the tokenized genes\n", + "default" : [ + "values_tokenized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_padding_mask", + "description" : "The key of the .obsm array containing the padding mask.\n", + "default" : [ + "padding_mask" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_gene_names", + "description" : "The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch_label", + "description" : "The name of the adata.obs column containing the batch labels. Must be provided when 'dsbn' is set to True.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--finetuned_checkpoints_key", + "description" : "Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models.\n", + "example" : [ + "model_state_dict" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Path to output anndata file containing pre-processed data as well as scGPT embeddings.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_embeddings", + "description" : "The name of the adata.obsm array to which scGPT embeddings will be written.\n", + "default" : [ + "X_scGPT" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--pad_token", + "description" : "The token to be used for padding.\n", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--pad_value", + "description" : "The value of the padding token.\n", + "default" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--dsbn", + "description" : "Whether to apply domain-specific batch normalization for generating embeddings. When set to True, 'obs_batch_labels' must be set as well.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--batch_size", + "description" : "The batch size to be used for inference\n", + "default" : [ + 64 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/source" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/finetuned_model" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu", + "gpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:23.09-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scgpt==0.2.1" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/scgpt/embedding/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/scgpt/embedding", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import numpy as np +import mudata as mu +import json +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.model import TransformerModel +from scgpt.utils.util import load_pretrained +import torch + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model': $( if [ ! -z ${VIASH_PAR_MODEL+x} ]; then echo "r'${VIASH_PAR_MODEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_vocab': $( if [ ! -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then echo "r'${VIASH_PAR_MODEL_VOCAB//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_config': $( if [ ! -z ${VIASH_PAR_MODEL_CONFIG+x} ]; then echo "r'${VIASH_PAR_MODEL_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_gene_tokens': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_TOKENS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_tokenized_values': $( if [ ! -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then echo "r'${VIASH_PAR_OBSM_TOKENIZED_VALUES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_padding_mask': $( if [ ! -z ${VIASH_PAR_OBSM_PADDING_MASK+x} ]; then echo "r'${VIASH_PAR_OBSM_PADDING_MASK//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_batch_label': $( if [ ! -z ${VIASH_PAR_OBS_BATCH_LABEL+x} ]; then echo "r'${VIASH_PAR_OBS_BATCH_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'finetuned_checkpoints_key': $( if [ ! -z ${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY+x} ]; then echo "r'${VIASH_PAR_FINETUNED_CHECKPOINTS_KEY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_embeddings': $( if [ ! -z ${VIASH_PAR_OBSM_EMBEDDINGS+x} ]; then echo "r'${VIASH_PAR_OBSM_EMBEDDINGS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_value': $( if [ ! -z ${VIASH_PAR_PAD_VALUE+x} ]; then echo "int(r'${VIASH_PAR_PAD_VALUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'dsbn': $( if [ ! -z ${VIASH_PAR_DSBN+x} ]; then echo "r'${VIASH_PAR_DSBN//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'batch_size': $( if [ ! -z ${VIASH_PAR_BATCH_SIZE+x} ]; then echo "int(r'${VIASH_PAR_BATCH_SIZE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +logger.info("Reading in data") + +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +for k, v in { + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + "--obsm_padding_mask": par["obsm_padding_mask"], +}.items(): + if v not in adata.obsm.keys(): + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) + +all_gene_ids = adata.obsm[par["obsm_gene_tokens"]] +all_values = adata.obsm[par["obsm_tokenized_values"]] +padding_mask = adata.obsm[par["obsm_padding_mask"]] + +# Fetch batch ids for domain-specific batch normalization +if par["dsbn"] and not par["obs_batch_label"]: + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels)." + ) +elif par["dsbn"] and par["obs_batch_label"]: + logger.info("Fetching batch id's for domain-specific batch normalization") + batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") + batch_id_labels = batch_id_cats.cat.codes.values + batch_ids = batch_id_labels.tolist() + batch_ids = np.array(batch_ids) + num_batch_types = len(set(batch_ids)) +elif not par["dsbn"] and par["obs_batch_label"]: + logger.info( + "Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed." + ) + +# Set padding specs +logger.info("Setting padding specs") +pad_token = par["pad_token"] +pad_value = par["pad_value"] +special_tokens = [pad_token, "", ""] + +# Fetching gene names +logger.info("Fetching gene names") +if not par["var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +else: + genes = adata.var[par["var_gene_names"]].astype(str).tolist() + +# Model files +logger.info("Loading model, vocab and configs") +model_config_file = par["model_config"] +model_file = par["model"] +vocab_file = par["model_vocab"] + +# Load vocab +vocab = GeneVocab.from_file(vocab_file) +for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + +vocab.set_default_index(vocab[""]) +ntokens = len(vocab) +gene_ids = np.array(vocab(genes), dtype=int) + +# Load model configs +with open(model_config_file, "r") as f: + model_configs = json.load(f) +embsize = model_configs["embsize"] +nhead = model_configs["nheads"] +d_hid = model_configs["d_hid"] +nlayers = model_configs["nlayers"] + +# Instantiate model +logger.info("Initializing transformer model") +model = TransformerModel( + ntokens, + d_model=embsize, + nhead=nhead, + d_hid=d_hid, + nlayers=nlayers, + vocab=vocab, + dropout=0.5, # scGPT default, only relevant for fine-tuning applications + pad_token=pad_token, + pad_value=pad_value, + nlayers_cls=3, # only applicable for decoder-based operations + n_cls=1, # only applicable for decoder-based operations + do_mvc=False, # only applicable for decoder-based operations + ecs_threshold=0.8, # only applicable for decoder-based operations + do_dab=False, # only applicable for decoder-based operations + use_batch_labels=False, # only applicable for decoder-based operations + num_batch_labels=num_batch_types if par["dsbn"] else None, + domain_spec_batchnorm=par["dsbn"], + input_emb_style="continuous", # scGPT default + explicit_zero_prob=False, # TODO: Parametrize when GPU-based machine types are supported + use_fast_transformer=False, # TODO: Parametrize when GPU-based machine types are supported + # fast_transformer_backend="flash", #TODO: Parametrize when GPU-based machine types are supported + pre_norm=False, # TODO: Parametrize when GPU-based machine types are supported +) + + +logger.info("Loading model") +model_file = par["model"] +model_dict = torch.load(model_file, map_location=device) + +# Ensure the provided model has the correct architecture +finetuned_checkpoints_key = par.get("finetuned_checkpoints_key") +if finetuned_checkpoints_key: + try: + model_dict = model_dict[finetuned_checkpoints_key] + except KeyError as e: + raise ValueError( + f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) from e + +# Load model +load_pretrained(model, model_dict, verbose=False) + +# Embed tokenized data +logger.info("Converting tokenized input data to embeddings") +model.to(device) +model.eval() + +cell_embeddings = model.encode_batch( + torch.from_numpy(all_gene_ids), + torch.from_numpy(all_values).float(), + src_key_padding_mask=torch.from_numpy(padding_mask), + batch_size=par["batch_size"], + batch_labels=torch.from_numpy(batch_ids).long() if par["dsbn"] else None, + output_to_cpu=True, + time_step=0, + return_np=True, +) + +cell_embeddings = cell_embeddings / np.linalg.norm( + cell_embeddings, axis=1, keepdims=True +) + +# Write output +logger.info("Writing output data") +adata.obsm[par["obsm_embeddings"]] = cell_embeddings +mdata.mod[par["modality"]] = adata +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/scgpt/embedding", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu", + "gpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/scgpt/embedding/nextflow.config b/target/nextflow/scgpt/embedding/nextflow.config new file mode 100644 index 00000000..90d5ffe4 --- /dev/null +++ b/target/nextflow/scgpt/embedding/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'scgpt/embedding' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT.\n' + author = 'Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/scgpt/embedding/nextflow_labels.config b/target/nextflow/scgpt/embedding/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/scgpt/embedding/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/scgpt/embedding/nextflow_schema.json b/target/nextflow/scgpt/embedding/nextflow_schema.json new file mode 100644 index 00000000..f0d53351 --- /dev/null +++ b/target/nextflow/scgpt/embedding/nextflow_schema.json @@ -0,0 +1,168 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "embedding", + "description": "Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input h5mu file containing tokenized gene and count data", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "model": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to scGPT model file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"best_model.pt\"`. " + }, + "model_vocab": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to scGPT model vocabulary file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"vocab.json\"`. " + }, + "model_config": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to scGPT model config file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"args.json\"`. " + }, + "obsm_gene_tokens": { + "type": "string", + "description": "The key of the .obsm array containing the gene token ids\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"gene_id_tokens\"`, example: `\"values.pt\"`. ", + "default": "gene_id_tokens" + }, + "obsm_tokenized_values": { + "type": "string", + "description": "The key of the .obsm array containing the count values of the tokenized genes\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"values_tokenized\"`. ", + "default": "values_tokenized" + }, + "obsm_padding_mask": { + "type": "string", + "description": "The key of the .obsm array containing the padding mask.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"padding_mask\"`. ", + "default": "padding_mask" + }, + "var_gene_names": { + "type": "string", + "description": "The name of the .var column containing gene names", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_batch_label": { + "type": "string", + "description": "The name of the adata.obs column containing the batch labels", + "help_text": "Type: `string`, multiple: `False`. " + }, + "finetuned_checkpoints_key": { + "type": "string", + "description": "Key in the model file containing the pretrained checkpoints", + "help_text": "Type: `string`, multiple: `False`, example: `\"model_state_dict\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Path to output anndata file containing pre-processed data as well as scGPT embeddings.\n", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_embeddings": { + "type": "string", + "description": "The name of the adata.obsm array to which scGPT embeddings will be written.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scGPT\"`. ", + "default": "X_scGPT" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "pad_token": { + "type": "string", + "description": "The token to be used for padding.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + }, + "pad_value": { + "type": "integer", + "description": "The value of the padding token.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `-2`. ", + "default": -2 + }, + "dsbn": { + "type": "boolean", + "description": "Whether to apply domain-specific batch normalization for generating embeddings", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "batch_size": { + "type": "integer", + "description": "The batch size to be used for inference\n", + "help_text": "Type: `integer`, multiple: `False`, default: `64`. ", + "default": 64 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/scgpt/embedding/setup_logger.py b/target/nextflow/scgpt/embedding/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/scgpt/embedding/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/scgpt/pad_tokenize/.config.vsh.yaml b/target/nextflow/scgpt/pad_tokenize/.config.vsh.yaml new file mode 100644 index 00000000..f6faa032 --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/.config.vsh.yaml @@ -0,0 +1,387 @@ +name: "pad_tokenize" +namespace: "scgpt" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "The input h5mu file of pre-processed data.\n" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Path to model vocabulary file.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the .var column containing gene names. When no gene_name_layer\ + \ is provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: "The name of the adata.var column containing boolean mask for vocabulary-cross\ + \ checked and/or highly variable genes.\n" + info: null + default: + - "id_in_vocab" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obsm_binned_counts" + description: "The name of the .obsm field containing the binned counts to be padded\ + \ and tokenized.\n" + info: null + default: + - "binned_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output h5mu file containing obsm arrays for gene tokens, tokenized\ + \ data and padding mask.\n" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_gene_tokens" + description: "The key of the .obsm array containing the gene token ids\n" + info: null + example: + - "values.pt" + default: + - "gene_id_tokens" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_tokenized_values" + description: "The key of the .obsm array containing the count values of the tokenized\ + \ genes\n" + info: null + default: + - "values_tokenized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_padding_mask" + description: "The key of the .obsm array containing the padding mask.\n" + info: null + default: + - "padding_mask" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "Token used for padding.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding token.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_seq_len" + description: "The maximum sequence length of the tokenized data. Defaults to the\ + \ number of features if not provided.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "subset_vars.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Tokenize and pad a batch of data for scGPT integration zero-shot inference\ + \ or fine-tuning.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "scgpt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "nvcr.io/nvidia/pytorch:23.09-py3" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + - type: "python" + user: false + packages: + - "scgpt==0.2.1" + - "ipython~=8.5.0" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/scgpt/pad_tokenize/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/scgpt/pad_tokenize" + executable: "target/nextflow/scgpt/pad_tokenize/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/scgpt/pad_tokenize/main.nf b/target/nextflow/scgpt/pad_tokenize/main.nf new file mode 100644 index 00000000..53e837e9 --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/main.nf @@ -0,0 +1,4169 @@ +// pad_tokenize main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Elizabeth Mlynarski (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "pad_tokenize", + "namespace" : "scgpt", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Elizabeth Mlynarski", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Principal Scientist Computational Genomics" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The input h5mu file of pre-processed data.\n", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_vocab", + "description" : "Path to model vocabulary file.\n", + "example" : [ + "vocab.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_gene_names", + "description" : "The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : "The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.\n", + "default" : [ + "id_in_vocab" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obsm_binned_counts", + "description" : "The name of the .obsm field containing the binned counts to be padded and tokenized.\n", + "default" : [ + "binned_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask.\n", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_gene_tokens", + "description" : "The key of the .obsm array containing the gene token ids\n", + "example" : [ + "values.pt" + ], + "default" : [ + "gene_id_tokens" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_tokenized_values", + "description" : "The key of the .obsm array containing the count values of the tokenized genes\n", + "default" : [ + "values_tokenized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_padding_mask", + "description" : "The key of the .obsm array containing the padding mask.\n", + "default" : [ + "padding_mask" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--pad_token", + "description" : "Token used for padding.\n", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--pad_value", + "description" : "The value of the padding token.\n", + "default" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_seq_len", + "description" : "The maximum sequence length of the tokenized data. Defaults to the number of features if not provided.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/subset_vars.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/scgpt/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "nvcr.io/nvidia/pytorch:23.09-py3", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "scgpt==0.2.1", + "ipython~=8.5.0" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/scgpt/pad_tokenize/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/scgpt/pad_tokenize", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata as mu +import numpy as np +from scipy.sparse import issparse +from scgpt.tokenizer import tokenize_and_pad_batch +from scgpt.tokenizer.gene_tokenizer import GeneVocab + + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'model_vocab': $( if [ ! -z ${VIASH_PAR_MODEL_VOCAB+x} ]; then echo "r'${VIASH_PAR_MODEL_VOCAB//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_gene_names': $( if [ ! -z ${VIASH_PAR_VAR_GENE_NAMES+x} ]; then echo "r'${VIASH_PAR_VAR_GENE_NAMES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'var_input': $( if [ ! -z ${VIASH_PAR_VAR_INPUT+x} ]; then echo "r'${VIASH_PAR_VAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_obsm_binned_counts': $( if [ ! -z ${VIASH_PAR_INPUT_OBSM_BINNED_COUNTS+x} ]; then echo "r'${VIASH_PAR_INPUT_OBSM_BINNED_COUNTS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_gene_tokens': $( if [ ! -z ${VIASH_PAR_OBSM_GENE_TOKENS+x} ]; then echo "r'${VIASH_PAR_OBSM_GENE_TOKENS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_tokenized_values': $( if [ ! -z ${VIASH_PAR_OBSM_TOKENIZED_VALUES+x} ]; then echo "r'${VIASH_PAR_OBSM_TOKENIZED_VALUES//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obsm_padding_mask': $( if [ ! -z ${VIASH_PAR_OBSM_PADDING_MASK+x} ]; then echo "r'${VIASH_PAR_OBSM_PADDING_MASK//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_token': $( if [ ! -z ${VIASH_PAR_PAD_TOKEN+x} ]; then echo "r'${VIASH_PAR_PAD_TOKEN//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'pad_value': $( if [ ! -z ${VIASH_PAR_PAD_VALUE+x} ]; then echo "int(r'${VIASH_PAR_PAD_VALUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'max_seq_len': $( if [ ! -z ${VIASH_PAR_MAX_SEQ_LEN+x} ]; then echo "int(r'${VIASH_PAR_MAX_SEQ_LEN//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from subset_vars import subset_vars + +logger = setup_logger() + +logger.info("Reading in data") + +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +adata = subset_vars(adata, par["var_input"]) + +# Set padding specs +pad_token = par["pad_token"] +special_tokens = [pad_token, "", ""] +pad_value = -2 + +logger.info("Fetching counts and gene names") +# Fetch counts +all_counts = ( + adata.obsm[par["input_obsm_binned_counts"]].toarray() + if issparse(adata.obsm[par["input_obsm_binned_counts"]]) + else adata.obsm[par["input_obsm_binned_counts"]] +) + +# Fetching gene names +if not par["var_gene_names"]: + genes = adata.var.index.astype(str).tolist() +else: + genes = adata.var[par["var_gene_names"]].astype(str).tolist() + +# Fetch gene names and look up tokens in vocab +logger.info("Reading in vocab and fetching gene tokens") +vocab_file = par["model_vocab"] +vocab = GeneVocab.from_file(vocab_file) +for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + +vocab.set_default_index(vocab[""]) +ntokens = len(vocab) +gene_ids = np.array(vocab(genes), dtype=int) + +# Fetch max seq len +if not par["max_seq_len"]: + max_seq_len = adata.var.shape[0] + 1 +else: + max_seq_len = par["max_seq_len"] + +# Tokenize and pad data +logger.info( + f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}." +) +tokenized_data = tokenize_and_pad_batch( + all_counts, + gene_ids, + max_len=max_seq_len, + vocab=vocab, + pad_token=pad_token, + pad_value=pad_value, + append_cls=True, # append token at the beginning, + include_zero_gene=False, + return_pt=True, + mod_type=None, + vocab_mod=None, +) + +all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] +padding_mask = all_gene_ids.eq(vocab[pad_token]) + +logger.info("Writing output data") +input_adata.obsm[par["obsm_gene_tokens"]] = all_gene_ids.numpy() +input_adata.obsm[par["obsm_tokenized_values"]] = all_values.numpy() +input_adata.obsm[par["obsm_padding_mask"]] = padding_mask.numpy() + +mdata.write(par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/scgpt/pad_tokenize", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/scgpt/pad_tokenize/nextflow.config b/target/nextflow/scgpt/pad_tokenize/nextflow.config new file mode 100644 index 00000000..de2cf6a6 --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'scgpt/pad_tokenize' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning.\n' + author = 'Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/scgpt/pad_tokenize/nextflow_labels.config b/target/nextflow/scgpt/pad_tokenize/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/scgpt/pad_tokenize/nextflow_schema.json b/target/nextflow/scgpt/pad_tokenize/nextflow_schema.json new file mode 100644 index 00000000..ba78967f --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/nextflow_schema.json @@ -0,0 +1,143 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "pad_tokenize", + "description": "Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "The input h5mu file of pre-processed data.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "model_vocab": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to model vocabulary file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"vocab.json\"`. " + }, + "var_gene_names": { + "type": "string", + "description": "The name of the .var column containing gene names", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_input": { + "type": "string", + "description": "The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"id_in_vocab\"`. ", + "default": "id_in_vocab" + }, + "input_obsm_binned_counts": { + "type": "string", + "description": "The name of the .obsm field containing the binned counts to be padded and tokenized.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"binned_counts\"`. ", + "default": "binned_counts" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask.\n", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_gene_tokens": { + "type": "string", + "description": "The key of the .obsm array containing the gene token ids\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"gene_id_tokens\"`, example: `\"values.pt\"`. ", + "default": "gene_id_tokens" + }, + "obsm_tokenized_values": { + "type": "string", + "description": "The key of the .obsm array containing the count values of the tokenized genes\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"values_tokenized\"`. ", + "default": "values_tokenized" + }, + "obsm_padding_mask": { + "type": "string", + "description": "The key of the .obsm array containing the padding mask.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"padding_mask\"`. ", + "default": "padding_mask" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "pad_token": { + "type": "string", + "description": "Token used for padding.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + }, + "pad_value": { + "type": "integer", + "description": "The value of the padding token.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `-2`. ", + "default": -2 + }, + "max_seq_len": { + "type": "integer", + "description": "The maximum sequence length of the tokenized data", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/scgpt/pad_tokenize/setup_logger.py b/target/nextflow/scgpt/pad_tokenize/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/scgpt/pad_tokenize/subset_vars.py b/target/nextflow/scgpt/pad_tokenize/subset_vars.py new file mode 100644 index 00000000..6003513a --- /dev/null +++ b/target/nextflow/scgpt/pad_tokenize/subset_vars.py @@ -0,0 +1,31 @@ +def subset_vars(adata, subset_col): + """Subset AnnData object on highly variable genes + + Parameters + ---------- + adata : AnnData + Annotated data object + subset_col : str + Name of the boolean column in `adata.var` that contains the information if features should be used or not + + Returns + ------- + AnnData + Copy of `adata` with subsetted features + """ + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) + + if adata.var[subset_col].dtype == "boolean": + assert adata.var[subset_col].isna().sum() == 0, ( + f"The .var column `{subset_col}` contains NaN values. Can not subset data." + ) + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert adata.var[subset_col].dtype == "bool", ( + f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + ) + + return adata[:, adata.var[subset_col]].copy() diff --git a/target/nextflow/transform/bpcells_regress_out/.config.vsh.yaml b/target/nextflow/transform/bpcells_regress_out/.config.vsh.yaml new file mode 100644 index 00000000..ea92459d --- /dev/null +++ b/target/nextflow/transform/bpcells_regress_out/.config.vsh.yaml @@ -0,0 +1,327 @@ +name: "bpcells_regress_out" +namespace: "transform" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "The modality to run this component on." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_keys" + description: "The .obs keys to regress on." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object to regress on.\nIf not provided, the\ + \ X attribute of the adata object will be used.\n" + info: null + example: + - "X_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "The layer of the adata object containing the regressed count data.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "X_regressed" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Regress out the effects of confounding variables using a linear least\ + \ squares regression model with BPCells.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:22.04" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "docker" + env: + - "RETICULATE_PYTHON=/usr/bin/python" + - type: "apt" + packages: + - "libhdf5-dev" + - "python3" + - "python3-pip" + - "python3-dev" + - "python-is-python3" + interactive: false + - type: "r" + cran: + - "anndata" + - "reticulate" + github: + - "bnprks/BPCells/r" + bioc_force_install: false + warnings_as_errors: true + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/bpcells_regress_out/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/bpcells_regress_out" + executable: "target/nextflow/transform/bpcells_regress_out/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/bpcells_regress_out/main.nf b/target/nextflow/transform/bpcells_regress_out/main.nf new file mode 100644 index 00000000..a90adbb3 --- /dev/null +++ b/target/nextflow/transform/bpcells_regress_out/main.nf @@ -0,0 +1,4070 @@ +// bpcells_regress_out main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Robrecht Cannoodt (contributor, author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bpcells_regress_out", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "contributor", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "The modality to run this component on.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_keys", + "description" : "The .obs keys to regress on.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the adata object to regress on.\nIf not provided, the X attribute of the adata object will be used.\n", + "example" : [ + "X_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "The layer of the adata object containing the regressed count data.\nIf not provided, the X attribute of the adata object will be used.\n", + "example" : [ + "X_regressed" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Regress out the effects of confounding variables using a linear least squares regression model with BPCells.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:22.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "docker", + "env" : [ + "RETICULATE_PYTHON=/usr/bin/python" + ] + }, + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "python3", + "python3-pip", + "python3-dev", + "python-is-python3" + ], + "interactive" : false + }, + { + "type" : "r", + "cran" : [ + "anndata", + "reticulate" + ], + "github" : [ + "bnprks/BPCells/r" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/bpcells_regress_out/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/bpcells_regress_out", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +cat("Loading libraries\\\\n") +library(glue) +library(BPCells) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("reticulate", quietly = TRUE) +mudata <- reticulate::import("mudata") + +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "input" = $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "modality" = $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_MODALITY" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "obs_keys" = $( if [ ! -z ${VIASH_PAR_OBS_KEYS+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_OBS_KEYS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "input_layer" = $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_INPUT_LAYER" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_layer" = $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_LAYER" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output_compression" = $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_COMPRESSION" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END + +# Read the h5mu file and make var names unique +mdata <- mudata\\$read_h5mu(par\\$input) + +# Regress out +if (!is.null(par\\$obs_keys) && length(par\\$obs_keys) > 0) { + cat("Regress out variables ", par\\$obs_keys, " on modality ", + par\\$modality, "\\\\n", + sep = "" + ) + + # Fetch modality AnnData and convert to an iterable matrix + adata <- mdata\\$mod[[par\\$modality]] + + # Fetch the input layer + mat <- + if (is.null(par\\$input_layer)) { + cat("Using .X as input layer\\\\n") + adata\\$X + } else { + cat("Using .layers ", par\\$input_layer, " as input layer\\\\n", sep = "") + adata\\$layers[[par\\$input_layer]] + } + + imat <- as(as(mat, "CsparseMatrix"), "IterableMatrix") + dimnames(imat) <- NULL + + # obs_keys is not NULL and not empty + latent_data <- as.data.frame(adata\\$obs[, par\\$obs_keys]) + + # Regress out using BPCells + regressed_data <- regress_out(imat, latent_data, prediction_axis = "col") + + # Convert iterable matrix back to C sparse matrix + rmat <- as(regressed_data, "dgCMatrix") + + # Assign regressed out data back to AnnData object + if (is.null(par\\$output_layer)) { + cat("Using .X as output layer\\\\n") + adata\\$X <- rmat + } else { + cat("Using .layers ", par\\$output_layer, " as output layer\\\\n", sep = "") + adata\\$layers[[par\\$output_layer]] <- rmat + } +} else { + cat("No obs_keys provided, skipping regression\\\\n") +} + +# Write to output h5mu file +mdata\\$write(par\\$output, compression = par\\$output_compression) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/bpcells_regress_out", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/bpcells_regress_out/nextflow.config b/target/nextflow/transform/bpcells_regress_out/nextflow.config new file mode 100644 index 00000000..a6770c59 --- /dev/null +++ b/target/nextflow/transform/bpcells_regress_out/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/bpcells_regress_out' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Regress out the effects of confounding variables using a linear least squares regression model with BPCells.\n' + author = 'Dorien Roosen, Robrecht Cannoodt, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/bpcells_regress_out/nextflow_labels.config b/target/nextflow/transform/bpcells_regress_out/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/bpcells_regress_out/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/bpcells_regress_out/nextflow_schema.json b/target/nextflow/transform/bpcells_regress_out/nextflow_schema.json new file mode 100644 index 00000000..f7bac683 --- /dev/null +++ b/target/nextflow/transform/bpcells_regress_out/nextflow_schema.json @@ -0,0 +1,82 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bpcells_regress_out", + "description": "Regress out the effects of confounding variables using a linear least squares regression model with BPCells.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "modality": { + "type": "string", + "description": "The modality to run this component on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_keys": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The .obs keys to regress on.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "input_layer": { + "type": "string", + "description": "The layer of the adata object to regress on.\nIf not provided, the X attribute of the adata object will be used.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"X_normalized\"`. " + }, + "output_layer": { + "type": "string", + "description": "The layer of the adata object containing the regressed count data.\nIf not provided, the X attribute of the adata object will be used.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"X_regressed\"`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/clr/.config.vsh.yaml b/target/nextflow/transform/clr/.config.vsh.yaml new file mode 100644 index 00000000..0262856f --- /dev/null +++ b/target/nextflow/transform/clr/.config.vsh.yaml @@ -0,0 +1,286 @@ +name: "clr" +namespace: "transform" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. By default, .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use. By default, use X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--axis" + description: "Axis across which CLR is performed. If set to 0, CLR is performed\ + \ across observations (cells).\nIf set to 1, CLR is performed across features\ + \ (genes).\n" + info: null + default: + - 0 + required: false + choices: + - 0 + - 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017).\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "midcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "muon~=0.1.5" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/clr/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/clr" + executable: "target/nextflow/transform/clr/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/clr/compress_h5mu.py b/target/nextflow/transform/clr/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/clr/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/clr/main.nf b/target/nextflow/transform/clr/main.nf new file mode 100644 index 00000000..8dd1201e --- /dev/null +++ b/target/nextflow/transform/clr/main.nf @@ -0,0 +1,3978 @@ +// clr main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "clr", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "prot" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. By default, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Output layer to use. By default, use X.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--axis", + "description" : "Axis across which CLR is performed. If set to 0, CLR is performed across observations (cells).\nIf set to 1, CLR is performed across features (genes).\n", + "default" : [ + 0 + ], + "required" : false, + "choices" : [ + 0, + 1 + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017).\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "midcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "muon~=0.1.5" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/clr/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/clr", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from muon import prot as pt +from mudata import read_h5ad +from anndata import AnnData +from functools import partial +from operator import setitem + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'axis': $( if [ ! -z ${VIASH_PAR_AXIS+x} ]; then echo "int(r'${VIASH_PAR_AXIS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from compress_h5mu import write_h5ad_to_h5mu_with_compression + + +def main(): + input_data = read_h5ad(par["input"], mod=par["modality"]) + if par["input_layer"]: + input_data = AnnData(X=input_data.layers[par["input_layer"]]) + # CLR always normalizes the .X layer, so we have to create an AnnData file with + # the input layer at .X + normalized_counts = pt.pp.clr(input_data, axis=par["axis"], inplace=False) + if not normalized_counts: + raise RuntimeError("CLR failed to return the requested output layer") + + output_layer_setter = ( + partial(setattr, input_data, "X") + if not par["output_layer"] + else partial(setitem, input_data.layers, par["output_layer"]) + ) + output_layer_setter(normalized_counts.X) + write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + input_data, + par["output_compression"], + ) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/clr", + "tag" : "main" + }, + "label" : [ + "lowmem", + "midcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/clr/nextflow.config b/target/nextflow/transform/clr/nextflow.config new file mode 100644 index 00000000..86d0c212 --- /dev/null +++ b/target/nextflow/transform/clr/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/clr' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017).\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/clr/nextflow_labels.config b/target/nextflow/transform/clr/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/clr/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/clr/nextflow_schema.json b/target/nextflow/transform/clr/nextflow_schema.json new file mode 100644 index 00000000..cf13db6d --- /dev/null +++ b/target/nextflow/transform/clr/nextflow_schema.json @@ -0,0 +1,84 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "clr", + "description": "Perform CLR normalization on CITE-seq data (Stoeckius et al., 2017).\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"prot\"`. ", + "default": "prot" + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_layer": { + "type": "string", + "description": "Output layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "axis": { + "type": "integer", + "description": "Axis across which CLR is performed", + "help_text": "Type: `integer`, multiple: `False`, default: `0`, choices: ``0`, `1``. ", + "enum": [ + 0, + 1 + ], + "default": 0 + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/delete_layer/.config.vsh.yaml b/target/nextflow/transform/delete_layer/.config.vsh.yaml new file mode 100644 index 00000000..288d2fe3 --- /dev/null +++ b/target/nextflow/transform/delete_layer/.config.vsh.yaml @@ -0,0 +1,274 @@ +name: "delete_layer" +namespace: "transform" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to remove" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--missing_ok" + description: "Do not raise an error if the layer does not exist for all modalities." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Delete an anndata layer from one or more modalities.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "singlecpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/delete_layer/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/delete_layer" + executable: "target/nextflow/transform/delete_layer/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/delete_layer/compress_h5mu.py b/target/nextflow/transform/delete_layer/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/delete_layer/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/delete_layer/main.nf b/target/nextflow/transform/delete_layer/main.nf new file mode 100644 index 00000000..f22aeb1f --- /dev/null +++ b/target/nextflow/transform/delete_layer/main.nf @@ -0,0 +1,3970 @@ +// delete_layer main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "delete_layer", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to remove", + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--missing_ok", + "description" : "Do not raise an error if the layer does not exist for all modalities.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Delete an anndata layer from one or more modalities.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "singlecpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/delete_layer/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/delete_layer", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from mudata import read_h5ad +from pathlib import Path + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer': $( if [ ! -z ${VIASH_PAR_LAYER+x} ]; then echo "r'${VIASH_PAR_LAYER//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'missing_ok': $( if [ ! -z ${VIASH_PAR_MISSING_OK+x} ]; then echo "r'${VIASH_PAR_MISSING_OK//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) + + logger.info("Reading input file %s, modality %s.", input_file, mod_name) + mod = read_h5ad(input_file, mod=mod_name) + for layer in par["layer"]: + if layer not in mod.layers: + if par["missing_ok"]: + continue + raise ValueError(f"Layer '{layer}' is not present in modality {mod_name}.") + logger.info("Deleting layer %s from modality %s.", layer, mod_name) + del mod.layers[layer] + + logger.info("Writing output to %s.", par["output"]) + + write_h5ad_to_h5mu_with_compression( + output_file, input_file, mod_name, mod, par["output_compression"] + ) + + logger.info("Finished.") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/delete_layer", + "tag" : "main" + }, + "label" : [ + "midmem", + "singlecpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/delete_layer/nextflow.config b/target/nextflow/transform/delete_layer/nextflow.config new file mode 100644 index 00000000..74157943 --- /dev/null +++ b/target/nextflow/transform/delete_layer/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/delete_layer' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Delete an anndata layer from one or more modalities.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/delete_layer/nextflow_labels.config b/target/nextflow/transform/delete_layer/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/delete_layer/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/delete_layer/nextflow_schema.json b/target/nextflow/transform/delete_layer/nextflow_schema.json new file mode 100644 index 00000000..c2a74fce --- /dev/null +++ b/target/nextflow/transform/delete_layer/nextflow_schema.json @@ -0,0 +1,78 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "delete_layer", + "description": "Delete an anndata layer from one or more modalities.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Input layer to remove", + "help_text": "Type: `string`, multiple: `True`, required. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "missing_ok": { + "type": "boolean", + "description": "Do not raise an error if the layer does not exist for all modalities.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/delete_layer/setup_logger.py b/target/nextflow/transform/delete_layer/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/delete_layer/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/transform/log1p/.config.vsh.yaml b/target/nextflow/transform/log1p/.config.vsh.yaml new file mode 100644 index 00000000..058be4b9 --- /dev/null +++ b/target/nextflow/transform/log1p/.config.vsh.yaml @@ -0,0 +1,299 @@ +name: "log1p" +namespace: "transform" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. If None, X is normalized" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use. By default, use X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--base" + description: "Base of the logarithm. Natural logarithm is used by default.\n" + info: null + example: + - 2.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Logarithmize the data matrix. Computes X = log(X + 1), where log denotes\ + \ the natural logarithm unless a different base is given.\n" +test_resources: +- type: "python_script" + path: "run_test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/log1p/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/log1p" + executable: "target/nextflow/transform/log1p/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/log1p/compress_h5mu.py b/target/nextflow/transform/log1p/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/log1p/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/log1p/main.nf b/target/nextflow/transform/log1p/main.nf new file mode 100644 index 00000000..549f4d6c --- /dev/null +++ b/target/nextflow/transform/log1p/main.nf @@ -0,0 +1,4004 @@ +// log1p main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) +// * Robrecht Cannoodt (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "log1p", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. If None, X is normalized", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Output layer to use. By default, use X.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--base", + "description" : "Base of the logarithm. Natural logarithm is used by default.\n", + "example" : [ + 2.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Logarithmize the data matrix. Computes X = log(X + 1), where log denotes the natural logarithm unless a different base is given.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "run_test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/log1p/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/log1p", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import anndata as ad +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'base': $( if [ ! -z ${VIASH_PAR_BASE+x} ]; then echo "float(r'${VIASH_PAR_BASE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from input mudata", par["modality"]) +data = mu.read_h5ad(par["input"], mod=par["modality"]) +assert data.var_names.is_unique, "Expected var_names to be unique." + +logger.info("Performing log transformation") +# Make our own copy with not a lot of data +# this avoid excessive memory usage and accidental overwrites +input_layer = data.layers[par["input_layer"]] if par["input_layer"] else data.X +data_for_scanpy = ad.AnnData(X=input_layer.copy()) +sc.pp.log1p( + data_for_scanpy, + base=par["base"], + layer=None, # use X + copy=False, +) # allow overwrites in the copy that was made + +# Scanpy will overwrite the input layer. +# So fetch input layer from the copy and use it to populate the output slot +if par["output_layer"]: + data.layers[par["output_layer"]] = data_for_scanpy.X +else: + data.X = data_for_scanpy.X +data.uns["log1p"] = data_for_scanpy.uns["log1p"].copy() + +logger.info("Writing to file %s", par["output"]) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/log1p", + "tag" : "main" + }, + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/log1p/nextflow.config b/target/nextflow/transform/log1p/nextflow.config new file mode 100644 index 00000000..06ac62aa --- /dev/null +++ b/target/nextflow/transform/log1p/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/log1p' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Logarithmize the data matrix. Computes X = log(X + 1), where log denotes the natural logarithm unless a different base is given.\n' + author = 'Dries De Maeyer, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/log1p/nextflow_labels.config b/target/nextflow/transform/log1p/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/log1p/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/log1p/nextflow_schema.json b/target/nextflow/transform/log1p/nextflow_schema.json new file mode 100644 index 00000000..86612bfd --- /dev/null +++ b/target/nextflow/transform/log1p/nextflow_schema.json @@ -0,0 +1,79 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "log1p", + "description": "Logarithmize the data matrix. Computes X = log(X + 1), where log denotes the natural logarithm unless a different base is given.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_layer": { + "type": "string", + "description": "Output layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "base": { + "type": "number", + "description": "Base of the logarithm", + "help_text": "Type: `double`, multiple: `False`, example: `2.0`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/log1p/setup_logger.py b/target/nextflow/transform/log1p/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/log1p/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/transform/move_layer/.config.vsh.yaml b/target/nextflow/transform/move_layer/.config.vsh.yaml new file mode 100644 index 00000000..55107d93 --- /dev/null +++ b/target/nextflow/transform/move_layer/.config.vsh.yaml @@ -0,0 +1,264 @@ +name: "move_layer" +namespace: "transform" +version: "main" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to move to a new output location. If specified, will\ + \ be used to select a key from .layers,\notherwise .X is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Destination location for the layer. If not provided, .X will be\ + \ used,\nOtherwise, will be the key for the .layers attribute in the output\ + \ MuData file.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Move a data matrix stored at the .layers or .X attributes in a MuData\ + \ object to another layer." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "singlecpu" + - "lowmem" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/move_layer/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/move_layer" + executable: "target/nextflow/transform/move_layer/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/move_layer/compress_h5mu.py b/target/nextflow/transform/move_layer/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/move_layer/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/move_layer/main.nf b/target/nextflow/transform/move_layer/main.nf new file mode 100644 index 00000000..71cff067 --- /dev/null +++ b/target/nextflow/transform/move_layer/main.nf @@ -0,0 +1,3944 @@ +// move_layer main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "move_layer", + "namespace" : "transform", + "version" : "main", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to move to a new output location. If specified, will be used to select a key from .layers,\notherwise .X is used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Destination location for the layer. If not provided, .X will be used,\nOtherwise, will be the key for the .layers attribute in the output MuData file.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Move a data matrix stored at the .layers or .X attributes in a MuData object to another layer.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/move_layer/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/move_layer", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from mudata import read_h5ad +from functools import partial +from operator import setitem + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s mudata from file %s", par["input"]) +mod_data = read_h5ad(par["input"], mod=par["modality"]) + + +logger.info( + "Using input layer '%s'", "X" if not par["input_layer"] else par["input_layer"] +) +if par["input_layer"]: + data_to_write = mod_data.layers[par["input_layer"]].copy() + del mod_data.layers[par["input_layer"]] +else: + data_to_write = mod_data.X + mod_data.X = None + +output_layer_setter = ( + partial(setattr, mod_data, "X") + if not par["output_layer"] + else partial(setitem, mod_data.layers, par["output_layer"]) +) +output_layer_setter(data_to_write) + +logger.info( + "Writing output to file %s with compression %s", + par["output"], + par["output_compression"], +) + +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], mod_data, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/move_layer", + "tag" : "main" + }, + "label" : [ + "singlecpu", + "lowmem" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/move_layer/nextflow.config b/target/nextflow/transform/move_layer/nextflow.config new file mode 100644 index 00000000..4ae004e8 --- /dev/null +++ b/target/nextflow/transform/move_layer/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'transform/move_layer' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Move a data matrix stored at the .layers or .X attributes in a MuData object to another layer.' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/move_layer/nextflow_labels.config b/target/nextflow/transform/move_layer/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/move_layer/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/move_layer/nextflow_schema.json b/target/nextflow/transform/move_layer/nextflow_schema.json new file mode 100644 index 00000000..2bac2cbb --- /dev/null +++ b/target/nextflow/transform/move_layer/nextflow_schema.json @@ -0,0 +1,74 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "move_layer", + "description": "Move a data matrix stored at the .layers or .X attributes in a MuData object to another layer.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer to move to a new output location", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_layer": { + "type": "string", + "description": "Destination location for the layer", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/move_layer/setup_logger.py b/target/nextflow/transform/move_layer/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/move_layer/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/transform/normalize_total/.config.vsh.yaml b/target/nextflow/transform/normalize_total/.config.vsh.yaml new file mode 100644 index 00000000..d3ed61b0 --- /dev/null +++ b/target/nextflow/transform/normalize_total/.config.vsh.yaml @@ -0,0 +1,317 @@ +name: "normalize_total" +namespace: "transform" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. By default, X is normalized" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use. By default, use X." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--target_sum" + description: "If None, after normalization, each observation (cell) has a total\ + \ count equal to the median of total counts for observations (cells) before\ + \ normalization." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--exclude_highly_expressed" + description: "Exclude (very) highly expressed genes for the computation of the\ + \ normalization factor (size factor) for each cell. A gene is considered highly\ + \ expressed, if it has more than max_fraction of the total counts in at least\ + \ one cell. The not-excluded genes will sum up to target_sum." + info: null + direction: "input" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Normalize counts per cell.\n\nNormalize each cell by total counts over\ + \ all genes, so that every cell has the same total count after normalization. If\ + \ choosing target_sum=1e6, this is CPM normalization.\n\nIf exclude_highly_expressed=True,\ + \ very highly expressed genes are excluded from the computation of the normalization\ + \ factor (size factor) for each cell. This is meaningful as these can strongly influence\ + \ the resulting normalized values for all other genes [Weinreb17].\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/normalize_total/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/normalize_total" + executable: "target/nextflow/transform/normalize_total/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/normalize_total/compress_h5mu.py b/target/nextflow/transform/normalize_total/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/normalize_total/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/normalize_total/main.nf b/target/nextflow/transform/normalize_total/main.nf new file mode 100644 index 00000000..9132dff2 --- /dev/null +++ b/target/nextflow/transform/normalize_total/main.nf @@ -0,0 +1,4021 @@ +// normalize_total main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (maintainer) +// * Robrecht Cannoodt (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "normalize_total", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. By default, X is normalized", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Output layer to use. By default, use X.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--target_sum", + "description" : "If None, after normalization, each observation (cell) has a total count equal to the median of total counts for observations (cells) before normalization.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--exclude_highly_expressed", + "description" : "Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell. A gene is considered highly expressed, if it has more than max_fraction of the total counts in at least one cell. The not-excluded genes will sum up to target_sum.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Normalize counts per cell.\n\nNormalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization.\n\nIf exclude_highly_expressed=True, very highly expressed genes are excluded from the computation of the normalization factor (size factor) for each cell. This is meaningful as these can strongly influence the resulting normalized values for all other genes [Weinreb17].\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/normalize_total/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/normalize_total", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import scanpy as sc +import mudata as mu + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'target_sum': $( if [ ! -z ${VIASH_PAR_TARGET_SUM+x} ]; then echo "int(r'${VIASH_PAR_TARGET_SUM//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'exclude_highly_expressed': $( if [ ! -z ${VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED+x} ]; then echo "r'${VIASH_PAR_EXCLUDE_HIGHLY_EXPRESSED//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading modality %s from %s", par["modality"], par["input"]) +dat = mu.read_h5ad(par["input"], mod=par["modality"]) +assert dat.var_names.is_unique, "The var_names of the input modality must be be unique." + +logger.info(par) + +logger.info("Performing total normalization.") +if par["input_layer"] and par["input_layer"] not in dat.layers.keys(): + raise ValueError(f"Input layer {par['input_layer']} not found in {par['modality']}") +output_data = sc.pp.normalize_total( + dat, + layer=par["input_layer"], + target_sum=par["target_sum"], + copy=True if par["output_layer"] else False, +) + +if output_data: + result = ( + output_data.X + if not par["input_layer"] + else output_data.layers[par["input_layer"]] + ) + dat.layers[par["output_layer"]] = result + +logger.info( + "Writing to file to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], dat, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/normalize_total", + "tag" : "main" + }, + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/normalize_total/nextflow.config b/target/nextflow/transform/normalize_total/nextflow.config new file mode 100644 index 00000000..bc51449d --- /dev/null +++ b/target/nextflow/transform/normalize_total/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/normalize_total' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Normalize counts per cell.\n\nNormalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization.\n\nIf exclude_highly_expressed=True, very highly expressed genes are excluded from the computation of the normalization factor (size factor) for each cell. This is meaningful as these can strongly influence the resulting normalized values for all other genes [Weinreb17].\n' + author = 'Dries De Maeyer, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/normalize_total/nextflow_labels.config b/target/nextflow/transform/normalize_total/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/normalize_total/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/normalize_total/nextflow_schema.json b/target/nextflow/transform/normalize_total/nextflow_schema.json new file mode 100644 index 00000000..25926141 --- /dev/null +++ b/target/nextflow/transform/normalize_total/nextflow_schema.json @@ -0,0 +1,85 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "normalize_total", + "description": "Normalize counts per cell.\n\nNormalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization.\n\nIf exclude_highly_expressed=True, very highly expressed genes are excluded from the computation of the normalization factor (size factor) for each cell. This is meaningful as these can strongly influence the resulting normalized values for all other genes [Weinreb17].\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "output_layer": { + "type": "string", + "description": "Output layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "target_sum": { + "type": "integer", + "description": "If None, after normalization, each observation (cell) has a total count equal to the median of total counts for observations (cells) before normalization.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "exclude_highly_expressed": { + "type": "boolean", + "description": "Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/normalize_total/setup_logger.py b/target/nextflow/transform/normalize_total/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/normalize_total/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/transform/regress_out/.config.vsh.yaml b/target/nextflow/transform/regress_out/.config.vsh.yaml new file mode 100644 index 00000000..76fc0492 --- /dev/null +++ b/target/nextflow/transform/regress_out/.config.vsh.yaml @@ -0,0 +1,295 @@ +name: "regress_out" +namespace: "transform" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + - "contributor" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality (one or more) to run this component on." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_keys" + description: "Which .obs keys to regress on." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the adata object to regress on.\nIf not provided, the\ + \ X attribute of the adata object will be used.\n" + info: null + example: + - "X_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "The layer of the adata object containing the regressed count data.\n\ + If not provided, the X attribute of the adata object will be used.\n" + info: null + example: + - "X_regressed" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Regress out (mostly) unwanted sources of variation.\nUses simple linear\ + \ regression. This is inspired by Seurat's regressOut function in R [Satija15].\ + \ \nNote that this function tends to overcorrect in certain circumstances as described\ + \ in issue theislab/scanpy#526.\nSee https://github.com/theislab/scanpy/issues/526.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/regress_out/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/regress_out" + executable: "target/nextflow/transform/regress_out/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/regress_out/main.nf b/target/nextflow/transform/regress_out/main.nf new file mode 100644 index 00000000..a42854a8 --- /dev/null +++ b/target/nextflow/transform/regress_out/main.nf @@ -0,0 +1,3987 @@ +// regress_out main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer, contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "regress_out", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer", + "contributor" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality (one or more) to run this component on.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_keys", + "description" : "Which .obs keys to regress on.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the adata object to regress on.\nIf not provided, the X attribute of the adata object will be used.\n", + "example" : [ + "X_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "The layer of the adata object containing the regressed count data.\nIf not provided, the X attribute of the adata object will be used.\n", + "example" : [ + "X_regressed" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Regress out (mostly) unwanted sources of variation.\nUses simple linear regression. This is inspired by Seurat's regressOut function in R [Satija15]. \nNote that this function tends to overcorrect in certain circumstances as described in issue theislab/scanpy#526.\nSee https://github.com/theislab/scanpy/issues/526.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/regress_out/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/regress_out", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import scanpy as sc +import mudata as mu +import anndata as ad +import multiprocessing +import sys + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'obs_keys': $( if [ ! -z ${VIASH_PAR_OBS_KEYS+x} ]; then echo "r'${VIASH_PAR_OBS_KEYS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +logger.info("Reading input mudata") +mdata = mu.read_h5mu(par["input"]) +mdata.var_names_make_unique() + +if par["obs_keys"] is not None and len(par["obs_keys"]) > 0: + mod = par["modality"] + data = mdata.mod[mod] + + # Copy required data from input data to new AnnData object to allow providing input and output layers + input_layer = data.X if not par["input_layer"] else data.layers[par["input_layer"]] + obs = data.obs.loc[:, par["obs_keys"]] + sc_data = ad.AnnData(X=input_layer.copy(), obs=obs) + + logger.info("Regress out variables on modality %s", mod) + sc.pp.regress_out( + sc_data, keys=par["obs_keys"], n_jobs=multiprocessing.cpu_count() - 1 + ) + + # Copy regressed data back to original input data + if par["output_layer"]: + data.layers[par["output_layer"]] = sc_data.X + else: + data.X = sc_data.X + +logger.info("Writing to file") +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/regress_out", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/regress_out/nextflow.config b/target/nextflow/transform/regress_out/nextflow.config new file mode 100644 index 00000000..8f388f9c --- /dev/null +++ b/target/nextflow/transform/regress_out/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/regress_out' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Regress out (mostly) unwanted sources of variation.\nUses simple linear regression. This is inspired by Seurat\'s regressOut function in R [Satija15]. \nNote that this function tends to overcorrect in certain circumstances as described in issue theislab/scanpy#526.\nSee https://github.com/theislab/scanpy/issues/526.\n' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/regress_out/nextflow_labels.config b/target/nextflow/transform/regress_out/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/regress_out/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/regress_out/nextflow_schema.json b/target/nextflow/transform/regress_out/nextflow_schema.json new file mode 100644 index 00000000..a6d30a45 --- /dev/null +++ b/target/nextflow/transform/regress_out/nextflow_schema.json @@ -0,0 +1,82 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "regress_out", + "description": "Regress out (mostly) unwanted sources of variation.\nUses simple linear regression. This is inspired by Seurat's regressOut function in R [Satija15]. \nNote that this function tends to overcorrect in certain circumstances as described in issue theislab/scanpy#526.\nSee https://github.com/theislab/scanpy/issues/526.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "modality": { + "type": "string", + "description": "Which modality (one or more) to run this component on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "obs_keys": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Which .obs keys to regress on.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "input_layer": { + "type": "string", + "description": "The layer of the adata object to regress on.\nIf not provided, the X attribute of the adata object will be used.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"X_normalized\"`. " + }, + "output_layer": { + "type": "string", + "description": "The layer of the adata object containing the regressed count data.\nIf not provided, the X attribute of the adata object will be used.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"X_regressed\"`. " + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/regress_out/setup_logger.py b/target/nextflow/transform/regress_out/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/regress_out/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/transform/scale/.config.vsh.yaml b/target/nextflow/transform/scale/.config.vsh.yaml new file mode 100644 index 00000000..246959cf --- /dev/null +++ b/target/nextflow/transform/scale/.config.vsh.yaml @@ -0,0 +1,293 @@ +name: "scale" +namespace: "transform" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "List of modalities to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer with data to scale. Uses .X by default" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer where scaled data will be stored. If not specified,\ + \ .X will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_value" + description: "Clip (truncate) to this value after scaling. Does not clip by default." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--zero_center" + description: "If set, omit zero-centering variables, which allows to handle sparse\ + \ input efficiently." + info: null + direction: "input" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + default: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Scale data to unit variance and zero mean.\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "lowmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/scale/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/scale" + executable: "target/nextflow/transform/scale/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/scale/compress_h5mu.py b/target/nextflow/transform/scale/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/scale/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/scale/main.nf b/target/nextflow/transform/scale/main.nf new file mode 100644 index 00000000..67191acb --- /dev/null +++ b/target/nextflow/transform/scale/main.nf @@ -0,0 +1,3999 @@ +// scale main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scale", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "List of modalities to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer with data to scale. Uses .X by default", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Output layer where scaled data will be stored. If not specified, .X will be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_value", + "description" : "Clip (truncate) to this value after scaling. Does not clip by default.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_false", + "name" : "--zero_center", + "description" : "If set, omit zero-centering variables, which allows to handle sparse input efficiently.", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "default" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Scale data to unit variance and zero mean.\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/scale/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/scale", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +from mudata import read_h5ad +import scanpy +from functools import partial +from operator import setitem + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'max_value': $( if [ ! -z ${VIASH_PAR_MAX_VALUE+x} ]; then echo "float(r'${VIASH_PAR_MAX_VALUE//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'zero_center': $( if [ ! -z ${VIASH_PAR_ZERO_CENTER+x} ]; then echo "r'${VIASH_PAR_ZERO_CENTER//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + + +def main(): + logger.info( + "Reading modality %s from .h5mu file: %s", par["modality"], par["input"] + ) + data = read_h5ad(par["input"], mod=par["modality"]) + logger.info("Scaling modality %s.", par["modality"]) + scanpy_output = scanpy.pp.scale( + data, + layer=par["input_layer"], + zero_center=par["zero_center"], + max_value=par["max_value"], + copy=True, + ) + output_layer_setter = ( + partial(setattr, data, "X") + if not par["output_layer"] + else partial(setitem, data.layers, par["output_layer"]) + ) + output_layer_setter( + scanpy_output.X + if not par["input_layer"] + else scanpy_output.layers[par["input_layer"]] + ) + logger.info( + "Writing to %s with compression %s", par["output"], par["output_compression"] + ) + write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], data, par["output_compression"] + ) + logger.info("Finished") + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/scale", + "tag" : "main" + }, + "label" : [ + "lowmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/scale/nextflow.config b/target/nextflow/transform/scale/nextflow.config new file mode 100644 index 00000000..0cf046d6 --- /dev/null +++ b/target/nextflow/transform/scale/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/scale' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Scale data to unit variance and zero mean.\n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/scale/nextflow_labels.config b/target/nextflow/transform/scale/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/scale/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/scale/nextflow_schema.json b/target/nextflow/transform/scale/nextflow_schema.json new file mode 100644 index 00000000..9d20750c --- /dev/null +++ b/target/nextflow/transform/scale/nextflow_schema.json @@ -0,0 +1,85 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scale", + "description": "Scale data to unit variance and zero mean.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "List of modalities to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "Input layer with data to scale", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output_layer": { + "type": "string", + "description": "Output layer where scaled data will be stored", + "help_text": "Type: `string`, multiple: `False`. " + }, + "max_value": { + "type": "number", + "description": "Clip (truncate) to this value after scaling", + "help_text": "Type: `double`, multiple: `False`. " + }, + "zero_center": { + "type": "boolean", + "description": "If set, omit zero-centering variables, which allows to handle sparse input efficiently.", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"output.h5mu\"`, direction: `output`. ", + "default": "output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/scale/setup_logger.py b/target/nextflow/transform/scale/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/scale/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/transform/tfidf/.config.vsh.yaml b/target/nextflow/transform/tfidf/.config.vsh.yaml new file mode 100644 index 00000000..11c304f4 --- /dev/null +++ b/target/nextflow/transform/tfidf/.config.vsh.yaml @@ -0,0 +1,324 @@ +name: "tfidf" +namespace: "transform" +version: "main" +authors: +- name: "Vladimir Shitov" + roles: + - "maintainer" + info: + role: "Contributor" + links: + email: "vladimir.shitov@helmholtz-muenchen.de" + github: "vladimirshitov" + orcid: "0000-0002-1960-8812" + linkedin: "vladimir-shitov-9a659513b" + organizations: + - name: "Helmholtz Munich" + href: "https://www.helmholtz-munich.de" + role: "PhD Candidate" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input h5mu file" + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "atac" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "Input layer to use. By default, X is normalized" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Output h5mu file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_layer" + description: "Output layer to use." + info: null + default: + - "tfidf" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scale_factor" + description: "Scale factor to multiply the TF-IDF matrix by." + info: null + default: + - 10000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_idf" + description: "Whether to log-transform IDF term." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_tf" + description: "Whether to log-transform TF term." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_tfidf" + description: "Whether to log-transform TF*IDF term (False by default). Can only\ + \ be used when log_tf and log_idf are False." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Perform TF-IDF normalization of the data (typically, ATAC).\n\nTF-IDF\ + \ stands for \"term frequency - inverse document frequency\". It is a technique\ + \ from natural language processing analysis.\nIn the context of ATAC data, \"terms\"\ + \ are the features (genes) and \"documents\" are the observations (cells). \nTF-IDF\ + \ normalization is applied to single-cell ATAC-seq data to highlight the importance\ + \ of specific genomic regions (typically peaks)\nacross different cells while down-weighting\ + \ regions that are commonly accessible across many cells. \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "counts" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "midmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim-bullseye" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "libhdf5-dev" + - "procps" + - "pkg-config" + - "gcc" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "muon~=0.1.5" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/transform/tfidf/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/transform/tfidf" + executable: "target/nextflow/transform/tfidf/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/transform/tfidf/compress_h5mu.py b/target/nextflow/transform/tfidf/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/transform/tfidf/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/transform/tfidf/main.nf b/target/nextflow/transform/tfidf/main.nf new file mode 100644 index 00000000..47cc2719 --- /dev/null +++ b/target/nextflow/transform/tfidf/main.nf @@ -0,0 +1,4018 @@ +// tfidf main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Vladimir Shitov (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "tfidf", + "namespace" : "transform", + "version" : "main", + "authors" : [ + { + "name" : "Vladimir Shitov", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "vladimir.shitov@helmholtz-muenchen.de", + "github" : "vladimirshitov", + "orcid" : "0000-0002-1960-8812", + "linkedin" : "vladimir-shitov-9a659513b" + }, + "organizations" : [ + { + "name" : "Helmholtz Munich", + "href" : "https://www.helmholtz-munich.de", + "role" : "PhD Candidate" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input h5mu file", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "atac" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "Input layer to use. By default, X is normalized", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Output h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_layer", + "description" : "Output layer to use.", + "default" : [ + "tfidf" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scale_factor", + "description" : "Scale factor to multiply the TF-IDF matrix by.", + "default" : [ + 10000 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--log_idf", + "description" : "Whether to log-transform IDF term.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--log_tf", + "description" : "Whether to log-transform TF term.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--log_tfidf", + "description" : "Whether to log-transform TF*IDF term (False by default). Can only be used when log_tf and log_idf are False.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Perform TF-IDF normalization of the data (typically, ATAC).\n\nTF-IDF stands for \\"term frequency - inverse document frequency\\". It is a technique from natural language processing analysis.\nIn the context of ATAC data, \\"terms\\" are the features (genes) and \\"documents\\" are the observations (cells). \nTF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks)\nacross different cells while down-weighting regions that are commonly accessible across many cells. \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_atac_tiny_bcl/counts/" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim-bullseye", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "libhdf5-dev", + "procps", + "pkg-config", + "gcc" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "muon~=0.1.5" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/transform/tfidf/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/transform/tfidf", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata +import muon + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_layer': $( if [ ! -z ${VIASH_PAR_OUTPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_OUTPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'scale_factor': $( if [ ! -z ${VIASH_PAR_SCALE_FACTOR+x} ]; then echo "int(r'${VIASH_PAR_SCALE_FACTOR//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'log_idf': $( if [ ! -z ${VIASH_PAR_LOG_IDF+x} ]; then echo "r'${VIASH_PAR_LOG_IDF//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'log_tf': $( if [ ! -z ${VIASH_PAR_LOG_TF+x} ]; then echo "r'${VIASH_PAR_LOG_TF//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'log_tfidf': $( if [ ! -z ${VIASH_PAR_LOG_TFIDF+x} ]; then echo "r'${VIASH_PAR_LOG_TFIDF//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input modality %s from %s", par["modality"], par["input"]) +adata = mudata.read_h5ad(par["input"], mod=par["modality"]) + +logger.info(par) + +logger.info("Performing TF-IDF normalization") +input_data = adata.copy() + +muon.atac.pp.tfidf( + input_data, + log_tf=par["log_tf"], + log_idf=par["log_idf"], + log_tfidf=par["log_tfidf"], + scale_factor=par["scale_factor"], + inplace=True, + copy=False, + from_layer=par["input_layer"], + to_layer=par["output_layer"], +) + +adata.layers[par["output_layer"]] = input_data.layers[par["output_layer"]] + +logger.info("Writing to file") +write_h5ad_to_h5mu_with_compression( + par["output"], par["input"], par["modality"], adata, par["output_compression"] +) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/transform/tfidf", + "tag" : "main" + }, + "label" : [ + "midmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/transform/tfidf/nextflow.config b/target/nextflow/transform/tfidf/nextflow.config new file mode 100644 index 00000000..a4d44c67 --- /dev/null +++ b/target/nextflow/transform/tfidf/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'transform/tfidf' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Perform TF-IDF normalization of the data (typically, ATAC).\n\nTF-IDF stands for "term frequency - inverse document frequency". It is a technique from natural language processing analysis.\nIn the context of ATAC data, "terms" are the features (genes) and "documents" are the observations (cells). \nTF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks)\nacross different cells while down-weighting regions that are commonly accessible across many cells. \n' + author = 'Vladimir Shitov' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/transform/tfidf/nextflow_labels.config b/target/nextflow/transform/tfidf/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/transform/tfidf/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/transform/tfidf/nextflow_schema.json b/target/nextflow/transform/tfidf/nextflow_schema.json new file mode 100644 index 00000000..5d77986d --- /dev/null +++ b/target/nextflow/transform/tfidf/nextflow_schema.json @@ -0,0 +1,99 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "tfidf", + "description": "Perform TF-IDF normalization of the data (typically, ATAC).\n\nTF-IDF stands for \"term frequency - inverse document frequency\". It is a technique from natural language processing analysis.\nIn the context of ATAC data, \"terms\" are the features (genes) and \"documents\" are the observations (cells). \nTF-IDF normalization is applied to single-cell ATAC-seq data to highlight the importance of specific genomic regions (typically peaks)\nacross different cells while down-weighting regions that are commonly accessible across many cells. \n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"atac\"`. ", + "default": "atac" + }, + "input_layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Output h5mu file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_layer": { + "type": "string", + "description": "Output layer to use.", + "help_text": "Type: `string`, multiple: `False`, default: `\"tfidf\"`. ", + "default": "tfidf" + }, + "scale_factor": { + "type": "integer", + "description": "Scale factor to multiply the TF-IDF matrix by.", + "help_text": "Type: `integer`, multiple: `False`, default: `10000`. ", + "default": 10000 + }, + "log_idf": { + "type": "boolean", + "description": "Whether to log-transform IDF term.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "log_tf": { + "type": "boolean", + "description": "Whether to log-transform TF term.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "log_tfidf": { + "type": "boolean", + "description": "Whether to log-transform TF*IDF term (False by default)", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/transform/tfidf/setup_logger.py b/target/nextflow/transform/tfidf/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/transform/tfidf/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/velocity/scvelo/.config.vsh.yaml b/target/nextflow/velocity/scvelo/.config.vsh.yaml new file mode 100644 index 00000000..5fc0dd1d --- /dev/null +++ b/target/nextflow/velocity/scvelo/.config.vsh.yaml @@ -0,0 +1,403 @@ +name: "scvelo" +namespace: "velocity" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Input MuData file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--counts_layer" + description: "Name of the counts layer, if not specified, X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Input modality" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_spliced" + description: "Name of the layer to store the spliced abundances in. Will be used\ + \ as key in the .layer attribute of the\noutput MuData object.\n" + info: null + default: + - "spliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_unspliced" + description: "Name of the layer to store the unspliced abundances in.\nWill be\ + \ used as key in the .layer attribute of the output MuData object.\n" + info: null + default: + - "unspliced" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer_ambiguous" + description: "Name of the layer to store the abundances in for which no fate was\ + \ determined.\nWill be used as key in the .layer attribute of the output MuData\ + \ object.\n" + info: null + default: + - "ambiguous" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output directory. If it does not exist, will be created." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_h5mu" + description: "Output mudata file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "Compression format to use for the output AnnData and/or Mudata objects.\n\ + By default no compression is applied.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Filtering and normalization" + description: "Arguments for filtering, normalization an log transform (see scvelo.pp.filter_and_normalize\ + \ function)" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts required for a gene to pass filtering (spliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts_u" + description: "Minimum number of counts required for a gene to pass filtering (unspliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells" + description: "Minimum number of cells expressed required to pass filtering (spliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_u" + description: "Minimum number of cells expressed required to pass filtering (unspliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_shared_counts" + description: "Minimum number of counts (both unspliced and spliced) required for\ + \ a gene." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_shared_cells" + description: "Minimum number of cells required to be expressed (both unspliced\ + \ and spliced)." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_top_genes" + description: "Number of genes to keep." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--log_transform" + description: "Do not log transform counts." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Fitting parameters" + description: "Arguments for fitting the data" + arguments: + - type: "integer" + name: "--n_principal_components" + description: "Number of principal components to use for calculating moments." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors" + description: "Number of neighbors to use. First/second-order moments are computed\ + \ for each\ncell across its nearest neighbors, where the neighbor graph is obtained\ + \ from\neuclidean distances in PCA space.\n" + info: null + default: + - 30 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "setup_logger.py" +- type: "file" + path: "compress_h5mu.py" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "velocyto.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "highcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "git" + interactive: false + - type: "python" + user: false + packages: + - "anndata~=0.11.1" + - "mudata~=0.3.1" + - "scanpy~=1.10.4" + - "scvelo~=0.3.3" + - "scipy~=1.14.1" + script: + - "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\ + nelse: exit(1)\")" + upgrade: true + test_setup: + - type: "apt" + packages: + - "git" + interactive: false + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + github: + - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/velocity/scvelo/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/velocity/scvelo" + executable: "target/nextflow/velocity/scvelo/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/velocity/scvelo/compress_h5mu.py b/target/nextflow/velocity/scvelo/compress_h5mu.py new file mode 100644 index 00000000..4b363ee6 --- /dev/null +++ b/target/nextflow/velocity/scvelo/compress_h5mu.py @@ -0,0 +1,87 @@ +import shutil +from anndata import AnnData +from mudata import write_h5ad +from h5py import File as H5File +from h5py import Group, Dataset +from pathlib import Path +from typing import Union, Literal +from functools import partial + + +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): + input_path, output_path = str(input_path), str(output_path) + + def copy_attributes(in_object, out_object): + for key, value in in_object.attrs.items(): + out_object.attrs[key] = value + + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) + else: + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) + + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): + copy_attributes(input_h5, output_h5) + input_h5.visititems(partial(visit_path, output_h5, compression)) + + with open(input_path, "rb") as input_bytes: + # Mudata puts metadata like this in the first 512 bytes: + # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0) + # See mudata/_core/io.py, read_h5mu() function + starting_metadata = input_bytes.read(100) + # The metadata is padded with extra null bytes up until 512 bytes + truncate_location = starting_metadata.find(b"\x00") + starting_metadata = starting_metadata[:truncate_location] + with open(output_path, "br+") as f: + nbytes = f.write(starting_metadata) + f.write(b"\0" * (512 - nbytes)) + + +def write_h5ad_to_h5mu_with_compression( + output_file: Union[str, Path], + h5mu: Union[str, Path], + modality_name: str, + modality_data: AnnData, + output_compression=None, +): + output_file = Path(output_file) + h5mu = Path(h5mu) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if output_compression + else output_file + ) + shutil.copyfile(h5mu, output_file_uncompressed) + write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data) + if output_compression: + compress_h5mu( + output_file_uncompressed, output_file, compression=output_compression + ) + output_file_uncompressed.unlink() diff --git a/target/nextflow/velocity/scvelo/main.nf b/target/nextflow/velocity/scvelo/main.nf new file mode 100644 index 00000000..740ce88a --- /dev/null +++ b/target/nextflow/velocity/scvelo/main.nf @@ -0,0 +1,4237 @@ +// scvelo main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scvelo", + "namespace" : "velocity", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input MuData file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--counts_layer", + "description" : "Name of the counts layer, if not specified, X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Input modality", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_spliced", + "description" : "Name of the layer to store the spliced abundances in. Will be used as key in the .layer attribute of the\noutput MuData object.\n", + "default" : [ + "spliced" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_unspliced", + "description" : "Name of the layer to store the unspliced abundances in.\nWill be used as key in the .layer attribute of the output MuData object.\n", + "default" : [ + "unspliced" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer_ambiguous", + "description" : "Name of the layer to store the abundances in for which no fate was determined.\nWill be used as key in the .layer attribute of the output MuData object.\n", + "default" : [ + "ambiguous" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output directory. If it does not exist, will be created.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_h5mu", + "description" : "Output mudata file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filtering and normalization", + "description" : "Arguments for filtering, normalization an log transform (see scvelo.pp.filter_and_normalize function)", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of counts required for a gene to pass filtering (spliced).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_counts_u", + "description" : "Minimum number of counts required for a gene to pass filtering (unspliced).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells", + "description" : "Minimum number of cells expressed required to pass filtering (spliced).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells_u", + "description" : "Minimum number of cells expressed required to pass filtering (unspliced).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_shared_counts", + "description" : "Minimum number of counts (both unspliced and spliced) required for a gene.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_shared_cells", + "description" : "Minimum number of cells required to be expressed (both unspliced and spliced).", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_top_genes", + "description" : "Number of genes to keep.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--log_transform", + "description" : "Do not log transform counts.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Fitting parameters", + "description" : "Arguments for fitting the data", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_principal_components", + "description" : "Number of principal components to use for calculating moments.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_neighbors", + "description" : "Number of neighbors to use. First/second-order moments are computed for each\ncell across its nearest neighbors, where the neighbor graph is obtained from\neuclidean distances in PCA space.\n", + "default" : [ + 30 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/utils/setup_logger.py" + }, + { + "type" : "file", + "path" : "/src/utils/compress_h5mu.py" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/rna_velocity/velocyto_processed/velocyto.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "anndata~=0.11.1", + "mudata~=0.3.1", + "scanpy~=1.10.4", + "scvelo~=0.3.3", + "scipy~=1.14.1" + ], + "script" : [ + "exec(\\"try:\\\\n import awkward\\\\nexcept ModuleNotFoundError:\\\\n exit(0)\\\\nelse: exit(1)\\")" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "apt", + "packages" : [ + "git" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "github" : [ + "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/velocity/scvelo/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/velocity/scvelo", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import sys +import mudata +import anndata +import tempfile +import shutil +from contextlib import redirect_stdout +from pathlib import Path +import matplotlib as mpl +import scvelo + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'counts_layer': $( if [ ! -z ${VIASH_PAR_COUNTS_LAYER+x} ]; then echo "r'${VIASH_PAR_COUNTS_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_spliced': $( if [ ! -z ${VIASH_PAR_LAYER_SPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_SPLICED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_unspliced': $( if [ ! -z ${VIASH_PAR_LAYER_UNSPLICED+x} ]; then echo "r'${VIASH_PAR_LAYER_UNSPLICED//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'layer_ambiguous': $( if [ ! -z ${VIASH_PAR_LAYER_AMBIGUOUS+x} ]; then echo "r'${VIASH_PAR_LAYER_AMBIGUOUS//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_h5mu': $( if [ ! -z ${VIASH_PAR_OUTPUT_H5MU+x} ]; then echo "r'${VIASH_PAR_OUTPUT_H5MU//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'min_counts': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_counts_u': $( if [ ! -z ${VIASH_PAR_MIN_COUNTS_U+x} ]; then echo "int(r'${VIASH_PAR_MIN_COUNTS_U//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_cells': $( if [ ! -z ${VIASH_PAR_MIN_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_cells_u': $( if [ ! -z ${VIASH_PAR_MIN_CELLS_U+x} ]; then echo "int(r'${VIASH_PAR_MIN_CELLS_U//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_shared_counts': $( if [ ! -z ${VIASH_PAR_MIN_SHARED_COUNTS+x} ]; then echo "int(r'${VIASH_PAR_MIN_SHARED_COUNTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'min_shared_cells': $( if [ ! -z ${VIASH_PAR_MIN_SHARED_CELLS+x} ]; then echo "int(r'${VIASH_PAR_MIN_SHARED_CELLS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_top_genes': $( if [ ! -z ${VIASH_PAR_N_TOP_GENES+x} ]; then echo "int(r'${VIASH_PAR_N_TOP_GENES//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'log_transform': $( if [ ! -z ${VIASH_PAR_LOG_TRANSFORM+x} ]; then echo "r'${VIASH_PAR_LOG_TRANSFORM//\\'/\\'\\"\\'\\"r\\'}'.lower() == 'true'"; else echo None; fi ), + 'n_principal_components': $( if [ ! -z ${VIASH_PAR_N_PRINCIPAL_COMPONENTS+x} ]; then echo "int(r'${VIASH_PAR_N_PRINCIPAL_COMPONENTS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'n_neighbors': $( if [ ! -z ${VIASH_PAR_N_NEIGHBORS+x} ]; then echo "int(r'${VIASH_PAR_N_NEIGHBORS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import compress_h5mu + +logger = setup_logger() + +mpl.rcParams["savefig.dpi"] = 150 + + +# Script must be wrapped into a main function because scvelo spawn subprocesses +# and this fails when the functions are not wrapped. +def main(): + # Create output directory + output_dir = Path(par["output"]) + output_dir.mkdir(parents=True, exist_ok=True) + scvelo.settings.figdir = str(output_dir) + + # Load the input data + adata_in = mudata.read_h5ad(par["input"], mod=par["modality"]) + + # Create a copy of the data as input + # as many scvelo functions do not take input layer arguments + layers_mapping = { + "spliced": par["layer_spliced"], + "unspliced": par["layer_unspliced"], + "ambiguous": par["layer_ambiguous"], + } + layer_data = { + default: (adata_in.layers.get(arg_val) if arg_val else adata_in.layers[default]) + for default, arg_val in layers_mapping.items() + } + adata = anndata.AnnData( + X=adata_in.X + if not par["counts_layer"] + else adata_in.layers[par["counts_layer"]], + layers=layer_data, + ) + + # Calculate the sample name + sample_name = par["output"].removesuffix(".h5mu") + sample_name = Path(sample_name).name + + # Read the input data + + # Save spliced vs unspliced proportions to file + with (output_dir / "proportions.txt").open("w") as target: + with redirect_stdout(target): + scvelo.utils.show_proportions(adata) + + # Plot piecharts of spliced vs unspliced proportions + scvelo.pl.proportions(adata, save=True, show=False) + + # Perform preprocessing + scvelo.pp.filter_and_normalize( + adata, + min_counts=par["min_counts"], + min_counts_u=par["min_counts_u"], + min_cells=par["min_cells"], + min_cells_u=par["min_cells_u"], + min_shared_counts=par["min_shared_counts"], + min_shared_cells=par["min_shared_cells"], + n_top_genes=par["n_top_genes"], + log=par["log_transform"], + ) + + # Fitting + scvelo.pp.moments( + adata, n_pcs=par["n_principal_components"], n_neighbors=par["n_neighbors"] + ) + + # Second step in velocyto calculations + # Velocity calculation and visualization + # From the scvelo manual: + # The solution to the full dynamical model is obtained by setting mode='dynamical', + # which requires to run scv.tl.recover_dynamics(adata) beforehand + scvelo.tl.recover_dynamics(adata) + scvelo.tl.velocity(adata, mode="dynamical") + scvelo.tl.velocity_graph(adata) + scvelo.pl.velocity_graph( + adata, save=str(output_dir / "scvelo_graph.pdf"), show=False + ) + + # Plotting + # TODO: add more here. + scvelo.pl.velocity_embedding_stream( + adata, save=str(output_dir / "scvelo_embedding.pdf"), show=False + ) + + # Copy over slots to output + for slot in ("obs", "var"): + setattr( + adata_in, + slot, + getattr(adata_in, slot) + .assign(**getattr(adata, slot).to_dict()) + .convert_dtypes(), + ) + items_per_slot = { + "uns": ( + "recover_dynamics", + "velocity_params", + "velocity_graph", + "velocity_graph_neg", + ), + "varm": ("loss",), + "obsm": ("velocity_pca",), + "layers": ( + "Ms", + "Mu", + "fit_t", + "fit_tau", + "fit_tau_", + "velocity", + "velocity_u", + ), + } + for dict_slot, dict_items in items_per_slot.items(): + setattr( + adata_in, + dict_slot, + dict( + getattr(adata_in, dict_slot), + **{key_: getattr(adata, dict_slot)[key_] for key_ in dict_items}, + ), + ) + with tempfile.NamedTemporaryFile( + suffix=".h5mu", delete_on_close=False + ) as temp_h5mu: + shutil.copyfile(par["input"], temp_h5mu.name) + # Create output + mudata.write_h5ad(temp_h5mu.name, mod=par["modality"], data=adata_in) + compression = par["output_compression"] + + if compression: + compress_h5mu(temp_h5mu.name, par["output_h5mu"], compression=compression) + else: + shutil.move(temp_h5mu.name, par["output_h5mu"]) + + +if __name__ == "__main__": + main() +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/velocity/scvelo", + "tag" : "main" + }, + "label" : [ + "highmem", + "highcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/velocity/scvelo/nextflow.config b/target/nextflow/velocity/scvelo/nextflow.config new file mode 100644 index 00000000..9d8db8eb --- /dev/null +++ b/target/nextflow/velocity/scvelo/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'velocity/scvelo' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/velocity/scvelo/nextflow_labels.config b/target/nextflow/velocity/scvelo/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/velocity/scvelo/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/velocity/scvelo/nextflow_schema.json b/target/nextflow/velocity/scvelo/nextflow_schema.json new file mode 100644 index 00000000..d38f6d6c --- /dev/null +++ b/target/nextflow/velocity/scvelo/nextflow_schema.json @@ -0,0 +1,175 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scvelo", + "description": "No description", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input MuData file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "counts_layer": { + "type": "string", + "description": "Name of the counts layer, if not specified, X is used.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "modality": { + "type": "string", + "description": "Input modality", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "layer_spliced": { + "type": "string", + "description": "Name of the layer to store the spliced abundances in", + "help_text": "Type: `string`, multiple: `False`, default: `\"spliced\"`. ", + "default": "spliced" + }, + "layer_unspliced": { + "type": "string", + "description": "Name of the layer to store the unspliced abundances in.\nWill be used as key in the .layer attribute of the output MuData object.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"unspliced\"`. ", + "default": "unspliced" + }, + "layer_ambiguous": { + "type": "string", + "description": "Name of the layer to store the abundances in for which no fate was determined.\nWill be used as key in the .layer attribute of the output MuData object.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"ambiguous\"`. ", + "default": "ambiguous" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output directory", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "output_h5mu": { + "type": "string", + "format": "path", + "description": "Output mudata file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_h5mu\"`, direction: `output`. ", + "default": "$id.$key.output_h5mu" + }, + "output_compression": { + "type": "string", + "description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "filtering and normalization": { + "title": "Filtering and normalization", + "type": "object", + "description": "Arguments for filtering, normalization an log transform (see scvelo.pp.filter_and_normalize function)", + "properties": { + "min_counts": { + "type": "integer", + "description": "Minimum number of counts required for a gene to pass filtering (spliced).", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_counts_u": { + "type": "integer", + "description": "Minimum number of counts required for a gene to pass filtering (unspliced).", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_cells": { + "type": "integer", + "description": "Minimum number of cells expressed required to pass filtering (spliced).", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_cells_u": { + "type": "integer", + "description": "Minimum number of cells expressed required to pass filtering (unspliced).", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_shared_counts": { + "type": "integer", + "description": "Minimum number of counts (both unspliced and spliced) required for a gene.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_shared_cells": { + "type": "integer", + "description": "Minimum number of cells required to be expressed (both unspliced and spliced).", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "n_top_genes": { + "type": "integer", + "description": "Number of genes to keep.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "log_transform": { + "type": "boolean", + "description": "Do not log transform counts.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "fitting parameters": { + "title": "Fitting parameters", + "type": "object", + "description": "Arguments for fitting the data", + "properties": { + "n_principal_components": { + "type": "integer", + "description": "Number of principal components to use for calculating moments.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "n_neighbors": { + "type": "integer", + "description": "Number of neighbors to use", + "help_text": "Type: `integer`, multiple: `False`, default: `30`. ", + "default": 30 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/filtering and normalization" + }, + { + "$ref": "#/$defs/fitting parameters" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/velocity/scvelo/setup_logger.py b/target/nextflow/velocity/scvelo/setup_logger.py new file mode 100644 index 00000000..3ca1cdb6 --- /dev/null +++ b/target/nextflow/velocity/scvelo/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger diff --git a/target/nextflow/velocity/velocyto/.config.vsh.yaml b/target/nextflow/velocity/velocyto/.config.vsh.yaml new file mode 100644 index 00000000..37284ad2 --- /dev/null +++ b/target/nextflow/velocity/velocyto/.config.vsh.yaml @@ -0,0 +1,290 @@ +name: "velocyto" +namespace: "velocity" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to BAM file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome" + alternatives: + - "-t" + description: "Path to GTF file" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode" + alternatives: + - "-b" + description: "Valid barcodes file, to filter the bam. If --bcfile is not specified\ + \ all the cell barcodes will be included.\nCell barcodes should be specified\ + \ in the bcfile as the 'CB' tag for each read\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--without_umi" + description: "foo" + info: null + direction: "input" + - type: "file" + name: "--output" + alternatives: + - "-o" + description: "Velocyto loom file" + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--logic" + alternatives: + - "-l" + description: "The logic to use for the filtering." + info: null + default: + - "Default" + required: false + choices: + - "Default" + - "Permissive10X" + - "Intermediate10X" + - "ValidatedIntrons10X" + - "Stricter10X" + - "ObservedSpanning10X" + - "Discordant10X" + - "SmartSeq2" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Runs the velocity analysis on a BAM file, outputting a loom file." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "cellranger_tiny_fastq" +- type: "file" + path: "rna_velocity" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + label: + - "highmem" + - "lowcpu" + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.10-slim-bookworm" + target_registry: "images.viash-hub.com" + target_tag: "main" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "build-essential" + - "file" + interactive: false + - type: "python" + user: false + pip: + - "numpy<2" + - "Cython" + upgrade: true + - type: "python" + user: false + pip: + - "velocyto" + upgrade: true + - type: "apt" + packages: + - "samtools" + interactive: false + test_setup: + - type: "python" + user: false + packages: + - "viashpy==0.8.0" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/velocity/velocyto/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/velocity/velocyto" + executable: "target/nextflow/velocity/velocyto/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/velocity/velocyto/main.nf b/target/nextflow/velocity/velocyto/main.nf new file mode 100644 index 00000000..d82a4f61 --- /dev/null +++ b/target/nextflow/velocity/velocyto/main.nf @@ -0,0 +1,3983 @@ +// velocyto main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "velocyto", + "namespace" : "velocity", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to BAM file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--transcriptome", + "alternatives" : [ + "-t" + ], + "description" : "Path to GTF file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcode", + "alternatives" : [ + "-b" + ], + "description" : "Valid barcodes file, to filter the bam. If --bcfile is not specified all the cell barcodes will be included.\nCell barcodes should be specified in the bcfile as the 'CB' tag for each read\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--without_umi", + "description" : "foo", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--output", + "alternatives" : [ + "-o" + ], + "description" : "Velocyto loom file", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--logic", + "alternatives" : [ + "-l" + ], + "description" : "The logic to use for the filtering.", + "default" : [ + "Default" + ], + "required" : false, + "choices" : [ + "Default", + "Permissive10X", + "Intermediate10X", + "ValidatedIntrons10X", + "Stricter10X", + "ObservedSpanning10X", + "Discordant10X", + "SmartSeq2" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Runs the velocity analysis on a BAM file, outputting a loom file.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + }, + { + "type" : "file", + "path" : "/resources_test/rna_velocity" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "label" : [ + "highmem", + "lowcpu" + ], + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.10-slim-bookworm", + "target_registry" : "images.viash-hub.com", + "target_tag" : "main", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "build-essential", + "file" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "pip" : [ + "numpy<2", + "Cython" + ], + "upgrade" : true + }, + { + "type" : "python", + "user" : false, + "pip" : [ + "velocyto" + ], + "upgrade" : true + }, + { + "type" : "apt", + "packages" : [ + "samtools" + ], + "interactive" : false + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy==0.8.0" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/velocity/velocyto/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "/workdir/root/repo/target/nextflow/velocity/velocyto", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_TRANSCRIPTOME+x} ]; then echo "${VIASH_PAR_TRANSCRIPTOME}" | sed "s#'#'\\"'\\"'#g;s#.*#par_transcriptome='&'#" ; else echo "# par_transcriptome="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODE+x} ]; then echo "${VIASH_PAR_BARCODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_barcode='&'#" ; else echo "# par_barcode="; fi ) +$( if [ ! -z ${VIASH_PAR_WITHOUT_UMI+x} ]; then echo "${VIASH_PAR_WITHOUT_UMI}" | sed "s#'#'\\"'\\"'#g;s#.*#par_without_umi='&'#" ; else echo "# par_without_umi="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_LOGIC+x} ]; then echo "${VIASH_PAR_LOGIC}" | sed "s#'#'\\"'\\"'#g;s#.*#par_logic='&'#" ; else echo "# par_logic="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +extra_params=( ) + +if [ ! -z "\\$par_barcode" ]; then + extra_params+=( "--bcfile=\\$par_barcode" ) +fi +if [ "\\$par_without_umi" == "true" ]; then + extra_params+=( "--without-umi" ) +fi +if [ ! -z "\\$meta_cpus" ]; then + extra_params+=( "--samtools-threads" "\\$meta_cpus" ) +fi +if [ ! -z "\\$meta_memory_mb" ]; then + extra_params+=( "--samtools-memory" "\\$meta_memory_mb" ) +fi + +output_dir=\\`dirname "\\$par_output"\\` +sample_id=\\`basename "\\$par_output" .loom\\` + +if (file \\`readlink -f "\\$par_transcriptome"\\` | grep -q compressed ) ; then + # create temporary directory + tmpdir=\\$(mktemp -d "\\$meta_temp_dir/\\$meta_name-XXXXXXXX") + function clean_up { + rm -rf "\\$tmpdir" + } + trap clean_up EXIT + + zcat "\\$par_transcriptome" > "\\$tmpdir/genes.gtf" + par_transcriptome="\\$tmpdir/genes.gtf" +fi + +velocyto run \\\\ + "\\$par_input" \\\\ + "\\$par_transcriptome" \\\\ + "\\${extra_params[@]}" \\\\ + --outputfolder "\\$output_dir" \\\\ + --sampleid "\\$sample_id" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/openpipeline/velocity/velocyto", + "tag" : "main" + }, + "label" : [ + "highmem", + "lowcpu" + ], + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/velocity/velocyto/nextflow.config b/target/nextflow/velocity/velocyto/nextflow.config new file mode 100644 index 00000000..bfa81446 --- /dev/null +++ b/target/nextflow/velocity/velocyto/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'velocity/velocyto' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Runs the velocity analysis on a BAM file, outputting a loom file.' + author = 'Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/velocity/velocyto/nextflow_labels.config b/target/nextflow/velocity/velocyto/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/velocity/velocyto/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/velocity/velocyto/nextflow_schema.json b/target/nextflow/velocity/velocyto/nextflow_schema.json new file mode 100644 index 00000000..3ad8f61b --- /dev/null +++ b/target/nextflow/velocity/velocyto/nextflow_schema.json @@ -0,0 +1,84 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "velocyto", + "description": "Runs the velocity analysis on a BAM file, outputting a loom file.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to BAM file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "transcriptome": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to GTF file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "barcode": { + "type": "string", + "format": "path", + "description": "Valid barcodes file, to filter the bam", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "without_umi": { + "type": "boolean", + "description": "foo", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output": { + "type": "string", + "format": "path", + "description": "Velocyto loom file", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "logic": { + "type": "string", + "description": "The logic to use for the filtering.", + "help_text": "Type: `string`, multiple: `False`, default: `\"Default\"`, choices: ``Default`, `Permissive10X`, `Intermediate10X`, `ValidatedIntrons10X`, `Stricter10X`, `ObservedSpanning10X`, `Discordant10X`, `SmartSeq2``. ", + "enum": [ + "Default", + "Permissive10X", + "Intermediate10X", + "ValidatedIntrons10X", + "Stricter10X", + "ObservedSpanning10X", + "Discordant10X", + "SmartSeq2" + ], + "default": "Default" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/annotation/harmony_knn/.config.vsh.yaml b/target/nextflow/workflows/annotation/harmony_knn/.config.vsh.yaml new file mode 100644 index 00000000..4c79c8de --- /dev/null +++ b/target/nextflow/workflows/annotation/harmony_knn/.config.vsh.yaml @@ -0,0 +1,504 @@ +name: "harmony_knn" +namespace: "workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Query Input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Input dataset consisting of the (unlabeled) query observations.\ + \ The dataset is expected to be pre-processed in the same way as --reference." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process. Should match the modality of the --reference\ + \ dataset." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the input dataset to process if .X is not to be used.\ + \ Should contain log normalized counts." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch_label" + description: "The .obs field in the input (query) dataset containing the batch\ + \ labels." + info: null + example: + - "sample" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The .var field in the input (query) dataset containing gene names;\ + \ if not provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite_existing_key" + description: "If provided, will overwrite existing fields in the input dataset\ + \ when data are copied during the reference alignment process." + info: null + direction: "input" +- name: "Reference input" + arguments: + - type: "file" + name: "--reference" + description: "Reference dataset consisting of the labeled observations to train\ + \ the KNN classifier on. The dataset is expected to be pre-processed in the\ + \ same way as the --input query dataset." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer of the reference dataset to process if .X is not to be\ + \ used. Should contain log normalized counts." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The `.obs` key of the target cell type labels to transfer." + info: null + example: + - "cell_type" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The .var field in the reference dataset containing gene names; if\ + \ not provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch_label" + description: "The .obs field in the reference dataset containing the batch labels." + info: null + example: + - "sample" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "HVG subset arguments" + arguments: + - type: "integer" + name: "--n_hvg" + description: "Number of highly variable genes to subset for.\n" + info: null + default: + - 2000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "PCA options" + arguments: + - type: "integer" + name: "--pca_num_components" + description: "Number of principal components to compute. Defaults to 50, or 1\ + \ - minimum dimension size of selected representation." + info: null + example: + - 25 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Harmony integration options" + arguments: + - type: "double" + name: "--harmony_theta" + description: "Diversity clustering penalty parameter. Can be set as a single value\ + \ for all batch observations or as multiple values, one for each observation\ + \ in the batches defined by --input_obs_batch_label. theta=0 does not encourage\ + \ any diversity. Larger values of theta result in more diverse clusters.\"\n" + info: null + default: + - 2.0 + required: false + min: 0.0 + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Leiden clustering options" + arguments: + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + min: 0.0 + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Neighbor classifier arguments" + arguments: + - type: "string" + name: "--knn_weights" + description: "Weight function used in prediction. Possible values are:\n`uniform`\ + \ (all points in each neighborhood are weighted equally) or \n`distance` (weight\ + \ points by the inverse of their distance)\n" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "distance" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--knn_n_neighbors" + description: "The number of neighbors to use in k-neighbor graph structure used\ + \ for fast approximate nearest neighbor search with PyNNDescent. \nLarger values\ + \ will result in more accurate search results at the cost of computation time.\n" + info: null + default: + - 15 + required: false + min: 5 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels predicted from\ + \ the classifier trained on the reference." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted cell labels.\nIf provided,\ + \ must have the same length as `--reference_obs_targets`.\nIf empty, will default\ + \ to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n\ + If provided, must have the same length as `--reference_obs_targets`.\nIf empty,\ + \ will default to the `reference_obs_targets` combined with the `\"_probability\"\ + ` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obsm_integrated" + description: "In which .obsm slot to store the integrated embedding." + info: null + default: + - "X_integrated_harmony" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cell type annotation workflow by performing harmony integration of reference\ + \ and query dataset followed by KNN label transfer." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: + name: "Harmony integration followed by KNN label transfer" + test_dependencies: + - name: "harmony_knn_test" + namespace: "test_workflows/annotation" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "workflows/integration/harmony_leiden" + alias: "harmony_leiden_workflow" + repository: + type: "local" +- name: "labels_transfer/knn" + repository: + type: "local" +- name: "workflows/multiomics/split_h5mu" + repository: + type: "local" +- name: "dataflow/concatenate_h5mu" + repository: + type: "local" +- name: "dimred/pca" + repository: + type: "local" +- name: "feature_annotation/align_query_reference" + repository: + type: "local" +- name: "feature_annotation/highly_variable_features_scanpy" + repository: + type: "local" +- name: "transform/delete_layer" + repository: + type: "local" +- name: "workflows/multiomics/split_modalities" + repository: + type: "local" +- name: "dataflow/merge" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/annotation/harmony_knn/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/annotation/harmony_knn" + executable: "target/nextflow/workflows/annotation/harmony_knn/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/workflows/integration/harmony_leiden" + - "target/nextflow/labels_transfer/knn" + - "target/_private/nextflow/workflows/multiomics/split_h5mu" + - "target/nextflow/dataflow/concatenate_h5mu" + - "target/nextflow/dimred/pca" + - "target/nextflow/feature_annotation/align_query_reference" + - "target/nextflow/feature_annotation/highly_variable_features_scanpy" + - "target/nextflow/transform/delete_layer" + - "target/_private/nextflow/workflows/multiomics/split_modalities" + - "target/nextflow/dataflow/merge" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/annotation/harmony_knn/main.nf b/target/nextflow/workflows/annotation/harmony_knn/main.nf new file mode 100644 index 00000000..a037afb0 --- /dev/null +++ b/target/nextflow/workflows/annotation/harmony_knn/main.nf @@ -0,0 +1,4057 @@ +// harmony_knn main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "harmony_knn", + "namespace" : "workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Query Input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process. Should match the modality of the --reference dataset.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch_label", + "description" : "The .obs field in the input (query) dataset containing the batch labels.", + "example" : [ + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--overwrite_existing_key", + "description" : "If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process.", + "direction" : "input" + } + ] + }, + { + "name" : "Reference input", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer of the reference dataset to process if .X is not to be used. Should contain log normalized counts.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "The `.obs` key of the target cell type labels to transfer.", + "example" : [ + "cell_type" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The .var field in the reference dataset containing gene names; if not provided, the .var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_batch_label", + "description" : "The .obs field in the reference dataset containing the batch labels.", + "example" : [ + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "HVG subset arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_hvg", + "description" : "Number of highly variable genes to subset for.\n", + "default" : [ + 2000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "PCA options", + "arguments" : [ + { + "type" : "integer", + "name" : "--pca_num_components", + "description" : "Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation.", + "example" : [ + 25 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Harmony integration options", + "arguments" : [ + { + "type" : "double", + "name" : "--harmony_theta", + "description" : "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --input_obs_batch_label. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters.\\"\n", + "default" : [ + 2.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Leiden clustering options", + "arguments" : [ + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbor classifier arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--knn_weights", + "description" : "Weight function used in prediction. Possible values are:\n`uniform` (all points in each neighborhood are weighted equally) or \n`distance` (weight points by the inverse of their distance)\n", + "default" : [ + "uniform" + ], + "required" : false, + "choices" : [ + "uniform", + "distance" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--knn_n_neighbors", + "description" : "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. \nLarger values will result in more accurate search results at the cost of computation time.\n", + "default" : [ + 15 + ], + "required" : false, + "min" : 5, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted cell labels.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_pred\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_probability\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obsm_integrated", + "description" : "In which .obsm slot to store the integrated embedding.", + "default" : [ + "X_integrated_harmony" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Cell type annotation workflow by performing harmony integration of reference and query dataset followed by KNN label transfer.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + } + ], + "info" : { + "name" : "Harmony integration followed by KNN label transfer", + "test_dependencies" : [ + { + "name" : "harmony_knn_test", + "namespace" : "test_workflows/annotation" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "workflows/integration/harmony_leiden", + "alias" : "harmony_leiden_workflow", + "repository" : { + "type" : "local" + } + }, + { + "name" : "labels_transfer/knn", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/split_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dataflow/concatenate_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dimred/pca", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/align_query_reference", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/highly_variable_features_scanpy", + "repository" : { + "type" : "local" + } + }, + { + "name" : "transform/delete_layer", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/split_modalities", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dataflow/merge", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/annotation/harmony_knn/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/annotation/harmony_knn", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { harmony_leiden as harmony_leiden_workflow_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/integration/harmony_leiden/main.nf" +harmony_leiden_workflow = harmony_leiden_workflow_viashalias.run(key: "harmony_leiden_workflow") +include { knn } from "${meta.resources_dir}/../../../../nextflow/labels_transfer/knn/main.nf" +include { split_h5mu } from "${meta.resources_dir}/../../../../_private/nextflow/workflows/multiomics/split_h5mu/main.nf" +include { concatenate_h5mu } from "${meta.resources_dir}/../../../../nextflow/dataflow/concatenate_h5mu/main.nf" +include { pca } from "${meta.resources_dir}/../../../../nextflow/dimred/pca/main.nf" +include { align_query_reference } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/align_query_reference/main.nf" +include { highly_variable_features_scanpy } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/highly_variable_features_scanpy/main.nf" +include { delete_layer } from "${meta.resources_dir}/../../../../nextflow/transform/delete_layer/main.nf" +include { split_modalities } from "${meta.resources_dir}/../../../../_private/nextflow/workflows/multiomics/split_modalities/main.nf" +include { merge } from "${meta.resources_dir}/../../../../nextflow/dataflow/merge/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + + modalities_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Align query and reference datasets + | align_query_reference.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "reference": "reference", + "reference_layer": "reference_layer", + "reference_obs_batch": "reference_obs_batch_label", + "reference_obs_label": "reference_obs_target", + "reference_var_gene_names": "reference_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "overwrite_existing_key": "overwrite_existing_key" + ], + args: [ + "input_id": "query", + "reference_id": "reference", + "output_layer": "_counts", + "output_var_gene_names": "_gene_names", + "output_obs_batch": "_sample_id", + "output_obs_label": "_cell_type", + "output_obs_id": "_dataset", + "output_var_common_genes": "_common_vars" + ], + toState: [ + "input": "output_query", + "reference": "output_reference" + ] + ) + + | split_modalities.run( + fromState: {id, state -> + def newState = ["input": state.input, "id": id] + }, + toState: ["output": "output", "output_types": "output_types"] + ) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + // def new_id = id + "_" + dat.name + def new_id = id // it's okay because the channel will get split up anyways + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types", "output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + | view {"After splitting modalities: $it"} + + + rna_ch = modalities_ch + + | filter{id, state -> state.modality == "rna"} + + // Concatenate query and reference datasets prior to integration + // Only concatenate rna modality in this channel + | concatenate_h5mu.run( + fromState: { id, state -> [ + "input": [state.input, state.reference] + ] + }, + args: [ + "input_id": ["query", "reference"], + "modality": "rna", + "other_axis_mode": "move" + ], + toState: ["input": "output"] + ) + | view {"After concatenation: $it"} + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "modality": "modality", + "n_top_features": "n_hvg" + ], + args: [ + "layer": "_counts", + "var_input": "_common_vars", + "var_name_filter": "_common_hvg", + "obs_batch_key": "_sample_id" + ], + toState: [ + "input": "output" + ] + ) + | pca.run( + fromState: [ + "input": "input", + "modality": "modality", + "overwrite": "overwrite_existing_key", + "num_compontents": "pca_num_components" + ], + args: [ + "layer": "_counts", + "var_input": "_common_hvg", + "obsm_output": "X_pca_query_reference", + "varm_output": "pca_loadings_query_reference", + "uns_output": "pca_variance_query_reference", + ], + toState: [ + "input": "output" + ] + ) + | delete_layer.run( + key: "delete_aligned_lognormalized_counts_layer", + fromState: [ + "input": "input", + "modality": "modality", + ], + args: [ + "layer": "_counts", + "missing_ok": "true" + ], + toState: [ + "input": "output" + ] + ) + // Run harmony integration with leiden clustering + | harmony_leiden_workflow.run( + fromState: { id, state -> [ + "id": id, + "input": state.input, + "modality": state.modality, + "obsm_integrated": state.output_obsm_integrated, + "theta": state.harmony_theta, + "leiden_resolution": state.leiden_resolution, + ]}, + args: [ + "embedding": "X_pca_query_reference", + "uns_neighbors": "harmonypy_integration_neighbors", + "obsp_neighbor_distances": "harmonypy_integration_distances", + "obsp_neighbor_connectivities": "harmonypy_integration_connectivities", + "obs_cluster": "harmony_integration_leiden", + "obsm_umap": "X_leiden_harmony_umap", + "obs_covariates": "_sample_id" + ], + toState: ["input": "output"] + ) + | view {"After integration: $it"} + // Split integrated dataset back into a separate reference and query dataset + | split_h5mu.run( + fromState: [ + "input": "input", + "modality": "modality" + ], + args: [ + "obs_feature": "_dataset", + "output_files": "sample_files.csv", + "drop_obs_nan": "true", + "output": "ref_query" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ], + auto: [ publish: true ] + ) + | view {"After sample splitting: $it"} + // map the integrated query and reference datasets back to the state + | map {id, state -> + def outputDir = state.output + if (workflow.stubRun) { + def output_files = outputDir.listFiles() + def new_state = state + [ + "input": output_files[0], + "reference": output_files[1], + ] + return [id, new_state] + } + def files = readCsv(state.output_files.toUriString()) + def query_file = files.findAll{ dat -> dat.name == 'query' } + assert query_file.size() == 1, 'there should only be one query file' + def reference_file = files.findAll{ dat -> dat.name == 'reference' } + assert reference_file.size() == 1, 'there should only be one reference file' + def integrated_query = outputDir.resolve(query_file.filename) + def integrated_reference = outputDir.resolve(reference_file.filename) + def newKeys = ["input": integrated_query, "reference": integrated_reference] + [id, state + newKeys] + } + // remove keys from split files + | map {id, state -> + def keysToRemove = ["output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + // Perform KNN label transfer from integrated reference to integrated query + | knn.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_obsm_features": "output_obsm_integrated", + "reference": "reference", + "reference_obsm_features": "output_obsm_integrated", + "reference_obs_targets": "reference_obs_target", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "output_compression": "output_compression", + "weights": "knn_weights", + "n_neighbors": "knn_n_neighbors", + "output": "workflow_output" + ], + toState: ["input": "output"] + // toState: {id, output, state -> ["output": output.output]} + ) + | view {"After processing RNA modality: $it"} + + other_mod_ch = modalities_ch + | filter{id, state -> state.modality != "rna"} + + output_ch = rna_ch.mix(other_mod_ch) + | groupTuple(by: 0, sort: "hash") + | map { id, states -> + def new_input = states.collect{it.input} + def modalities = states.collect{it.modality}.unique() + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality", "reference"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + [id, new_state + ["input": new_input, "modalities": modalities]] + } + | merge.run( + fromState: ["input": "input"], + toState: ["output": "output"], + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/annotation/harmony_knn/nextflow.config b/target/nextflow/workflows/annotation/harmony_knn/nextflow.config new file mode 100644 index 00000000..36486a5a --- /dev/null +++ b/target/nextflow/workflows/annotation/harmony_knn/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/annotation/harmony_knn' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Cell type annotation workflow by performing harmony integration of reference and query dataset followed by KNN label transfer.' + author = 'Dorien Roosen, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/annotation/harmony_knn/nextflow_labels.config b/target/nextflow/workflows/annotation/harmony_knn/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/annotation/harmony_knn/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/annotation/harmony_knn/nextflow_schema.json b/target/nextflow/workflows/annotation/harmony_knn/nextflow_schema.json new file mode 100644 index 00000000..ff6e07b1 --- /dev/null +++ b/target/nextflow/workflows/annotation/harmony_knn/nextflow_schema.json @@ -0,0 +1,264 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "harmony_knn", + "description": "Cell type annotation workflow by performing harmony integration of reference and query dataset followed by KNN label transfer.", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "In which `.obs` slots to store the predicted cell labels.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n", + "help_text": "Type: `string`, multiple: `True`. " + }, + "output_obs_probability": { + "type": "array", + "items": { + "type": "string" + }, + "description": "In which `.obs` slots to store the probability of the predictions.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\"_probability\"` suffix.\n", + "help_text": "Type: `string`, multiple: `True`. " + }, + "output_obsm_integrated": { + "type": "string", + "description": "In which .obsm slot to store the integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_integrated_harmony\"`. ", + "default": "X_integrated_harmony" + }, + "output_compression": { + "type": "string", + "description": "The compression format to be used on the output h5mu object.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + } + } + }, + "query input": { + "title": "Query Input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input dataset consisting of the (unlabeled) query observations", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer of the input dataset to process if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_batch_label": { + "type": "string", + "description": "The .obs field in the input (query) dataset containing the batch labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample\"`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "overwrite_existing_key": { + "type": "boolean", + "description": "If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "reference input": { + "title": "Reference input", + "type": "object", + "description": "No description", + "properties": { + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference dataset consisting of the labeled observations to train the KNN classifier on", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer of the reference dataset to process if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_target": { + "type": "string", + "description": "The `.obs` key of the target cell type labels to transfer.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"cell_type\"`. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The .var field in the reference dataset containing gene names; if not provided, the .var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_batch_label": { + "type": "string", + "description": "The .obs field in the reference dataset containing the batch labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample\"`. " + } + } + }, + "hvg subset arguments": { + "title": "HVG subset arguments", + "type": "object", + "description": "No description", + "properties": { + "n_hvg": { + "type": "integer", + "description": "Number of highly variable genes to subset for.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `2000`. ", + "default": 2000 + } + } + }, + "pca options": { + "title": "PCA options", + "type": "object", + "description": "No description", + "properties": { + "pca_num_components": { + "type": "integer", + "description": "Number of principal components to compute", + "help_text": "Type: `integer`, multiple: `False`, example: `25`. " + } + } + }, + "harmony integration options": { + "title": "Harmony integration options", + "type": "object", + "description": "No description", + "properties": { + "harmony_theta": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Diversity clustering penalty parameter", + "help_text": "Type: `double`, multiple: `True`, default: `[2.0]`. ", + "default": [ + 2.0 + ] + } + } + }, + "leiden clustering options": { + "title": "Leiden clustering options", + "type": "object", + "description": "No description", + "properties": { + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "neighbor classifier arguments": { + "title": "Neighbor classifier arguments", + "type": "object", + "description": "No description", + "properties": { + "knn_weights": { + "type": "string", + "description": "Weight function used in prediction", + "help_text": "Type: `string`, multiple: `False`, default: `\"uniform\"`, choices: ``uniform`, `distance``. ", + "enum": [ + "uniform", + "distance" + ], + "default": "uniform" + }, + "knn_n_neighbors": { + "type": "integer", + "description": "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent", + "help_text": "Type: `integer`, multiple: `False`, default: `15`. ", + "default": 15 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/query input" + }, + { + "$ref": "#/$defs/reference input" + }, + { + "$ref": "#/$defs/hvg subset arguments" + }, + { + "$ref": "#/$defs/pca options" + }, + { + "$ref": "#/$defs/harmony integration options" + }, + { + "$ref": "#/$defs/leiden clustering options" + }, + { + "$ref": "#/$defs/neighbor classifier arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/annotation/scanvi_scarches/.config.vsh.yaml b/target/nextflow/workflows/annotation/scanvi_scarches/.config.vsh.yaml new file mode 100644 index 00000000..0dd9809e --- /dev/null +++ b/target/nextflow/workflows/annotation/scanvi_scarches/.config.vsh.yaml @@ -0,0 +1,550 @@ +name: "scanvi_scarches" +namespace: "workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Query Input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Input dataset consisting of the (unlabeled) query observations.\ + \ The dataset is expected to be pre-processed in the same way as --reference." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process. Should match the modality of the --reference\ + \ dataset." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Which layer to use for integration if .X is not to be used. Should\ + \ match the layer of the --reference dataset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch_label" + description: "The .obs field in the input (query) dataset containing the batch\ + \ labels." + info: null + example: + - "sample" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_size_factor" + description: "Key in adata.obs for size factor information. Instead of using library\ + \ size as a size factor,\nthe provided size factor column will be used as offset\ + \ in the mean of the likelihood.\nAssumed to be on linear scale.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: ".var column containing gene names. By default, use the index." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Reference input" + arguments: + - type: "file" + name: "--reference" + description: "Reference dataset consisting of the labeled observations to train\ + \ the KNN classifier on. The dataset is expected to be pre-processed in the\ + \ same way as the --input query dataset." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The `.obs` key containing the target labels." + info: null + example: + - "cell_type" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch_label" + description: "The .obs field in the reference dataset containing the batch labels." + info: null + example: + - "sample" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_size_factor" + description: "Key in adata.obs for size factor information. Instead of using library\ + \ size as a size factor,\nthe provided size factor column will be used as offset\ + \ in the mean of the likelihood.\nAssumed to be on linear scale.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--unlabeled_category" + description: "Value in the --reference_obs_batch_label field that indicates unlabeled\ + \ observations\n" + info: null + default: + - "Unknown" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_hvg" + description: ".var column containing highly variable genes. If not provided, genes\ + \ will not be subset." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: ".var column containing gene names. By default, use the index." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "scVI, scANVI and scArches training options" + arguments: + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Leiden clustering options" + arguments: + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + min: 0.0 + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Neighbor classifier arguments" + arguments: + - type: "string" + name: "--knn_weights" + description: "Weight function used in prediction. Possible values are:\n`uniform`\ + \ (all points in each neighborhood are weighted equally) or \n`distance` (weight\ + \ points by the inverse of their distance)\n" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "distance" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--knn_n_neighbors" + description: "The number of neighbors to use in k-neighbor graph structure used\ + \ for fast approximate nearest neighbor search with PyNNDescent. \nLarger values\ + \ will result in more accurate search results at the cost of computation time.\n" + info: null + default: + - 15 + required: false + min: 5 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels predicted from\ + \ the classifier trained on the reference." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which .obs slot to store the predicted labels." + info: null + default: + - "scanvi_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which. obs slot to store the probabilities of the predicted labels." + info: null + default: + - "scanvi_probabilities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obsm_integrated" + description: "In which .obsm slot to store the integrated embedding." + info: null + default: + - "X_integrated_scanvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression format to be used on the output h5mu object.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_model" + description: "Path to the resulting scANVI model that was updated with query data." + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cell type annotation workflow using ScanVI with scArches for reference\ + \ mapping." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "TS_Blood_filtered.h5mu" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: + name: "scANVI - scArches workflow" + test_dependencies: + - name: "scanvi_scarches_test" + namespace: "test_workflows/annotation" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "integrate/scvi" + repository: + type: "local" +- name: "annotate/scanvi" + repository: + type: "local" +- name: "integrate/scarches" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/annotation/scanvi_scarches/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/annotation/scanvi_scarches" + executable: "target/nextflow/workflows/annotation/scanvi_scarches/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/integrate/scvi" + - "target/nextflow/annotate/scanvi" + - "target/nextflow/integrate/scarches" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/annotation/scanvi_scarches/main.nf b/target/nextflow/workflows/annotation/scanvi_scarches/main.nf new file mode 100644 index 00000000..5844391d --- /dev/null +++ b/target/nextflow/workflows/annotation/scanvi_scarches/main.nf @@ -0,0 +1,3957 @@ +// scanvi_scarches main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scanvi_scarches", + "namespace" : "workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Query Input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process. Should match the modality of the --reference dataset.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Which layer to use for integration if .X is not to be used. Should match the layer of the --reference dataset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch_label", + "description" : "The .obs field in the input (query) dataset containing the batch labels.", + "example" : [ + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_size_factor", + "description" : "Key in adata.obs for size factor information. Instead of using library size as a size factor,\nthe provided size factor column will be used as offset in the mean of the likelihood.\nAssumed to be on linear scale.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : ".var column containing gene names. By default, use the index.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Reference input", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "The `.obs` key containing the target labels.", + "example" : [ + "cell_type" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_batch_label", + "description" : "The .obs field in the reference dataset containing the batch labels.", + "example" : [ + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_size_factor", + "description" : "Key in adata.obs for size factor information. Instead of using library size as a size factor,\nthe provided size factor column will be used as offset in the mean of the likelihood.\nAssumed to be on linear scale.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--unlabeled_category", + "description" : "Value in the --reference_obs_batch_label field that indicates unlabeled observations\n", + "default" : [ + "Unknown" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_hvg", + "description" : ".var column containing highly variable genes. If not provided, genes will not be subset.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : ".var column containing gene names. By default, use the index.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "scVI, scANVI and scArches training options", + "arguments" : [ + { + "type" : "boolean", + "name" : "--early_stopping", + "description" : "Whether to perform early stopping with respect to the validation set.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--early_stopping_monitor", + "description" : "Metric logged during validation set epoch.", + "default" : [ + "elbo_validation" + ], + "required" : false, + "choices" : [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--early_stopping_patience", + "description" : "Number of validation epochs with no improvement after which training will be stopped.", + "default" : [ + 45 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--early_stopping_min_delta", + "description" : "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--reduce_lr_on_plateau", + "description" : "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_factor", + "description" : "Factor to reduce learning rate.", + "default" : [ + 0.6 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_patience", + "description" : "Number of epochs with no improvement after which learning rate will be reduced.", + "default" : [ + 30.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Leiden clustering options", + "arguments" : [ + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbor classifier arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--knn_weights", + "description" : "Weight function used in prediction. Possible values are:\n`uniform` (all points in each neighborhood are weighted equally) or \n`distance` (weight points by the inverse of their distance)\n", + "default" : [ + "uniform" + ], + "required" : false, + "choices" : [ + "uniform", + "distance" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--knn_n_neighbors", + "description" : "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. \nLarger values will result in more accurate search results at the cost of computation time.\n", + "default" : [ + 15 + ], + "required" : false, + "min" : 5, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which .obs slot to store the predicted labels.", + "default" : [ + "scanvi_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which. obs slot to store the probabilities of the predicted labels.", + "default" : [ + "scanvi_probabilities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obsm_integrated", + "description" : "In which .obsm slot to store the integrated embedding.", + "default" : [ + "X_integrated_scanvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression format to be used on the output h5mu object.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_model", + "description" : "Path to the resulting scANVI model that was updated with query data.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Cell type annotation workflow using ScanVI with scArches for reference mapping.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "info" : { + "name" : "scANVI - scArches workflow", + "test_dependencies" : [ + { + "name" : "scanvi_scarches_test", + "namespace" : "test_workflows/annotation" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "integrate/scvi", + "repository" : { + "type" : "local" + } + }, + { + "name" : "annotate/scanvi", + "repository" : { + "type" : "local" + } + }, + { + "name" : "integrate/scarches", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/annotation/scanvi_scarches/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/annotation/scanvi_scarches", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { scvi } from "${meta.resources_dir}/../../../../nextflow/integrate/scvi/main.nf" +include { scanvi } from "${meta.resources_dir}/../../../../nextflow/annotate/scanvi/main.nf" +include { scarches } from "${meta.resources_dir}/../../../../nextflow/integrate/scarches/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output, "workflow_output_model": state.output_model] + [id, new_state] + } + + // Integrate & generate scvi model from the reference data + | scvi.run( + fromState: [ + "input": "reference", + "modality": "modality", + "input_layer": "layer", + "obs_batch": "reference_obs_batch_label", + "var_input": "reference_var_hvg", + "var_gene_names": "reference_var_gene_names", + "obs_size_factor": "reference_obs_size_factor", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + ], + args: [ + "obsm_output": "X_integrated_scvi" + ], + toState: [ + "reference": "output", + "output_scvi_model": "output_model" + ] + ) + + // Create scanvi model from the scvi reference model and integrate reference data + | scanvi.run( + fromState: [ + "input": "reference", + "modality": "modality", + "input_layer": "layer", + "var_input": "reference_var_hvg", + "var_gene_names": "reference_var_gene_names", + "obs_labels": "reference_obs_target", + "unlabeled_category": "unlabeled_category", + "scvi_model": "output_scvi_model", + "obsm_output": "output_obsm_integrated", + "obs_output_predictions": "output_obs_predictions", + "obs_output_probabilities": "output_obs_probability", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + ], + toState: [ + "reference": "output", + "scanvi_model": "output_model" + ] + ) + + // Update scANVI reference model with query data and integrate+annotate query data + | scarches.run( + fromState: [ + "input": "input", + "layer": "layer", + "modality": "modality", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "input_obs_size_factor": "input_obs_size_factor", + "reference": "scanvi_model", + "obsm_output": "output_obsm_integrated", + "obs_output_predictions": "output_obs_predictions", + "obs_output_probabilities": "output_obs_probability", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + "output": "workflow_output", + "model_output": "workflow_output_model" + ], + toState: [ + "input": "output", + "output_model": "model_output" + ] + ) + | view {"After scarches: $it"} + // Perform neighbors calculation, leiden clustering and umap dimred on the integrated query data + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "output_obsm_integrated", + "output": "workflow_output", + "leiden_resolution": "leiden_resolution" + ], + args: [ + "uns_neighbors": "scanvi_integration_neighbors", + "obsp_neighbor_distances": "scanvi_integration_distances", + "obsp_neighbor_connectivities": "scanvi_integration_connectivities", + "obs_cluster": "scanvi_integration_leiden", + "obsm_umap": "X_scanvi_umap" + ], + toState: [ "output": "output" ] + ) + | setState(["output", "output_model"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/annotation/scanvi_scarches/nextflow.config b/target/nextflow/workflows/annotation/scanvi_scarches/nextflow.config new file mode 100644 index 00000000..072d2c34 --- /dev/null +++ b/target/nextflow/workflows/annotation/scanvi_scarches/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/annotation/scanvi_scarches' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Cell type annotation workflow using ScanVI with scArches for reference mapping.' + author = 'Dorien Roosen, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/annotation/scanvi_scarches/nextflow_labels.config b/target/nextflow/workflows/annotation/scanvi_scarches/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/annotation/scanvi_scarches/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/annotation/scanvi_scarches/nextflow_schema.json b/target/nextflow/workflows/annotation/scanvi_scarches/nextflow_schema.json new file mode 100644 index 00000000..448eeaad --- /dev/null +++ b/target/nextflow/workflows/annotation/scanvi_scarches/nextflow_schema.json @@ -0,0 +1,280 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scanvi_scarches", + "description": "Cell type annotation workflow using ScanVI with scArches for reference mapping.", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "string", + "description": "In which .obs slot to store the predicted labels.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_pred\"`. ", + "default": "scanvi_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "In which", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_probabilities\"`. ", + "default": "scanvi_probabilities" + }, + "output_obsm_integrated": { + "type": "string", + "description": "In which .obsm slot to store the integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_integrated_scanvi\"`. ", + "default": "X_integrated_scanvi" + }, + "output_compression": { + "type": "string", + "description": "The compression format to be used on the output h5mu object.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + }, + "output_model": { + "type": "string", + "format": "path", + "description": "Path to the resulting scANVI model that was updated with query data.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_model\"`, direction: `output`. ", + "default": "$id.$key.output_model" + } + } + }, + "query input": { + "title": "Query Input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input dataset consisting of the (unlabeled) query observations", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Which layer to use for integration if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_batch_label": { + "type": "string", + "description": "The .obs field in the input (query) dataset containing the batch labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample\"`. " + }, + "input_obs_size_factor": { + "type": "string", + "description": "Key in adata.obs for size factor information", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": ".var column containing gene names", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "reference input": { + "title": "Reference input", + "type": "object", + "description": "No description", + "properties": { + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference dataset consisting of the labeled observations to train the KNN classifier on", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_obs_target": { + "type": "string", + "description": "The `.obs` key containing the target labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"cell_type\"`. " + }, + "reference_obs_batch_label": { + "type": "string", + "description": "The .obs field in the reference dataset containing the batch labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample\"`. " + }, + "reference_obs_size_factor": { + "type": "string", + "description": "Key in adata.obs for size factor information", + "help_text": "Type: `string`, multiple: `False`. " + }, + "unlabeled_category": { + "type": "string", + "description": "Value in the --reference_obs_batch_label field that indicates unlabeled observations\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"Unknown\"`. ", + "default": "Unknown" + }, + "reference_var_hvg": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_var_gene_names": { + "type": "string", + "description": ".var column containing gene names", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "scvi, scanvi and scarches training options": { + "title": "scVI, scANVI and scArches training options", + "type": "object", + "description": "No description", + "properties": { + "early_stopping": { + "type": "boolean", + "description": "Whether to perform early stopping with respect to the validation set.", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "early_stopping_monitor": { + "type": "string", + "description": "Metric logged during validation set epoch.", + "help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ", + "enum": [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "default": "elbo_validation" + }, + "early_stopping_patience": { + "type": "integer", + "description": "Number of validation epochs with no improvement after which training will be stopped.", + "help_text": "Type: `integer`, multiple: `False`, default: `45`. ", + "default": 45 + }, + "early_stopping_min_delta": { + "type": "number", + "description": "Minimum change in the monitored quantity to qualify as an improvement, i.e", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "reduce_lr_on_plateau": { + "type": "boolean", + "description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "lr_factor": { + "type": "number", + "description": "Factor to reduce learning rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.6`. ", + "default": 0.6 + }, + "lr_patience": { + "type": "number", + "description": "Number of epochs with no improvement after which learning rate will be reduced.", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + } + } + }, + "leiden clustering options": { + "title": "Leiden clustering options", + "type": "object", + "description": "No description", + "properties": { + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "neighbor classifier arguments": { + "title": "Neighbor classifier arguments", + "type": "object", + "description": "No description", + "properties": { + "knn_weights": { + "type": "string", + "description": "Weight function used in prediction", + "help_text": "Type: `string`, multiple: `False`, default: `\"uniform\"`, choices: ``uniform`, `distance``. ", + "enum": [ + "uniform", + "distance" + ], + "default": "uniform" + }, + "knn_n_neighbors": { + "type": "integer", + "description": "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent", + "help_text": "Type: `integer`, multiple: `False`, default: `15`. ", + "default": 15 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/query input" + }, + { + "$ref": "#/$defs/reference input" + }, + { + "$ref": "#/$defs/scvi, scanvi and scarches training options" + }, + { + "$ref": "#/$defs/leiden clustering options" + }, + { + "$ref": "#/$defs/neighbor classifier arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/annotation/scgpt_annotation/.config.vsh.yaml b/target/nextflow/workflows/annotation/scgpt_annotation/.config.vsh.yaml new file mode 100644 index 00000000..b431ef5c --- /dev/null +++ b/target/nextflow/workflows/annotation/scgpt_annotation/.config.vsh.yaml @@ -0,0 +1,492 @@ +name: "scgpt_annotation" +namespace: "workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Query input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the input dataset to process if .X is not to be used.\ + \ Should contain log normalized counts.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The .var field in the input (query) containing gene names; if not\ + \ provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch_label" + description: "The .obs field in the input (query) dataset containing the batch\ + \ labels.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model input" + arguments: + - type: "file" + name: "--model" + description: "The scGPT model file. \nMust be a fine-tuned model that contains\ + \ keys for checkpoints (--finetuned_checkpoints_key) and cell type label mapper(--label_mapper_key).\n" + info: null + example: + - "best_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_config" + description: "The scGPT model configuration file. \n" + info: null + example: + - "args.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "The scGPT model vocabulary file.\n" + info: null + example: + - "vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--finetuned_checkpoints_key" + description: "Key in the model file containing the pre-trained checkpoints.\n" + info: null + default: + - "model_state_dict" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--label_mapper_key" + description: "Key in the model file containing the cell type class to label mapper\ + \ dictionary.\n" + info: null + default: + - "id_to_class" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output file path" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_compression" + description: "The compression algorithm to use for the output h5mu file.\n" + info: null + example: + - "gzip" + required: false + choices: + - "gzip" + - "lzf" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "The name of the adata.obs column to write predicted cell type labels\ + \ to.\n" + info: null + default: + - "scgpt_pred" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "The name of the adata.obs column to write predicted cell type labels\ + \ to.\n" + info: null + default: + - "scgpt_probability" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Padding arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "Token used for padding.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding token.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "HVG subset arguments" + arguments: + - type: "integer" + name: "--n_hvg" + description: "Number of highly variable genes to subset for.\n" + info: null + default: + - 1200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--hvg_flavor" + description: "Method to be used for identifying highly variable genes. \nNote\ + \ that the default for this workflow (`cell_ranger`) is not the default method\ + \ for scanpy hvg detection (`seurat`).\n" + info: null + default: + - "cell_ranger" + required: false + choices: + - "cell_ranger" + - "seurat" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Tokenization arguments" + arguments: + - type: "integer" + name: "--max_seq_len" + description: "The maximum sequence length of the tokenized data.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Embedding arguments" + arguments: + - type: "boolean" + name: "--dsbn" + description: "Apply domain-specific batch normalization\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size to be used for embedding inference.\n" + info: null + default: + - 64 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Binning arguments" + arguments: + - type: "integer" + name: "--n_input_bins" + description: "The number of bins to discretize the data into; When no value is\ + \ provided, data won't be binned.\n" + info: null + default: + - 51 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "Seed for random number generation used for binning. If not set,\ + \ no seed is used.\n" + info: null + required: false + min: 0 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Cell type annotation workflow using scGPT. \nThe workflow takes a pre-processed\ + \ h5mu file as query input, and performs\n - subsetting for HVG\n - cross-checking\ + \ of genes with the model vocabulary\n - binning of gene counts\n - padding and\ + \ tokenizing of genes\n - transformer-based cell type prediction\nNote that cell-type\ + \ prediction using scGPT is only possible using a fine-tuned scGPT model.\n" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "scgpt" +info: + name: "scGPT Annotation" + test_dependencies: + - name: "scgpt_annotation_test" + namespace: "test_workflows/annotation" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "scgpt/cross_check_genes" + repository: + type: "local" +- name: "scgpt/binning" + repository: + type: "local" +- name: "feature_annotation/highly_variable_features_scanpy" + repository: + type: "local" +- name: "filter/do_filter" + repository: + type: "local" +- name: "scgpt/pad_tokenize" + repository: + type: "local" +- name: "scgpt/cell_type_annotation" + alias: "scgpt_celltype_annotation" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/annotation/scgpt_annotation/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/annotation/scgpt_annotation" + executable: "target/nextflow/workflows/annotation/scgpt_annotation/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/scgpt/cross_check_genes" + - "target/nextflow/scgpt/binning" + - "target/nextflow/feature_annotation/highly_variable_features_scanpy" + - "target/nextflow/filter/do_filter" + - "target/nextflow/scgpt/pad_tokenize" + - "target/nextflow/scgpt/cell_type_annotation" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/annotation/scgpt_annotation/main.nf b/target/nextflow/workflows/annotation/scgpt_annotation/main.nf new file mode 100644 index 00000000..b234521f --- /dev/null +++ b/target/nextflow/workflows/annotation/scgpt_annotation/main.nf @@ -0,0 +1,3906 @@ +// scgpt_annotation main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) +// * Elizabeth Mlynarski (contributor) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scgpt_annotation", + "namespace" : "workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Elizabeth Mlynarski", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Principal Scientist Computational Genomics" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Query input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the input file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The .var field in the input (query) containing gene names; if not provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch_label", + "description" : "The .obs field in the input (query) dataset containing the batch labels.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Model input", + "arguments" : [ + { + "type" : "file", + "name" : "--model", + "description" : "The scGPT model file. \nMust be a fine-tuned model that contains keys for checkpoints (--finetuned_checkpoints_key) and cell type label mapper(--label_mapper_key).\n", + "example" : [ + "best_model.pt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_config", + "description" : "The scGPT model configuration file. \n", + "example" : [ + "args.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_vocab", + "description" : "The scGPT model vocabulary file.\n", + "example" : [ + "vocab.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--finetuned_checkpoints_key", + "description" : "Key in the model file containing the pre-trained checkpoints.\n", + "default" : [ + "model_state_dict" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--label_mapper_key", + "description" : "Key in the model file containing the cell type class to label mapper dictionary.\n", + "default" : [ + "id_to_class" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output file path", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_compression", + "description" : "The compression algorithm to use for the output h5mu file.\n", + "example" : [ + "gzip" + ], + "required" : false, + "choices" : [ + "gzip", + "lzf" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "The name of the adata.obs column to write predicted cell type labels to.\n", + "default" : [ + "scgpt_pred" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "The name of the adata.obs column to write predicted cell type labels to.\n", + "default" : [ + "scgpt_probability" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Padding arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--pad_token", + "description" : "Token used for padding.\n", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--pad_value", + "description" : "The value of the padding token.\n", + "default" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "HVG subset arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_hvg", + "description" : "Number of highly variable genes to subset for.\n", + "default" : [ + 1200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--hvg_flavor", + "description" : "Method to be used for identifying highly variable genes. \nNote that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`).\n", + "default" : [ + "cell_ranger" + ], + "required" : false, + "choices" : [ + "cell_ranger", + "seurat" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Tokenization arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--max_seq_len", + "description" : "The maximum sequence length of the tokenized data.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Embedding arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--dsbn", + "description" : "Apply domain-specific batch normalization\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--batch_size", + "description" : "The batch size to be used for embedding inference.\n", + "default" : [ + 64 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Binning arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_input_bins", + "description" : "The number of bins to discretize the data into; When no value is provided, data won't be binned.\n", + "default" : [ + 51 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "Seed for random number generation used for binning. If not set, no seed is used.\n", + "required" : false, + "min" : 0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Cell type annotation workflow using scGPT. \nThe workflow takes a pre-processed h5mu file as query input, and performs\n - subsetting for HVG\n - cross-checking of genes with the model vocabulary\n - binning of gene counts\n - padding and tokenizing of genes\n - transformer-based cell type prediction\nNote that cell-type prediction using scGPT is only possible using a fine-tuned scGPT model.\n", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt" + } + ], + "info" : { + "name" : "scGPT Annotation", + "test_dependencies" : [ + { + "name" : "scgpt_annotation_test", + "namespace" : "test_workflows/annotation" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "scgpt/cross_check_genes", + "repository" : { + "type" : "local" + } + }, + { + "name" : "scgpt/binning", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/highly_variable_features_scanpy", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/do_filter", + "repository" : { + "type" : "local" + } + }, + { + "name" : "scgpt/pad_tokenize", + "repository" : { + "type" : "local" + } + }, + { + "name" : "scgpt/cell_type_annotation", + "alias" : "scgpt_celltype_annotation", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/annotation/scgpt_annotation/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/annotation/scgpt_annotation", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cross_check_genes } from "${meta.resources_dir}/../../../../nextflow/scgpt/cross_check_genes/main.nf" +include { binning } from "${meta.resources_dir}/../../../../nextflow/scgpt/binning/main.nf" +include { highly_variable_features_scanpy } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/highly_variable_features_scanpy/main.nf" +include { do_filter } from "${meta.resources_dir}/../../../../nextflow/filter/do_filter/main.nf" +include { pad_tokenize } from "${meta.resources_dir}/../../../../nextflow/scgpt/pad_tokenize/main.nf" +include { cell_type_annotation as scgpt_celltype_annotation_viashalias } from "${meta.resources_dir}/../../../../nextflow/scgpt/cell_type_annotation/main.nf" +scgpt_celltype_annotation = scgpt_celltype_annotation_viashalias.run(key: "scgpt_celltype_annotation") + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Annotate the mudata object with highly variable genes. + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "layer": "input_layer", + "modality": "modality", + "n_top_features": "n_hvg", + "flavor": "hvg_flavor" + ], + args: [ + "var_name_filter": "scgpt_filter_with_hvg" + ], + toState: ["input": "output"] + ) + // Check whether the genes are part of the provided vocabulary. + // Subsets for genes present in vocab only. + | cross_check_genes.run( + fromState: [ + "input": "input", + "modality": "modality", + "vocab_file": "model_vocab", + "input_var_gene_names": "input_var_gene_names", + "output": "output", + "pad_token": "pad_token" + ], + args: [ + "var_input": "scgpt_filter_with_hvg", + "output_var_filter": "scgpt_cross_checked_genes" + ], + toState: ["input": "output"] + ) + // Bins the data into a fixed number of bins. + | binning.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "n_input_bins": "n_input_bins", + "output": "output", + "seed": "seed" + ], + args: [ + "output_obsm_binned_counts": "binned_counts", + "var_input": "scgpt_cross_checked_genes" + ], + toState: ["input": "output"] + ) + // Padding and tokenization of gene count values. + | pad_tokenize.run( + fromState: [ + "input": "input", + "modality": "modality", + "model_vocab": "model_vocab", + "var_gene_names": "input_var_gene_names", + "pad_token": "pad_token", + "pad_value": "pad_value", + "max_seq_len": "max_seq_len", + "output": "output" + ], + args: [ + "input_obsm_binned_counts": "binned_counts", + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask", + "var_input": "scgpt_cross_checked_genes" + ], + toState: ["input": "output"] + ) + // scGPT decoder-based cell type annotation. + | scgpt_celltype_annotation.run( + fromState: [ + "model": "model", + "model_vocab": "model_vocab", + "model_config": "model_config", + "label_mapper_key": "label_mapper_key", + "finetuned_checkpoints_key": "finetuned_checkpoints_key", + "input": "input", + "modality": "modality", + "obs_batch_label": "input_obs_batch_label", + "pad_token": "pad_token", + "pad_value": "pad_value", + "n_input_bins": "n_input_bins", + "dsbn": "dsbn", + "batch_size": "batch_size", + "seed": "seed", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "output": "workflow_output", + "output_compression": "output_compression" + ], + args: [ + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized" + ], + toState: {id, output, state -> ["output": output.output]} + ) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/annotation/scgpt_annotation/nextflow.config b/target/nextflow/workflows/annotation/scgpt_annotation/nextflow.config new file mode 100644 index 00000000..e1647222 --- /dev/null +++ b/target/nextflow/workflows/annotation/scgpt_annotation/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/annotation/scgpt_annotation' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Cell type annotation workflow using scGPT. \nThe workflow takes a pre-processed h5mu file as query input, and performs\n - subsetting for HVG\n - cross-checking of genes with the model vocabulary\n - binning of gene counts\n - padding and tokenizing of genes\n - transformer-based cell type prediction\nNote that cell-type prediction using scGPT is only possible using a fine-tuned scGPT model.\n' + author = 'Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/annotation/scgpt_annotation/nextflow_labels.config b/target/nextflow/workflows/annotation/scgpt_annotation/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/annotation/scgpt_annotation/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/annotation/scgpt_annotation/nextflow_schema.json b/target/nextflow/workflows/annotation/scgpt_annotation/nextflow_schema.json new file mode 100644 index 00000000..057e9a84 --- /dev/null +++ b/target/nextflow/workflows/annotation/scgpt_annotation/nextflow_schema.json @@ -0,0 +1,255 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scgpt_annotation", + "description": "Cell type annotation workflow using scGPT. \nThe workflow takes a pre-processed h5mu file as query input, and performs\n - subsetting for HVG\n - cross-checking of genes with the model vocabulary\n - binning of gene counts\n - padding and tokenizing of genes\n - transformer-based cell type prediction\nNote that cell-type prediction using scGPT is only possible using a fine-tuned scGPT model.\n", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output file path", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_compression": { + "type": "string", + "description": "The compression algorithm to use for the output h5mu file.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ", + "enum": [ + "gzip", + "lzf" + ] + }, + "output_obs_predictions": { + "type": "string", + "description": "The name of the adata.obs column to write predicted cell type labels to.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scgpt_pred\"`. ", + "default": "scgpt_pred" + }, + "output_obs_probability": { + "type": "string", + "description": "The name of the adata.obs column to write predicted cell type labels to.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scgpt_probability\"`. ", + "default": "scgpt_probability" + } + } + }, + "query input": { + "title": "Query input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the input file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer of the input dataset to process if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The .var field in the input (query) containing gene names; if not provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_obs_batch_label": { + "type": "string", + "description": "The .obs field in the input (query) dataset containing the batch labels.\n", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "model input": { + "title": "Model input", + "type": "object", + "description": "No description", + "properties": { + "model": { + "type": "string", + "format": "path", + "exists": true, + "description": "The scGPT model file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"best_model.pt\"`. " + }, + "model_config": { + "type": "string", + "format": "path", + "exists": true, + "description": "The scGPT model configuration file", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"args.json\"`. " + }, + "model_vocab": { + "type": "string", + "format": "path", + "exists": true, + "description": "The scGPT model vocabulary file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"vocab.json\"`. " + }, + "finetuned_checkpoints_key": { + "type": "string", + "description": "Key in the model file containing the pre-trained checkpoints.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"model_state_dict\"`. ", + "default": "model_state_dict" + }, + "label_mapper_key": { + "type": "string", + "description": "Key in the model file containing the cell type class to label mapper dictionary.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"id_to_class\"`. ", + "default": "id_to_class" + } + } + }, + "padding arguments": { + "title": "Padding arguments", + "type": "object", + "description": "No description", + "properties": { + "pad_token": { + "type": "string", + "description": "Token used for padding.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + }, + "pad_value": { + "type": "integer", + "description": "The value of the padding token.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `-2`. ", + "default": -2 + } + } + }, + "hvg subset arguments": { + "title": "HVG subset arguments", + "type": "object", + "description": "No description", + "properties": { + "n_hvg": { + "type": "integer", + "description": "Number of highly variable genes to subset for.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `1200`. ", + "default": 1200 + }, + "hvg_flavor": { + "type": "string", + "description": "Method to be used for identifying highly variable genes", + "help_text": "Type: `string`, multiple: `False`, default: `\"cell_ranger\"`, choices: ``cell_ranger`, `seurat``. ", + "enum": [ + "cell_ranger", + "seurat" + ], + "default": "cell_ranger" + } + } + }, + "tokenization arguments": { + "title": "Tokenization arguments", + "type": "object", + "description": "No description", + "properties": { + "max_seq_len": { + "type": "integer", + "description": "The maximum sequence length of the tokenized data.\n", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "embedding arguments": { + "title": "Embedding arguments", + "type": "object", + "description": "No description", + "properties": { + "dsbn": { + "type": "boolean", + "description": "Apply domain-specific batch normalization\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "batch_size": { + "type": "integer", + "description": "The batch size to be used for embedding inference.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `64`. ", + "default": 64 + } + } + }, + "binning arguments": { + "title": "Binning arguments", + "type": "object", + "description": "No description", + "properties": { + "n_input_bins": { + "type": "integer", + "description": "The number of bins to discretize the data into; When no value is provided, data won't be binned.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `51`. ", + "default": 51 + }, + "seed": { + "type": "integer", + "description": "Seed for random number generation used for binning", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/query input" + }, + { + "$ref": "#/$defs/model input" + }, + { + "$ref": "#/$defs/padding arguments" + }, + { + "$ref": "#/$defs/hvg subset arguments" + }, + { + "$ref": "#/$defs/tokenization arguments" + }, + { + "$ref": "#/$defs/embedding arguments" + }, + { + "$ref": "#/$defs/binning arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/annotation/scvi_knn/.config.vsh.yaml b/target/nextflow/workflows/annotation/scvi_knn/.config.vsh.yaml new file mode 100644 index 00000000..b69f9eba --- /dev/null +++ b/target/nextflow/workflows/annotation/scvi_knn/.config.vsh.yaml @@ -0,0 +1,579 @@ +name: "scvi_knn" +namespace: "workflows/annotation" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Query Input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Input dataset consisting of the (unlabeled) query observations." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process. Should match the modality of the --reference\ + \ dataset." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the input dataset containing the raw counts if .X is\ + \ not to be used." + info: null + example: + - "counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer_lognormalized" + description: "The layer of the input dataset containing the lognormalized counts\ + \ if .X is not to be used." + info: null + example: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_obs_batch_label" + description: "The .obs field in the input (query) dataset containing the batch\ + \ labels." + info: null + example: + - "sample" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_var_gene_names" + description: "The .var field in the input (query) dataset containing gene names;\ + \ if not provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--input_reference_gene_overlap" + description: "The minimum number of genes present in both the reference and query\ + \ datasets.\n" + info: null + default: + - 100 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--overwrite_existing_key" + description: "If provided, will overwrite existing fields in the input dataset\ + \ when data are copied during the reference alignment process." + info: null + direction: "input" +- name: "Reference input" + arguments: + - type: "file" + name: "--reference" + description: "Reference dataset consisting of the labeled observations." + info: null + example: + - "reference.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer" + description: "The layer of the reference dataset containing the raw counts if\ + \ .X is not to be used." + info: null + example: + - "counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_layer_lognormalized" + description: "The layer of the reference dataset containing the lognormalized\ + \ counts if .X is not to be used." + info: null + example: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_target" + description: "The `.obs` key(s) of the target labels to transfer." + info: null + example: + - "cell_type" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_var_gene_names" + description: "The .var field in the reference dataset containing gene names; if\ + \ not provided, the .var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--reference_obs_batch_label" + description: "The .obs field in the reference dataset containing the batch labels." + info: null + example: + - "sample" + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "HVG subset arguments" + arguments: + - type: "integer" + name: "--n_hvg" + description: "Number of highly variable genes to subset for.\n" + info: null + default: + - 2000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "scVI integration options" + arguments: + - type: "boolean" + name: "--scvi_early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--scvi_early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scvi_early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--scvi_early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--scvi_max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--scvi_reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--scvi_lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--scvi_lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Leiden clustering options" + arguments: + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + min: 0.0 + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Neighbor classifier arguments" + arguments: + - type: "string" + name: "--knn_weights" + description: "Weight function used in prediction. Possible values are:\n`uniform`\ + \ (all points in each neighborhood are weighted equally) or \n`distance` (weight\ + \ points by the inverse of their distance)\n" + info: null + default: + - "uniform" + required: false + choices: + - "uniform" + - "distance" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--knn_n_neighbors" + description: "The number of neighbors to use in k-neighbor graph structure used\ + \ for fast approximate nearest neighbor search with PyNNDescent. \nLarger values\ + \ will result in more accurate search results at the cost of computation time.\n" + info: null + default: + - 15 + required: false + min: 5 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The query data in .h5mu format with predicted labels predicted from\ + \ the classifier trained on the reference." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_predictions" + description: "In which `.obs` slots to store the predicted cell labels.\nIf provided,\ + \ must have the same length as `--reference_obs_targets`.\nIf empty, will default\ + \ to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obs_probability" + description: "In which `.obs` slots to store the probability of the predictions.\n\ + If provided, must have the same length as `--reference_obs_targets`.\nIf empty,\ + \ will default to the `reference_obs_targets` combined with the `\"_probability\"\ + ` suffix.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--output_obsm_integrated" + description: "In which .obsm slot to store the integrated embedding." + info: null + default: + - "X_integrated_scvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "\"Cell type annotation workflow that performs scVI integration of reference\ + \ and query dataset followed by KNN label transfer. \nThe query and reference datasets\ + \ are expected to be pre-processed in the same way, for example with the `process_samples`\ + \ workflow of OpenPipeline.\nNote that this workflow will integrate the reference\ + \ dataset from scratch and integrate the query dataset in the same embedding space.\ + \ \nThe workflow does not currently output the trained SCVI reference model.\"\n" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "scgpt" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +- type: "file" + path: "TS_Blood_filtered.h5mu" +info: + name: "scVI Annotation" + test_dependencies: + - name: "scvi_knn_test" + namespace: "test_workflows/annotation" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "workflows/integration/scvi_leiden" + alias: "scvi_leiden_workflow" + repository: + type: "local" +- name: "labels_transfer/knn" + repository: + type: "local" +- name: "workflows/multiomics/split_h5mu" + repository: + type: "local" +- name: "dataflow/concatenate_h5mu" + repository: + type: "local" +- name: "feature_annotation/align_query_reference" + repository: + type: "local" +- name: "feature_annotation/highly_variable_features_scanpy" + repository: + type: "local" +- name: "transform/delete_layer" + repository: + type: "local" +- name: "workflows/multiomics/split_modalities" + repository: + type: "local" +- name: "dataflow/merge" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/annotation/scvi_knn/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/annotation/scvi_knn" + executable: "target/nextflow/workflows/annotation/scvi_knn/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/workflows/integration/scvi_leiden" + - "target/nextflow/labels_transfer/knn" + - "target/_private/nextflow/workflows/multiomics/split_h5mu" + - "target/nextflow/dataflow/concatenate_h5mu" + - "target/nextflow/feature_annotation/align_query_reference" + - "target/nextflow/feature_annotation/highly_variable_features_scanpy" + - "target/nextflow/transform/delete_layer" + - "target/_private/nextflow/workflows/multiomics/split_modalities" + - "target/nextflow/dataflow/merge" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/annotation/scvi_knn/main.nf b/target/nextflow/workflows/annotation/scvi_knn/main.nf new file mode 100644 index 00000000..3e1a4c80 --- /dev/null +++ b/target/nextflow/workflows/annotation/scvi_knn/main.nf @@ -0,0 +1,4144 @@ +// scvi_knn main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (author, maintainer) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scvi_knn", + "namespace" : "workflows/annotation", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Query Input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Input dataset consisting of the (unlabeled) query observations.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process. Should match the modality of the --reference dataset.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the input dataset containing the raw counts if .X is not to be used.", + "example" : [ + "counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer_lognormalized", + "description" : "The layer of the input dataset containing the lognormalized counts if .X is not to be used.", + "example" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_obs_batch_label", + "description" : "The .obs field in the input (query) dataset containing the batch labels.", + "example" : [ + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_var_gene_names", + "description" : "The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--input_reference_gene_overlap", + "description" : "The minimum number of genes present in both the reference and query datasets.\n", + "default" : [ + 100 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--overwrite_existing_key", + "description" : "If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process.", + "direction" : "input" + } + ] + }, + { + "name" : "Reference input", + "arguments" : [ + { + "type" : "file", + "name" : "--reference", + "description" : "Reference dataset consisting of the labeled observations.", + "example" : [ + "reference.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer", + "description" : "The layer of the reference dataset containing the raw counts if .X is not to be used.", + "example" : [ + "counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_layer_lognormalized", + "description" : "The layer of the reference dataset containing the lognormalized counts if .X is not to be used.", + "example" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_target", + "description" : "The `.obs` key(s) of the target labels to transfer.", + "example" : [ + "cell_type" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_var_gene_names", + "description" : "The .var field in the reference dataset containing gene names; if not provided, the .var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--reference_obs_batch_label", + "description" : "The .obs field in the reference dataset containing the batch labels.", + "example" : [ + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "HVG subset arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_hvg", + "description" : "Number of highly variable genes to subset for.\n", + "default" : [ + 2000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "scVI integration options", + "arguments" : [ + { + "type" : "boolean", + "name" : "--scvi_early_stopping", + "description" : "Whether to perform early stopping with respect to the validation set.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--scvi_early_stopping_monitor", + "description" : "Metric logged during validation set epoch.", + "default" : [ + "elbo_validation" + ], + "required" : false, + "choices" : [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scvi_early_stopping_patience", + "description" : "Number of validation epochs with no improvement after which training will be stopped.", + "default" : [ + 45 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--scvi_early_stopping_min_delta", + "description" : "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--scvi_max_epochs", + "description" : "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--scvi_reduce_lr_on_plateau", + "description" : "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--scvi_lr_factor", + "description" : "Factor to reduce learning rate.", + "default" : [ + 0.6 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--scvi_lr_patience", + "description" : "Number of epochs with no improvement after which learning rate will be reduced.", + "default" : [ + 30.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Leiden clustering options", + "arguments" : [ + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbor classifier arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--knn_weights", + "description" : "Weight function used in prediction. Possible values are:\n`uniform` (all points in each neighborhood are weighted equally) or \n`distance` (weight points by the inverse of their distance)\n", + "default" : [ + "uniform" + ], + "required" : false, + "choices" : [ + "uniform", + "distance" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--knn_n_neighbors", + "description" : "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. \nLarger values will result in more accurate search results at the cost of computation time.\n", + "default" : [ + 15 + ], + "required" : false, + "min" : 5, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_predictions", + "description" : "In which `.obs` slots to store the predicted cell labels.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_pred\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_probability", + "description" : "In which `.obs` slots to store the probability of the predictions.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\\"_probability\\"` suffix.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obsm_integrated", + "description" : "In which .obsm slot to store the integrated embedding.", + "default" : [ + "X_integrated_scvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "\\"Cell type annotation workflow that performs scVI integration of reference and query dataset followed by KNN label transfer. \nThe query and reference datasets are expected to be pre-processed in the same way, for example with the `process_samples` workflow of OpenPipeline.\nNote that this workflow will integrate the reference dataset from scratch and integrate the query dataset in the same embedding space. \nThe workflow does not currently output the trained SCVI reference model.\\"\n", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + }, + { + "type" : "file", + "path" : "/resources_test/annotation_test_data/TS_Blood_filtered.h5mu" + } + ], + "info" : { + "name" : "scVI Annotation", + "test_dependencies" : [ + { + "name" : "scvi_knn_test", + "namespace" : "test_workflows/annotation" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "workflows/integration/scvi_leiden", + "alias" : "scvi_leiden_workflow", + "repository" : { + "type" : "local" + } + }, + { + "name" : "labels_transfer/knn", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/split_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dataflow/concatenate_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/align_query_reference", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/highly_variable_features_scanpy", + "repository" : { + "type" : "local" + } + }, + { + "name" : "transform/delete_layer", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/split_modalities", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dataflow/merge", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/annotation/scvi_knn/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/annotation/scvi_knn", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { scvi_leiden as scvi_leiden_workflow_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/integration/scvi_leiden/main.nf" +scvi_leiden_workflow = scvi_leiden_workflow_viashalias.run(key: "scvi_leiden_workflow") +include { knn } from "${meta.resources_dir}/../../../../nextflow/labels_transfer/knn/main.nf" +include { split_h5mu } from "${meta.resources_dir}/../../../../_private/nextflow/workflows/multiomics/split_h5mu/main.nf" +include { concatenate_h5mu } from "${meta.resources_dir}/../../../../nextflow/dataflow/concatenate_h5mu/main.nf" +include { align_query_reference } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/align_query_reference/main.nf" +include { highly_variable_features_scanpy } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/highly_variable_features_scanpy/main.nf" +include { delete_layer } from "${meta.resources_dir}/../../../../nextflow/transform/delete_layer/main.nf" +include { split_modalities } from "${meta.resources_dir}/../../../../_private/nextflow/workflows/multiomics/split_modalities/main.nf" +include { merge } from "${meta.resources_dir}/../../../../nextflow/dataflow/merge/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + modalities_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Allign query and reference datasets + | align_query_reference.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "input_layer_lognormalized": "input_layer_lognormalized", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "reference": "reference", + "reference_layer": "reference_layer", + "reference_layer_lognormalized": "reference_layer_lognormalized", + "reference_obs_batch": "reference_obs_batch_label", + "reference_obs_label": "reference_obs_target", + "reference_var_gene_names": "reference_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "overwrite_existing_key": "overwrite_existing_key" + ], + args: [ + "input_id": "query", + "reference_id": "reference", + "output_layer": "_counts", + "output_layer_lognormalized": "_log_normalized", + "output_var_gene_names": "_gene_names", + "output_obs_batch": "_sample_id", + "output_obs_label": "_cell_type", + "output_obs_id": "_dataset", + "output_var_common_genes": "_common_vars", + "align_layers_lognormalized_counts": "true" + ], + toState: [ + "input": "output_query", + "reference": "output_reference" + ] + ) + + | split_modalities.run( + fromState: {id, state -> + def newState = ["input": state.input, "id": id] + }, + toState: ["output": "output", "output_types": "output_types"] + ) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + // def new_id = id + "_" + dat.name + def new_id = id // it's okay because the channel will get split up anyways + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types", "output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + | view {"After splitting modalities: $it"} + + rna_ch = modalities_ch + + | filter{id, state -> state.modality == "rna"} + + // Concatenate query and reference datasets prior to integration + // Only concatenate rna modality in this channel + | concatenate_h5mu.run( + fromState: { id, state -> + ["input": [state.input, state.reference]] + }, + args: [ + "input_id": ["query", "reference"], + "modality": "rna", + "other_axis_mode": "move" + ], + toState: ["input": "output"] + ) + + // Calculate HVG across query and reference + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "modality": "modality", + "n_top_features": "n_hvg" + ], + args: [ + "layer": "_log_normalized", + "var_input": "_common_vars", + "var_name_filter": "_common_hvg", + "obs_batch_key": "_sample_id" + ], + toState: [ + "input": "output" + ] + ) + | delete_layer.run( + key: "delete_aligned_lognormalized_counts_layer", + fromState: [ + "input": "input", + "modality": "modality", + ], + args: [ + "layer": "_log_normalized", + "missing_ok": "true" + ], + toState: [ + "input": "output" + ] + ) + // Run scvi integration with leiden clustering + | scvi_leiden_workflow.run( + fromState: { id, state -> [ + "id": id, + "input": state.input, + "modality": state.modality, + "obsm_output": state.output_obsm_integrated, + "leiden_resolution": state.leiden_resolution, + "early_stopping": state.scvi_early_stopping, + "early_stopping_monitor": state.scvi_early_stopping_monitor, + "early_stoping_patience": state.scvi_early_stoping_patience, + "early_stopping_min_delta": state.scvi_early_stopping_min_delta, + "max_epochs": state.scvi_max_epochs, + "reduce_lr_on_plateau": state.scvi_reduce_lr_on_plateau, + "lr_factor": state.scvi_lr_factor, + "lr_patience": state.scvi_lr_patience + ]}, + args: [ + "var_input": "_common_hvg", + "uns_neighbors": "scvi_integration_neighbors", + "obsp_neighbor_distances": "scvi_integration_distances", + "obsp_neighbor_connectivities": "scvi_integration_connectivities", + "obs_cluster": "scvi_integration_leiden", + "obsm_umap": "X_leiden_scvi_umap", + "obs_batch": "_sample_id", + "layer": "_counts" + ], + toState: ["input": "output"] + ) + | delete_layer.run( + key: "delete_aligned_counts_layer", + fromState: [ + "input": "input", + "modality": "modality", + ], + args: [ + "layer": "_counts", + "missing_ok": "true" + ], + toState: [ + "input": "output" + ] + ) + // Split integrated dataset back into a separate reference and query dataset + | split_h5mu.run( + fromState: [ + "input": "input", + "modality": "modality" + ], + args: [ + "obs_feature": "_dataset", + "output_files": "sample_files.csv", + "drop_obs_nan": "true", + "output": "ref_query" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ] + ) + // map the integrated query and reference datasets back to the state + | map {id, state -> + def outputDir = state.output + if (workflow.stubRun) { + def output_files = outputDir.listFiles() + def new_state = state + [ + "input": output_files[0], + "reference": output_files[1], + ] + return [id, new_state] + } + def files = readCsv(state.output_files.toUriString()) + def query_file = files.findAll{ dat -> dat.name == 'query' } + assert query_file.size() == 1, 'there should only be one query file' + def reference_file = files.findAll{ dat -> dat.name == 'reference' } + assert reference_file.size() == 1, 'there should only be one reference file' + def integrated_query = outputDir.resolve(query_file.filename) + def integrated_reference = outputDir.resolve(reference_file.filename) + def newKeys = ["input": integrated_query, "reference": integrated_reference] + [id, state + newKeys] + } + // remove keys from split files + | map {id, state -> + def keysToRemove = ["output_files"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + + // Perform KNN label transfer from reference to query + | knn.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_obsm_features": "output_obsm_integrated", + "reference": "reference", + "reference_obsm_features": "output_obsm_integrated", + "reference_obs_targets": "reference_obs_target", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "weights": "knn_weights", + "n_neighbors": "knn_n_neighbors", + "output": "workflow_output" + ], + args:[ + "output_compression": "gzip" + ], + toState: ["input": "output"] + // toState: {id, output, state -> ["output": output.output]} + ) + | view {"After processing RNA modality: $it"} + + + other_mod_ch = modalities_ch + | filter{id, state -> state.modality != "rna"} + + output_ch = rna_ch.mix(other_mod_ch) + | groupTuple(by: 0, sort: "hash") + | map { id, states -> + def new_input = states.collect{it.input} + def modalities = states.collect{it.modality}.unique() + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality", "reference"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + [id, new_state + ["input": new_input, "modalities": modalities]] + } + | merge.run( + fromState: ["input": "input"], + toState: ["output": "output"], + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/annotation/scvi_knn/nextflow.config b/target/nextflow/workflows/annotation/scvi_knn/nextflow.config new file mode 100644 index 00000000..5817ebb3 --- /dev/null +++ b/target/nextflow/workflows/annotation/scvi_knn/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/annotation/scvi_knn' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = '"Cell type annotation workflow that performs scVI integration of reference and query dataset followed by KNN label transfer. \nThe query and reference datasets are expected to be pre-processed in the same way, for example with the `process_samples` workflow of OpenPipeline.\nNote that this workflow will integrate the reference dataset from scratch and integrate the query dataset in the same embedding space. \nThe workflow does not currently output the trained SCVI reference model."\n' + author = 'Dorien Roosen, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/annotation/scvi_knn/nextflow_labels.config b/target/nextflow/workflows/annotation/scvi_knn/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/annotation/scvi_knn/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/annotation/scvi_knn/nextflow_schema.json b/target/nextflow/workflows/annotation/scvi_knn/nextflow_schema.json new file mode 100644 index 00000000..0e213e4f --- /dev/null +++ b/target/nextflow/workflows/annotation/scvi_knn/nextflow_schema.json @@ -0,0 +1,290 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scvi_knn", + "description": "\"Cell type annotation workflow that performs scVI integration of reference and query dataset followed by KNN label transfer. \nThe query and reference datasets are expected to be pre-processed in the same way, for example with the `process_samples` workflow of OpenPipeline.\nNote that this workflow will integrate the reference dataset from scratch and integrate the query dataset in the same embedding space. \nThe workflow does not currently output the trained SCVI reference model.\"\n", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_obs_predictions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "In which `.obs` slots to store the predicted cell labels.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\"_pred\"` suffix.\n", + "help_text": "Type: `string`, multiple: `True`. " + }, + "output_obs_probability": { + "type": "array", + "items": { + "type": "string" + }, + "description": "In which `.obs` slots to store the probability of the predictions.\nIf provided, must have the same length as `--reference_obs_targets`.\nIf empty, will default to the `reference_obs_targets` combined with the `\"_probability\"` suffix.\n", + "help_text": "Type: `string`, multiple: `True`. " + }, + "output_obsm_integrated": { + "type": "string", + "description": "In which .obsm slot to store the integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_integrated_scvi\"`. ", + "default": "X_integrated_scvi" + } + } + }, + "query input": { + "title": "Query Input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input dataset consisting of the (unlabeled) query observations.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer of the input dataset containing the raw counts if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`, example: `\"counts\"`. " + }, + "input_layer_lognormalized": { + "type": "string", + "description": "The layer of the input dataset containing the lognormalized counts if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`, example: `\"log_normalized\"`. " + }, + "input_obs_batch_label": { + "type": "string", + "description": "The .obs field in the input (query) dataset containing the batch labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample\"`. " + }, + "input_var_gene_names": { + "type": "string", + "description": "The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "input_reference_gene_overlap": { + "type": "integer", + "description": "The minimum number of genes present in both the reference and query datasets.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `100`. ", + "default": 100 + }, + "overwrite_existing_key": { + "type": "boolean", + "description": "If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "reference input": { + "title": "Reference input", + "type": "object", + "description": "No description", + "properties": { + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference dataset consisting of the labeled observations.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.h5mu\"`. " + }, + "reference_layer": { + "type": "string", + "description": "The layer of the reference dataset containing the raw counts if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`, example: `\"counts\"`. " + }, + "reference_layer_lognormalized": { + "type": "string", + "description": "The layer of the reference dataset containing the lognormalized counts if .X is not to be used.", + "help_text": "Type: `string`, multiple: `False`, example: `\"log_normalized\"`. " + }, + "reference_obs_target": { + "type": "string", + "description": "The `.obs` key(s) of the target labels to transfer.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"cell_type\"`. " + }, + "reference_var_gene_names": { + "type": "string", + "description": "The .var field in the reference dataset containing gene names; if not provided, the .var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "reference_obs_batch_label": { + "type": "string", + "description": "The .obs field in the reference dataset containing the batch labels.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"sample\"`. " + } + } + }, + "hvg subset arguments": { + "title": "HVG subset arguments", + "type": "object", + "description": "No description", + "properties": { + "n_hvg": { + "type": "integer", + "description": "Number of highly variable genes to subset for.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `2000`. ", + "default": 2000 + } + } + }, + "scvi integration options": { + "title": "scVI integration options", + "type": "object", + "description": "No description", + "properties": { + "scvi_early_stopping": { + "type": "boolean", + "description": "Whether to perform early stopping with respect to the validation set.", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "scvi_early_stopping_monitor": { + "type": "string", + "description": "Metric logged during validation set epoch.", + "help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ", + "enum": [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "default": "elbo_validation" + }, + "scvi_early_stopping_patience": { + "type": "integer", + "description": "Number of validation epochs with no improvement after which training will be stopped.", + "help_text": "Type: `integer`, multiple: `False`, default: `45`. ", + "default": 45 + }, + "scvi_early_stopping_min_delta": { + "type": "number", + "description": "Minimum change in the monitored quantity to qualify as an improvement, i.e", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "scvi_max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "scvi_reduce_lr_on_plateau": { + "type": "boolean", + "description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "scvi_lr_factor": { + "type": "number", + "description": "Factor to reduce learning rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.6`. ", + "default": 0.6 + }, + "scvi_lr_patience": { + "type": "number", + "description": "Number of epochs with no improvement after which learning rate will be reduced.", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + } + } + }, + "leiden clustering options": { + "title": "Leiden clustering options", + "type": "object", + "description": "No description", + "properties": { + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "neighbor classifier arguments": { + "title": "Neighbor classifier arguments", + "type": "object", + "description": "No description", + "properties": { + "knn_weights": { + "type": "string", + "description": "Weight function used in prediction", + "help_text": "Type: `string`, multiple: `False`, default: `\"uniform\"`, choices: ``uniform`, `distance``. ", + "enum": [ + "uniform", + "distance" + ], + "default": "uniform" + }, + "knn_n_neighbors": { + "type": "integer", + "description": "The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent", + "help_text": "Type: `integer`, multiple: `False`, default: `15`. ", + "default": 15 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/query input" + }, + { + "$ref": "#/$defs/reference input" + }, + { + "$ref": "#/$defs/hvg subset arguments" + }, + { + "$ref": "#/$defs/scvi integration options" + }, + { + "$ref": "#/$defs/leiden clustering options" + }, + { + "$ref": "#/$defs/neighbor classifier arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/.config.vsh.yaml b/target/nextflow/workflows/gdo/gdo_singlesample/.config.vsh.yaml new file mode 100644 index 00000000..f2c4869c --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/.config.vsh.yaml @@ -0,0 +1,277 @@ +name: "gdo_singlesample" +namespace: "workflows/gdo" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to start from. By default, .X will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Filtering options" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts captured per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_counts" + description: "Maximum number of counts captured per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_guides_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_guides_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 1500000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_guide" + description: "Minimum of non-zero values per gene." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Processing unimodal single-sample guide-derived oligonucleotide (GDO)\ + \ data." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "10x_5k_lung_crispr" +info: + name: "GDO Singlesample" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "filter/filter_with_counts" + repository: + type: "local" +- name: "filter/do_filter" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/gdo/gdo_singlesample/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/gdo/gdo_singlesample" + executable: "target/nextflow/workflows/gdo/gdo_singlesample/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/filter/filter_with_counts" + - "target/nextflow/filter/do_filter" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/main.nf b/target/nextflow/workflows/gdo/gdo_singlesample/main.nf new file mode 100644 index 00000000..206fea33 --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/main.nf @@ -0,0 +1,3571 @@ +// gdo_singlesample main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "gdo_singlesample", + "namespace" : "workflows/gdo", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to start from. By default, .X will be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filtering options", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of counts captured per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_counts", + "description" : "Maximum number of counts captured per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_guides_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_guides_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 1500000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells_per_guide", + "description" : "Minimum of non-zero values per gene.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Processing unimodal single-sample guide-derived oligonucleotide (GDO) data.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_lung_crispr" + } + ], + "info" : { + "name" : "GDO Singlesample" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "filter/filter_with_counts", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/do_filter", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/gdo/gdo_singlesample/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/gdo/gdo_singlesample", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { filter_with_counts } from "${meta.resources_dir}/../../../../nextflow/filter/filter_with_counts/main.nf" +include { do_filter } from "${meta.resources_dir}/../../../../nextflow/filter/do_filter/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // cell filtering + | filter_with_counts.run( + key: "gdo_filter_with_counts", + fromState: { id, state -> + [ + "input": state.input, + "modality": "gdo", + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "min_counts": state.min_counts, + "max_counts": state.max_counts, + "min_genes_per_cell": state.min_guides_per_cell, + "max_genes_per_cell": state.max_guides_per_cell, + "min_cells_per_gene": state.min_cells_per_guide, + "layer": state.layer, + ] + }, + toState: ["input": "output"] + ) + | do_filter.run( + key: "gdo_do_filter", + fromState : { id, state -> + // do_filter does not need a layer argument because it filters all layers + // from a modality. + def newState = [ + "input": state.input, + "obs_filter": "filter_with_counts", + "modality": "gdo", + "var_filter": "filter_with_counts", + "output_compression": "gzip", + "output": state.workflow_output + ] + return newState + }, + toState: ["output": "output"] + ) + | setState(["output"]) + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/nextflow.config b/target/nextflow/workflows/gdo/gdo_singlesample/nextflow.config new file mode 100644 index 00000000..ed39a76c --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/gdo/gdo_singlesample' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Processing unimodal single-sample guide-derived oligonucleotide (GDO) data.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/nextflow_labels.config b/target/nextflow/workflows/gdo/gdo_singlesample/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/nextflow_schema.json b/target/nextflow/workflows/gdo/gdo_singlesample/nextflow_schema.json new file mode 100644 index 00000000..1a0ffda2 --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/nextflow_schema.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "gdo_singlesample", + "description": "Processing unimodal single-sample guide-derived oligonucleotide (GDO) data.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "Input layer to start from", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "filtering options": { + "title": "Filtering options", + "type": "object", + "description": "No description", + "properties": { + "min_counts": { + "type": "integer", + "description": "Minimum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_counts": { + "type": "integer", + "description": "Maximum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "min_guides_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_guides_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `1500000`. " + }, + "min_cells_per_guide": { + "type": "integer", + "description": "Minimum of non-zero values per gene.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/filtering options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/utils/errorstrat_ignore.config b/target/nextflow/workflows/gdo/gdo_singlesample/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/utils/integration_tests.config b/target/nextflow/workflows/gdo/gdo_singlesample/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/utils/labels.config b/target/nextflow/workflows/gdo/gdo_singlesample/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/gdo/gdo_singlesample/utils/labels_ci.config b/target/nextflow/workflows/gdo/gdo_singlesample/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/gdo/gdo_singlesample/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/.config.vsh.yaml b/target/nextflow/workflows/ingestion/bd_rhapsody/.config.vsh.yaml new file mode 100644 index 00000000..e76f4aa1 --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/.config.vsh.yaml @@ -0,0 +1,661 @@ +name: "bd_rhapsody" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Robrecht Cannoodt" + roles: + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dorien Roosen" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--reads" + description: "Reads (optional) - Path to your FASTQ.GZ formatted read files from\ + \ libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample\ + \ Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n" + info: + config_key: "Reads" + example: + - "WTALibrary_S1_L001_R1_001.fastq.gz" + - "WTALibrary_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reads_atac" + description: "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\n\ + You may specify as many R1/R2/I2 files as you want.\n" + info: + config_key: "Reads_ATAC" + example: + - "ATACLibrary_S2_L001_R1_001.fastq.gz" + - "ATACLibrary_S2_L001_R2_001.fastq.gz" + - "ATACLibrary_S2_L001_I2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "References" + description: "Assay type will be inferred from the provided reference(s).\nDo not\ + \ provide both reference_archive and targeted_reference at the same time.\n \n\ + Valid reference input combinations:\n - reference_archive: WTA only\n - reference_archive\ + \ & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference:\ + \ WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference:\ + \ WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n\ + \ - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n\ + \ - targeted_reference: Targeted only\n - targeted_reference & abseq_reference:\ + \ Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can\ + \ be generated with the `reference/build_bdrhap_reference` component.\nAlternatively,\ + \ BD also provides standard references which can be downloaded from these locations:\n\ + \n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n\ + \ - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n" + arguments: + - type: "file" + name: "--reference_archive" + description: "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure\ + \ of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level\ + \ folder\n - `star_index/`: sub-folder containing STAR index, that is files\ + \ created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation\ + \ e.g. \"gencode.v43.primary_assembly.annotation.gtf\"\n" + info: + config_key: "Reference_Archive" + example: + - "RhapRef_Human_WTA_2023-02.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--targeted_reference" + description: "Path to the targeted reference file in FASTA format.\n" + info: + config_key: "Targeted_Reference" + example: + - "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--abseq_reference" + description: "Path to the AbSeq reference file in FASTA format. Only needed if\ + \ BD AbSeq Ab-Oligos are used." + info: + config_key: "AbSeq_Reference" + example: + - "AbSeq_reference.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--supplemental_reference" + alternatives: + - "-s" + description: "Path to the supplemental reference file in FASTA format. Only needed\ + \ if there are additional transgene sequences to be aligned against in a WTA\ + \ assay experiment." + info: + config_key: "Supplemental_Reference" + example: + - "supplemental_reference.fasta" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + description: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The processed output file in h5mu format." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_raw" + alternatives: + - "-o" + description: "The unprocessed output directory containing all the outputs from\ + \ the pipeline." + info: null + example: + - "output_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Putative Cell Calling Settings" + arguments: + - type: "string" + name: "--cell_calling_data" + description: "Specify the dataset to be used for putative cell calling: mRNA,\ + \ AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset,\ + \ please provide an AbSeq_Reference fasta file above.\n \nFor putative cell\ + \ calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive\ + \ file above.\n \nThe default data for putative cell calling, will be determined\ + \ the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n\ + - If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n" + info: + config_key: "Cell_Calling_Data" + example: + - "mRNA" + required: false + choices: + - "mRNA" + - "AbSeq" + - "ATAC" + - "mRNA_and_ATAC" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_calling_bioproduct_algorithm" + description: "Specify the bioproduct algorithm to be used for putative cell calling:\ + \ Basic or Refined\n \nBy default, the Basic algorithm will be used for putative\ + \ cell calling.\n" + info: + config_key: "Cell_Calling_Bioproduct_Algorithm" + example: + - "Basic" + required: false + choices: + - "Basic" + - "Refined" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_calling_atac_algorithm" + description: "Specify the ATAC-seq algorithm to be used for putative cell calling:\ + \ Basic or Refined\n \nBy default, the Basic algorithm will be used for putative\ + \ cell calling.\n" + info: + config_key: "Cell_Calling_ATAC_Algorithm" + example: + - "Basic" + required: false + choices: + - "Basic" + - "Refined" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--exact_cell_count" + description: "Set a specific number of cells as putative, based on those with\ + \ the highest error-corrected read count\n" + info: + config_key: "Exact_Cell_Count" + example: + - 10000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--expected_cell_count" + description: "Guide the basic putative cell calling algorithm by providing an\ + \ estimate of the number of cells expected. Usually this can be the number\ + \ of cells loaded into the Rhapsody cartridge. If there are multiple inflection\ + \ points on the second derivative cumulative curve, this will ensure the one\ + \ selected is near the expected. \n" + info: + config_key: "Expected_Cell_Count" + example: + - 20000 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Intronic Reads Settings" + arguments: + - type: "boolean" + name: "--exclude_intronic_reads" + description: "By default, the flag is false, and reads aligned to exons and introns\ + \ are considered and represented in molecule counts. When the flag is set to\ + \ true, intronic reads will be excluded.\nThe value can be true or false.\n" + info: + config_key: "Exclude_Intronic_Reads" + example: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Multiplex Settings" + arguments: + - type: "string" + name: "--sample_tags_version" + description: "Specify the version of the Sample Tags used in the run:\n\n* If\ + \ Sample Tag Multiplexing was done, specify the appropriate version: human,\ + \ mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK +\ + \ Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not\ + \ an SMK + ATAC-Seq only run), choose the \"nuclei_includes_mrna\" option.\n\ + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)),\ + \ choose the \"nuclei_atac_only\" option.\n" + info: + config_key: "Sample_Tags_Version" + example: + - "human" + required: false + choices: + - "human" + - "mouse" + - "flex" + - "nuclei_includes_mrna" + - "nuclei_atac_only" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--tag_names" + description: "Specify the tag number followed by '-' and the desired sample name\ + \ to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n" + info: + config_key: "Tag_Names" + example: + - "4-mySample" + - "9-myOtherSample" + - "6-alsoThisSample" + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "VDJ arguments" + arguments: + - type: "string" + name: "--vdj_version" + description: "If VDJ was done, specify the appropriate option: human, mouse, humanBCR,\ + \ humanTCR, mouseBCR, mouseTCR\n" + info: + config_key: "VDJ_Version" + example: + - "human" + required: false + choices: + - "human" + - "mouse" + - "humanBCR" + - "humanTCR" + - "mouseBCR" + - "mouseTCR" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "ATAC options" + arguments: + - type: "file" + name: "--predefined_atac_peaks" + description: "An optional BED file containing pre-established chromatin accessibility\ + \ peak regions for generating the ATAC cell-by-peak matrix." + info: + config_key: "Predefined_ATAC_Peaks" + example: + - "predefined_peaks.bed" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Additional options" + arguments: + - type: "string" + name: "--run_name" + description: "Specify a run name to use as the output file base name. Use only\ + \ letters, numbers, or hyphens. Do not use special characters or spaces.\n" + info: + config_key: "Run_Name" + default: + - "sample" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--generate_bam" + description: "Specify whether to create the BAM file output\n" + info: + config_key: "Generate_Bam" + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--long_reads" + description: "Use STARlong (default: undefined - i.e. autodetects based on read\ + \ lengths) - Specify if the STARlong aligner should be used instead of STAR.\ + \ Set to true if the reads are longer than 650bp.\n" + info: + config_key: "Long_Reads" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Advanced options" + description: "NOTE: Only change these if you are really sure about what you are\ + \ doing\n" + arguments: + - type: "string" + name: "--custom_star_params" + description: "Modify STAR alignment parameters - Set this parameter to fully override\ + \ default STAR mapping parameters used in the pipeline.\nFor reference this\ + \ is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread\ + \ 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq\ + \ AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin\ + \ 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax\ + \ 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT\ + \ set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`,\ + \ `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n" + info: + config_key: "Custom_STAR_Params" + example: + - "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed\ + \ 2000000" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--custom_bwa_mem2_params" + description: "Modify bwa-mem2 alignment parameters - Set this parameter to fully\ + \ override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does\ + \ not specify any custom mapping params to bwa-mem2 so program default values\ + \ are used\nThis applies to fastqs provided in the Reads_ATAC user input \n\ + Do NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2\ + \ version 2.2.1\n" + info: + config_key: "Custom_bwa_mem2_Params" + example: + - "-k 16 -w 200 -r" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "CWL-runner arguments" + arguments: + - type: "boolean" + name: "--parallel" + description: "Run jobs in parallel." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--timestamps" + description: "Add timestamps to the errors, warnings, and notifications." + info: null + direction: "input" +- name: "Undocumented arguments" + arguments: + - type: "integer" + name: "--abseq_umi" + info: + config_key: "AbSeq_UMI" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--target_analysis" + info: + config_key: "Target_analysis" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--vdj_jgene_evalue" + description: "e-value threshold for J gene. The e-value threshold for J gene call\ + \ by IgBlast/PyIR, default is set as 0.001\n" + info: + config_key: "VDJ_JGene_Evalue" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--vdj_vgene_evalue" + description: "e-value threshold for V gene. The e-value threshold for V gene call\ + \ by IgBlast/PyIR, default is set as 0.001\n" + info: + config_key: "VDJ_VGene_Evalue" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--write_filtered_reads" + info: + config_key: "Write_Filtered_Reads" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n\nThis pipeline performs\ + \ analysis of single-cell multiomic sequence read (FASTQ) data. The supported\n\ + sequencing libraries are those generated by the BD Rhapsody assay kits, including:\ + \ Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell\ + \ Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning\ + \ 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement'\ + \ from the YAML.\n" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "bdrhap_5kjrt" +- type: "file" + path: "reference_gencodev41_chr1" +info: + name: "BD Rhapsody" + test_dependencies: + - name: "bd_rhapsody_test" + namespace: "test_workflows/ingestion" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "mapping/bd_rhapsody" + alias: "bd_rhapsody_component" + repository: + type: "local" +- name: "convert/from_bdrhap_to_h5mu" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/bd_rhapsody/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/bd_rhapsody" + executable: "target/nextflow/workflows/ingestion/bd_rhapsody/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/mapping/bd_rhapsody" + - "target/nextflow/convert/from_bdrhap_to_h5mu" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/main.nf b/target/nextflow/workflows/ingestion/bd_rhapsody/main.nf new file mode 100644 index 00000000..b176db3b --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/main.nf @@ -0,0 +1,4010 @@ +// bd_rhapsody main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Robrecht Cannoodt (maintainer) +// * Dorien Roosen (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bd_rhapsody", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dorien Roosen", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--reads", + "description" : "Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n", + "info" : { + "config_key" : "Reads" + }, + "example" : [ + "WTALibrary_S1_L001_R1_001.fastq.gz", + "WTALibrary_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reads_atac", + "description" : "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\nYou may specify as many R1/R2/I2 files as you want.\n", + "info" : { + "config_key" : "Reads_ATAC" + }, + "example" : [ + "ATACLibrary_S2_L001_R1_001.fastq.gz", + "ATACLibrary_S2_L001_R2_001.fastq.gz", + "ATACLibrary_S2_L001_I2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "References", + "description" : "Assay type will be inferred from the provided reference(s).\nDo not provide both reference_archive and targeted_reference at the same time.\n \nValid reference input combinations:\n - reference_archive: WTA only\n - reference_archive & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference: WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n - targeted_reference: Targeted only\n - targeted_reference & abseq_reference: Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can be generated with the `reference/build_bdrhap_reference` component.\nAlternatively, BD also provides standard references which can be downloaded from these locations:\n\n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n", + "arguments" : [ + { + "type" : "file", + "name" : "--reference_archive", + "description" : "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level folder\n - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation e.g. \\"gencode.v43.primary_assembly.annotation.gtf\\"\n", + "info" : { + "config_key" : "Reference_Archive" + }, + "example" : [ + "RhapRef_Human_WTA_2023-02.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--targeted_reference", + "description" : "Path to the targeted reference file in FASTA format.\n", + "info" : { + "config_key" : "Targeted_Reference" + }, + "example" : [ + "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--abseq_reference", + "description" : "Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used.", + "info" : { + "config_key" : "AbSeq_Reference" + }, + "example" : [ + "AbSeq_reference.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--supplemental_reference", + "alternatives" : [ + "-s" + ], + "description" : "Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment.", + "info" : { + "config_key" : "Supplemental_Reference" + }, + "example" : [ + "supplemental_reference.fasta" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "description" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The processed output file in h5mu format.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_raw", + "alternatives" : [ + "-o" + ], + "description" : "The unprocessed output directory containing all the outputs from the pipeline.", + "example" : [ + "output_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Putative Cell Calling Settings", + "arguments" : [ + { + "type" : "string", + "name" : "--cell_calling_data", + "description" : "Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above.\n \nFor putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above.\n \nThe default data for putative cell calling, will be determined the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n- If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n", + "info" : { + "config_key" : "Cell_Calling_Data" + }, + "example" : [ + "mRNA" + ], + "required" : false, + "choices" : [ + "mRNA", + "AbSeq", + "ATAC", + "mRNA_and_ATAC" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_calling_bioproduct_algorithm", + "description" : "Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "info" : { + "config_key" : "Cell_Calling_Bioproduct_Algorithm" + }, + "example" : [ + "Basic" + ], + "required" : false, + "choices" : [ + "Basic", + "Refined" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_calling_atac_algorithm", + "description" : "Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "info" : { + "config_key" : "Cell_Calling_ATAC_Algorithm" + }, + "example" : [ + "Basic" + ], + "required" : false, + "choices" : [ + "Basic", + "Refined" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--exact_cell_count", + "description" : "Set a specific number of cells as putative, based on those with the highest error-corrected read count\n", + "info" : { + "config_key" : "Exact_Cell_Count" + }, + "example" : [ + 10000 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--expected_cell_count", + "description" : "Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. \n", + "info" : { + "config_key" : "Expected_Cell_Count" + }, + "example" : [ + 20000 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Intronic Reads Settings", + "arguments" : [ + { + "type" : "boolean", + "name" : "--exclude_intronic_reads", + "description" : "By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded.\nThe value can be true or false.\n", + "info" : { + "config_key" : "Exclude_Intronic_Reads" + }, + "example" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Multiplex Settings", + "arguments" : [ + { + "type" : "string", + "name" : "--sample_tags_version", + "description" : "Specify the version of the Sample Tags used in the run:\n\n* If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the \\"nuclei_includes_mrna\\" option.\n* If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the \\"nuclei_atac_only\\" option.\n", + "info" : { + "config_key" : "Sample_Tags_Version" + }, + "example" : [ + "human" + ], + "required" : false, + "choices" : [ + "human", + "mouse", + "flex", + "nuclei_includes_mrna", + "nuclei_atac_only" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--tag_names", + "description" : "Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n", + "info" : { + "config_key" : "Tag_Names" + }, + "example" : [ + "4-mySample", + "9-myOtherSample", + "6-alsoThisSample" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "VDJ arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--vdj_version", + "description" : "If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR\n", + "info" : { + "config_key" : "VDJ_Version" + }, + "example" : [ + "human" + ], + "required" : false, + "choices" : [ + "human", + "mouse", + "humanBCR", + "humanTCR", + "mouseBCR", + "mouseTCR" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "ATAC options", + "arguments" : [ + { + "type" : "file", + "name" : "--predefined_atac_peaks", + "description" : "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix.", + "info" : { + "config_key" : "Predefined_ATAC_Peaks" + }, + "example" : [ + "predefined_peaks.bed" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Additional options", + "arguments" : [ + { + "type" : "string", + "name" : "--run_name", + "description" : "Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces.\n", + "info" : { + "config_key" : "Run_Name" + }, + "default" : [ + "sample" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--generate_bam", + "description" : "Specify whether to create the BAM file output\n", + "info" : { + "config_key" : "Generate_Bam" + }, + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--long_reads", + "description" : "Use STARlong (default: undefined - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp.\n", + "info" : { + "config_key" : "Long_Reads" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Advanced options", + "description" : "NOTE: Only change these if you are really sure about what you are doing\n", + "arguments" : [ + { + "type" : "string", + "name" : "--custom_star_params", + "description" : "Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline.\nFor reference this is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n", + "info" : { + "config_key" : "Custom_STAR_Params" + }, + "example" : [ + "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--custom_bwa_mem2_params", + "description" : "Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used\nThis applies to fastqs provided in the Reads_ATAC user input \nDo NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2 version 2.2.1\n", + "info" : { + "config_key" : "Custom_bwa_mem2_Params" + }, + "example" : [ + "-k 16 -w 200 -r" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "CWL-runner arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--parallel", + "description" : "Run jobs in parallel.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--timestamps", + "description" : "Add timestamps to the errors, warnings, and notifications.", + "direction" : "input" + } + ] + }, + { + "name" : "Undocumented arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--abseq_umi", + "info" : { + "config_key" : "AbSeq_UMI" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--target_analysis", + "info" : { + "config_key" : "Target_analysis" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--vdj_jgene_evalue", + "description" : "e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001\n", + "info" : { + "config_key" : "VDJ_JGene_Evalue" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--vdj_vgene_evalue", + "description" : "e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001\n", + "info" : { + "config_key" : "VDJ_VGene_Evalue" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--write_filtered_reads", + "info" : { + "config_key" : "Write_Filtered_Reads" + }, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n\nThis pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\nsequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement' from the YAML.\n", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/bdrhap_5kjrt" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1/" + } + ], + "info" : { + "name" : "BD Rhapsody", + "test_dependencies" : [ + { + "name" : "bd_rhapsody_test", + "namespace" : "test_workflows/ingestion" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "mapping/bd_rhapsody", + "alias" : "bd_rhapsody_component", + "repository" : { + "type" : "local" + } + }, + { + "name" : "convert/from_bdrhap_to_h5mu", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/bd_rhapsody/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/bd_rhapsody", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { bd_rhapsody as bd_rhapsody_component_viashalias } from "${meta.resources_dir}/../../../../nextflow/mapping/bd_rhapsody/main.nf" +bd_rhapsody_component = bd_rhapsody_component_viashalias.run(key: "bd_rhapsody_component") +include { from_bdrhap_to_h5mu } from "${meta.resources_dir}/../../../../nextflow/convert/from_bdrhap_to_h5mu/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // run bd rhapsody + | bd_rhapsody_component.run( + fromState: { id, state -> + // pass all arguments except: + // - remove output_h5mu + // - rename output_raw to output_dir + def data_ = state.clone() + data_.remove("output") + data_.remove("output_raw") + data_ + }, + toState: [ + "input": "output_mudata", + "output_raw": "output_dir" + ] + ) + + // convert to h5mu + | from_bdrhap_to_h5mu.run( + fromState: { id, state -> + [ + id: id, + input: state.input, + output: state.output, + output_compression: "gzip" + ] + }, + toState: ["output": "output"] + ) + + | setState(["output_raw", "output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow.config b/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow.config new file mode 100644 index 00000000..a6cdf570 --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/bd_rhapsody' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n\nThis pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\nsequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning \'https://bitbucket.org/CRSwDev/cwl\' and removing all objects with class \'DockerRequirement\' from the YAML.\n' + author = 'Robrecht Cannoodt, Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow_labels.config b/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow_schema.json b/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow_schema.json new file mode 100644 index 00000000..7d8a64ef --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/nextflow_schema.json @@ -0,0 +1,357 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bd_rhapsody", + "description": "BD Rhapsody Sequence Analysis CWL pipeline v2.2.1\n\nThis pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported\nsequencing libraries are those generated by the BD Rhapsody assay kits, including: Whole Transcriptome\nmRNA, Targeted mRNA, AbSeq Antibody-Oligonucleotides, Single-Cell Multiplexing, TCR/BCR, and\nATAC-Seq\n\nThe CWL pipeline file is obtained by cloning 'https://bitbucket.org/CRSwDev/cwl' and removing all objects with class 'DockerRequirement' from the YAML.\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "reads": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include:\n \n- WTA mRNA\n- Targeted mRNA\n- AbSeq\n- Sample Multiplexing\n- VDJ\n \nYou may specify as many R1/R2 read pairs as you want.\n", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"WTALibrary_S1_L001_R1_001.fastq.gz\";\"WTALibrary_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reads_atac": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries.\nYou may specify as many R1/R2/I2 files as you want.\n", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"ATACLibrary_S2_L001_R1_001.fastq.gz\";\"ATACLibrary_S2_L001_R2_001.fastq.gz\";\"ATACLibrary_S2_L001_I2_001.fastq.gz\"]`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "Outputs", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The processed output file in h5mu format.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_raw": { + "type": "string", + "format": "path", + "description": "The unprocessed output directory containing all the outputs from the pipeline.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_raw\"`, direction: `output`, example: `\"output_dir\"`. ", + "default": "$id.$key.output_raw" + } + } + }, + "references": { + "title": "References", + "type": "object", + "description": "Assay type will be inferred from the provided reference(s).\nDo not provide both reference_archive and targeted_reference at the same time.\n \nValid reference input combinations:\n - reference_archive: WTA only\n - reference_archive & abseq_reference: WTA + AbSeq\n - reference_archive & supplemental_reference: WTA + extra transgenes\n - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes\n - reference_archive: WTA + ATAC or ATAC only\n - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes\n - targeted_reference: Targeted only\n - targeted_reference & abseq_reference: Targeted + AbSeq\n - abseq_reference: AbSeq only\n\nThe reference_archive can be generated with the `reference/build_bdrhap_reference` component.\nAlternatively, BD also provides standard references which can be downloaded from these locations:\n\n - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz\n - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz\n", + "properties": { + "reference_archive": { + "type": "string", + "format": "path", + "description": "Path to Rhapsody WTA Reference in the tar.gz format.\n\nStructure of the reference archive:\n \n- `BD_Rhapsody_Reference_Files/`: top level folder\n - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate`\n - GTF for gene-transcript-annotation e.g", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"RhapRef_Human_WTA_2023-02.tar.gz\"`. " + }, + "targeted_reference": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to the targeted reference file in FASTA format.\n", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"BD_Rhapsody_Immune_Response_Panel_Hs.fasta\"]`. " + }, + "abseq_reference": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to the AbSeq reference file in FASTA format", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"AbSeq_reference.fasta\"]`. " + }, + "supplemental_reference": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to the supplemental reference file in FASTA format", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"supplemental_reference.fasta\"]`. " + } + } + }, + "putative cell calling settings": { + "title": "Putative Cell Calling Settings", + "type": "object", + "description": "No description", + "properties": { + "cell_calling_data": { + "type": "string", + "description": "Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC\n \nFor putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above.\n \nFor putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above.\n \nThe default data for putative cell calling, will be determined the following way:\n \n- If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC\n- If only ATAC Reads exist: ATAC\n- Otherwise: mRNA\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"mRNA\"`, choices: ``mRNA`, `AbSeq`, `ATAC`, `mRNA_and_ATAC``. ", + "enum": [ + "mRNA", + "AbSeq", + "ATAC", + "mRNA_and_ATAC" + ] + }, + "cell_calling_bioproduct_algorithm": { + "type": "string", + "description": "Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`, choices: ``Basic`, `Refined``. ", + "enum": [ + "Basic", + "Refined" + ] + }, + "cell_calling_atac_algorithm": { + "type": "string", + "description": "Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined\n \nBy default, the Basic algorithm will be used for putative cell calling.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"Basic\"`, choices: ``Basic`, `Refined``. ", + "enum": [ + "Basic", + "Refined" + ] + }, + "exact_cell_count": { + "type": "integer", + "description": "Set a specific number of cells as putative, based on those with the highest error-corrected read count\n", + "help_text": "Type: `integer`, multiple: `False`, example: `10000`. " + }, + "expected_cell_count": { + "type": "integer", + "description": "Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected", + "help_text": "Type: `integer`, multiple: `False`, example: `20000`. " + } + } + }, + "intronic reads settings": { + "title": "Intronic Reads Settings", + "type": "object", + "description": "No description", + "properties": { + "exclude_intronic_reads": { + "type": "boolean", + "description": "By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts", + "help_text": "Type: `boolean`, multiple: `False`, example: `false`. " + } + } + }, + "multiplex settings": { + "title": "Multiplex Settings", + "type": "object", + "description": "No description", + "properties": { + "sample_tags_version": { + "type": "string", + "description": "Specify the version of the Sample Tags used in the run:\n\n* If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only\n* If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the \"nuclei_includes_mrna\" option.\n* If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the \"nuclei_atac_only\" option.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"human\"`, choices: ``human`, `mouse`, `flex`, `nuclei_includes_mrna`, `nuclei_atac_only``. ", + "enum": [ + "human", + "mouse", + "flex", + "nuclei_includes_mrna", + "nuclei_atac_only" + ] + }, + "tag_names": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv\nDo not use the special characters.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"4-mySample\";\"9-myOtherSample\";\"6-alsoThisSample\"]`. " + } + } + }, + "vdj arguments": { + "title": "VDJ arguments", + "type": "object", + "description": "No description", + "properties": { + "vdj_version": { + "type": "string", + "description": "If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"human\"`, choices: ``human`, `mouse`, `humanBCR`, `humanTCR`, `mouseBCR`, `mouseTCR``. ", + "enum": [ + "human", + "mouse", + "humanBCR", + "humanTCR", + "mouseBCR", + "mouseTCR" + ] + } + } + }, + "atac options": { + "title": "ATAC options", + "type": "object", + "description": "No description", + "properties": { + "predefined_atac_peaks": { + "type": "string", + "format": "path", + "description": "An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"predefined_peaks.bed\"`. " + } + } + }, + "additional options": { + "title": "Additional options", + "type": "object", + "description": "No description", + "properties": { + "run_name": { + "type": "string", + "description": "Specify a run name to use as the output file base name", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample\"`. ", + "default": "sample" + }, + "generate_bam": { + "type": "boolean", + "description": "Specify whether to create the BAM file output\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "long_reads": { + "type": "boolean", + "description": "Use STARlong (default: undefined - i.e", + "help_text": "Type: `boolean`, multiple: `False`. " + } + } + }, + "advanced options": { + "title": "Advanced options", + "type": "object", + "description": "NOTE: Only change these if you are really sure about what you are doing\n", + "properties": { + "custom_star_params": { + "type": "string", + "description": "Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline.\nFor reference this is the default that is used:\n\n Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000`\n Long Reads: Same as Short Reads + `--seedPerReadNmax 10000`\n\nThis applies to fastqs provided in the Reads user input \nDo NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc.\nWe use STAR version 2.7.10b\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000\"`. " + }, + "custom_bwa_mem2_params": { + "type": "string", + "description": "Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline\nThe pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used\nThis applies to fastqs provided in the Reads_ATAC user input \nDo NOT set any non-mapping related params like `-C`, `-t`, etc.\nWe use bwa-mem2 version 2.2.1\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"-k 16 -w 200 -r\"`. " + } + } + }, + "cwl-runner arguments": { + "title": "CWL-runner arguments", + "type": "object", + "description": "No description", + "properties": { + "parallel": { + "type": "boolean", + "description": "Run jobs in parallel.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "timestamps": { + "type": "boolean", + "description": "Add timestamps to the errors, warnings, and notifications.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "undocumented arguments": { + "title": "Undocumented arguments", + "type": "object", + "description": "No description", + "properties": { + "abseq_umi": { + "type": "integer", + "description": "", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "target_analysis": { + "type": "boolean", + "description": "", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "vdj_jgene_evalue": { + "type": "number", + "description": "e-value threshold for J gene", + "help_text": "Type: `double`, multiple: `False`. " + }, + "vdj_vgene_evalue": { + "type": "number", + "description": "e-value threshold for V gene", + "help_text": "Type: `double`, multiple: `False`. " + }, + "write_filtered_reads": { + "type": "boolean", + "description": "", + "help_text": "Type: `boolean`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/references" + }, + { + "$ref": "#/$defs/putative cell calling settings" + }, + { + "$ref": "#/$defs/intronic reads settings" + }, + { + "$ref": "#/$defs/multiplex settings" + }, + { + "$ref": "#/$defs/vdj arguments" + }, + { + "$ref": "#/$defs/atac options" + }, + { + "$ref": "#/$defs/additional options" + }, + { + "$ref": "#/$defs/advanced options" + }, + { + "$ref": "#/$defs/cwl-runner arguments" + }, + { + "$ref": "#/$defs/undocumented arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/utils/integration_tests.config b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/utils/labels.config b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/bd_rhapsody/utils/labels_ci.config b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/bd_rhapsody/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/.config.vsh.yaml b/target/nextflow/workflows/ingestion/cellranger_mapping/.config.vsh.yaml new file mode 100644 index 00000000..0497d4ff --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/.config.vsh.yaml @@ -0,0 +1,377 @@ +name: "cellranger_mapping" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "The fastq.gz files to align. Can also be a single directory containing\ + \ fastq.gz files." + info: null + example: + - "sample_S1_L001_R1_001.fastq.gz" + - "sample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reference" + description: "The path to Cell Ranger reference tar.gz file." + info: null + example: + - "reference.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output_raw" + description: "Location where the output folder from Cell Ranger will be stored." + info: null + example: + - "output_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_h5mu" + description: "The output from Cell Ranger, converted to h5mu." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_metrics" + description: "Name of the .uns slot under which to QC metrics (if any)." + info: null + default: + - "metrics_summary" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_type" + description: "Which Cell Ranger output to use for converting to h5mu." + info: null + default: + - "raw" + required: false + choices: + - "raw" + - "filtered" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Cell Ranger arguments" + arguments: + - type: "integer" + name: "--expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm." + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--chemistry" + description: "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single\ + \ Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1\n- SC3Pv2:\ + \ Single Cell 3' v2\n- SC3Pv3: Single Cell 3' v3\n- SC3Pv3LT: Single Cell 3'\ + \ v3 LT\n- SC3Pv3HT: Single Cell 3' v3 HT\n- SC5P-PE: Single Cell 5' paired-end\n\ + - SC5P-R2: Single Cell 5' R2-only\n- SC-FB: Single Cell Antibody-only 3' v2\ + \ or 5'\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-\ + \ for more information.\n" + info: null + default: + - "auto" + required: false + choices: + - "auto" + - "threeprime" + - "fiveprime" + - "SC3Pv1" + - "SC3Pv2" + - "SC3Pv3" + - "SC3Pv3LT" + - "SC3Pv3HT" + - "SC5P-PE" + - "SC5P-R2" + - "SC-FB" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--secondary_analysis" + description: "Whether or not to run the secondary analysis e.g. clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--generate_bam" + description: "Whether to generate a BAM file." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--include_introns" + description: "Include intronic reads in count (default=true unless --target-panel\ + \ is specified in which case default=false)" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A pipeline for running Cell Ranger mapping." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "cellranger_tiny_fastq" +info: + name: "Cell Ranger mapping" + test_dependencies: + - name: "cellranger_mapping_test" + namespace: "test_workflows/ingestion" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "mapping/cellranger_count" + repository: + type: "local" +- name: "mapping/cellranger_count_split" + repository: + type: "local" +- name: "convert/from_10xh5_to_h5mu" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/cellranger_mapping/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/cellranger_mapping" + executable: "target/nextflow/workflows/ingestion/cellranger_mapping/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/mapping/cellranger_count" + - "target/nextflow/mapping/cellranger_count_split" + - "target/nextflow/convert/from_10xh5_to_h5mu" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/main.nf b/target/nextflow/workflows/ingestion/cellranger_mapping/main.nf new file mode 100644 index 00000000..3910343c --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/main.nf @@ -0,0 +1,3706 @@ +// cellranger_mapping main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) +// * Dries De Maeyer (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_mapping", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "The fastq.gz files to align. Can also be a single directory containing fastq.gz files.", + "example" : [ + "sample_S1_L001_R1_001.fastq.gz", + "sample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "description" : "The path to Cell Ranger reference tar.gz file.", + "example" : [ + "reference.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output_raw", + "description" : "Location where the output folder from Cell Ranger will be stored.", + "example" : [ + "output_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_h5mu", + "description" : "The output from Cell Ranger, converted to h5mu.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_metrics", + "description" : "Name of the .uns slot under which to QC metrics (if any).", + "default" : [ + "metrics_summary" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_type", + "description" : "Which Cell Ranger output to use for converting to h5mu.", + "default" : [ + "raw" + ], + "required" : false, + "choices" : [ + "raw", + "filtered" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Cell Ranger arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--expect_cells", + "description" : "Expected number of recovered cells, used as input to cell calling algorithm.", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--chemistry", + "description" : "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1\n- SC3Pv2: Single Cell 3' v2\n- SC3Pv3: Single Cell 3' v3\n- SC3Pv3LT: Single Cell 3' v3 LT\n- SC3Pv3HT: Single Cell 3' v3 HT\n- SC5P-PE: Single Cell 5' paired-end\n- SC5P-R2: Single Cell 5' R2-only\n- SC-FB: Single Cell Antibody-only 3' v2 or 5'\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information.\n", + "default" : [ + "auto" + ], + "required" : false, + "choices" : [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv3LT", + "SC3Pv3HT", + "SC5P-PE", + "SC5P-R2", + "SC-FB" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--secondary_analysis", + "description" : "Whether or not to run the secondary analysis e.g. clustering.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--generate_bam", + "description" : "Whether to generate a BAM file.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--include_introns", + "description" : "Include intronic reads in count (default=true unless --target-panel is specified in which case default=false)", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A pipeline for running Cell Ranger mapping.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_fastq" + } + ], + "info" : { + "name" : "Cell Ranger mapping", + "test_dependencies" : [ + { + "name" : "cellranger_mapping_test", + "namespace" : "test_workflows/ingestion" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "mapping/cellranger_count", + "repository" : { + "type" : "local" + } + }, + { + "name" : "mapping/cellranger_count_split", + "repository" : { + "type" : "local" + } + }, + { + "name" : "convert/from_10xh5_to_h5mu", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/cellranger_mapping/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/cellranger_mapping", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cellranger_count } from "${meta.resources_dir}/../../../../nextflow/mapping/cellranger_count/main.nf" +include { cellranger_count_split } from "${meta.resources_dir}/../../../../nextflow/mapping/cellranger_count_split/main.nf" +include { from_10xh5_to_h5mu } from "${meta.resources_dir}/../../../../nextflow/convert/from_10xh5_to_h5mu/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | cellranger_count.run( + fromState: [ + "input": "input", + "output": "output_raw", + "expect_cells": "expect_cells", + "chemistry": "chemistry", + "secondary_analysis": "secondary_analysis", + "generate_bam": "generate_bam", + "include_introns": "include_introns", + "reference": "reference" + ], + toState: [ + "input": "output", + "output_raw": "output" + ] + ) + // split output dir into map + | cellranger_count_split.run( + fromState: {id, state -> + def stateMapping = [ + "input": state.input, + ] + outputType = state.output_type == "raw" ? "raw_h5" : "filtered_h5" + stateMapping += [outputType: "\$id.\$key.${outputType}.h5"] + stateMapping += ["metrics_summary": "\$id.\$key.metrics_summary.csv"] + return stateMapping + }, + toState: {id, output, state -> + def outputFile = state.output_type == "raw" ? output.raw_h5 : output.filtered_h5 + def newState = state + [ "input": outputFile ] + return newState + } + ) + // convert to h5mu + | from_10xh5_to_h5mu.run( + fromState: {id, state -> + [ + "input": state.input, + "output_compression": "gzip", + "output": state.output_h5mu, + "uns_metrics": state.uns_metrics, + "input_metrics_summary": state.metrics_summary + ] + }, + toState: ["output_h5mu": "output"] + ) + | setState(["output_raw", "output_h5mu"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow.config b/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow.config new file mode 100644 index 00000000..202c078b --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/cellranger_mapping' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A pipeline for running Cell Ranger mapping.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt, Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow_labels.config b/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow_schema.json b/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow_schema.json new file mode 100644 index 00000000..761a369b --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/nextflow_schema.json @@ -0,0 +1,149 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_mapping", + "description": "A pipeline for running Cell Ranger mapping.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "The fastq.gz files to align", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"sample_S1_L001_R1_001.fastq.gz\";\"sample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "The path to Cell Ranger reference tar.gz file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference.tar.gz\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output_raw": { + "type": "string", + "format": "path", + "description": "Location where the output folder from Cell Ranger will be stored.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_raw\"`, direction: `output`, example: `\"output_dir\"`. ", + "default": "$id.$key.output_raw" + }, + "output_h5mu": { + "type": "string", + "format": "path", + "description": "The output from Cell Ranger, converted to h5mu.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_h5mu.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output_h5mu.h5mu" + }, + "uns_metrics": { + "type": "string", + "description": "Name of the .uns slot under which to QC metrics (if any).", + "help_text": "Type: `string`, multiple: `False`, default: `\"metrics_summary\"`. ", + "default": "metrics_summary" + }, + "output_type": { + "type": "string", + "description": "Which Cell Ranger output to use for converting to h5mu.", + "help_text": "Type: `string`, multiple: `False`, default: `\"raw\"`, choices: ``raw`, `filtered``. ", + "enum": [ + "raw", + "filtered" + ], + "default": "raw" + } + } + }, + "cell ranger arguments": { + "title": "Cell Ranger arguments", + "type": "object", + "description": "No description", + "properties": { + "expect_cells": { + "type": "integer", + "description": "Expected number of recovered cells, used as input to cell calling algorithm.", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "chemistry": { + "type": "string", + "description": "Assay configuration.\n- auto: autodetect mode\n- threeprime: Single Cell 3'\n- fiveprime: Single Cell 5'\n- SC3Pv1: Single Cell 3' v1\n- SC3Pv2: Single Cell 3' v2\n- SC3Pv3: Single Cell 3' v3\n- SC3Pv3LT: Single Cell 3' v3 LT\n- SC3Pv3HT: Single Cell 3' v3 HT\n- SC5P-PE: Single Cell 5' paired-end\n- SC5P-R2: Single Cell 5' R2-only\n- SC-FB: Single Cell Antibody-only 3' v2 or 5'\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"auto\"`, choices: ``auto`, `threeprime`, `fiveprime`, `SC3Pv1`, `SC3Pv2`, `SC3Pv3`, `SC3Pv3LT`, `SC3Pv3HT`, `SC5P-PE`, `SC5P-R2`, `SC-FB``. ", + "enum": [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv3LT", + "SC3Pv3HT", + "SC5P-PE", + "SC5P-R2", + "SC-FB" + ], + "default": "auto" + }, + "secondary_analysis": { + "type": "boolean", + "description": "Whether or not to run the secondary analysis e.g", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "generate_bam": { + "type": "boolean", + "description": "Whether to generate a BAM file.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "include_introns": { + "type": "boolean", + "description": "Include intronic reads in count (default=true unless --target-panel is specified in which case default=false)", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/cell ranger arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/utils/integration_tests.config b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/utils/labels.config b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/cellranger_mapping/utils/labels_ci.config b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_mapping/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/.config.vsh.yaml b/target/nextflow/workflows/ingestion/cellranger_multi/.config.vsh.yaml new file mode 100644 index 00000000..6a1863aa --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/.config.vsh.yaml @@ -0,0 +1,950 @@ +name: "cellranger_multi" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input files" + arguments: + - type: "file" + name: "--input" + description: "The FASTQ files to be analyzed. FASTQ files should conform to the\ + \ naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane\ + \ Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Feature type-specific input files" + description: "Helper functionality to allow feature type-specific input files, without\ + \ the need to specify\nlibrary_type or library_id. The library_id will be inferred\ + \ from the input paths.\n" + arguments: + - type: "file" + name: "--gex_input" + description: "The FASTQ files to be analyzed for Gene Expression. FASTQ files\ + \ should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--abc_input" + description: "The FASTQ files to be analyzed for Antibody Capture. FASTQ files\ + \ should conform to \nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--cgc_input" + description: "The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--mux_input" + description: "The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_input" + description: "The FASTQ files to be analyzed for VDJ. FASTQ files should conform\ + \ to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_t_input" + description: "The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform\ + \ to the naming\nconventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_t_gd_input" + description: "The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should\ + \ conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--vdj_b_input" + description: "The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform\ + \ to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample\ + \ Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--agc_input" + description: "The FASTQ files to be analyzed for Antigen Capture. FASTQ files\ + \ should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample\ + \ Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n" + info: null + example: + - "mysample_S1_L001_R1_001.fastq.gz" + - "mysample_S1_L001_R2_001.fastq.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Library arguments" + arguments: + - type: "string" + name: "--library_id" + description: "The Illumina sample name to analyze. This must exactly match the\ + \ 'Sample Name'part\nof the FASTQ files specified in the `--input` argument.\n" + info: null + example: + - "mysample1" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_type" + description: "The underlying feature type of the library.\n" + info: null + example: + - "Gene Expression" + required: false + choices: + - "Gene Expression" + - "VDJ" + - "VDJ-T" + - "VDJ-B" + - "VDJ-T-GD" + - "Antibody Capture" + - "CRISPR Guide Capture" + - "Multiplexing Capture" + - "Antigen Capture" + - "Custom" + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_subsample" + description: "The rate at which reads from the provided FASTQ files are sampled.\n\ + Must be strictly greater than 0 and less than or equal to 1.\n" + info: null + example: + - "0.5" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_lanes" + description: "Lanes associated with this sample. Defaults to using all lanes." + info: null + example: + - "1-4" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--library_chemistry" + description: "Only applicable to FRP. Library-specific assay configuration. By\ + \ default,\nthe assay configuration is detected automatically. Typically, users\ + \ will\nnot need to specify a chemistry.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Sample parameters" + arguments: + - type: "string" + name: "--sample_ids" + alternatives: + - "--cell_multiplex_sample_id" + description: "A name to identify a multiplexed sample. Must be alphanumeric with\ + \ hyphens and/or underscores,\nand less than 64 characters. Required for Cell\ + \ Multiplexing libraries.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sample_description" + alternatives: + - "--cell_multiplex_description" + description: "A description for the sample." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sample_expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--sample_force_cells" + description: "Force pipeline to use this number of cells, bypassing cell detection.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Feature Barcode library specific arguments" + arguments: + - type: "file" + name: "--feature_reference" + description: "Path to the Feature reference CSV file, declaring Feature Barcode\ + \ constructs and associated barcodes.\nRequired only for Antibody Capture or\ + \ CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref\ + \ for more information.\"\n" + info: null + example: + - "feature_reference.csv" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--feature_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is the user-supplied value. Note that the length\ + \ includes the Barcode and UMI\nsequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--feature_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before\ + \ sequencing metrics are computed\nand therefore, limiting the length of Read\ + \ 2 may affect Q30 scores.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_crispr_umi" + description: "Set the minimum number of CRISPR guide RNA UMIs required for protospacer\ + \ detection.\nIf a lower or higher sensitivity is desired for detection, this\ + \ value can be customized\naccording to specific experimental needs. Applicable\ + \ only to datasets that include a\nCRISPR Guide Capture library.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Gene expression arguments" + description: "Arguments relevant to the analysis of gene expression data." + arguments: + - type: "file" + name: "--gex_reference" + description: "Genome refence index built by Cell Ranger mkref." + info: null + example: + - "reference_genome.tar.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_secondary_analysis" + description: "Whether or not to run the secondary analysis e.g. clustering." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_generate_bam" + description: "Whether to generate a BAM file." + info: null + default: + - false + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--tenx_cloud_token_path" + description: "The 10x Cloud Analysis user token used to enable cell annotation." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--cell_annotation_model" + description: "\"Cell annotation model to use. If auto, uses the default model\ + \ for the species.\nIf not given, does not run cell annotation.\"\n" + info: null + required: false + choices: + - "auto" + - "human_pca_v1_beta" + - "mouse_pca_v1_beta" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_expect_cells" + description: "Expected number of recovered cells, used as input to cell calling\ + \ algorithm.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_force_cells" + description: "Force pipeline to use this number of cells, bypassing cell detection.\n" + info: null + example: + - 3000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--gex_include_introns" + description: "Whether or not to include intronic reads in counts.\nThis option\ + \ does not apply to Fixed RNA Profiling analysis.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is the user-supplied value. Note that the length\ + \ includes the Barcode and UMI\nsequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gex_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before\ + \ sequencing metrics are computed\nand therefore, limiting the length of Read\ + \ 2 may affect Q30 scores.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gex_chemistry" + description: "Assay configuration. Either specify a single value which will be\ + \ applied to all libraries,\nor a number of values that is equal to the number\ + \ of libararies. The latter is only applicable\nto only applicable to Fixed\ + \ RNA Profiling.\n - auto: Chemistry autodetection (default)\n - threeprime:\ + \ Single Cell 3'\n - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single\ + \ Cell 3' v1, v2, v3, or v4\n - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT\n\ + \ - SC-FB: Single Cell Antibody-only 3' v2 or 5'\n - fiveprime: Single Cell\ + \ 5'\n - SC5P-PE: Paired-end Single Cell 5'\n - SC5P-PE-v3: Paired-end Single\ + \ Cell 5' v3\n - SC5P-R2: R2-only Single Cell 5'\n - SC5P-R2-v3: R2-only Single\ + \ Cell 5' v3\n - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X)\n - SC5PHT\ + \ : Single Cell 5' v2 HT\n - SFRP: Fixed RNA Profiling (Singleplex)\n - MFRP:\ + \ Fixed RNA Profiling (Multiplex, Probe Barcode on R2)\n - MFRP-R1: Fixed RNA\ + \ Profiling (Multiplex, Probe Barcode on R1)\n - MFRP-RNA: Fixed RNA Profiling\ + \ (Multiplex, RNA, Probe Barcode on R2)\n - MFRP-Ab: Fixed RNA Profiling (Multiplex,\ + \ Antibody, Probe Barcode at R2:69)\n - MFRP-Ab-R2pos50: Fixed RNA Profiling\ + \ (Multiplex, Antibody, Probe Barcode at R2:50)\n - MFRP-RNA-R1: Fixed RNA\ + \ Profiling (Multiplex, RNA, Probe Barcode on R1)\n - MFRP-Ab-R1: Fixed RNA\ + \ Profiling (Multiplex, Antibody, Probe Barcode on R1)\n - ARC-v1 for analyzing\ + \ the Gene Expression portion of Multiome data. If Cell Ranger auto-detects\ + \ ARC-v1 chemistry, an error is triggered.\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry-\ + \ for more information.\n" + info: null + default: + - "auto" + required: false + choices: + - "auto" + - "threeprime" + - "fiveprime" + - "SC3Pv1" + - "SC3Pv2" + - "SC3Pv3" + - "SC3Pv3-polyA" + - "SC3Pv4" + - "SC3Pv4-polyA" + - "SC3Pv3LT" + - "SC3Pv3HT" + - "SC3Pv3HT-polyA" + - "SC5P-PE" + - "SC5P-PE-v3" + - "SC5P-R2" + - "SC-FB" + - "SC5P-R2-v3" + - "SCP5-PE-v3" + - "SC5PHT" + - "MFRP" + - "MFRP-R1" + - "MFRP-RNA" + - "MFRP-Ab" + - "SFRP" + - "MFRP-Ab-R2pos50" + - "MFRP-RNA-R1" + - "MFRP-Ab-R1" + - "ARC-v1" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "VDJ related parameters" + arguments: + - type: "file" + name: "--vdj_reference" + description: "VDJ refence index built by Cell Ranger mkref." + info: null + example: + - "reference_vdj.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--vdj_inner_enrichment_primers" + description: "V(D)J Immune Profiling libraries: if inner enrichment primers other\ + \ than those provided \nin the 10x Genomics kits are used, they need to be specified\ + \ here as a\ntext file with one primer per line.\n" + info: null + example: + - "enrichment_primers.txt" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vdj_r1_length" + description: "Limit the length of the input Read 1 sequence of V(D)J libraries\ + \ to the first N bases, where N is the user-supplied value.\nNote that the length\ + \ includes the Barcode and UMI sequences so do not set this below 26.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--vdj_r2_length" + description: "Limit the length of the input Read 2 sequence of V(D)J libraries\ + \ to the first N bases, where N is a user-supplied value. \nTrimming occurs\ + \ before sequencing metrics are computed and therefore, limiting the length\ + \ of Read 2 may affect Q30 scores\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "3' Cell multiplexing parameters (CellPlex Multiplexing)" + arguments: + - type: "string" + name: "--cell_multiplex_oligo_ids" + alternatives: + - "--cmo_ids" + description: "The Cell Multiplexing oligo IDs used to multiplex this sample. If\ + \ multiple CMOs were used for a sample,\nseparate IDs with a pipe (e.g., CMO301|CMO302).\ + \ Required for Cell Multiplexing libraries.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--min_assignment_confidence" + description: "The minimum estimated likelihood to call a sample as tagged with\ + \ a Cell Multiplexing Oligo (CMO) instead of \"Unassigned\".\nUsers may wish\ + \ to tolerate a higher rate of mis-assignment in order to obtain more singlets\ + \ to include in their analysis,\nor a lower rate of mis-assignment at the cost\ + \ of obtaining fewer singlets.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--cmo_set" + description: "Path to a custom CMO set CSV file, declaring CMO constructs and\ + \ associated barcodes. If the default CMO reference IDs that are built into\n\ + the Cell Ranger software are required, this option does not need to be used.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcode_sample_assignment" + description: "Path to a barcode-sample assignment CSV file that specifies the\ + \ barcodes that belong to each sample.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Hashtag multiplexing parameters" + arguments: + - type: "string" + name: "--hashtag_ids" + description: "The hashtag IDs used to multiplex this sample. If multiple antibody\ + \ hashtags were used for the same sample,\nyou can separate IDs with a pipe.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "On-chip multiplexing parameters" + arguments: + - type: "string" + name: "--ocm_barcode_ids" + description: "The OCM barcode IDs used to multiplex this sample. Must be one of\ + \ OB1, OB2, OB3, OB4.\nIf multiple OCM Barcodes were used for the same sample,\ + \ you can separate IDs\nwith a pipe (e.g., OB1|OB2).\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Flex multiplexing paramaters" + arguments: + - type: "file" + name: "--probe_set" + description: "A probe set reference CSV file. It specifies the sequences used\ + \ as a reference for probe alignment and the gene ID associated with each probe.\n\ + It must include 4 columns (probe file format 1.0.0): gene_id,probe_seq,probe_id,included,region\ + \ and an optional 5th column (probe file format 1.0.1).\n- gene_id: The Ensembl\ + \ gene identifier targeted by the probe.\n- probe_seq: The nucleotide sequence\ + \ of the probe, which is complementary to the transcript sequence.\n- probe_id:\ + \ The probe identifier, whose format is described in Probe identifiers.\n- included:\ + \ A TRUE or FALSE flag specifying whether the probe is included in the filtered\ + \ counts matrix output or excluded by the probe filter. \n See filter-probes\ + \ option of cellranger multi. All probes of a gene must be marked TRUE in the\ + \ included column for that gene to be included.\n- region: Present only in v1.0.1\ + \ probe set reference CSV. The gene boundary targeted by the probe. Accepted\ + \ values are spliced or unspliced.\n\nThe file also contains a number of required\ + \ metadata fields in the header in the format #key=value:\n- panel_name: The\ + \ name of the probe set.\n- panel_type: Always predesigned for predesigned probe\ + \ sets.\n- reference_genome: The reference genome build used for probe design.\n\ + - reference_version: The version of the Cell Ranger reference transcriptome\ + \ used for probe design.\n- probe_set_file_format: The version of the probe\ + \ set file format specification that this file conforms to.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--filter_probes" + description: "If 'false', include all non-deprecated probes listed in the probe\ + \ set reference CSV file.\nIf 'true' or not set, probes that are predicted to\ + \ have off-target activity to homologous genes are excluded from analysis.\n\ + Not filtering will result in UMI counts from all non-deprecated probes,\nincluding\ + \ those with predicted off-target activity, to be used in the analysis.\nProbes\ + \ whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--probe_barcode_ids" + description: "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex\ + \ GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing\ + \ Barcode IDs. 10x recommends specifying both barcodes (e.g., BC001+AB001)\n\ + when an Antibody Capture library is present. The barcode pair order is BC+AB\ + \ and they\nare separated with a \"+\" (no spaces). Alternatively, you can specify\ + \ the Probe Barcode ID alone and\nCell Ranger's barcode pairing auto-detection\ + \ algorithm will automatically match to the corresponding Antibody\nMultiplexing\ + \ Barcode.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--emptydrops_minimum_umis" + description: "For singleplex Flex experiments, use this option to adjust the UMI\ + \ cutoff during the second step of cell calling.\nCell Ranger will still perform\ + \ the full cell calling process but will only evaluate barcodes with UMIs above\n\ + the threshold you specify.\n" + info: null + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Antigen Capture (BEAM) libary arguments" + description: "These arguments are recommended if an Antigen Capture (BEAM) library\ + \ is present. \nIt is needed to calculate the antigen specificity score.\n" + arguments: + - type: "string" + name: "--control_id" + description: "A user-defined ID for any negative controls used in the T/BCR Antigen\ + \ Capture assay. Must match id specified in the feature reference CSV.\nMay\ + \ only include ASCII characters and must not use whitespace, slash, quote, or\ + \ comma characters. \nEach ID must be unique and must not collide with a gene\ + \ identifier from the transcriptome.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--mhc_allele" + description: "The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele\ + \ name specified in the Feature Reference CSV.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "General arguments" + description: "These arguments are applicable to all library types.\n" + arguments: + - type: "boolean" + name: "--check_library_compatibility" + description: "Optional. This option allows users to disable the check that evaluates\ + \ 10x Barcode overlap between\nibraries when multiple libraries are specified\ + \ (e.g., Gene Expression + Antibody Capture). Setting\nthis option to false\ + \ will disable the check across all library combinations. We recommend running\n\ + this check (default), however if the pipeline errors out, users can bypass the\ + \ check to generate\noutputs for troubleshooting.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output_raw" + description: "The raw output folder." + info: null + example: + - "output_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_h5mu" + description: "Locations for the output files. Must contain a wildcard (*) character,\n\ + which will be replaced with the sample name.\n" + info: null + example: + - "*.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_metrics" + description: "Name of the .uns slot under which to QC metrics (if any)." + info: null + default: + - "metrics_cellranger" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A pipeline for running Cell Ranger multi." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "10x_5k_anticmv/raw/" + dest: "10x_5k_anticmv/raw/" +- type: "file" + path: "10x_5k_fixed/raw/" + dest: "10x_5k_fixed/raw/" +- type: "file" + path: "reference_gencodev41_chr1" +info: + name: "Cell Ranger multi" + test_dependencies: + - name: "cellranger_multi_test" + namespace: "test_workflows/ingestion" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "mapping/cellranger_multi" + alias: "cellranger_multi_component" + repository: + type: "local" +- name: "convert/from_cellranger_multi_to_h5mu" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/cellranger_multi/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/cellranger_multi" + executable: "target/nextflow/workflows/ingestion/cellranger_multi/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/mapping/cellranger_multi" + - "target/nextflow/convert/from_cellranger_multi_to_h5mu" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/main.nf b/target/nextflow/workflows/ingestion/cellranger_multi/main.nf new file mode 100644 index 00000000..fc057b58 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/main.nf @@ -0,0 +1,4311 @@ +// cellranger_multi main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_multi", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input files", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The FASTQ files to be analyzed. FASTQ files should conform to the naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Feature type-specific input files", + "description" : "Helper functionality to allow feature type-specific input files, without the need to specify\nlibrary_type or library_id. The library_id will be inferred from the input paths.\n", + "arguments" : [ + { + "type" : "file", + "name" : "--gex_input", + "description" : "The FASTQ files to be analyzed for Gene Expression. FASTQ files should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--abc_input", + "description" : "The FASTQ files to be analyzed for Antibody Capture. FASTQ files should conform to \nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cgc_input", + "description" : "The FASTQ files to be analyzed for CRISPR Guide Capture. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--mux_input", + "description" : "The FASTQ files to be analyzed for Multiplexing Capture. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_input", + "description" : "The FASTQ files to be analyzed for VDJ. FASTQ files should conform to the\nnaming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_t_input", + "description" : "The FASTQ files to be analyzed for VDJ-T. FASTQ files should conform to the naming\nconventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_t_gd_input", + "description" : "The FASTQ files to be analyzed for VDJ-T-GD. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_b_input", + "description" : "The FASTQ files to be analyzed for VDJ-B. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--agc_input", + "description" : "The FASTQ files to be analyzed for Antigen Capture. FASTQ files should conform to\nthe naming conventions of bcl2fastq and mkfastq:\n`[Sample Name]_S[Sample Index]_L00[Lane Number]_[Read Type]_001.fastq.gz`\n", + "example" : [ + "mysample_S1_L001_R1_001.fastq.gz", + "mysample_S1_L001_R2_001.fastq.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Library arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--library_id", + "description" : "The Illumina sample name to analyze. This must exactly match the 'Sample Name'part\nof the FASTQ files specified in the `--input` argument.\n", + "example" : [ + "mysample1" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_type", + "description" : "The underlying feature type of the library.\n", + "example" : [ + "Gene Expression" + ], + "required" : false, + "choices" : [ + "Gene Expression", + "VDJ", + "VDJ-T", + "VDJ-B", + "VDJ-T-GD", + "Antibody Capture", + "CRISPR Guide Capture", + "Multiplexing Capture", + "Antigen Capture", + "Custom" + ], + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_subsample", + "description" : "The rate at which reads from the provided FASTQ files are sampled.\nMust be strictly greater than 0 and less than or equal to 1.\n", + "example" : [ + "0.5" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_lanes", + "description" : "Lanes associated with this sample. Defaults to using all lanes.", + "example" : [ + "1-4" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--library_chemistry", + "description" : "Only applicable to FRP. Library-specific assay configuration. By default,\nthe assay configuration is detected automatically. Typically, users will\nnot need to specify a chemistry.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Sample parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--sample_ids", + "alternatives" : [ + "--cell_multiplex_sample_id" + ], + "description" : "A name to identify a multiplexed sample. Must be alphanumeric with hyphens and/or underscores,\nand less than 64 characters. Required for Cell Multiplexing libraries.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample_description", + "alternatives" : [ + "--cell_multiplex_description" + ], + "description" : "A description for the sample.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sample_expect_cells", + "description" : "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--sample_force_cells", + "description" : "Force pipeline to use this number of cells, bypassing cell detection.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Feature Barcode library specific arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--feature_reference", + "description" : "Path to the Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes.\nRequired only for Antibody Capture or CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref for more information.\\"\n", + "example" : [ + "feature_reference.csv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--feature_r1_length", + "description" : "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value. Note that the length includes the Barcode and UMI\nsequences so do not set this below 26.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--feature_r2_length", + "description" : "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before sequencing metrics are computed\nand therefore, limiting the length of Read 2 may affect Q30 scores.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_crispr_umi", + "description" : "Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection.\nIf a lower or higher sensitivity is desired for detection, this value can be customized\naccording to specific experimental needs. Applicable only to datasets that include a\nCRISPR Guide Capture library.\n", + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Gene expression arguments", + "description" : "Arguments relevant to the analysis of gene expression data.", + "arguments" : [ + { + "type" : "file", + "name" : "--gex_reference", + "description" : "Genome refence index built by Cell Ranger mkref.", + "example" : [ + "reference_genome.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--gex_secondary_analysis", + "description" : "Whether or not to run the secondary analysis e.g. clustering.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--gex_generate_bam", + "description" : "Whether to generate a BAM file.", + "default" : [ + false + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--tenx_cloud_token_path", + "description" : "The 10x Cloud Analysis user token used to enable cell annotation.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--cell_annotation_model", + "description" : "\\"Cell annotation model to use. If auto, uses the default model for the species.\nIf not given, does not run cell annotation.\\"\n", + "required" : false, + "choices" : [ + "auto", + "human_pca_v1_beta", + "mouse_pca_v1_beta" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_expect_cells", + "description" : "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_force_cells", + "description" : "Force pipeline to use this number of cells, bypassing cell detection.\n", + "example" : [ + 3000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--gex_include_introns", + "description" : "Whether or not to include intronic reads in counts.\nThis option does not apply to Fixed RNA Profiling analysis.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_r1_length", + "description" : "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value. Note that the length includes the Barcode and UMI\nsequences so do not set this below 26.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gex_r2_length", + "description" : "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value. Trimming occurs before sequencing metrics are computed\nand therefore, limiting the length of Read 2 may affect Q30 scores.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--gex_chemistry", + "description" : "Assay configuration. Either specify a single value which will be applied to all libraries,\nor a number of values that is equal to the number of libararies. The latter is only applicable\nto only applicable to Fixed RNA Profiling.\n - auto: Chemistry autodetection (default)\n - threeprime: Single Cell 3'\n - SC3Pv1, SC3Pv2, SC3Pv3(-polyA), SC3Pv4(-polyA): Single Cell 3' v1, v2, v3, or v4\n - SC3Pv3HT(-polyA): Single Cell 3' v3.1 HT\n - SC-FB: Single Cell Antibody-only 3' v2 or 5'\n - fiveprime: Single Cell 5'\n - SC5P-PE: Paired-end Single Cell 5'\n - SC5P-PE-v3: Paired-end Single Cell 5' v3\n - SC5P-R2: R2-only Single Cell 5'\n - SC5P-R2-v3: R2-only Single Cell 5' v3\n - SCP5-PE-v3: Single Cell 5' paired-end v3 (GEM-X)\n - SC5PHT : Single Cell 5' v2 HT\n - SFRP: Fixed RNA Profiling (Singleplex)\n - MFRP: Fixed RNA Profiling (Multiplex, Probe Barcode on R2)\n - MFRP-R1: Fixed RNA Profiling (Multiplex, Probe Barcode on R1)\n - MFRP-RNA: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R2)\n - MFRP-Ab: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at R2:69)\n - MFRP-Ab-R2pos50: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode at R2:50)\n - MFRP-RNA-R1: Fixed RNA Profiling (Multiplex, RNA, Probe Barcode on R1)\n - MFRP-Ab-R1: Fixed RNA Profiling (Multiplex, Antibody, Probe Barcode on R1)\n - ARC-v1 for analyzing the Gene Expression portion of Multiome data. If Cell Ranger auto-detects ARC-v1 chemistry, an error is triggered.\nSee https://kb.10xgenomics.com/hc/en-us/articles/115003764132-How-does-Cell-Ranger-auto-detect-chemistry- for more information.\n", + "default" : [ + "auto" + ], + "required" : false, + "choices" : [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv3-polyA", + "SC3Pv4", + "SC3Pv4-polyA", + "SC3Pv3LT", + "SC3Pv3HT", + "SC3Pv3HT-polyA", + "SC5P-PE", + "SC5P-PE-v3", + "SC5P-R2", + "SC-FB", + "SC5P-R2-v3", + "SCP5-PE-v3", + "SC5PHT", + "MFRP", + "MFRP-R1", + "MFRP-RNA", + "MFRP-Ab", + "SFRP", + "MFRP-Ab-R2pos50", + "MFRP-RNA-R1", + "MFRP-Ab-R1", + "ARC-v1" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "VDJ related parameters", + "arguments" : [ + { + "type" : "file", + "name" : "--vdj_reference", + "description" : "VDJ refence index built by Cell Ranger mkref.", + "example" : [ + "reference_vdj.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--vdj_inner_enrichment_primers", + "description" : "V(D)J Immune Profiling libraries: if inner enrichment primers other than those provided \nin the 10x Genomics kits are used, they need to be specified here as a\ntext file with one primer per line.\n", + "example" : [ + "enrichment_primers.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--vdj_r1_length", + "description" : "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, where N is the user-supplied value.\nNote that the length includes the Barcode and UMI sequences so do not set this below 26.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--vdj_r2_length", + "description" : "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, where N is a user-supplied value. \nTrimming occurs before sequencing metrics are computed and therefore, limiting the length of Read 2 may affect Q30 scores\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "3' Cell multiplexing parameters (CellPlex Multiplexing)", + "arguments" : [ + { + "type" : "string", + "name" : "--cell_multiplex_oligo_ids", + "alternatives" : [ + "--cmo_ids" + ], + "description" : "The Cell Multiplexing oligo IDs used to multiplex this sample. If multiple CMOs were used for a sample,\nseparate IDs with a pipe (e.g., CMO301|CMO302). Required for Cell Multiplexing libraries.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_assignment_confidence", + "description" : "The minimum estimated likelihood to call a sample as tagged with a Cell Multiplexing Oligo (CMO) instead of \\"Unassigned\\".\nUsers may wish to tolerate a higher rate of mis-assignment in order to obtain more singlets to include in their analysis,\nor a lower rate of mis-assignment at the cost of obtaining fewer singlets.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--cmo_set", + "description" : "Path to a custom CMO set CSV file, declaring CMO constructs and associated barcodes. If the default CMO reference IDs that are built into\nthe Cell Ranger software are required, this option does not need to be used.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcode_sample_assignment", + "description" : "Path to a barcode-sample assignment CSV file that specifies the barcodes that belong to each sample.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Hashtag multiplexing parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--hashtag_ids", + "description" : "The hashtag IDs used to multiplex this sample. If multiple antibody hashtags were used for the same sample,\nyou can separate IDs with a pipe.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "On-chip multiplexing parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--ocm_barcode_ids", + "description" : "The OCM barcode IDs used to multiplex this sample. Must be one of OB1, OB2, OB3, OB4.\nIf multiple OCM Barcodes were used for the same sample, you can separate IDs\nwith a pipe (e.g., OB1|OB2).\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Flex multiplexing paramaters", + "arguments" : [ + { + "type" : "file", + "name" : "--probe_set", + "description" : "A probe set reference CSV file. It specifies the sequences used as a reference for probe alignment and the gene ID associated with each probe.\nIt must include 4 columns (probe file format 1.0.0): gene_id,probe_seq,probe_id,included,region and an optional 5th column (probe file format 1.0.1).\n- gene_id: The Ensembl gene identifier targeted by the probe.\n- probe_seq: The nucleotide sequence of the probe, which is complementary to the transcript sequence.\n- probe_id: The probe identifier, whose format is described in Probe identifiers.\n- included: A TRUE or FALSE flag specifying whether the probe is included in the filtered counts matrix output or excluded by the probe filter. \n See filter-probes option of cellranger multi. All probes of a gene must be marked TRUE in the included column for that gene to be included.\n- region: Present only in v1.0.1 probe set reference CSV. The gene boundary targeted by the probe. Accepted values are spliced or unspliced.\n\nThe file also contains a number of required metadata fields in the header in the format #key=value:\n- panel_name: The name of the probe set.\n- panel_type: Always predesigned for predesigned probe sets.\n- reference_genome: The reference genome build used for probe design.\n- reference_version: The version of the Cell Ranger reference transcriptome used for probe design.\n- probe_set_file_format: The version of the probe set file format specification that this file conforms to.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--filter_probes", + "description" : "If 'false', include all non-deprecated probes listed in the probe set reference CSV file.\nIf 'true' or not set, probes that are predicted to have off-target activity to homologous genes are excluded from analysis.\nNot filtering will result in UMI counts from all non-deprecated probes,\nincluding those with predicted off-target activity, to be used in the analysis.\nProbes whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--probe_barcode_ids", + "description" : "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing Barcode IDs. 10x recommends specifying both barcodes (e.g., BC001+AB001)\nwhen an Antibody Capture library is present. The barcode pair order is BC+AB and they\nare separated with a \\"+\\" (no spaces). Alternatively, you can specify the Probe Barcode ID alone and\nCell Ranger's barcode pairing auto-detection algorithm will automatically match to the corresponding Antibody\nMultiplexing Barcode.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--emptydrops_minimum_umis", + "description" : "For singleplex Flex experiments, use this option to adjust the UMI cutoff during the second step of cell calling.\nCell Ranger will still perform the full cell calling process but will only evaluate barcodes with UMIs above\nthe threshold you specify.\n", + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Antigen Capture (BEAM) libary arguments", + "description" : "These arguments are recommended if an Antigen Capture (BEAM) library is present. \nIt is needed to calculate the antigen specificity score.\n", + "arguments" : [ + { + "type" : "string", + "name" : "--control_id", + "description" : "A user-defined ID for any negative controls used in the T/BCR Antigen Capture assay. Must match id specified in the feature reference CSV.\nMay only include ASCII characters and must not use whitespace, slash, quote, or comma characters. \nEach ID must be unique and must not collide with a gene identifier from the transcriptome.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--mhc_allele", + "description" : "The MHC allele for TCR Antigen Capture libraries. Must match mhc_allele name specified in the Feature Reference CSV.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "General arguments", + "description" : "These arguments are applicable to all library types.\n", + "arguments" : [ + { + "type" : "boolean", + "name" : "--check_library_compatibility", + "description" : "Optional. This option allows users to disable the check that evaluates 10x Barcode overlap between\nibraries when multiple libraries are specified (e.g., Gene Expression + Antibody Capture). Setting\nthis option to false will disable the check across all library combinations. We recommend running\nthis check (default), however if the pipeline errors out, users can bypass the check to generate\noutputs for troubleshooting.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output_raw", + "description" : "The raw output folder.", + "example" : [ + "output_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_h5mu", + "description" : "Locations for the output files. Must contain a wildcard (*) character,\nwhich will be replaced with the sample name.\n", + "example" : [ + "*.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_metrics", + "description" : "Name of the .uns slot under which to QC metrics (if any).", + "default" : [ + "metrics_cellranger" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A pipeline for running Cell Ranger multi.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_anticmv/raw/", + "dest" : "10x_5k_anticmv/raw/" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_fixed/raw/", + "dest" : "10x_5k_fixed/raw/" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "info" : { + "name" : "Cell Ranger multi", + "test_dependencies" : [ + { + "name" : "cellranger_multi_test", + "namespace" : "test_workflows/ingestion" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "mapping/cellranger_multi", + "alias" : "cellranger_multi_component", + "repository" : { + "type" : "local" + } + }, + { + "name" : "convert/from_cellranger_multi_to_h5mu", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/cellranger_multi/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/cellranger_multi", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cellranger_multi as cellranger_multi_component_viashalias } from "${meta.resources_dir}/../../../../nextflow/mapping/cellranger_multi/main.nf" +cellranger_multi_component = cellranger_multi_component_viashalias.run(key: "cellranger_multi_component") +include { from_cellranger_multi_to_h5mu } from "${meta.resources_dir}/../../../../nextflow/convert/from_cellranger_multi_to_h5mu/main.nf" + +// inner workflow +// user-provided Nextflow code +process cellrangerMultiStub { + input: + tuple val(id), val(unused) + + output: + tuple val(id), path("raw_dir"), path("*.h5mu"), path("samples.csv") + + script: + """ + echo "This process is not meant to be run without -stub being defined." + exit 1 + """ + + stub: + """ + mkdir raw_dir + touch ${id}_sample_1.h5mu + touch ${id}_sample_2.h5mu + touch ${id}_sample_3.h5mu + echo -e "sample_name,file\n${id}_sample_1,${id}_sample_1.h5mu\n${id}_sample_2,${id}_sample_2.h5mu\n${id}_sample_3,${id}_sample_3.h5mu" > samples.csv + """ +} + +workflow run_wf { + take: + input_ch + + main: + cellranger_ch = input_ch + | cellranger_multi_component.run( + fromState: [ + "input": "input", + "gex_input": "gex_input", + "abc_input": "abc_input", + "cgc_input": "cgc_input", + "mux_input": "mux_input", + "vdj_input": "vdj_input", + "vdj_t_input": "vdj_t_input", + "vdj_t_gd_input": "vdj_t_gd_input", + "vdj_b_input": "vdj_b_input", + "agc_input": "agc_input", + "library_id": "library_id", + "library_type": "library_type", + "library_subsample": "library_subsample", + "library_lanes": "library_lanes", + "library_chemistry": "library_chemistry", + "sample_ids": "sample_ids", + "sample_description": "sample_description", + "sample_expect_cells": "sample_expect_cells", + "sample_force_cells": "sample_force_cells", + "feature_reference": "feature_reference", + "feature_r1_length": "feature_r1_length", + "feature_r2_length": "feature_r2_length", + "gex_reference": "gex_reference", + "gex_secondary_analysis": "gex_secondary_analysis", + "gex_generate_bam": "gex_generate_bam", + "gex_expect_cells": "gex_expect_cells", + "gex_force_cells": "gex_force_cells", + "gex_include_introns": "gex_include_introns", + "gex_r1_length": "gex_r1_length", + "gex_r2_length": "gex_r2_length", + "gex_chemistry": "gex_chemistry", + "vdj_reference": "vdj_reference", + "vdj_inner_enrichment_primers": "vdj_inner_enrichment_primers", + "vdj_r1_length": "vdj_r1_length", + "vdj_r2_length": "vdj_r2_length", + "cell_multiplex_oligo_ids": "cell_multiplex_oligo_ids", + "min_assignment_confidence": "min_assignment_confidence", + "cmo_set": "cmo_set", + "barcode_sample_assignment": "barcode_sample_assignment", + "probe_set": "probe_set", + "filter_probes": "filter_probes", + "probe_barcode_ids": "probe_barcode_ids", + "control_id": "control_id", + "mhc_allele": "mhc_allele", + "check_library_compatibility": "check_library_compatibility", + "output": "output_raw", + ], + toState: [ + "output_raw": "output", + "input": "output" + ] + ) + + h5mu_ch = cellranger_ch + | from_cellranger_multi_to_h5mu.run( + fromState: {id, state -> + [ + "input": state.input, + "uns_metrics": state.uns_metrics, + "output_compression": "gzip" + ] + }, + toState: {id, output, state -> + [ + "sample_csv": output.sample_csv, + "output_h5mu": output.output, + "output_raw": state.output_raw + ] + }, + filter: {!workflow.stubRun}, + ) + + h5mu_stub_ch = cellranger_ch + | filter{workflow.stubRun} + | map {id, state -> [id, state.input]} + | cellrangerMultiStub + | map {id, raw_dir, output, output_files -> + [id, ["output_raw": raw_dir, "output_h5mu": output, "sample_csv": output_files]] + } + + output_ch = h5mu_stub_ch.concat(h5mu_ch) + | flatMap {id, state -> + def h5mu_list = state.output_h5mu + def samples = readCsv(state.sample_csv.toUriString()) + println "Samples: $samples" + def result = h5mu_list.collect{ h5mu_file -> + println "H5mu: ${h5mu_file}, getName: ${h5mu_file.getName()}" + def corresponding_csv_entry = samples.find{h5mu_file.getName() == it.file} + print "CSV entry: $corresponding_csv_entry" + // The cellranger component used to only be able to output a single h5mu file + // In cases where cell multiplexing is not used (1 output sample), it uses 'run' for the sample ID as a dummy. + // This sample ID 'run' was never used for the ID of the channel events. + // So here we overwrite this 'run' id with the name of the input event. + def new_id = h5mu_list.size() == 1 ? id : corresponding_csv_entry.sample_name + return [ new_id, ["output_h5mu": h5mu_file, "output_raw": state.output_raw, "_meta": ["join_id": id]]] + } + return result + } + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/nextflow.config b/target/nextflow/workflows/ingestion/cellranger_multi/nextflow.config new file mode 100644 index 00000000..4fc5d72d --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/cellranger_multi' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A pipeline for running Cell Ranger multi.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/nextflow_labels.config b/target/nextflow/workflows/ingestion/cellranger_multi/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/nextflow_schema.json b/target/nextflow/workflows/ingestion/cellranger_multi/nextflow_schema.json new file mode 100644 index 00000000..6c19b0fc --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/nextflow_schema.json @@ -0,0 +1,571 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_multi", + "description": "A pipeline for running Cell Ranger multi.", + "type": "object", + "$defs": { + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output_raw": { + "type": "string", + "format": "path", + "description": "The raw output folder.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_raw\"`, direction: `output`, example: `\"output_dir\"`. ", + "default": "$id.$key.output_raw" + }, + "output_h5mu": { + "type": "string", + "format": "path", + "description": "Locations for the output files", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_h5mu.h5mu\"`, direction: `output`, example: `\"*.h5mu\"`. ", + "default": "$id.$key.output_h5mu.h5mu" + }, + "uns_metrics": { + "type": "string", + "description": "Name of the .uns slot under which to QC metrics (if any).", + "help_text": "Type: `string`, multiple: `False`, default: `\"metrics_cellranger\"`. ", + "default": "metrics_cellranger" + } + } + }, + "input files": { + "title": "Input files", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + } + } + }, + "feature type-specific input files": { + "title": "Feature type-specific input files", + "type": "object", + "description": "Helper functionality to allow feature type-specific input files, without the need to specify\nlibrary_type or library_id. The library_id will be inferred from the input paths.\n", + "properties": { + "gex_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Gene Expression", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "abc_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Antibody Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "cgc_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for CRISPR Guide Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "mux_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Multiplexing Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_t_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ-T", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_t_gd_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ-T-GD", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "vdj_b_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for VDJ-B", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + }, + "agc_input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "The FASTQ files to be analyzed for Antigen Capture", + "help_text": "Type: `file`, multiple: `True`, direction: `input`, example: `[\"mysample_S1_L001_R1_001.fastq.gz\";\"mysample_S1_L001_R2_001.fastq.gz\"]`. " + } + } + }, + "library arguments": { + "title": "Library arguments", + "type": "object", + "description": "No description", + "properties": { + "library_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The Illumina sample name to analyze", + "help_text": "Type: `string`, multiple: `True`, example: `[\"mysample1\"]`. " + }, + "library_type": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The underlying feature type of the library.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"Gene Expression\"]`, choices: ``Gene Expression`, `VDJ`, `VDJ-T`, `VDJ-B`, `VDJ-T-GD`, `Antibody Capture`, `CRISPR Guide Capture`, `Multiplexing Capture`, `Antigen Capture`, `Custom``. " + }, + "library_subsample": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The rate at which reads from the provided FASTQ files are sampled.\nMust be strictly greater than 0 and less than or equal to 1.\n", + "help_text": "Type: `string`, multiple: `True`, example: `[\"0.5\"]`. " + }, + "library_lanes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Lanes associated with this sample", + "help_text": "Type: `string`, multiple: `True`, example: `[\"1-4\"]`. " + }, + "library_chemistry": { + "type": "string", + "description": "Only applicable to FRP", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "sample parameters": { + "title": "Sample parameters", + "type": "object", + "description": "No description", + "properties": { + "sample_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A name to identify a multiplexed sample", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sample_description": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A description for the sample.", + "help_text": "Type: `string`, multiple: `True`. " + }, + "sample_expect_cells": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "help_text": "Type: `integer`, multiple: `True`, example: `[3000]`. " + }, + "sample_force_cells": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Force pipeline to use this number of cells, bypassing cell detection.\n", + "help_text": "Type: `integer`, multiple: `True`, example: `[3000]`. " + } + } + }, + "feature barcode library specific arguments": { + "title": "Feature Barcode library specific arguments", + "type": "object", + "description": "No description", + "properties": { + "feature_reference": { + "type": "string", + "format": "path", + "description": "Path to the Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes.\nRequired only for Antibody Capture or CRISPR Guide Capture libraries.\nSee https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/feature-bc-analysis#feature-ref for more information.\"\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"feature_reference.csv\"`. " + }, + "feature_r1_length": { + "type": "integer", + "description": "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "feature_r2_length": { + "type": "integer", + "description": "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "min_crispr_umi": { + "type": "integer", + "description": "Set the minimum number of CRISPR guide RNA UMIs required for protospacer detection.\nIf a lower or higher sensitivity is desired for detection, this value can be customized\naccording to specific experimental needs", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "gene expression arguments": { + "title": "Gene expression arguments", + "type": "object", + "description": "Arguments relevant to the analysis of gene expression data.", + "properties": { + "gex_reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Genome refence index built by Cell Ranger mkref.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"reference_genome.tar.gz\"`. " + }, + "gex_secondary_analysis": { + "type": "boolean", + "description": "Whether or not to run the secondary analysis e.g", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "gex_generate_bam": { + "type": "boolean", + "description": "Whether to generate a BAM file.", + "help_text": "Type: `boolean`, multiple: `False`, default: `false`. ", + "default": false + }, + "tenx_cloud_token_path": { + "type": "string", + "format": "path", + "description": "The 10x Cloud Analysis user token used to enable cell annotation.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "cell_annotation_model": { + "type": "string", + "description": "\"Cell annotation model to use", + "help_text": "Type: `string`, multiple: `False`, choices: ``auto`, `human_pca_v1_beta`, `mouse_pca_v1_beta``. ", + "enum": [ + "auto", + "human_pca_v1_beta", + "mouse_pca_v1_beta" + ] + }, + "gex_expect_cells": { + "type": "integer", + "description": "Expected number of recovered cells, used as input to cell calling algorithm.\n", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "gex_force_cells": { + "type": "integer", + "description": "Force pipeline to use this number of cells, bypassing cell detection.\n", + "help_text": "Type: `integer`, multiple: `False`, example: `3000`. " + }, + "gex_include_introns": { + "type": "boolean", + "description": "Whether or not to include intronic reads in counts.\nThis option does not apply to Fixed RNA Profiling analysis.\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "gex_r1_length": { + "type": "integer", + "description": "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases,\nwhere N is the user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "gex_r2_length": { + "type": "integer", + "description": "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases,\nwhere N is a user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "gex_chemistry": { + "type": "string", + "description": "Assay configuration", + "help_text": "Type: `string`, multiple: `False`, default: `\"auto\"`, choices: ``auto`, `threeprime`, `fiveprime`, `SC3Pv1`, `SC3Pv2`, `SC3Pv3`, `SC3Pv3-polyA`, `SC3Pv4`, `SC3Pv4-polyA`, `SC3Pv3LT`, `SC3Pv3HT`, `SC3Pv3HT-polyA`, `SC5P-PE`, `SC5P-PE-v3`, `SC5P-R2`, `SC-FB`, `SC5P-R2-v3`, `SCP5-PE-v3`, `SC5PHT`, `MFRP`, `MFRP-R1`, `MFRP-RNA`, `MFRP-Ab`, `SFRP`, `MFRP-Ab-R2pos50`, `MFRP-RNA-R1`, `MFRP-Ab-R1`, `ARC-v1``. ", + "enum": [ + "auto", + "threeprime", + "fiveprime", + "SC3Pv1", + "SC3Pv2", + "SC3Pv3", + "SC3Pv3-polyA", + "SC3Pv4", + "SC3Pv4-polyA", + "SC3Pv3LT", + "SC3Pv3HT", + "SC3Pv3HT-polyA", + "SC5P-PE", + "SC5P-PE-v3", + "SC5P-R2", + "SC-FB", + "SC5P-R2-v3", + "SCP5-PE-v3", + "SC5PHT", + "MFRP", + "MFRP-R1", + "MFRP-RNA", + "MFRP-Ab", + "SFRP", + "MFRP-Ab-R2pos50", + "MFRP-RNA-R1", + "MFRP-Ab-R1", + "ARC-v1" + ], + "default": "auto" + } + } + }, + "vdj related parameters": { + "title": "VDJ related parameters", + "type": "object", + "description": "No description", + "properties": { + "vdj_reference": { + "type": "string", + "format": "path", + "description": "VDJ refence index built by Cell Ranger mkref.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference_vdj.tar.gz\"`. " + }, + "vdj_inner_enrichment_primers": { + "type": "string", + "format": "path", + "description": "V(D)J Immune Profiling libraries: if inner enrichment primers other than those provided \nin the 10x Genomics kits are used, they need to be specified here as a\ntext file with one primer per line.\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"enrichment_primers.txt\"`. " + }, + "vdj_r1_length": { + "type": "integer", + "description": "Limit the length of the input Read 1 sequence of V(D)J libraries to the first N bases, where N is the user-supplied value.\nNote that the length includes the Barcode and UMI sequences so do not set this below 26.\n", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "vdj_r2_length": { + "type": "integer", + "description": "Limit the length of the input Read 2 sequence of V(D)J libraries to the first N bases, where N is a user-supplied value", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "3' cell multiplexing parameters (cellplex multiplexing)": { + "title": "3' Cell multiplexing parameters (CellPlex Multiplexing)", + "type": "object", + "description": "No description", + "properties": { + "cell_multiplex_oligo_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The Cell Multiplexing oligo IDs used to multiplex this sample", + "help_text": "Type: `string`, multiple: `True`. " + }, + "min_assignment_confidence": { + "type": "number", + "description": "The minimum estimated likelihood to call a sample as tagged with a Cell Multiplexing Oligo (CMO) instead of \"Unassigned\".\nUsers may wish to tolerate a higher rate of mis-assignment in order to obtain more singlets to include in their analysis,\nor a lower rate of mis-assignment at the cost of obtaining fewer singlets.\n", + "help_text": "Type: `double`, multiple: `False`. " + }, + "cmo_set": { + "type": "string", + "format": "path", + "description": "Path to a custom CMO set CSV file, declaring CMO constructs and associated barcodes", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "barcode_sample_assignment": { + "type": "string", + "format": "path", + "description": "Path to a barcode-sample assignment CSV file that specifies the barcodes that belong to each sample.\n", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + } + } + }, + "hashtag multiplexing parameters": { + "title": "Hashtag multiplexing parameters", + "type": "object", + "description": "No description", + "properties": { + "hashtag_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The hashtag IDs used to multiplex this sample", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "on-chip multiplexing parameters": { + "title": "On-chip multiplexing parameters", + "type": "object", + "description": "No description", + "properties": { + "ocm_barcode_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The OCM barcode IDs used to multiplex this sample", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "flex multiplexing paramaters": { + "title": "Flex multiplexing paramaters", + "type": "object", + "description": "No description", + "properties": { + "probe_set": { + "type": "string", + "format": "path", + "description": "A probe set reference CSV file", + "help_text": "Type: `file`, multiple: `False`, direction: `input`. " + }, + "filter_probes": { + "type": "boolean", + "description": "If 'false', include all non-deprecated probes listed in the probe set reference CSV file.\nIf 'true' or not set, probes that are predicted to have off-target activity to homologous genes are excluded from analysis.\nNot filtering will result in UMI counts from all non-deprecated probes,\nincluding those with predicted off-target activity, to be used in the analysis.\nProbes whose ID is prefixed with DEPRECATED are always excluded from the analysis.\n", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "probe_barcode_ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The Fixed RNA Probe Barcode ID used for this sample, and for multiplex GEX + Antibody Capture libraries,\nthe corresponding Antibody Multiplexing Barcode IDs", + "help_text": "Type: `string`, multiple: `True`. " + }, + "emptydrops_minimum_umis": { + "type": "integer", + "description": "For singleplex Flex experiments, use this option to adjust the UMI cutoff during the second step of cell calling.\nCell Ranger will still perform the full cell calling process but will only evaluate barcodes with UMIs above\nthe threshold you specify.\n", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "antigen capture (beam) libary arguments": { + "title": "Antigen Capture (BEAM) libary arguments", + "type": "object", + "description": "These arguments are recommended if an Antigen Capture (BEAM) library is present. \nIt is needed to calculate the antigen specificity score.\n", + "properties": { + "control_id": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A user-defined ID for any negative controls used in the T/BCR Antigen Capture assay", + "help_text": "Type: `string`, multiple: `True`. " + }, + "mhc_allele": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The MHC allele for TCR Antigen Capture libraries", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "general arguments": { + "title": "General arguments", + "type": "object", + "description": "These arguments are applicable to all library types.\n", + "properties": { + "check_library_compatibility": { + "type": "boolean", + "description": "Optional", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/input files" + }, + { + "$ref": "#/$defs/feature type-specific input files" + }, + { + "$ref": "#/$defs/library arguments" + }, + { + "$ref": "#/$defs/sample parameters" + }, + { + "$ref": "#/$defs/feature barcode library specific arguments" + }, + { + "$ref": "#/$defs/gene expression arguments" + }, + { + "$ref": "#/$defs/vdj related parameters" + }, + { + "$ref": "#/$defs/3' cell multiplexing parameters (cellplex multiplexing)" + }, + { + "$ref": "#/$defs/hashtag multiplexing parameters" + }, + { + "$ref": "#/$defs/on-chip multiplexing parameters" + }, + { + "$ref": "#/$defs/flex multiplexing paramaters" + }, + { + "$ref": "#/$defs/antigen capture (beam) libary arguments" + }, + { + "$ref": "#/$defs/general arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/cellranger_multi/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/utils/integration_tests.config b/target/nextflow/workflows/ingestion/cellranger_multi/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/utils/labels.config b/target/nextflow/workflows/ingestion/cellranger_multi/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/cellranger_multi/utils/labels_ci.config b/target/nextflow/workflows/ingestion/cellranger_multi/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_multi/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/.config.vsh.yaml b/target/nextflow/workflows/ingestion/cellranger_postprocessing/.config.vsh.yaml new file mode 100644 index 00000000..2d4661e9 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/.config.vsh.yaml @@ -0,0 +1,287 @@ +name: "cellranger_postprocessing" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Input h5mu file created by running Cell Ranger and converting its\ + \ output to h5mu." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The converted h5mu file." + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Correction arguments" + arguments: + - type: "boolean_true" + name: "--perform_correction" + description: "Whether or not to run CellBender to perform count correction." + info: null + direction: "input" + - type: "integer" + name: "--cellbender_epochs" + description: "Number of epochs to run CellBender for." + info: null + default: + - 150 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Filtering arguments" + arguments: + - type: "integer" + name: "--min_genes" + description: "Minimum number of counts required for a cell to pass filtering." + info: null + example: + - 100 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_counts" + description: "Minimum number of genes expressed required for a cell to pass filtering." + info: null + example: + - 1000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Post-processing Cell Ranger datasets." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + name: "Cell Ranger post-processing" + test_dependencies: + - name: "from_10xh5_to_h5mu" + namespace: "convert" + - name: "cellranger_postprocessing_test" + namespace: "test_workflows/ingestion" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "correction/cellbender_remove_background" + repository: + type: "local" +- name: "filter/filter_with_counts" + repository: + type: "local" +- name: "filter/subset_h5mu" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/cellranger_postprocessing/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/cellranger_postprocessing" + executable: "target/nextflow/workflows/ingestion/cellranger_postprocessing/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/correction/cellbender_remove_background" + - "target/nextflow/filter/filter_with_counts" + - "target/nextflow/filter/subset_h5mu" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/main.nf b/target/nextflow/workflows/ingestion/cellranger_postprocessing/main.nf new file mode 100644 index 00000000..e8abe3a2 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/main.nf @@ -0,0 +1,3618 @@ +// cellranger_postprocessing main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cellranger_postprocessing", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Input h5mu file created by running Cell Ranger and converting its output to h5mu.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The converted h5mu file.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Correction arguments", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--perform_correction", + "description" : "Whether or not to run CellBender to perform count correction.", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--cellbender_epochs", + "description" : "Number of epochs to run CellBender for.", + "default" : [ + 150 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filtering arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_genes", + "description" : "Minimum number of counts required for a cell to pass filtering.", + "example" : [ + 100 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of genes expressed required for a cell to pass filtering.", + "example" : [ + 1000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Post-processing Cell Ranger datasets.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "info" : { + "name" : "Cell Ranger post-processing", + "test_dependencies" : [ + { + "name" : "from_10xh5_to_h5mu", + "namespace" : "convert" + }, + { + "name" : "cellranger_postprocessing_test", + "namespace" : "test_workflows/ingestion" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "correction/cellbender_remove_background", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/filter_with_counts", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/subset_h5mu", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/cellranger_postprocessing/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/cellranger_postprocessing", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cellbender_remove_background } from "${meta.resources_dir}/../../../../nextflow/correction/cellbender_remove_background/main.nf" +include { filter_with_counts } from "${meta.resources_dir}/../../../../nextflow/filter/filter_with_counts/main.nf" +include { subset_h5mu } from "${meta.resources_dir}/../../../../nextflow/filter/subset_h5mu/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map{id, state -> + def output_state + if (!workflow.stubRun) { + assert (state.perform_correction || state.min_genes != null || state.min_counts != null): + "Either perform_correct, min_genes or min_counts should be specified!" + output_state = state + } else { + output_state = state + ["perform_correction": true, "min_genes": 1, "min_counts": 1] + } + [id, output_state] + } + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // perform correction if so desired + | cellbender_remove_background.run( + runIf: {id, state -> state.perform_correction}, + fromState: { id, state -> + [ + input: state.input, + epochs: state.cellbender_epochs, + output_layer: "cellbender_corrected", + output_compression: "gzip", + output: state.workflow_output, + ] + }, + toState: { id, output, state -> + state + ["input": output.output, "layer": "cellbender_corrected"] + } + ) + | filter_with_counts.run( + runIf: {id, state -> + state.min_genes != null || state.min_counts != null + }, + fromState: { id, state -> + [ + input: state.input, + min_genes: state.min_genes, + min_counts: state.min_counts, + layer: state.layer, + output_compression: "gzip", + do_subset: true, + output: state.workflow_output, + ] + }, + toState: [input: "output"] + ) + // Make sure to use the correct ouput file names, + // irrespective of which component(s) (or combinations of then) + // were run. The above components + // should put their output into 'input' + | map {id, state -> + [id, ["output": state.input]] + } + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow.config b/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow.config new file mode 100644 index 00000000..1094f065 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/cellranger_postprocessing' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Post-processing Cell Ranger datasets.' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow_labels.config b/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow_schema.json b/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow_schema.json new file mode 100644 index 00000000..b09e10a2 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/nextflow_schema.json @@ -0,0 +1,106 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "cellranger_postprocessing", + "description": "Post-processing Cell Ranger datasets.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file created by running Cell Ranger and converting its output to h5mu.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The converted h5mu file.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + } + } + }, + "correction arguments": { + "title": "Correction arguments", + "type": "object", + "description": "No description", + "properties": { + "perform_correction": { + "type": "boolean", + "description": "Whether or not to run CellBender to perform count correction.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "cellbender_epochs": { + "type": "integer", + "description": "Number of epochs to run CellBender for.", + "help_text": "Type: `integer`, multiple: `False`, default: `150`. ", + "default": 150 + } + } + }, + "filtering arguments": { + "title": "Filtering arguments", + "type": "object", + "description": "No description", + "properties": { + "min_genes": { + "type": "integer", + "description": "Minimum number of counts required for a cell to pass filtering.", + "help_text": "Type: `integer`, multiple: `False`, example: `100`. " + }, + "min_counts": { + "type": "integer", + "description": "Minimum number of genes expressed required for a cell to pass filtering.", + "help_text": "Type: `integer`, multiple: `False`, example: `1000`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/correction arguments" + }, + { + "$ref": "#/$defs/filtering arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/integration_tests.config b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/labels.config b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/labels_ci.config b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/cellranger_postprocessing/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/conversion/.config.vsh.yaml b/target/nextflow/workflows/ingestion/conversion/.config.vsh.yaml new file mode 100644 index 00000000..304e4e8c --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/.config.vsh.yaml @@ -0,0 +1,263 @@ +name: "conversion" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the sample." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_type" + alternatives: + - "-t" + description: "Type of the input file" + info: null + required: true + choices: + - "10xmtx" + - "10xh5" + - "h5ad" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Name or template for the output files." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Conversion from h5ad" + arguments: + - type: "string" + name: "--modality" + description: "Name of the modality where the h5ad is stored in the h5mu object." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A pipeline to convert different file formats to .h5mu." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + name: "Convert to MuData" + test_dependencies: + - name: "conversion_test" + namespace: "test_workflows/ingestion" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "convert/from_10xh5_to_h5mu" + repository: + type: "local" +- name: "convert/from_10xmtx_to_h5mu" + repository: + type: "local" +- name: "convert/from_h5ad_to_h5mu" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/conversion/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/conversion" + executable: "target/nextflow/workflows/ingestion/conversion/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/convert/from_10xh5_to_h5mu" + - "target/nextflow/convert/from_10xmtx_to_h5mu" + - "target/nextflow/convert/from_h5ad_to_h5mu" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/conversion/main.nf b/target/nextflow/workflows/ingestion/conversion/main.nf new file mode 100644 index 00000000..69985156 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/main.nf @@ -0,0 +1,3550 @@ +// conversion main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) +// * Dries De Maeyer (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "conversion", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the sample.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_type", + "alternatives" : [ + "-t" + ], + "description" : "Type of the input file", + "required" : true, + "choices" : [ + "10xmtx", + "10xh5", + "h5ad" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Name or template for the output files.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Conversion from h5ad", + "arguments" : [ + { + "type" : "string", + "name" : "--modality", + "description" : "Name of the modality where the h5ad is stored in the h5mu object.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A pipeline to convert different file formats to .h5mu.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "info" : { + "name" : "Convert to MuData", + "test_dependencies" : [ + { + "name" : "conversion_test", + "namespace" : "test_workflows/ingestion" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "convert/from_10xh5_to_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "convert/from_10xmtx_to_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "convert/from_h5ad_to_h5mu", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/conversion/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/conversion", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { from_10xh5_to_h5mu } from "${meta.resources_dir}/../../../../nextflow/convert/from_10xh5_to_h5mu/main.nf" +include { from_10xmtx_to_h5mu } from "${meta.resources_dir}/../../../../nextflow/convert/from_10xmtx_to_h5mu/main.nf" +include { from_h5ad_to_h5mu } from "${meta.resources_dir}/../../../../nextflow/convert/from_h5ad_to_h5mu/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | runEach( + components: [from_10xh5_to_h5mu, from_h5ad_to_h5mu, from_10xmtx_to_h5mu], + filter: { id, state, component -> + def componentNameMapper = [ + "10xh5": "from_10xh5_to_h5mu", + "10xmtx": "from_10xmtx_to_h5mu", + "h5ad": "from_h5ad_to_h5mu" + ] + componentNameMapper[state.input_type] == component.config.name + }, + fromState: { id, state, component -> + def passed_state = [ + input: state.input, + compression: "gzip", + output: state.output + ] + if (component.name == "from_h5ad_to_h5mu") { + passed_state.modality = state.modality + } + passed_state + }, + toState: ["output": "output"] + ) + | setState(["output": "output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/conversion/nextflow.config b/target/nextflow/workflows/ingestion/conversion/nextflow.config new file mode 100644 index 00000000..ce40df85 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/conversion' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A pipeline to convert different file formats to .h5mu.' + author = 'Dries Schaumont, Dries De Maeyer' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/conversion/nextflow_labels.config b/target/nextflow/workflows/ingestion/conversion/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/conversion/nextflow_schema.json b/target/nextflow/workflows/ingestion/conversion/nextflow_schema.json new file mode 100644 index 00000000..ec81d794 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/nextflow_schema.json @@ -0,0 +1,92 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "conversion", + "description": "A pipeline to convert different file formats to .h5mu.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "input_type": { + "type": "string", + "description": "Type of the input file", + "help_text": "Type: `string`, multiple: `False`, required, choices: ``10xmtx`, `10xh5`, `h5ad``. ", + "enum": [ + "10xmtx", + "10xh5", + "h5ad" + ] + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Name or template for the output files.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "conversion from h5ad": { + "title": "Conversion from h5ad", + "type": "object", + "description": "No description", + "properties": { + "modality": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Name of the modality where the h5ad is stored in the h5mu object.", + "help_text": "Type: `string`, multiple: `True`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/conversion from h5ad" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/ingestion/conversion/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/conversion/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/conversion/utils/integration_tests.config b/target/nextflow/workflows/ingestion/conversion/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/conversion/utils/labels.config b/target/nextflow/workflows/ingestion/conversion/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/conversion/utils/labels_ci.config b/target/nextflow/workflows/ingestion/conversion/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/conversion/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/demux/.config.vsh.yaml b/target/nextflow/workflows/ingestion/demux/.config.vsh.yaml new file mode 100644 index 00000000..be294014 --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/.config.vsh.yaml @@ -0,0 +1,348 @@ +name: "demux" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +- name: "Marijke Van Moerbeke" + roles: + - "author" + info: + role: "Contributor" + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Samuel D'Souza" + roles: + - "author" + info: + role: "Contributor" + links: + github: "srdsam" + linkedin: "samuel-d-souza-887023150/" + organizations: + - name: "Chan Zuckerberg Biohub" + href: "https://www.czbiohub.org" + role: "Data Engineer" +- name: "Robrecht Cannoodt" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Input run directory" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--sample_sheet" + alternatives: + - "-s" + description: "Pointer to the sample sheet" + info: null + example: + - "bcl_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--demultiplexer" + description: "The multiplexer to use, one of bclconvert or mkfastq" + info: null + default: + - "bcl2fastq" + required: false + choices: + - "bclconvert" + - "bcl2fastq" + - "mkfastq" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--ignore_missing" + description: "Should the demultiplexer ignore missing entities (filter, ...)" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_fastq" + description: "Output directory containig fastq files" + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_fastqc" + description: "Reports directory produced by FastQC" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_multiqc" + description: "Reports directory produced by MultiQC" + info: null + example: + - "reports_dir" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Convert `.bcl` files to `.fastq` files using bcl2fastq, bcl-convert\ + \ or Cell Ranger mkfastq.\n" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "cellranger_tiny_bcl" +info: + name: "Demux" + short_description: "A generic pipeline for running bcl2fastq, bcl-convert or Cell\ + \ Ranger mkfastq." +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "demux/cellranger_mkfastq" + repository: + type: "local" +- name: "demux/bcl_convert" + repository: + type: "local" +- name: "demux/bcl2fastq" + repository: + type: "local" +- name: "qc/fastqc" + repository: + type: "local" +- name: "qc/multiqc" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/demux/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/demux" + executable: "target/nextflow/workflows/ingestion/demux/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/demux/cellranger_mkfastq" + - "target/nextflow/demux/bcl_convert" + - "target/nextflow/demux/bcl2fastq" + - "target/nextflow/qc/fastqc" + - "target/nextflow/qc/multiqc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/demux/main.nf b/target/nextflow/workflows/ingestion/demux/main.nf new file mode 100644 index 00000000..6f6c678c --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/main.nf @@ -0,0 +1,3699 @@ +// demux main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Toni Verbeiren (author, maintainer) +// * Marijke Van Moerbeke (author) +// * Angela Oliveira Pisco (author) +// * Samuel D'Souza (author) +// * Robrecht Cannoodt (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "demux", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Toni Verbeiren", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "github" : "tverbeiren", + "linkedin" : "verbeiren" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist and CEO" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + }, + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Samuel D'Souza", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "srdsam", + "linkedin" : "samuel-d-souza-887023150/" + }, + "organizations" : [ + { + "name" : "Chan Zuckerberg Biohub", + "href" : "https://www.czbiohub.org", + "role" : "Data Engineer" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Input run directory", + "example" : [ + "bcl_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--sample_sheet", + "alternatives" : [ + "-s" + ], + "description" : "Pointer to the sample sheet", + "example" : [ + "bcl_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--demultiplexer", + "description" : "The multiplexer to use, one of bclconvert or mkfastq", + "default" : [ + "bcl2fastq" + ], + "required" : false, + "choices" : [ + "bclconvert", + "bcl2fastq", + "mkfastq" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--ignore_missing", + "description" : "Should the demultiplexer ignore missing entities (filter, ...)", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_fastq", + "description" : "Output directory containig fastq files", + "example" : [ + "fastq_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_fastqc", + "description" : "Reports directory produced by FastQC", + "example" : [ + "reports_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_multiqc", + "description" : "Reports directory produced by MultiQC", + "example" : [ + "reports_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Convert `.bcl` files to `.fastq` files using bcl2fastq, bcl-convert or Cell Ranger mkfastq.\n", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/cellranger_tiny_bcl" + } + ], + "info" : { + "name" : "Demux", + "short_description" : "A generic pipeline for running bcl2fastq, bcl-convert or Cell Ranger mkfastq." + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "demux/cellranger_mkfastq", + "repository" : { + "type" : "local" + } + }, + { + "name" : "demux/bcl_convert", + "repository" : { + "type" : "local" + } + }, + { + "name" : "demux/bcl2fastq", + "repository" : { + "type" : "local" + } + }, + { + "name" : "qc/fastqc", + "repository" : { + "type" : "local" + } + }, + { + "name" : "qc/multiqc", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/demux/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/demux", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cellranger_mkfastq } from "${meta.resources_dir}/../../../../nextflow/demux/cellranger_mkfastq/main.nf" +include { bcl_convert } from "${meta.resources_dir}/../../../../nextflow/demux/bcl_convert/main.nf" +include { bcl2fastq } from "${meta.resources_dir}/../../../../nextflow/demux/bcl2fastq/main.nf" +include { fastqc } from "${meta.resources_dir}/../../../../nextflow/qc/fastqc/main.nf" +include { multiqc } from "${meta.resources_dir}/../../../../nextflow/qc/multiqc/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // run the demultiplexers + | runEach( + components: [cellranger_mkfastq, bcl_convert, bcl2fastq], + filter: { id, state, component -> + def funcNameMapper = [ + "bclconvert": "bcl_convert", + "bcl2fastq": "bcl2fastq", + "mkfastq": "cellranger_mkfastq" + ] + funcNameMapper[state.demultiplexer] == component.config.name + }, + fromState: { id, state, component -> + def data = [ + input: state.input, + sample_sheet: state.sample_sheet, + reports: null // disable reports so they end up in the output dir + ] + if (component.config.name== "bcl2fastq") { + data.ignore_missing = state.ignore_missing + } + data + }, + toState: [ + "input": "output", + "output_fastq": "output" + ] + ) + + // run fastqc + | fastqc.run( + fromState: [ + "input": "input", + "output": "output_fastqc" + ], + args: [mode: "dir"], + toState: [ + "output_fastqc": "output", + "input": "output" + ] + ) + + // run multiqc + | multiqc.run( + fromState: { id, state -> + [ + "input": [state.input], + "output": state.output_multiqc + ] + }, + toState: ["output_multiqc": "output"] + ) + // subset state to the outputs + | setState(["output_fastq", "output_fastqc", "output_multiqc"]) + + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/demux/nextflow.config b/target/nextflow/workflows/ingestion/demux/nextflow.config new file mode 100644 index 00000000..6a28b40b --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/demux' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Convert `.bcl` files to `.fastq` files using bcl2fastq, bcl-convert or Cell Ranger mkfastq.\n' + author = 'Toni Verbeiren, Marijke Van Moerbeke, Angela Oliveira Pisco, Samuel D\'Souza, Robrecht Cannoodt' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/demux/nextflow_labels.config b/target/nextflow/workflows/ingestion/demux/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/demux/nextflow_schema.json b/target/nextflow/workflows/ingestion/demux/nextflow_schema.json new file mode 100644 index 00000000..810f2b43 --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/nextflow_schema.json @@ -0,0 +1,91 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "demux", + "description": "Convert `.bcl` files to `.fastq` files using bcl2fastq, bcl-convert or Cell Ranger mkfastq.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input run directory", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"bcl_dir\"`. " + }, + "sample_sheet": { + "type": "string", + "format": "path", + "exists": true, + "description": "Pointer to the sample sheet", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"bcl_dir\"`. " + }, + "demultiplexer": { + "type": "string", + "description": "The multiplexer to use, one of bclconvert or mkfastq", + "help_text": "Type: `string`, multiple: `False`, default: `\"bcl2fastq\"`, choices: ``bclconvert`, `bcl2fastq`, `mkfastq``. ", + "enum": [ + "bclconvert", + "bcl2fastq", + "mkfastq" + ], + "default": "bcl2fastq" + }, + "ignore_missing": { + "type": "boolean", + "description": "Should the demultiplexer ignore missing entities (filter, ...)", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "output_fastq": { + "type": "string", + "format": "path", + "description": "Output directory containig fastq files", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_fastq\"`, direction: `output`, example: `\"fastq_dir\"`. ", + "default": "$id.$key.output_fastq" + }, + "output_fastqc": { + "type": "string", + "format": "path", + "description": "Reports directory produced by FastQC", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_fastqc\"`, direction: `output`, example: `\"reports_dir\"`. ", + "default": "$id.$key.output_fastqc" + }, + "output_multiqc": { + "type": "string", + "format": "path", + "description": "Reports directory produced by MultiQC", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_multiqc\"`, direction: `output`, example: `\"reports_dir\"`. ", + "default": "$id.$key.output_multiqc" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/ingestion/demux/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/demux/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/demux/utils/integration_tests.config b/target/nextflow/workflows/ingestion/demux/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/demux/utils/labels.config b/target/nextflow/workflows/ingestion/demux/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/demux/utils/labels_ci.config b/target/nextflow/workflows/ingestion/demux/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/demux/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/ingestion/make_reference/.config.vsh.yaml b/target/nextflow/workflows/ingestion/make_reference/.config.vsh.yaml new file mode 100644 index 00000000..d9bc3e88 --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/.config.vsh.yaml @@ -0,0 +1,450 @@ +name: "make_reference" +namespace: "workflows/ingestion" +version: "main" +authors: +- name: "Angela Oliveira Pisco" + roles: + - "author" + info: + role: "Contributor" + links: + github: "aopisco" + orcid: "0000-0003-0142-2355" + linkedin: "aopisco" + organizations: + - name: "Insitro" + href: "https://insitro.com" + role: "Director of Computational Biology" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the reference." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genome_fasta" + description: "Reference genome fasta." + info: null + example: + - "https:/ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--transcriptome_gtf" + description: "Reference transcriptome annotation." + info: null + example: + - "https:/ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--ercc" + description: "ERCC sequence and annotation file." + info: null + example: + - "https:/assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "STAR Settings" + arguments: + - type: "integer" + name: "--star_genome_sa_index_nbases" + description: "Length (bases) of the SA pre-indexing string. Typically between\ + \ 10 and 15.\nLonger strings will use much more memory, but allow faster searches.\ + \ For small\ngenomes, the parameter {genomeSAindexNbases must be scaled down\ + \ to\nmin(14, log2(GenomeLength)/2 - 1).\n" + info: null + default: + - 14 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "BD Rhapsody Settings" + arguments: + - type: "string" + name: "--bdrhap_mitochondrial_contigs" + description: "Names of the Mitochondrial contigs in the provided Reference Genome.\ + \ Fragments originating from contigs other than these are\nidentified as 'nuclear\ + \ fragments' in the ATACseq analysis pipeline.\n" + info: null + default: + - "chrM" + - "chrMT" + - "M" + - "MT" + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--bdrhap_filtering_off" + description: "By default the input Transcript Annotation files are filtered based\ + \ on the gene_type/gene_biotype attribute. Only features \nhaving the following\ + \ attribute values are kept:\n\n - protein_coding\n - lncRNA \n - IG_LV_gene\n\ + \ - IG_V_gene\n - IG_V_pseudogene\n - IG_D_gene\n - IG_J_gene\n - IG_J_pseudogene\n\ + \ - IG_C_gene\n - IG_C_pseudogene\n - TR_V_gene\n - TR_V_pseudogene\n -\ + \ TR_D_gene\n - TR_J_gene\n - TR_J_pseudogene\n - TR_C_gene\n\n If you have\ + \ already pre-filtered the input Annotation files and/or wish to turn-off the\ + \ filtering, please set this option to True.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--bdrhap_wta_only_index" + description: "Build a WTA only index, otherwise builds a WTA + ATAC index." + info: null + direction: "input" + - type: "string" + name: "--bdrhap_extra_star_params" + description: "Additional parameters to pass to STAR when building the genome index.\ + \ Specify exactly like how you would on the command line." + info: null + example: + - "--limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Cellranger ARC options" + arguments: + - type: "file" + name: "--motifs_file" + description: "Path to file containing transcription factor motifs in JASPAR format." + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--non_nuclear_contigs" + description: "Name(s) of contig(s) that do not have any chromatin structure, for\ + \ example, \nmitochondria or plastids. These contigs are excluded from peak\ + \ calling since\nthe entire contig will be \"open\" due to a lack of chromatin\ + \ structure.\nLeave empty if there are no such contigs.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "string" + name: "--target" + description: "Which reference indices to generate." + info: null + default: + - "star" + required: false + choices: + - "cellranger" + - "cellranger_arc" + - "bd_rhapsody" + - "star" + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output_fasta" + description: "Output genome sequence fasta." + info: null + example: + - "genome_sequence.fa.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_gtf" + description: "Output transcriptome annotation gtf." + info: null + example: + - "transcriptome_annotation.gtf.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_cellranger" + description: "Output index" + info: null + example: + - "cellranger_index.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_cellranger_arc" + description: "Output index" + info: null + example: + - "cellranger_index_arc.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_bd_rhapsody" + description: "Output index" + info: null + example: + - "bdrhap_index.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_star" + description: "Output index" + info: null + example: + - "star_index.tar.gz" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Arguments" + arguments: + - type: "string" + name: "--subset_regex" + description: "Will subset the reference chromosomes using the given regex." + info: null + example: + - "(ERCC-00002|chr1)" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Build a transcriptomics reference into one of many formats.\n" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "reference_gencodev41_chr1" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "reference/make_reference" + alias: "make_reference_component" + repository: + type: "local" +- name: "reference/build_bdrhap_reference" + repository: + type: "local" +- name: "reference/build_star_reference" + repository: + type: "local" +- name: "reference/build_cellranger_reference" + repository: + type: "local" +- name: "reference/build_cellranger_arc_reference" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/ingestion/make_reference/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/ingestion/make_reference" + executable: "target/nextflow/workflows/ingestion/make_reference/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/reference/make_reference" + - "target/nextflow/reference/build_bdrhap_reference" + - "target/nextflow/reference/build_star_reference" + - "target/nextflow/reference/build_cellranger_reference" + - "target/nextflow/reference/build_cellranger_arc_reference" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/ingestion/make_reference/main.nf b/target/nextflow/workflows/ingestion/make_reference/main.nf new file mode 100644 index 00000000..617b50ca --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/main.nf @@ -0,0 +1,3829 @@ +// make_reference main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Angela Oliveira Pisco (author) +// * Robrecht Cannoodt (author, maintainer) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "make_reference", + "namespace" : "workflows/ingestion", + "version" : "main", + "authors" : [ + { + "name" : "Angela Oliveira Pisco", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "github" : "aopisco", + "orcid" : "0000-0003-0142-2355", + "linkedin" : "aopisco" + }, + "organizations" : [ + { + "name" : "Insitro", + "href" : "https://insitro.com", + "role" : "Director of Computational Biology" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the reference.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--genome_fasta", + "description" : "Reference genome fasta.", + "example" : [ + "https:/ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--transcriptome_gtf", + "description" : "Reference transcriptome annotation.", + "example" : [ + "https:/ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--ercc", + "description" : "ERCC sequence and annotation file.", + "example" : [ + "https:/assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "STAR Settings", + "arguments" : [ + { + "type" : "integer", + "name" : "--star_genome_sa_index_nbases", + "description" : "Length (bases) of the SA pre-indexing string. Typically between 10 and 15.\nLonger strings will use much more memory, but allow faster searches. For small\ngenomes, the parameter {genomeSAindexNbases must be scaled down to\nmin(14, log2(GenomeLength)/2 - 1).\n", + "default" : [ + 14 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "BD Rhapsody Settings", + "arguments" : [ + { + "type" : "string", + "name" : "--bdrhap_mitochondrial_contigs", + "description" : "Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are\nidentified as 'nuclear fragments' in the ATACseq analysis pipeline.\n", + "default" : [ + "chrM", + "chrMT", + "M", + "MT" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--bdrhap_filtering_off", + "description" : "By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features \nhaving the following attribute values are kept:\n\n - protein_coding\n - lncRNA \n - IG_LV_gene\n - IG_V_gene\n - IG_V_pseudogene\n - IG_D_gene\n - IG_J_gene\n - IG_J_pseudogene\n - IG_C_gene\n - IG_C_pseudogene\n - TR_V_gene\n - TR_V_pseudogene\n - TR_D_gene\n - TR_J_gene\n - TR_J_pseudogene\n - TR_C_gene\n\n If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--bdrhap_wta_only_index", + "description" : "Build a WTA only index, otherwise builds a WTA + ATAC index.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--bdrhap_extra_star_params", + "description" : "Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line.", + "example" : [ + "--limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Cellranger ARC options", + "arguments" : [ + { + "type" : "file", + "name" : "--motifs_file", + "description" : "Path to file containing transcription factor motifs in JASPAR format.", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--non_nuclear_contigs", + "description" : "Name(s) of contig(s) that do not have any chromatin structure, for example, \nmitochondria or plastids. These contigs are excluded from peak calling since\nthe entire contig will be \\"open\\" due to a lack of chromatin structure.\nLeave empty if there are no such contigs.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "string", + "name" : "--target", + "description" : "Which reference indices to generate.", + "default" : [ + "star" + ], + "required" : false, + "choices" : [ + "cellranger", + "cellranger_arc", + "bd_rhapsody", + "star" + ], + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_fasta", + "description" : "Output genome sequence fasta.", + "example" : [ + "genome_sequence.fa.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_gtf", + "description" : "Output transcriptome annotation gtf.", + "example" : [ + "transcriptome_annotation.gtf.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_cellranger", + "description" : "Output index", + "example" : [ + "cellranger_index.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_cellranger_arc", + "description" : "Output index", + "example" : [ + "cellranger_index_arc.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_bd_rhapsody", + "description" : "Output index", + "example" : [ + "bdrhap_index.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_star", + "description" : "Output index", + "example" : [ + "star_index.tar.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--subset_regex", + "description" : "Will subset the reference chromosomes using the given regex.", + "example" : [ + "(ERCC-00002|chr1)" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Build a transcriptomics reference into one of many formats.\n", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/reference_gencodev41_chr1" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "reference/make_reference", + "alias" : "make_reference_component", + "repository" : { + "type" : "local" + } + }, + { + "name" : "reference/build_bdrhap_reference", + "repository" : { + "type" : "local" + } + }, + { + "name" : "reference/build_star_reference", + "repository" : { + "type" : "local" + } + }, + { + "name" : "reference/build_cellranger_reference", + "repository" : { + "type" : "local" + } + }, + { + "name" : "reference/build_cellranger_arc_reference", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/ingestion/make_reference/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/ingestion/make_reference", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { make_reference as make_reference_component_viashalias } from "${meta.resources_dir}/../../../../nextflow/reference/make_reference/main.nf" +make_reference_component = make_reference_component_viashalias.run(key: "make_reference_component") +include { build_bdrhap_reference } from "${meta.resources_dir}/../../../../nextflow/reference/build_bdrhap_reference/main.nf" +include { build_star_reference } from "${meta.resources_dir}/../../../../nextflow/reference/build_star_reference/main.nf" +include { build_cellranger_reference } from "${meta.resources_dir}/../../../../nextflow/reference/build_cellranger_reference/main.nf" +include { build_cellranger_arc_reference } from "${meta.resources_dir}/../../../../nextflow/reference/build_cellranger_arc_reference/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map{ id, state -> + // remove all fields starting with 'output_' + def newState = state.findAll{k, v -> !k.startsWith("output_")} + // make sure target is present + newState.target = newState.target ?: [] + [id, newState] + } + | make_reference_component.run( + fromState: [ + "input": "input", + "genome_fasta": "genome_fasta", + "transcriptome_gtf": "transcriptome_gtf", + "ercc": "ercc", + "output_fasta": "output_fasta", + "output_gtf": "output_gtf", + "subset_regex": "subset_regex" + ], + toState: [ + "output_fasta": "output_fasta", + "output_gtf": "output_gtf" + ] + ) + | build_cellranger_arc_reference.run( + runIf: { id, state -> + state.target.contains("cellranger_arc") + }, + fromState: [ + "genome_fasta": "output_fasta", + "annotation_gtf": "output_gtf", + "output": "output_cellranger_arc", + "motifs_file": "motifs_file", + "non_nuclear_contigs": "non_nuclear_contigs", + ], + toState: [ + "output_cellranger_arc": "output" + ], + ) + | build_cellranger_reference.run( + runIf: { id, state -> + state.target.contains("cellranger") + }, + fromState: [ + genome_fasta: "output_fasta", + transcriptome_gtf: "output_gtf" + ], + toState: [ + output_cellranger: "output" + ] + ) + | build_star_reference.run( + runIf: { id, state -> + state.target.contains("star") + }, + fromState: [ + genome_fasta: "output_fasta", + transcriptome_gtf: "output_gtf", + genomeSAindexNbases: "star_genome_sa_index_nbases" + ], + toState: [ + output_star: "output" + ] + ) + | build_bdrhap_reference.run( + runIf: { id, state -> + state.target.contains("bd_rhapsody") + }, + fromState: [ + genome_fasta: "output_fasta", + gtf: "output_gtf", + mitochondrial_contigs: "bdrhap_mitochondrial_contigs", + filtering_off: "bdrhap_filtering_off", + wta_only_index: "bdrhap_wta_only_index", + rna_only_index: "bdrhap_rna_only_index" + ], + toState: [ + output_bd_rhapsody: "reference_archive" + ] + ) + | setState([ + "output_fasta", + "output_gtf", + "output_cellranger", + "output_star", + "output_bd_rhapsody", + "output_cellranger_arc", + ]) + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/ingestion/make_reference/nextflow.config b/target/nextflow/workflows/ingestion/make_reference/nextflow.config new file mode 100644 index 00000000..962283cf --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/ingestion/make_reference' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Build a transcriptomics reference into one of many formats.\n' + author = 'Angela Oliveira Pisco, Robrecht Cannoodt, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/ingestion/make_reference/nextflow_labels.config b/target/nextflow/workflows/ingestion/make_reference/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/make_reference/nextflow_schema.json b/target/nextflow/workflows/ingestion/make_reference/nextflow_schema.json new file mode 100644 index 00000000..e69de29b diff --git a/target/nextflow/workflows/ingestion/make_reference/utils/errorstrat_ignore.config b/target/nextflow/workflows/ingestion/make_reference/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/make_reference/utils/integration_tests.config b/target/nextflow/workflows/ingestion/make_reference/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/ingestion/make_reference/utils/labels.config b/target/nextflow/workflows/ingestion/make_reference/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/ingestion/make_reference/utils/labels_ci.config b/target/nextflow/workflows/ingestion/make_reference/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/ingestion/make_reference/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/integration/bbknn_leiden/.config.vsh.yaml b/target/nextflow/workflows/integration/bbknn_leiden/.config.vsh.yaml new file mode 100644 index 00000000..4509fb0e --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/.config.vsh.yaml @@ -0,0 +1,389 @@ +name: "bbknn_leiden" +namespace: "workflows/integration" +version: "main" +authors: +- name: "Mauro Saporita" + roles: + - "author" + info: + role: "Contributor" + links: + email: "maurosaporita@gmail.com" + github: "mauro-saporita" + linkedin: "mauro-saporita-930b06a5" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Lead Nextflow Developer" +- name: "Povilas Gibas" + roles: + - "author" + info: + role: "Contributor" + links: + email: "povilasgibas@gmail.com" + github: "PoGibas" + linkedin: "povilas-gibas" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Bioinformatician" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Bbknn" + arguments: + - type: "string" + name: "--obsm_input" + description: "The dimensionality reduction in `.obsm` to use for neighbour detection.\ + \ Defaults to X_pca." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch" + description: ".obs column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--uns_output" + description: "Mandatory .uns slot to store various neighbor output objects." + info: null + default: + - "bbknn_integration_neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "bbknn_integration_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "bbknn_integration_connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_neighbors_within_batch" + description: "How many top neighbours to report for each batch; total number of\ + \ neighbours in the initial k-nearest-neighbours computation will be this number\ + \ times the number of batches." + info: null + default: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_pcs" + description: "How many dimensions (in case of PCA, principal components) to use\ + \ in the analysis." + info: null + default: + - 50 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_trim" + description: "Trim the neighbours of each cell to these many top connectivities.\ + \ May help with population independence and improve the tidiness of clustering.\ + \ The lower the value the more independent the individual populations, at the\ + \ cost of more conserved batch effect. If `None` (default), sets the parameter\ + \ value automatically to 10 times `neighbors_within_batch` times the number\ + \ of batches. Set to 0 to skip." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Clustering options" + arguments: + - type: "string" + name: "--obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions\ + \ specified in '--leiden_resolution'. \n" + info: null + default: + - "bbknn_integration_leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "UMAP options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding." + info: null + default: + - "X_leiden_bbknn_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run bbknn followed by leiden clustering and run umap on the result." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "cluster/leiden" + repository: + type: "local" +- name: "dimred/umap" + repository: + type: "local" +- name: "neighbors/bbknn" + repository: + type: "local" +- name: "metadata/move_obsm_to_obs" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/integration/bbknn_leiden/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/integration/bbknn_leiden" + executable: "target/nextflow/workflows/integration/bbknn_leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/cluster/leiden" + - "target/nextflow/dimred/umap" + - "target/nextflow/neighbors/bbknn" + - "target/nextflow/metadata/move_obsm_to_obs" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/integration/bbknn_leiden/main.nf b/target/nextflow/workflows/integration/bbknn_leiden/main.nf new file mode 100644 index 00000000..f42b2bf5 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/main.nf @@ -0,0 +1,3731 @@ +// bbknn_leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Mauro Saporita (author) +// * Povilas Gibas (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "bbknn_leiden", + "namespace" : "workflows/integration", + "version" : "main", + "authors" : [ + { + "name" : "Mauro Saporita", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "maurosaporita@gmail.com", + "github" : "mauro-saporita", + "linkedin" : "mauro-saporita-930b06a5" + }, + "organizations" : [ + { + "name" : "Ardigen", + "href" : "https://ardigen.com", + "role" : "Lead Nextflow Developer" + } + ] + } + }, + { + "name" : "Povilas Gibas", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "povilasgibas@gmail.com", + "github" : "PoGibas", + "linkedin" : "povilas-gibas" + }, + "organizations" : [ + { + "name" : "Ardigen", + "href" : "https://ardigen.com", + "role" : "Bioinformatician" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use specified layer for expression values instead of the .X object from the modality.", + "default" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Bbknn", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_input", + "description" : "The dimensionality reduction in `.obsm` to use for neighbour detection. Defaults to X_pca.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch", + "description" : ".obs column name discriminating between your batches.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--uns_output", + "description" : "Mandatory .uns slot to store various neighbor output objects.", + "default" : [ + "bbknn_integration_neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "bbknn_integration_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "bbknn_integration_connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_neighbors_within_batch", + "description" : "How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches.", + "default" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_pcs", + "description" : "How many dimensions (in case of PCA, principal components) to use in the analysis.", + "default" : [ + 50 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_trim", + "description" : "Trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. The lower the value the more independent the individual populations, at the cost of more conserved batch effect. If `None` (default), sets the parameter value automatically to 10 times `neighbors_within_batch` times the number of batches. Set to 0 to skip.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions specified in '--leiden_resolution'. \n", + "default" : [ + "bbknn_integration_leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "UMAP options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.", + "default" : [ + "X_leiden_bbknn_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run bbknn followed by leiden clustering and run umap on the result.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "cluster/leiden", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dimred/umap", + "repository" : { + "type" : "local" + } + }, + { + "name" : "neighbors/bbknn", + "repository" : { + "type" : "local" + } + }, + { + "name" : "metadata/move_obsm_to_obs", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/integration/bbknn_leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/integration/bbknn_leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { leiden } from "${meta.resources_dir}/../../../../nextflow/cluster/leiden/main.nf" +include { umap } from "${meta.resources_dir}/../../../../nextflow/dimred/umap/main.nf" +include { bbknn } from "${meta.resources_dir}/../../../../nextflow/neighbors/bbknn/main.nf" +include { move_obsm_to_obs } from "${meta.resources_dir}/../../../../nextflow/metadata/move_obsm_to_obs/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + bbknn_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // compute bbknn graph + | bbknn.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_input", + "obs_batch": "obs_batch", + "uns_output": "uns_output", + "obsp_distances": "obsp_distances", + "obsp_connectivities": "obsp_connectivities", + "n_neighbors_within_batch": "n_neighbors_within_batch", + "n_pcs": "n_pcs", + "n_trim": "n_trim", + ], + toState: [ + "input": "output" + ] + ) + with_leiden_ch = bbknn_ch + | filter{id, state -> state.leiden_resolution} + // run leiden on the bbknn graph + | leiden.run( + fromState: [ + "input": "input", + "obsp_connectivities": "obsp_connectivities", + "obsm_name": "obs_cluster", + "resolution": "leiden_resolution", + "modality": "modality" + ], + toState: [ + "input": "output" + ] + ) + // move obsm leiden cluster dataframe to obs + | move_obsm_to_obs.run( + fromState: + [ + "input": "input", + "obsm_key": "obs_cluster", + "modality": "modality", + ], + toState: ["input": "output"] + ) + + without_leiden_ch = bbknn_ch + | filter{id, state -> !state.leiden_resolution} + + output_ch = with_leiden_ch.mix(without_leiden_ch) + // run umap on the bbknn graph + | umap.run( + fromState: { id, state -> + [ + "input": state.input, + "uns_neighbors": state.uns_output, + "obsm_output": state.obsm_umap, + "modality": state.modality, + "output": state.workflow_output, + "output_compression": "gzip" + ] + }, + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/integration/bbknn_leiden/nextflow.config b/target/nextflow/workflows/integration/bbknn_leiden/nextflow.config new file mode 100644 index 00000000..70c6d221 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/integration/bbknn_leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run bbknn followed by leiden clustering and run umap on the result.' + author = 'Mauro Saporita, Povilas Gibas' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/integration/bbknn_leiden/nextflow_labels.config b/target/nextflow/workflows/integration/bbknn_leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/bbknn_leiden/nextflow_schema.json b/target/nextflow/workflows/integration/bbknn_leiden/nextflow_schema.json new file mode 100644 index 00000000..7bc1abb4 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/nextflow_schema.json @@ -0,0 +1,176 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "bbknn_leiden", + "description": "Run bbknn followed by leiden clustering and run umap on the result.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`, default: `\"log_normalized\"`. ", + "default": "log_normalized" + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "bbknn": { + "title": "Bbknn", + "type": "object", + "description": "No description", + "properties": { + "obsm_input": { + "type": "string", + "description": "The dimensionality reduction in `.obsm` to use for neighbour detection", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "obs_batch": { + "type": "string", + "description": ".obs column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "uns_output": { + "type": "string", + "description": "Mandatory .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"bbknn_integration_neighbors\"`. ", + "default": "bbknn_integration_neighbors" + }, + "obsp_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"bbknn_integration_distances\"`. ", + "default": "bbknn_integration_distances" + }, + "obsp_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"bbknn_integration_connectivities\"`. ", + "default": "bbknn_integration_connectivities" + }, + "n_neighbors_within_batch": { + "type": "integer", + "description": "How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches.", + "help_text": "Type: `integer`, multiple: `False`, default: `3`. ", + "default": 3 + }, + "n_pcs": { + "type": "integer", + "description": "How many dimensions (in case of PCA, principal components) to use in the analysis.", + "help_text": "Type: `integer`, multiple: `False`, default: `50`. ", + "default": 50 + }, + "n_trim": { + "type": "integer", + "description": "Trim the neighbours of each cell to these many top connectivities", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "clustering options": { + "title": "Clustering options", + "type": "object", + "description": "No description", + "properties": { + "obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"bbknn_integration_leiden\"`. ", + "default": "bbknn_integration_leiden" + }, + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "umap options": { + "title": "UMAP options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_leiden_bbknn_umap\"`. ", + "default": "X_leiden_bbknn_umap" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/bbknn" + }, + { + "$ref": "#/$defs/clustering options" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/integration/bbknn_leiden/utils/errorstrat_ignore.config b/target/nextflow/workflows/integration/bbknn_leiden/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/integration/bbknn_leiden/utils/integration_tests.config b/target/nextflow/workflows/integration/bbknn_leiden/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/integration/bbknn_leiden/utils/labels.config b/target/nextflow/workflows/integration/bbknn_leiden/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/bbknn_leiden/utils/labels_ci.config b/target/nextflow/workflows/integration/bbknn_leiden/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/integration/bbknn_leiden/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/integration/harmony_leiden/.config.vsh.yaml b/target/nextflow/workflows/integration/harmony_leiden/.config.vsh.yaml new file mode 100644 index 00000000..91f1fff7 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/.config.vsh.yaml @@ -0,0 +1,359 @@ +name: "harmony_leiden" +namespace: "workflows/integration" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Neighbour calculation" + arguments: + - type: "string" + name: "--uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + default: + - "harmonypy_integration_neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "harmonypy_integration_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "harmonypy_integration_connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Harmony integration options" + arguments: + - type: "string" + name: "--embedding" + description: "Embedding to use as input" + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_integrated" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_pca_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_covariates" + description: "The .obs field(s) that define the covariate(s) to regress out." + info: null + example: + - "batch" + - "sample" + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "double" + name: "--theta" + description: "Diversity clustering penalty parameter. Can be set as a single value\ + \ for all batch observations or as multiple values, one for each observation\ + \ in the batches defined by --obs_covariates. theta=0 does not encourage any\ + \ diversity. Larger values of theta\nresult in more diverse clusters.\"\n" + info: null + default: + - 2.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Clustering options" + arguments: + - type: "string" + name: "--obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions\ + \ specified in '--leiden_resolution'.\n" + info: null + default: + - "harmony_integration_leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Umap options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding." + info: null + default: + - "X_leiden_harmony_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run harmony integration followed by neighbour calculations, leiden clustering\ + \ and run umap on the result." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "integrate/harmonypy" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/integration/harmony_leiden/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/integration/harmony_leiden" + executable: "target/nextflow/workflows/integration/harmony_leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/integrate/harmonypy" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/integration/harmony_leiden/main.nf b/target/nextflow/workflows/integration/harmony_leiden/main.nf new file mode 100644 index 00000000..e79a08a4 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/main.nf @@ -0,0 +1,3658 @@ +// harmony_leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "harmony_leiden", + "namespace" : "workflows/integration", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use specified layer for expression values instead of the .X object from the modality.", + "default" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbour calculation", + "arguments" : [ + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "default" : [ + "harmonypy_integration_neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "harmonypy_integration_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "harmonypy_integration_connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Harmony integration options", + "arguments" : [ + { + "type" : "string", + "name" : "--embedding", + "description" : "Embedding to use as input", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_integrated", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_pca_integrated" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_covariates", + "description" : "The .obs field(s) that define the covariate(s) to regress out.", + "example" : [ + "batch", + "sample" + ], + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--theta", + "description" : "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta\nresult in more diverse clusters.\\"\n", + "default" : [ + 2.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions specified in '--leiden_resolution'.\n", + "default" : [ + "harmony_integration_leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Umap options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.", + "default" : [ + "X_leiden_harmony_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run harmony integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "integrate/harmonypy", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/integration/harmony_leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/integration/harmony_leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { harmonypy } from "${meta.resources_dir}/../../../../nextflow/integrate/harmonypy/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // run harmonypy + | harmonypy.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "embedding", + "obs_covariates": "obs_covariates", + "obsm_output": "obsm_integrated", + "theta": "theta" + ], + toState: ["input": "output"] + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_integrated", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "leiden_resolution": "leiden_resolution", + "obs_cluster": "obs_cluster", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/integration/harmony_leiden/nextflow.config b/target/nextflow/workflows/integration/harmony_leiden/nextflow.config new file mode 100644 index 00000000..17a7d910 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/integration/harmony_leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run harmony integration followed by neighbour calculations, leiden clustering and run umap on the result.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/integration/harmony_leiden/nextflow_labels.config b/target/nextflow/workflows/integration/harmony_leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/harmony_leiden/nextflow_schema.json b/target/nextflow/workflows/integration/harmony_leiden/nextflow_schema.json new file mode 100644 index 00000000..1cb993ba --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/nextflow_schema.json @@ -0,0 +1,188 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "harmony_leiden", + "description": "Run harmony integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`, default: `\"log_normalized\"`. ", + "default": "log_normalized" + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "neighbour calculation": { + "title": "Neighbour calculation", + "type": "object", + "description": "No description", + "properties": { + "uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"harmonypy_integration_neighbors\"`. ", + "default": "harmonypy_integration_neighbors" + }, + "obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"harmonypy_integration_distances\"`. ", + "default": "harmonypy_integration_distances" + }, + "obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"harmonypy_integration_connectivities\"`. ", + "default": "harmonypy_integration_connectivities" + } + } + }, + "harmony integration options": { + "title": "Harmony integration options", + "type": "object", + "description": "No description", + "properties": { + "embedding": { + "type": "string", + "description": "Embedding to use as input", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "obsm_integrated": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca_integrated\"`. ", + "default": "X_pca_integrated" + }, + "obs_covariates": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The .obs field(s) that define the covariate(s) to regress out.", + "help_text": "Type: `string`, multiple: `True`, required, example: `[\"batch\";\"sample\"]`. " + }, + "theta": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Diversity clustering penalty parameter", + "help_text": "Type: `double`, multiple: `True`, default: `[2.0]`. ", + "default": [ + 2.0 + ] + } + } + }, + "clustering options": { + "title": "Clustering options", + "type": "object", + "description": "No description", + "properties": { + "obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"harmony_integration_leiden\"`. ", + "default": "harmony_integration_leiden" + }, + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "umap options": { + "title": "Umap options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_leiden_harmony_umap\"`. ", + "default": "X_leiden_harmony_umap" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/neighbour calculation" + }, + { + "$ref": "#/$defs/harmony integration options" + }, + { + "$ref": "#/$defs/clustering options" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/integration/harmony_leiden/utils/errorstrat_ignore.config b/target/nextflow/workflows/integration/harmony_leiden/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/integration/harmony_leiden/utils/integration_tests.config b/target/nextflow/workflows/integration/harmony_leiden/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/integration/harmony_leiden/utils/labels.config b/target/nextflow/workflows/integration/harmony_leiden/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/harmony_leiden/utils/labels_ci.config b/target/nextflow/workflows/integration/harmony_leiden/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/integration/harmony_leiden/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/integration/scanorama_leiden/.config.vsh.yaml b/target/nextflow/workflows/integration/scanorama_leiden/.config.vsh.yaml new file mode 100644 index 00000000..7e8b66ca --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/.config.vsh.yaml @@ -0,0 +1,411 @@ +name: "scanorama_leiden" +namespace: "workflows/integration" +version: "main" +authors: +- name: "Mauro Saporita" + roles: + - "author" + info: + role: "Contributor" + links: + email: "maurosaporita@gmail.com" + github: "mauro-saporita" + linkedin: "mauro-saporita-930b06a5" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Lead Nextflow Developer" +- name: "Povilas Gibas" + roles: + - "author" + info: + role: "Contributor" + links: + email: "povilasgibas@gmail.com" + github: "PoGibas" + linkedin: "povilas-gibas" + organizations: + - name: "Ardigen" + href: "https://ardigen.com" + role: "Bioinformatician" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Neighbour calculation" + arguments: + - type: "string" + name: "--uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + default: + - "scanorama_integration_neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "scanorama_integration_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "scanorama_integration_connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Scanorama integration options" + arguments: + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: ".obsm slot that points to embedding to run scanorama on." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "The name of the field in adata.obsm where the integrated embeddings\ + \ will be stored after running this function. Defaults to X_scanorama." + info: null + default: + - "X_scanorama" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--knn" + description: "Number of nearest neighbors to use for matching." + info: null + default: + - 20 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size used in the alignment vector computation. Useful\ + \ when integrating very large (>100k samples) datasets. Set to large value that\ + \ runs within available memory." + info: null + default: + - 5000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--sigma" + description: "Correction smoothing parameter on Gaussian kernel." + info: null + default: + - 15.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--approx" + description: "Use approximate nearest neighbors with Python annoy; greatly speeds\ + \ up matching runtime." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--alpha" + description: "Alignment score minimum cutoff" + info: null + default: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Clustering options" + arguments: + - type: "string" + name: "--obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the\nresolutions specified\ + \ in '--leiden_resolution'.\n" + info: null + default: + - "scanorama_integration_leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Umap options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding." + info: null + default: + - "X_leiden_scanorama_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run scanorama integration followed by neighbour calculations, leiden\ + \ clustering and run umap on the result." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "integrate/scanorama" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/integration/scanorama_leiden/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/integration/scanorama_leiden" + executable: "target/nextflow/workflows/integration/scanorama_leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/integrate/scanorama" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/integration/scanorama_leiden/main.nf b/target/nextflow/workflows/integration/scanorama_leiden/main.nf new file mode 100644 index 00000000..ad9b1501 --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/main.nf @@ -0,0 +1,3729 @@ +// scanorama_leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Mauro Saporita (author) +// * Povilas Gibas (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scanorama_leiden", + "namespace" : "workflows/integration", + "version" : "main", + "authors" : [ + { + "name" : "Mauro Saporita", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "maurosaporita@gmail.com", + "github" : "mauro-saporita", + "linkedin" : "mauro-saporita-930b06a5" + }, + "organizations" : [ + { + "name" : "Ardigen", + "href" : "https://ardigen.com", + "role" : "Lead Nextflow Developer" + } + ] + } + }, + { + "name" : "Povilas Gibas", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "links" : { + "email" : "povilasgibas@gmail.com", + "github" : "PoGibas", + "linkedin" : "povilas-gibas" + }, + "organizations" : [ + { + "name" : "Ardigen", + "href" : "https://ardigen.com", + "role" : "Bioinformatician" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use specified layer for expression values instead of the .X object from the modality.", + "default" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbour calculation", + "arguments" : [ + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "default" : [ + "scanorama_integration_neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "scanorama_integration_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "scanorama_integration_connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Scanorama integration options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_batch", + "description" : "Column name discriminating between your batches.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_input", + "description" : ".obsm slot that points to embedding to run scanorama on.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "The name of the field in adata.obsm where the integrated embeddings will be stored after running this function. Defaults to X_scanorama.", + "default" : [ + "X_scanorama" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--knn", + "description" : "Number of nearest neighbors to use for matching.", + "default" : [ + 20 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--batch_size", + "description" : "The batch size used in the alignment vector computation. Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory.", + "default" : [ + 5000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--sigma", + "description" : "Correction smoothing parameter on Gaussian kernel.", + "default" : [ + 15.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--approx", + "description" : "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--alpha", + "description" : "Alignment score minimum cutoff", + "default" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the\nresolutions specified in '--leiden_resolution'.\n", + "default" : [ + "scanorama_integration_leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Umap options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.", + "default" : [ + "X_leiden_scanorama_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run scanorama integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "integrate/scanorama", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/integration/scanorama_leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/integration/scanorama_leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { scanorama } from "${meta.resources_dir}/../../../../nextflow/integrate/scanorama/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | scanorama.run( + fromState: [ + "input": "input", + "obsm_input": "obsm_input", + "obs_batch": "obs_batch", + "obsm_output": "obsm_output", + "modality": "modality", + "batch_size": "batch_size", + "sigma": "sigma", + "approx": "approx", + "alpha": "alpha", + "knn": "knn", + ], + toState: ["input": "output"] + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_output", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "leiden_resolution": "leiden_resolution", + "obs_cluster": "obs_cluster", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/integration/scanorama_leiden/nextflow.config b/target/nextflow/workflows/integration/scanorama_leiden/nextflow.config new file mode 100644 index 00000000..1ac06f08 --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/integration/scanorama_leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run scanorama integration followed by neighbour calculations, leiden clustering and run umap on the result.' + author = 'Mauro Saporita, Povilas Gibas' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/integration/scanorama_leiden/nextflow_labels.config b/target/nextflow/workflows/integration/scanorama_leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/scanorama_leiden/nextflow_schema.json b/target/nextflow/workflows/integration/scanorama_leiden/nextflow_schema.json new file mode 100644 index 00000000..ae01265d --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/nextflow_schema.json @@ -0,0 +1,205 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scanorama_leiden", + "description": "Run scanorama integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`, default: `\"log_normalized\"`. ", + "default": "log_normalized" + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "neighbour calculation": { + "title": "Neighbour calculation", + "type": "object", + "description": "No description", + "properties": { + "uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanorama_integration_neighbors\"`. ", + "default": "scanorama_integration_neighbors" + }, + "obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanorama_integration_distances\"`. ", + "default": "scanorama_integration_distances" + }, + "obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanorama_integration_connectivities\"`. ", + "default": "scanorama_integration_connectivities" + } + } + }, + "scanorama integration options": { + "title": "Scanorama integration options", + "type": "object", + "description": "No description", + "properties": { + "obs_batch": { + "type": "string", + "description": "Column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "obsm_input": { + "type": "string", + "description": ".obsm slot that points to embedding to run scanorama on.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "obsm_output": { + "type": "string", + "description": "The name of the field in adata.obsm where the integrated embeddings will be stored after running this function", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scanorama\"`. ", + "default": "X_scanorama" + }, + "knn": { + "type": "integer", + "description": "Number of nearest neighbors to use for matching.", + "help_text": "Type: `integer`, multiple: `False`, default: `20`. ", + "default": 20 + }, + "batch_size": { + "type": "integer", + "description": "The batch size used in the alignment vector computation", + "help_text": "Type: `integer`, multiple: `False`, default: `5000`. ", + "default": 5000 + }, + "sigma": { + "type": "number", + "description": "Correction smoothing parameter on Gaussian kernel.", + "help_text": "Type: `double`, multiple: `False`, default: `15.0`. ", + "default": 15.0 + }, + "approx": { + "type": "boolean", + "description": "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "alpha": { + "type": "number", + "description": "Alignment score minimum cutoff", + "help_text": "Type: `double`, multiple: `False`, default: `0.1`. ", + "default": 0.1 + } + } + }, + "clustering options": { + "title": "Clustering options", + "type": "object", + "description": "No description", + "properties": { + "obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"scanorama_integration_leiden\"`. ", + "default": "scanorama_integration_leiden" + }, + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "umap options": { + "title": "Umap options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_leiden_scanorama_umap\"`. ", + "default": "X_leiden_scanorama_umap" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/neighbour calculation" + }, + { + "$ref": "#/$defs/scanorama integration options" + }, + { + "$ref": "#/$defs/clustering options" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/integration/scanorama_leiden/utils/errorstrat_ignore.config b/target/nextflow/workflows/integration/scanorama_leiden/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/integration/scanorama_leiden/utils/integration_tests.config b/target/nextflow/workflows/integration/scanorama_leiden/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/integration/scanorama_leiden/utils/labels.config b/target/nextflow/workflows/integration/scanorama_leiden/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/scanorama_leiden/utils/labels_ci.config b/target/nextflow/workflows/integration/scanorama_leiden/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/integration/scanorama_leiden/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/integration/scgpt_leiden/.config.vsh.yaml b/target/nextflow/workflows/integration/scgpt_leiden/.config.vsh.yaml new file mode 100644 index 00000000..09651e01 --- /dev/null +++ b/target/nextflow/workflows/integration/scgpt_leiden/.config.vsh.yaml @@ -0,0 +1,463 @@ +name: "scgpt_leiden" +namespace: "workflows/integration" +version: "main" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + - "author" + info: + role: "Core Team Member" + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Elizabeth Mlynarski" + roles: + - "author" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Principal Scientist Computational Genomics" +- name: "Weiwei Schultz" + roles: + - "contributor" + info: + role: "Contributor" + organizations: + - name: "Janssen R&D US" + role: "Associate Director Data Sciences" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the input file." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality from the input MuData file to process.\n" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--input_layer" + description: "The layer of the input dataset to process if .X is not to be used.\ + \ Should contain log normalized counts.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_gene_names" + description: "The name of the adata var column containing gene names; when no\ + \ gene_name_layer is provided, the var index will be used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_batch_label" + description: "The name of the adata obs column containing the batch labels.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Model" + arguments: + - type: "file" + name: "--model" + description: "Path to scGPT model file.\n" + info: null + example: + - "resources_test/scgpt/best_model.pt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_vocab" + description: "Path to scGPT model vocabulary file.\n" + info: null + example: + - "resources_test/scgpt/vocab.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--model_config" + description: "Path to scGPT model config file.\n" + info: null + example: + - "args.json" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--finetuned_checkpoints_key" + description: "Key in the model file containing the pretrained checkpoints. Only\ + \ relevant for fine-tuned models.\n" + info: null + example: + - "model_state_dict" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Output file path" + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_integrated" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_scgpt" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Padding arguments" + arguments: + - type: "string" + name: "--pad_token" + description: "Token used for padding.\n" + info: null + default: + - "" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--pad_value" + description: "The value of the padding token.\n" + info: null + default: + - -2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "HVG subset arguments" + arguments: + - type: "integer" + name: "--n_hvg" + description: "Number of highly variable genes to subset for.\n" + info: null + default: + - 1200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--hvg_flavor" + description: "Method to be used for identifying highly variable genes. \nNote\ + \ that the default for this workflow (`cell_ranger`) is not the default method\ + \ for scanpy hvg detection (`seurat`).\n" + info: null + default: + - "cell_ranger" + required: false + choices: + - "cell_ranger" + - "seurat" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Tokenization arguments" + arguments: + - type: "integer" + name: "--max_seq_len" + description: "The maximum sequence length of the tokenized data. Defaults to the\ + \ number of features if not provided.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Embedding arguments" + arguments: + - type: "boolean" + name: "--dsbn" + description: "Apply domain-specific batch normalization\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--batch_size" + description: "The batch size to be used for embedding inference.\n" + info: null + default: + - 64 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Binning arguments" + arguments: + - type: "integer" + name: "--n_input_bins" + description: "The number of bins to discretize the data into; When no value is\ + \ provided, data won't be binned.\n" + info: null + default: + - 51 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--seed" + description: "Seed for random number generation used for binning. If not set,\ + \ no seed is used.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Clustering arguments" + arguments: + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run scGPT integration (cell embedding generation) followed by neighbour\ + \ calculations, leiden clustering and run umap on the result." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "scgpt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "scgpt/cross_check_genes" + repository: + type: "local" +- name: "scgpt/binning" + repository: + type: "local" +- name: "feature_annotation/highly_variable_features_scanpy" + repository: + type: "local" +- name: "scgpt/pad_tokenize" + repository: + type: "local" +- name: "scgpt/embedding" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/integration/scgpt_leiden/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/integration/scgpt_leiden" + executable: "target/nextflow/workflows/integration/scgpt_leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/scgpt/cross_check_genes" + - "target/nextflow/scgpt/binning" + - "target/nextflow/feature_annotation/highly_variable_features_scanpy" + - "target/nextflow/scgpt/pad_tokenize" + - "target/nextflow/scgpt/embedding" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/integration/scgpt_leiden/main.nf b/target/nextflow/workflows/integration/scgpt_leiden/main.nf new file mode 100644 index 00000000..04d4e5f2 --- /dev/null +++ b/target/nextflow/workflows/integration/scgpt_leiden/main.nf @@ -0,0 +1,3902 @@ +// scgpt_leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer, author) +// * Elizabeth Mlynarski (author) +// * Weiwei Schultz (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scgpt_leiden", + "namespace" : "workflows/integration", + "version" : "main", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer", + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Elizabeth Mlynarski", + "roles" : [ + "author" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Principal Scientist Computational Genomics" + } + ] + } + }, + { + "name" : "Weiwei Schultz", + "roles" : [ + "contributor" + ], + "info" : { + "role" : "Contributor", + "organizations" : [ + { + "name" : "Janssen R&D US", + "role" : "Associate Director Data Sciences" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the input file.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality from the input MuData file to process.\n", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--input_layer", + "description" : "The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_gene_names", + "description" : "The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_batch_label", + "description" : "The name of the adata obs column containing the batch labels.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Model", + "arguments" : [ + { + "type" : "file", + "name" : "--model", + "description" : "Path to scGPT model file.\n", + "example" : [ + "resources_test/scgpt/best_model.pt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_vocab", + "description" : "Path to scGPT model vocabulary file.\n", + "example" : [ + "resources_test/scgpt/vocab.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--model_config", + "description" : "Path to scGPT model config file.\n", + "example" : [ + "args.json" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--finetuned_checkpoints_key", + "description" : "Key in the model file containing the pretrained checkpoints. Only relevant for fine-tuned models.\n", + "example" : [ + "model_state_dict" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Output file path", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_integrated", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_scgpt" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Padding arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--pad_token", + "description" : "Token used for padding.\n", + "default" : [ + "" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--pad_value", + "description" : "The value of the padding token.\n", + "default" : [ + -2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "HVG subset arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_hvg", + "description" : "Number of highly variable genes to subset for.\n", + "default" : [ + 1200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--hvg_flavor", + "description" : "Method to be used for identifying highly variable genes. \nNote that the default for this workflow (`cell_ranger`) is not the default method for scanpy hvg detection (`seurat`).\n", + "default" : [ + "cell_ranger" + ], + "required" : false, + "choices" : [ + "cell_ranger", + "seurat" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Tokenization arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--max_seq_len", + "description" : "The maximum sequence length of the tokenized data. Defaults to the number of features if not provided.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Embedding arguments", + "arguments" : [ + { + "type" : "boolean", + "name" : "--dsbn", + "description" : "Apply domain-specific batch normalization\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--batch_size", + "description" : "The batch size to be used for embedding inference.\n", + "default" : [ + 64 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Binning arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--n_input_bins", + "description" : "The number of bins to discretize the data into; When no value is provided, data won't be binned.\n", + "default" : [ + 51 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--seed", + "description" : "Seed for random number generation used for binning. If not set, no seed is used.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering arguments", + "arguments" : [ + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/scgpt" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "scgpt/cross_check_genes", + "repository" : { + "type" : "local" + } + }, + { + "name" : "scgpt/binning", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/highly_variable_features_scanpy", + "repository" : { + "type" : "local" + } + }, + { + "name" : "scgpt/pad_tokenize", + "repository" : { + "type" : "local" + } + }, + { + "name" : "scgpt/embedding", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/integration/scgpt_leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/integration/scgpt_leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cross_check_genes } from "${meta.resources_dir}/../../../../nextflow/scgpt/cross_check_genes/main.nf" +include { binning } from "${meta.resources_dir}/../../../../nextflow/scgpt/binning/main.nf" +include { highly_variable_features_scanpy } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/highly_variable_features_scanpy/main.nf" +include { pad_tokenize } from "${meta.resources_dir}/../../../../nextflow/scgpt/pad_tokenize/main.nf" +include { embedding } from "${meta.resources_dir}/../../../../nextflow/scgpt/embedding/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Annotates the mudata object with highly variable genes. + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "layer": "input_layer", + "modality": "modality", + "n_top_features": "n_hvg", + "flavor": "hvg_flavor" + ], + args: ["var_name_filter": "scgpt_filter_with_hvg"], + toState: ["input": "output"] + ) + // Check whether the genes are part of the provided vocabulary. + | cross_check_genes.run( + fromState: [ + "input": "input", + "modality": "modality", + "vocab_file": "model_vocab", + "input_var_gene_names": "var_gene_names", + "output": "output", + "pad_token": "pad_token" + ], + args: [ + "var_input": "scgpt_filter_with_hvg", + "output_var_filter": "scgpt_cross_checked_genes" + ], + toState: [ + "input": "output" + ] + ) + // Bins the data into a fixed number of bins. + | binning.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "n_input_bins": "n_input_bins", + "output": "output" + ], + args: [ + "output_obsm_binned_counts": "binned_counts", + "var_input": "scgpt_cross_checked_genes" + ], + toState: [ + "input": "output" + ] + ) + // Padding and tokenization of gene count values. + | pad_tokenize.run( + fromState: [ + "input": "input", + "modality": "modality", + "model_vocab": "model_vocab", + "var_gene_names": "var_gene_names", + "pad_token": "pad_token", + "pad_value": "pad_value", + "max_seq_len": "max_seq_len", + "output": "output" + ], + args: [ + "input_obsm_binned_counts": "binned_counts", + "var_input": "scgpt_cross_checked_genes", + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask" + ], + toState: [ + "input": "output" + ] + ) + // Generation of cell embedings from the tokenized gene counts values. + | embedding.run( + fromState: [ + "input": "input", + "modality": "modality", + "model": "model", + "model_vocab": "model_vocab", + "model_config": "model_config", + "var_gene_names": "var_gene_names", + "obs_batch_label": "obs_batch_label", + "pad_token": "pad_token", + "pad_value": "pad_value", + "dsbn": "dsbn", + "batch_size": "batch_size", + "obsm_embeddings": "obsm_integrated", + "finetuned_checkpoints_key": "finetuned_checkpoints_key", + "output": "output" + ], + args: [ + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask" + ], + toState: [ + "input": "output" + ] + ) + // Calculation of neighbors, leiden clustering and UMAP. + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "obsm_input": "obsm_integrated", + "modality": "modality", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "output": "workflow_output", + "leiden_resolution": "leiden_resolution", + "obsm_umap": "obsm_integrated", + ], + toState: [ + "output": "output" + ], + args: [ + "uns_neighbors": "scGPT_integration_neighbors", + "obsp_neighbor_distances": "scGPT_integration_distances", + "obsp_neighbor_connectivities": "scGPT_integration_connectivities", + "obs_cluster": "scGPT_integration_leiden", + "obsm_umap": "X_scGPT_umap" + ] + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/integration/scgpt_leiden/nextflow.config b/target/nextflow/workflows/integration/scgpt_leiden/nextflow.config new file mode 100644 index 00000000..2a753241 --- /dev/null +++ b/target/nextflow/workflows/integration/scgpt_leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/integration/scgpt_leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result.' + author = 'Dorien Roosen, Elizabeth Mlynarski, Weiwei Schultz' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/integration/scgpt_leiden/nextflow_labels.config b/target/nextflow/workflows/integration/scgpt_leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/scgpt_leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/scgpt_leiden/nextflow_schema.json b/target/nextflow/workflows/integration/scgpt_leiden/nextflow_schema.json new file mode 100644 index 00000000..a17146d7 --- /dev/null +++ b/target/nextflow/workflows/integration/scgpt_leiden/nextflow_schema.json @@ -0,0 +1,254 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scgpt_leiden", + "description": "Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the input file.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality from the input MuData file to process.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "input_layer": { + "type": "string", + "description": "The layer of the input dataset to process if .X is not to be used", + "help_text": "Type: `string`, multiple: `False`. " + }, + "var_gene_names": { + "type": "string", + "description": "The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_batch_label": { + "type": "string", + "description": "The name of the adata obs column containing the batch labels.\n", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Output file path", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "obsm_integrated": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scgpt\"`. ", + "default": "X_scgpt" + } + } + }, + "model": { + "title": "Model", + "type": "object", + "description": "No description", + "properties": { + "model": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to scGPT model file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"resources_test/scgpt/best_model.pt\"`. " + }, + "model_vocab": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to scGPT model vocabulary file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"resources_test/scgpt/vocab.json\"`. " + }, + "model_config": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to scGPT model config file.\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"args.json\"`. " + }, + "finetuned_checkpoints_key": { + "type": "string", + "description": "Key in the model file containing the pretrained checkpoints", + "help_text": "Type: `string`, multiple: `False`, example: `\"model_state_dict\"`. " + } + } + }, + "padding arguments": { + "title": "Padding arguments", + "type": "object", + "description": "No description", + "properties": { + "pad_token": { + "type": "string", + "description": "Token used for padding.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"\"`. ", + "default": "" + }, + "pad_value": { + "type": "integer", + "description": "The value of the padding token.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `-2`. ", + "default": -2 + } + } + }, + "hvg subset arguments": { + "title": "HVG subset arguments", + "type": "object", + "description": "No description", + "properties": { + "n_hvg": { + "type": "integer", + "description": "Number of highly variable genes to subset for.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `1200`. ", + "default": 1200 + }, + "hvg_flavor": { + "type": "string", + "description": "Method to be used for identifying highly variable genes", + "help_text": "Type: `string`, multiple: `False`, default: `\"cell_ranger\"`, choices: ``cell_ranger`, `seurat``. ", + "enum": [ + "cell_ranger", + "seurat" + ], + "default": "cell_ranger" + } + } + }, + "tokenization arguments": { + "title": "Tokenization arguments", + "type": "object", + "description": "No description", + "properties": { + "max_seq_len": { + "type": "integer", + "description": "The maximum sequence length of the tokenized data", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "embedding arguments": { + "title": "Embedding arguments", + "type": "object", + "description": "No description", + "properties": { + "dsbn": { + "type": "boolean", + "description": "Apply domain-specific batch normalization\n", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "batch_size": { + "type": "integer", + "description": "The batch size to be used for embedding inference.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `64`. ", + "default": 64 + } + } + }, + "binning arguments": { + "title": "Binning arguments", + "type": "object", + "description": "No description", + "properties": { + "n_input_bins": { + "type": "integer", + "description": "The number of bins to discretize the data into; When no value is provided, data won't be binned.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `51`. ", + "default": 51 + }, + "seed": { + "type": "integer", + "description": "Seed for random number generation used for binning", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "clustering arguments": { + "title": "Clustering arguments", + "type": "object", + "description": "No description", + "properties": { + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/model" + }, + { + "$ref": "#/$defs/padding arguments" + }, + { + "$ref": "#/$defs/hvg subset arguments" + }, + { + "$ref": "#/$defs/tokenization arguments" + }, + { + "$ref": "#/$defs/embedding arguments" + }, + { + "$ref": "#/$defs/binning arguments" + }, + { + "$ref": "#/$defs/clustering arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/integration/scvi_leiden/.config.vsh.yaml b/target/nextflow/workflows/integration/scvi_leiden/.config.vsh.yaml new file mode 100644 index 00000000..43a84adc --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/.config.vsh.yaml @@ -0,0 +1,438 @@ +name: "scvi_leiden" +namespace: "workflows/integration" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_model" + description: "Folder where the state of the trained model will be saved to." + info: null + example: + - "output_dir" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Neighbour calculation" + arguments: + - type: "string" + name: "--uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + default: + - "scvi_integration_neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "scvi_integration_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "scvi_integration_connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Scvi integration options" + arguments: + - type: "string" + name: "--obs_batch" + description: "Column name discriminating between your batches." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_output" + description: "In which .obsm slot to store the resulting integrated embedding." + info: null + default: + - "X_scvi_integrated" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_input" + description: ".var column containing highly variable genes. By default, do not\ + \ subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--early_stopping" + description: "Whether to perform early stopping with respect to the validation\ + \ set." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--early_stopping_monitor" + description: "Metric logged during validation set epoch." + info: null + default: + - "elbo_validation" + required: false + choices: + - "elbo_validation" + - "reconstruction_loss_validation" + - "kl_local_validation" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--early_stopping_patience" + description: "Number of validation epochs with no improvement after which training\ + \ will be stopped." + info: null + default: + - 45 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--early_stopping_min_delta" + description: "Minimum change in the monitored quantity to qualify as an improvement,\ + \ i.e. an absolute change of less than min_delta, will count as no improvement." + info: null + default: + - 0.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset, defaults to (20000 / number\ + \ of cells) * 400 or 400; whichever is smallest." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--reduce_lr_on_plateau" + description: "Whether to monitor validation loss and reduce learning rate when\ + \ validation set `lr_scheduler_metric` plateaus." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_factor" + description: "Factor to reduce learning rate." + info: null + default: + - 0.6 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--lr_patience" + description: "Number of epochs with no improvement after which learning rate will\ + \ be reduced." + info: null + default: + - 30.0 + required: false + min: 0.0 + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Clustering options" + arguments: + - type: "string" + name: "--obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions\ + \ specified in '--leiden_resolution'.\n" + info: null + default: + - "scvi_integration_leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Umap options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding." + info: null + default: + - "X_scvi_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run scvi integration followed by neighbour calculations, leiden clustering\ + \ and run umap on the result." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "integrate/scvi" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/integration/scvi_leiden/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/integration/scvi_leiden" + executable: "target/nextflow/workflows/integration/scvi_leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/integrate/scvi" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/integration/scvi_leiden/main.nf b/target/nextflow/workflows/integration/scvi_leiden/main.nf new file mode 100644 index 00000000..efff87ff --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/main.nf @@ -0,0 +1,3752 @@ +// scvi_leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "scvi_leiden", + "namespace" : "workflows/integration", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use specified layer for expression values instead of the .X object from the modality.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_model", + "description" : "Folder where the state of the trained model will be saved to.", + "example" : [ + "output_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbour calculation", + "arguments" : [ + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "default" : [ + "scvi_integration_neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "scvi_integration_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "scvi_integration_connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Scvi integration options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_batch", + "description" : "Column name discriminating between your batches.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_output", + "description" : "In which .obsm slot to store the resulting integrated embedding.", + "default" : [ + "X_scvi_integrated" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : ".var column containing highly variable genes. By default, do not subset genes.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--early_stopping", + "description" : "Whether to perform early stopping with respect to the validation set.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--early_stopping_monitor", + "description" : "Metric logged during validation set epoch.", + "default" : [ + "elbo_validation" + ], + "required" : false, + "choices" : [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--early_stopping_patience", + "description" : "Number of validation epochs with no improvement after which training will be stopped.", + "default" : [ + 45 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--early_stopping_min_delta", + "description" : "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.", + "default" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--reduce_lr_on_plateau", + "description" : "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_factor", + "description" : "Factor to reduce learning rate.", + "default" : [ + 0.6 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--lr_patience", + "description" : "Number of epochs with no improvement after which learning rate will be reduced.", + "default" : [ + 30.0 + ], + "required" : false, + "min" : 0.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions specified in '--leiden_resolution'.\n", + "default" : [ + "scvi_integration_leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Umap options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.", + "default" : [ + "X_scvi_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run scvi integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "integrate/scvi", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/integration/scvi_leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/integration/scvi_leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { scvi } from "${meta.resources_dir}/../../../../nextflow/integrate/scvi/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | scvi.run( + fromState: [ + "input": "input", + "obs_batch": "obs_batch", + "obsm_output": "obsm_output", + "var_input": "var_input", + "early_stopping": "early_stopping", + "early_stopping_monitor": "early_stopping_monitor", + "early_stopping_patience": "early_stopping_patience", + "early_stopping_min_delta": "early_stopping_min_delta", + "max_epochs": "max_epochs", + "reduce_lr_on_plateau": "reduce_lr_on_plateau", + "lr_factor": "lr_factor", + "lr_patience": "lr_patience", + "output_model": "output_model", + "modality": "modality", + "input_layer": "layer", + ], + toState: [ + "input": "output", + "output_model": "output_model" + ], + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "obsm_output", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "leiden_resolution": "leiden_resolution", + "obs_cluster": "obs_cluster", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"] + ) + | setState(["output", "output_model"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/integration/scvi_leiden/nextflow.config b/target/nextflow/workflows/integration/scvi_leiden/nextflow.config new file mode 100644 index 00000000..5f429823 --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/integration/scvi_leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run scvi integration followed by neighbour calculations, leiden clustering and run umap on the result.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/integration/scvi_leiden/nextflow_labels.config b/target/nextflow/workflows/integration/scvi_leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/scvi_leiden/nextflow_schema.json b/target/nextflow/workflows/integration/scvi_leiden/nextflow_schema.json new file mode 100644 index 00000000..97694ba7 --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/nextflow_schema.json @@ -0,0 +1,230 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "scvi_leiden", + "description": "Run scvi integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "output_model": { + "type": "string", + "format": "path", + "description": "Folder where the state of the trained model will be saved to.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_model\"`, direction: `output`, example: `\"output_dir\"`. ", + "default": "$id.$key.output_model" + } + } + }, + "neighbour calculation": { + "title": "Neighbour calculation", + "type": "object", + "description": "No description", + "properties": { + "uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scvi_integration_neighbors\"`. ", + "default": "scvi_integration_neighbors" + }, + "obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scvi_integration_distances\"`. ", + "default": "scvi_integration_distances" + }, + "obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scvi_integration_connectivities\"`. ", + "default": "scvi_integration_connectivities" + } + } + }, + "scvi integration options": { + "title": "Scvi integration options", + "type": "object", + "description": "No description", + "properties": { + "obs_batch": { + "type": "string", + "description": "Column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the resulting integrated embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scvi_integrated\"`. ", + "default": "X_scvi_integrated" + }, + "var_input": { + "type": "string", + "description": ".var column containing highly variable genes", + "help_text": "Type: `string`, multiple: `False`. " + }, + "early_stopping": { + "type": "boolean", + "description": "Whether to perform early stopping with respect to the validation set.", + "help_text": "Type: `boolean`, multiple: `False`. " + }, + "early_stopping_monitor": { + "type": "string", + "description": "Metric logged during validation set epoch.", + "help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ", + "enum": [ + "elbo_validation", + "reconstruction_loss_validation", + "kl_local_validation" + ], + "default": "elbo_validation" + }, + "early_stopping_patience": { + "type": "integer", + "description": "Number of validation epochs with no improvement after which training will be stopped.", + "help_text": "Type: `integer`, multiple: `False`, default: `45`. ", + "default": 45 + }, + "early_stopping_min_delta": { + "type": "number", + "description": "Minimum change in the monitored quantity to qualify as an improvement, i.e", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "reduce_lr_on_plateau": { + "type": "boolean", + "description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "lr_factor": { + "type": "number", + "description": "Factor to reduce learning rate.", + "help_text": "Type: `double`, multiple: `False`, default: `0.6`. ", + "default": 0.6 + }, + "lr_patience": { + "type": "number", + "description": "Number of epochs with no improvement after which learning rate will be reduced.", + "help_text": "Type: `double`, multiple: `False`, default: `30.0`. ", + "default": 30.0 + } + } + }, + "clustering options": { + "title": "Clustering options", + "type": "object", + "description": "No description", + "properties": { + "obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"scvi_integration_leiden\"`. ", + "default": "scvi_integration_leiden" + }, + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "umap options": { + "title": "Umap options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_scvi_umap\"`. ", + "default": "X_scvi_umap" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/neighbour calculation" + }, + { + "$ref": "#/$defs/scvi integration options" + }, + { + "$ref": "#/$defs/clustering options" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/integration/scvi_leiden/utils/errorstrat_ignore.config b/target/nextflow/workflows/integration/scvi_leiden/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/integration/scvi_leiden/utils/integration_tests.config b/target/nextflow/workflows/integration/scvi_leiden/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/integration/scvi_leiden/utils/labels.config b/target/nextflow/workflows/integration/scvi_leiden/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/scvi_leiden/utils/labels_ci.config b/target/nextflow/workflows/integration/scvi_leiden/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/integration/scvi_leiden/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/integration/totalvi_leiden/.config.vsh.yaml b/target/nextflow/workflows/integration/totalvi_leiden/.config.vsh.yaml new file mode 100644 index 00000000..a94f3499 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/.config.vsh.yaml @@ -0,0 +1,514 @@ +name: "totalvi_leiden" +namespace: "workflows/integration" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_modality" + description: "Which modality to process." + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference" + alternatives: + - "-r" + description: "Input h5mu file with reference data to train the TOTALVI model." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--reference_model_path" + description: "Directory with the reference model. If not exists, trained model\ + \ will be saved there" + info: null + default: + - "totalvi_model_reference" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--query_model_path" + description: "Directory, where the query model will be saved" + info: null + default: + - "totalvi_model_query" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "General TotalVI Options" + arguments: + - type: "string" + name: "--obs_batch" + description: ".Obs column name discriminating between your batches." + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_epochs" + description: "Number of passes through the dataset" + info: null + default: + - 400 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_query_epochs" + description: "Number of passes through the dataset, when fine-tuning model for\ + \ query" + info: null + default: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--weight_decay" + description: "Weight decay, when fine-tuning model for query" + info: null + default: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--force_retrain" + description: "If true, retrain the model and save it to reference_model_path" + info: null + direction: "input" + - type: "string" + name: "--var_input" + description: "Boolean .var column to subset data with (e.g. containing highly\ + \ variable genes). By default, do not subset genes." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "TotalVI integration options RNA" + arguments: + - type: "string" + name: "--rna_reference_modality" + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_obsm_output" + description: "In which .obsm slot to store the normalized RNA from TOTALVI." + info: null + default: + - "X_totalvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "TotalVI integration options ADT" + arguments: + - type: "string" + name: "--prot_reference_modality" + description: "Name of the modality containing proteins in the reference" + info: null + default: + - "prot" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_obsm_output" + description: "In which .obsm slot to store the normalized protein data from TOTALVI." + info: null + default: + - "X_totalvi" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Neighbour calculation RNA" + arguments: + - type: "string" + name: "--rna_uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + default: + - "totalvi_integration_neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "totalvi_integration_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "totalvi_integration_connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Neighbour calculation ADT" + arguments: + - type: "string" + name: "--prot_uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + default: + - "totalvi_integration_neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "totalvi_integration_distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "totalvi_integration_connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Clustering options RNA" + arguments: + - type: "string" + name: "--rna_obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions\ + \ specified in '--leiden_resolution'.\n" + info: null + default: + - "totalvi_integration_leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Clustering options ADT" + arguments: + - type: "string" + name: "--prot_obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions\ + \ specified in '--leiden_resolution'.\n" + info: null + default: + - "totalvi_integration_leiden" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--prot_leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Umap options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding." + info: null + default: + - "X_totalvi_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run totalVI integration followed by neighbour calculations, leiden clustering\ + \ and run umap on the result." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3_mms.h5mu" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "integrate/totalvi" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/integration/totalvi_leiden/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/integration/totalvi_leiden" + executable: "target/nextflow/workflows/integration/totalvi_leiden/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/integrate/totalvi" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/integration/totalvi_leiden/main.nf b/target/nextflow/workflows/integration/totalvi_leiden/main.nf new file mode 100644 index 00000000..5d6566af --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/main.nf @@ -0,0 +1,3874 @@ +// totalvi_leiden main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "totalvi_leiden", + "namespace" : "workflows/integration", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use specified layer for expression values instead of the .X object from the modality.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_modality", + "description" : "Which modality to process.", + "default" : [ + "prot" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference", + "alternatives" : [ + "-r" + ], + "description" : "Input h5mu file with reference data to train the TOTALVI model.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reference_model_path", + "description" : "Directory with the reference model. If not exists, trained model will be saved there", + "default" : [ + "totalvi_model_reference" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--query_model_path", + "description" : "Directory, where the query model will be saved", + "default" : [ + "totalvi_model_query" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "General TotalVI Options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_batch", + "description" : ".Obs column name discriminating between your batches.", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_epochs", + "description" : "Number of passes through the dataset", + "default" : [ + 400 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_query_epochs", + "description" : "Number of passes through the dataset, when fine-tuning model for query", + "default" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--weight_decay", + "description" : "Weight decay, when fine-tuning model for query", + "default" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--force_retrain", + "description" : "If true, retrain the model and save it to reference_model_path", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--var_input", + "description" : "Boolean .var column to subset data with (e.g. containing highly variable genes). By default, do not subset genes.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "TotalVI integration options RNA", + "arguments" : [ + { + "type" : "string", + "name" : "--rna_reference_modality", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_obsm_output", + "description" : "In which .obsm slot to store the normalized RNA from TOTALVI.", + "default" : [ + "X_totalvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "TotalVI integration options ADT", + "arguments" : [ + { + "type" : "string", + "name" : "--prot_reference_modality", + "description" : "Name of the modality containing proteins in the reference", + "default" : [ + "prot" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_obsm_output", + "description" : "In which .obsm slot to store the normalized protein data from TOTALVI.", + "default" : [ + "X_totalvi" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbour calculation RNA", + "arguments" : [ + { + "type" : "string", + "name" : "--rna_uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "default" : [ + "totalvi_integration_neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "totalvi_integration_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "totalvi_integration_connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbour calculation ADT", + "arguments" : [ + { + "type" : "string", + "name" : "--prot_uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "default" : [ + "totalvi_integration_neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "totalvi_integration_distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "totalvi_integration_connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options RNA", + "arguments" : [ + { + "type" : "string", + "name" : "--rna_obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions specified in '--leiden_resolution'.\n", + "default" : [ + "totalvi_integration_leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options ADT", + "arguments" : [ + { + "type" : "string", + "name" : "--prot_obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions specified in '--leiden_resolution'.\n", + "default" : [ + "totalvi_integration_leiden" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--prot_leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Umap options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.", + "default" : [ + "X_totalvi_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run totalVI integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "integrate/totalvi", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/integration/totalvi_leiden/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/integration/totalvi_leiden", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { totalvi } from "${meta.resources_dir}/../../../../nextflow/integrate/totalvi/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Avoid conflict with other output arguments + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | totalvi.run( + fromState: [ + "input": "input", + "input_layer": "layer", + "obs_batch": "obs_batch", + "query_modality": "modality", + "query_proteins_modality": "prot_modality", + "query_model_path": "query_model_path", + "obsm_normalized_rna_output": "rna_obsm_output", + "obsm_normalized_protein_output": "prot_obsm_output", + "reference_model_path": "reference_model_path", + "reference_modality": "rna_reference_modality", + "reference_proteins_modality": "prot_reference_modality", + "var_input": "var_input", + "force_retrain": "force_retrain", + "weight_decay": "weight_decay", + "max_epochs": "max_epochs", + "max_query_epochs": "max_query_epochs", + "reference": "reference" + ], + toState: [ + "input": "output", + "query_model_path": "query_model_path", + "reference_model_path": "reference_model_path", + ] + ) + | neighbors_leiden_umap.run( // For gene expression + key: "rna_neighbors_leiden_umap", + fromState: [ + "input": "input", + "modality": "modality", + "obsm_input": "rna_obsm_output", + "uns_neighbors": "rna_uns_neighbors", + "obsp_neighbor_distances": "rna_obsp_neighbor_distances", + "obsp_neighbor_connectivities": "rna_obsp_neighbor_connectivities", + "obs_cluster": "rna_obs_cluster", + "leiden_resolution": "rna_leiden_resolution", + "uns_neighbors": "rna_uns_neighbors", + "obsm_umap": "obsm_umap", + ], + toState: ["input": "output"], + ) + | neighbors_leiden_umap.run( // For ADT + key: "adt_neighbors_leiden_umap", + fromState: [ + "input": "input", + "modality": "prot_modality", + "obsm_input": "prot_obsm_output", + "uns_neighbors": "prot_uns_neighbors", + "obsp_neighbor_distances": "prot_obsp_neighbor_distances", + "obsp_neighbor_connectivities": "prot_obsp_neighbor_connectivities", + "obs_cluster": "prot_obs_cluster", + "leiden_resolution": "prot_leiden_resolution", + "uns_neighbors": "prot_uns_neighbors", + "obsm_umap": "obsm_umap", + "output": "workflow_output", + ], + toState: ["output": "output"], + ) + | setState(["output", "reference_model_path", "query_model_path"]) + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/integration/totalvi_leiden/nextflow.config b/target/nextflow/workflows/integration/totalvi_leiden/nextflow.config new file mode 100644 index 00000000..ec663cb0 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/integration/totalvi_leiden' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run totalVI integration followed by neighbour calculations, leiden clustering and run umap on the result.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/integration/totalvi_leiden/nextflow_labels.config b/target/nextflow/workflows/integration/totalvi_leiden/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/totalvi_leiden/nextflow_schema.json b/target/nextflow/workflows/integration/totalvi_leiden/nextflow_schema.json new file mode 100644 index 00000000..501d3338 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/nextflow_schema.json @@ -0,0 +1,317 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "totalvi_leiden", + "description": "Run totalVI integration followed by neighbour calculations, leiden clustering and run umap on the result.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "prot_modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"prot\"`. ", + "default": "prot" + }, + "reference": { + "type": "string", + "format": "path", + "exists": true, + "description": "Input h5mu file with reference data to train the TOTALVI model.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + }, + "reference_model_path": { + "type": "string", + "format": "path", + "description": "Directory with the reference model", + "help_text": "Type: `file`, multiple: `False`, default: `\"totalvi_model_reference\"`, direction: `output`. ", + "default": "totalvi_model_reference" + }, + "query_model_path": { + "type": "string", + "format": "path", + "description": "Directory, where the query model will be saved", + "help_text": "Type: `file`, multiple: `False`, default: `\"totalvi_model_query\"`, direction: `output`. ", + "default": "totalvi_model_query" + } + } + }, + "general totalvi options": { + "title": "General TotalVI Options", + "type": "object", + "description": "No description", + "properties": { + "obs_batch": { + "type": "string", + "description": ".Obs column name discriminating between your batches.", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "max_epochs": { + "type": "integer", + "description": "Number of passes through the dataset", + "help_text": "Type: `integer`, multiple: `False`, default: `400`. ", + "default": 400 + }, + "max_query_epochs": { + "type": "integer", + "description": "Number of passes through the dataset, when fine-tuning model for query", + "help_text": "Type: `integer`, multiple: `False`, default: `200`. ", + "default": 200 + }, + "weight_decay": { + "type": "number", + "description": "Weight decay, when fine-tuning model for query", + "help_text": "Type: `double`, multiple: `False`, default: `0.0`. ", + "default": 0.0 + }, + "force_retrain": { + "type": "boolean", + "description": "If true, retrain the model and save it to reference_model_path", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "var_input": { + "type": "string", + "description": "Boolean .var column to subset data with (e.g", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "totalvi integration options rna": { + "title": "TotalVI integration options RNA", + "type": "object", + "description": "No description", + "properties": { + "rna_reference_modality": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "rna_obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the normalized RNA from TOTALVI.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_totalvi\"`. ", + "default": "X_totalvi" + } + } + }, + "totalvi integration options adt": { + "title": "TotalVI integration options ADT", + "type": "object", + "description": "No description", + "properties": { + "prot_reference_modality": { + "type": "string", + "description": "Name of the modality containing proteins in the reference", + "help_text": "Type: `string`, multiple: `False`, default: `\"prot\"`. ", + "default": "prot" + }, + "prot_obsm_output": { + "type": "string", + "description": "In which .obsm slot to store the normalized protein data from TOTALVI.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_totalvi\"`. ", + "default": "X_totalvi" + } + } + }, + "neighbour calculation rna": { + "title": "Neighbour calculation RNA", + "type": "object", + "description": "No description", + "properties": { + "rna_uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_neighbors\"`. ", + "default": "totalvi_integration_neighbors" + }, + "rna_obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_distances\"`. ", + "default": "totalvi_integration_distances" + }, + "rna_obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_connectivities\"`. ", + "default": "totalvi_integration_connectivities" + } + } + }, + "neighbour calculation adt": { + "title": "Neighbour calculation ADT", + "type": "object", + "description": "No description", + "properties": { + "prot_uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_neighbors\"`. ", + "default": "totalvi_integration_neighbors" + }, + "prot_obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_distances\"`. ", + "default": "totalvi_integration_distances" + }, + "prot_obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_connectivities\"`. ", + "default": "totalvi_integration_connectivities" + } + } + }, + "clustering options rna": { + "title": "Clustering options RNA", + "type": "object", + "description": "No description", + "properties": { + "rna_obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_leiden\"`. ", + "default": "totalvi_integration_leiden" + }, + "rna_leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "clustering options adt": { + "title": "Clustering options ADT", + "type": "object", + "description": "No description", + "properties": { + "prot_obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`, default: `\"totalvi_integration_leiden\"`. ", + "default": "totalvi_integration_leiden" + }, + "prot_leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "umap options": { + "title": "Umap options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_totalvi_umap\"`. ", + "default": "X_totalvi_umap" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/general totalvi options" + }, + { + "$ref": "#/$defs/totalvi integration options rna" + }, + { + "$ref": "#/$defs/totalvi integration options adt" + }, + { + "$ref": "#/$defs/neighbour calculation rna" + }, + { + "$ref": "#/$defs/neighbour calculation adt" + }, + { + "$ref": "#/$defs/clustering options rna" + }, + { + "$ref": "#/$defs/clustering options adt" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/integration/totalvi_leiden/utils/errorstrat_ignore.config b/target/nextflow/workflows/integration/totalvi_leiden/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/integration/totalvi_leiden/utils/integration_tests.config b/target/nextflow/workflows/integration/totalvi_leiden/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/integration/totalvi_leiden/utils/labels.config b/target/nextflow/workflows/integration/totalvi_leiden/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/integration/totalvi_leiden/utils/labels_ci.config b/target/nextflow/workflows/integration/totalvi_leiden/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/integration/totalvi_leiden/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/.config.vsh.yaml b/target/nextflow/workflows/multiomics/dimensionality_reduction/.config.vsh.yaml new file mode 100644 index 00000000..d3c89c61 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/.config.vsh.yaml @@ -0,0 +1,330 @@ +name: "dimensionality_reduction" +namespace: "workflows/multiomics" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "use specified layer for expression values instead of the .X object\ + \ from the modality." + info: null + default: + - "log_normalized" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "PCA options" + arguments: + - type: "string" + name: "--obsm_pca" + description: "In which .obsm slot to store the resulting PCA embedding." + info: null + default: + - "X_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_pca_feature_selection" + description: "Column name in .var matrix that will be used to select which genes\ + \ to run the PCA on." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pca_loadings_varm_output" + description: "Name of the .varm key where the PCA loadings are stored.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pca_variance_uns_output" + description: "Name of the .uns key where the variance and variance ratio will\ + \ be stored as a map.\nThe map will contain two keys: variance and variance_ratio\ + \ respectively.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--pca_overwrite" + description: "Allow overwriting slots for PCA output." + info: null + direction: "input" +- name: "Neighbour calculation" + arguments: + - type: "string" + name: "--uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + default: + - "neighbors" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + default: + - "distances" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + default: + - "connectivities" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Umap options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding." + info: null + default: + - "X_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Run calculations that output information required for most integration\ + \ methods: PCA, nearest neighbour and UMAP." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "concat_test_data" +info: + test_dependencies: + - name: "dimensionality_reduction_test" + namespace: "test_workflows/multiomics" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "dimred/pca" + repository: + type: "local" +- name: "workflows/multiomics/neighbors_leiden_umap" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/multiomics/dimensionality_reduction/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/multiomics/dimensionality_reduction" + executable: "target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/dimred/pca" + - "target/nextflow/workflows/multiomics/neighbors_leiden_umap" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf b/target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf new file mode 100644 index 00000000..d7779308 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/main.nf @@ -0,0 +1,3627 @@ +// dimensionality_reduction main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "dimensionality_reduction", + "namespace" : "workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "use specified layer for expression values instead of the .X object from the modality.", + "default" : [ + "log_normalized" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "PCA options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_pca", + "description" : "In which .obsm slot to store the resulting PCA embedding.", + "default" : [ + "X_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_pca_feature_selection", + "description" : "Column name in .var matrix that will be used to select which genes to run the PCA on.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--pca_loadings_varm_output", + "description" : "Name of the .varm key where the PCA loadings are stored.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--pca_variance_uns_output", + "description" : "Name of the .uns key where the variance and variance ratio will be stored as a map.\nThe map will contain two keys: variance and variance_ratio respectively.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--pca_overwrite", + "description" : "Allow overwriting slots for PCA output.", + "direction" : "input" + } + ] + }, + { + "name" : "Neighbour calculation", + "arguments" : [ + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "default" : [ + "neighbors" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "default" : [ + "distances" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "default" : [ + "connectivities" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Umap options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.", + "default" : [ + "X_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Run calculations that output information required for most integration methods: PCA, nearest neighbour and UMAP.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data" + } + ], + "info" : { + "test_dependencies" : [ + { + "name" : "dimensionality_reduction_test", + "namespace" : "test_workflows/multiomics" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "dimred/pca", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/neighbors_leiden_umap", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/multiomics/dimensionality_reduction/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/multiomics/dimensionality_reduction", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { pca } from "${meta.resources_dir}/../../../../nextflow/dimred/pca/main.nf" +include { neighbors_leiden_umap } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | pca.run( + fromState: [ + "input": "input", + "obsm_output": "obsm_pca", + "var_input": "var_pca_feature_selection", + "modality": "modality", + "overwrite": "pca_overwrite", + "layer": "layer", + "varm_output": "pca_loadings_varm_output", + "uns_output": "pca_variance_uns_output", + ], + toState: ["input": "output"] + ) + | neighbors_leiden_umap.run( + fromState: [ + "input": "input", + "obsm_input": "obsm_pca", + "modality": "modality", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_distances": "obsp_neighbor_distances", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + "output": "workflow_output", + "obsm_umap": "obsm_umap", + ], + toState: ["output": "output"], + args: [ + "leiden_resolution": [] // disable leiden + ] + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow.config b/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow.config new file mode 100644 index 00000000..556bce5e --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/multiomics/dimensionality_reduction' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Run calculations that output information required for most integration methods: PCA, nearest neighbour and UMAP.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow_labels.config b/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow_schema.json b/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow_schema.json new file mode 100644 index 00000000..d2416018 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/nextflow_schema.json @@ -0,0 +1,157 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "dimensionality_reduction", + "description": "Run calculations that output information required for most integration methods: PCA, nearest neighbour and UMAP.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "use specified layer for expression values instead of the .X object from the modality.", + "help_text": "Type: `string`, multiple: `False`, default: `\"log_normalized\"`. ", + "default": "log_normalized" + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "pca options": { + "title": "PCA options", + "type": "object", + "description": "No description", + "properties": { + "obsm_pca": { + "type": "string", + "description": "In which .obsm slot to store the resulting PCA embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ", + "default": "X_pca" + }, + "var_pca_feature_selection": { + "type": "string", + "description": "Column name in .var matrix that will be used to select which genes to run the PCA on.", + "help_text": "Type: `string`, multiple: `False`. " + }, + "pca_loadings_varm_output": { + "type": "string", + "description": "Name of the .varm key where the PCA loadings are stored.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "pca_variance_uns_output": { + "type": "string", + "description": "Name of the .uns key where the variance and variance ratio will be stored as a map.\nThe map will contain two keys: variance and variance_ratio respectively.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "pca_overwrite": { + "type": "boolean", + "description": "Allow overwriting slots for PCA output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "neighbour calculation": { + "title": "Neighbour calculation", + "type": "object", + "description": "No description", + "properties": { + "uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, default: `\"neighbors\"`. ", + "default": "neighbors" + }, + "obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"distances\"`. ", + "default": "distances" + }, + "obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, default: `\"connectivities\"`. ", + "default": "connectivities" + } + } + }, + "umap options": { + "title": "Umap options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.", + "help_text": "Type: `string`, multiple: `False`, default: `\"X_umap\"`. ", + "default": "X_umap" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/pca options" + }, + { + "$ref": "#/$defs/neighbour calculation" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/errorstrat_ignore.config b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/integration_tests.config b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/labels.config b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/labels_ci.config b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/multiomics/dimensionality_reduction/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/.config.vsh.yaml b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/.config.vsh.yaml new file mode 100644 index 00000000..7c630ff5 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/.config.vsh.yaml @@ -0,0 +1,288 @@ +name: "neighbors_leiden_umap" +namespace: "workflows/multiomics" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsm_input" + description: "The key of the embedding to use as input." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Neighbour calculation" + arguments: + - type: "string" + name: "--uns_neighbors" + description: "In which .uns slot to store various neighbor output objects." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_distances" + description: "In which .obsp slot to store the distance matrix between the resulting\ + \ neighbors." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obsp_neighbor_connectivities" + description: "In which .obsp slot to store the connectivities matrix between the\ + \ resulting neighbors." + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Clustering options" + arguments: + - type: "string" + name: "--obs_cluster" + description: "Prefix for the .obs keys under which to add the cluster labels.\ + \ Newly created columns in .obs will \nbe created from the specified value for\ + \ '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions\ + \ specified in '--leiden_resolution'.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--leiden_resolution" + description: "Control the coarseness of the clustering. Higher values lead to\ + \ more clusters." + info: null + default: + - 1.0 + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Umap options" + arguments: + - type: "string" + name: "--obsm_umap" + description: "In which .obsm slot to store the resulting UMAP embedding.\nWhen\ + \ not specified, UMAP will not be executed.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Performs neighborhood search, leiden clustering and run umap on an integrated\ + \ embedding." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "cluster/leiden" + repository: + type: "local" +- name: "dimred/umap" + repository: + type: "local" +- name: "neighbors/find_neighbors" + repository: + type: "local" +- name: "metadata/move_obsm_to_obs" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/multiomics/neighbors_leiden_umap/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/multiomics/neighbors_leiden_umap" + executable: "target/nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/cluster/leiden" + - "target/nextflow/dimred/umap" + - "target/nextflow/neighbors/find_neighbors" + - "target/nextflow/metadata/move_obsm_to_obs" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf new file mode 100644 index 00000000..251a279a --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/main.nf @@ -0,0 +1,3594 @@ +// neighbors_leiden_umap main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "neighbors_leiden_umap", + "namespace" : "workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsm_input", + "description" : "The key of the embedding to use as input.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Neighbour calculation", + "arguments" : [ + { + "type" : "string", + "name" : "--uns_neighbors", + "description" : "In which .uns slot to store various neighbor output objects.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_distances", + "description" : "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obsp_neighbor_connectivities", + "description" : "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Clustering options", + "arguments" : [ + { + "type" : "string", + "name" : "--obs_cluster", + "description" : "Prefix for the .obs keys under which to add the cluster labels. Newly created columns in .obs will \nbe created from the specified value for '--obs_cluster' suffixed with an underscore and one of the resolutions\nresolutions specified in '--leiden_resolution'.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--leiden_resolution", + "description" : "Control the coarseness of the clustering. Higher values lead to more clusters.", + "default" : [ + 1.0 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Umap options", + "arguments" : [ + { + "type" : "string", + "name" : "--obsm_umap", + "description" : "In which .obsm slot to store the resulting UMAP embedding.\nWhen not specified, UMAP will not be executed.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Performs neighborhood search, leiden clustering and run umap on an integrated embedding.", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "cluster/leiden", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dimred/umap", + "repository" : { + "type" : "local" + } + }, + { + "name" : "neighbors/find_neighbors", + "repository" : { + "type" : "local" + } + }, + { + "name" : "metadata/move_obsm_to_obs", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/multiomics/neighbors_leiden_umap/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/multiomics/neighbors_leiden_umap", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { leiden } from "${meta.resources_dir}/../../../../nextflow/cluster/leiden/main.nf" +include { umap } from "${meta.resources_dir}/../../../../nextflow/dimred/umap/main.nf" +include { find_neighbors } from "${meta.resources_dir}/../../../../nextflow/neighbors/find_neighbors/main.nf" +include { move_obsm_to_obs } from "${meta.resources_dir}/../../../../nextflow/metadata/move_obsm_to_obs/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + if (!workflow.stubRun) { + assert (state.leiden_resolution.isEmpty() || state.obs_cluster?.trim()): + "When leiden_resolution is set, obs_cluster must also be defined." + } + [id, state] + } + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | find_neighbors.run( + fromState: [ + "input": "input", + "uns_output": "uns_neighbors", + "obsp_distances": "obsp_neighbor_distances", + "obsp_connectivities": "obsp_neighbor_connectivities", + "obsm_input": "obsm_input", + "output": "workflow_output", + "modality": "modality" + ], + toState: ["input": "output"] + ) + | leiden.run( + runIf: {id, state -> state.leiden_resolution}, + fromState: [ + "input": "input", + "obsp_connectivities": "obsp_neighbor_connectivities", + "obsm_name": "obs_cluster", + "resolution": "leiden_resolution", + "modality": "modality", + ], + toState: ["input": "output"] + ) + | move_obsm_to_obs.run( + runIf: {id, state -> state.leiden_resolution}, + fromState: [ + "input": "input", + "output": "workflow_output", + "obsm_key": "obs_cluster", + "modality": "modality", + ], + args: ["output_compression": "gzip"], + toState: ["input": "output"] + ) + | umap.run( + runIf: {id, state -> !state.obsm_umap?.trim()?.isEmpty()}, + fromState: [ + "input": "input", + "output": "workflow_output", + "uns_neighbors": "uns_neighbors", + "obsm_output": "obsm_umap", + "modality": "modality", + ], + args: ["output_compression": "gzip"], + toState: ["input": "output"] + ) + | setState(["output": "input"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow.config b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow.config new file mode 100644 index 00000000..8a864bd4 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/multiomics/neighbors_leiden_umap' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Performs neighborhood search, leiden clustering and run umap on an integrated embedding.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow_labels.config b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow_schema.json b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow_schema.json new file mode 100644 index 00000000..5c8c8699 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/nextflow_schema.json @@ -0,0 +1,136 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "neighbors_leiden_umap", + "description": "Performs neighborhood search, leiden clustering and run umap on an integrated embedding.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "obsm_input": { + "type": "string", + "description": "The key of the embedding to use as input.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "neighbour calculation": { + "title": "Neighbour calculation", + "type": "object", + "description": "No description", + "properties": { + "uns_neighbors": { + "type": "string", + "description": "In which .uns slot to store various neighbor output objects.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "obsp_neighbor_distances": { + "type": "string", + "description": "In which .obsp slot to store the distance matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "obsp_neighbor_connectivities": { + "type": "string", + "description": "In which .obsp slot to store the connectivities matrix between the resulting neighbors.", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "clustering options": { + "title": "Clustering options", + "type": "object", + "description": "No description", + "properties": { + "obs_cluster": { + "type": "string", + "description": "Prefix for the .obs keys under which to add the cluster labels", + "help_text": "Type: `string`, multiple: `False`. " + }, + "leiden_resolution": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Control the coarseness of the clustering", + "help_text": "Type: `double`, multiple: `True`, default: `[1.0]`. ", + "default": [ + 1.0 + ] + } + } + }, + "umap options": { + "title": "Umap options", + "type": "object", + "description": "No description", + "properties": { + "obsm_umap": { + "type": "string", + "description": "In which .obsm slot to store the resulting UMAP embedding.\nWhen not specified, UMAP will not be executed.\n", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/neighbour calculation" + }, + { + "$ref": "#/$defs/clustering options" + }, + { + "$ref": "#/$defs/umap options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/errorstrat_ignore.config b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/integration_tests.config b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/labels.config b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/labels_ci.config b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/multiomics/neighbors_leiden_umap/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/multiomics/process_batches/.config.vsh.yaml b/target/nextflow/workflows/multiomics/process_batches/.config.vsh.yaml new file mode 100644 index 00000000..d451d464 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/.config.vsh.yaml @@ -0,0 +1,437 @@ +name: "process_batches" +namespace: "workflows/multiomics" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the sample." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--rna_layer" + description: "Input layer for the gene expression modality. If not specified,\ + \ .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_layer" + description: "Input layer for the antibody capture modality. If not specified,\ + \ .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Highly variable features detection" + arguments: + - type: "string" + name: "--highly_variable_features_var_output" + alternatives: + - "--filter_with_hvg_var_output" + description: "In which .var slot to store a boolean array corresponding to the\ + \ highly variable genes." + info: null + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--highly_variable_features_obs_batch_key" + alternatives: + - "--filter_with_hvg_obs_batch_key" + description: "If specified, highly-variable genes are selected within each batch\ + \ separately and merged. This simple \nprocess avoids the selection of batch-specific\ + \ genes and acts as a lightweight batch correction method.\n" + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "QC metrics calculation options" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes.\n" + info: null + example: + - "ercc,highly_variable" + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + default: + - 50 + - 100 + - 200 + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: "," +- name: "PCA options" + arguments: + - type: "boolean_true" + name: "--pca_overwrite" + description: "Allow overwriting slots for PCA output." + info: null + direction: "input" +- name: "CLR options" + arguments: + - type: "integer" + name: "--clr_axis" + description: "Axis to perform the CLR transformation on." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "RNA Scaling options" + description: "Options for enabling scaling of the log-normalized data to unit variance\ + \ and zero mean.\nThe scaled data will be output a different layer and representation\ + \ with reduced dimensions\nwill be created and stored in addition to the non-scaled\ + \ data.\n" + arguments: + - type: "boolean_true" + name: "--rna_enable_scaling" + description: "Enable scaling for the RNA modality." + info: null + direction: "input" + - type: "string" + name: "--rna_scaling_output_layer" + description: "Output layer where the scaled log-normalized data will be stored." + info: null + default: + - "scaled" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_pca_obsm_output" + description: "Name of the .obsm key where the PCA representation of the log-normalized\n\ + and scaled data is stored.\n" + info: null + default: + - "scaled_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_pca_loadings_varm_output" + description: "Name of the .varm key where the PCA loadings of the log-normalized\ + \ and scaled\ndata is stored.\n" + info: null + default: + - "scaled_pca_loadings" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_pca_variance_uns_output" + description: "Name of the .uns key where the variance and variance ratio will\ + \ be stored as a map.\nThe map will contain two keys: variance and variance_ratio\ + \ respectively.\n" + info: null + default: + - "scaled_pca_variance" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_umap_obsm_output" + description: "Name of the .obsm key where the UMAP representation of the log-normalized\ + \ and scaled data is stored." + info: null + default: + - "scaled_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_scaling_max_value" + description: "Clip (truncate) data to this value after scaling. If not specified,\ + \ do not clip." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--rna_scaling_zero_center" + description: "If set, omit zero-centering variables, which allows to handle sparse\ + \ input efficiently.\"" + info: null + direction: "input" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "This workflow serves as an entrypoint into the 'full_pipeline' in order\ + \ to\nre-run the multisample processing and the integration setup. An input .h5mu\ + \ file will \nfirst be split in order to run the multisample processing per modality.\ + \ Next, the modalities\nare merged again and the integration setup pipeline is executed.\ + \ Please note that this workflow\nassumes that samples from multiple pipelines are\ + \ already concatenated. \n" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "concat_test_data" +- type: "file" + path: "10x_5k_anticmv" +info: + test_dependencies: + - name: "workflow_test" + namespace: "test_workflows/multiomics/process_batches" + - name: "workflow_test2" + namespace: "test_workflows/multiomics/process_batches" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "dataflow/merge" + repository: + type: "local" +- name: "workflows/multiomics/split_modalities" + alias: "split_modalities_workflow" + repository: + type: "local" +- name: "workflows/prot/prot_multisample" + repository: + type: "local" +- name: "workflows/rna/rna_multisample" + repository: + type: "local" +- name: "workflows/multiomics/dimensionality_reduction" + alias: "dimensionality_reduction_rna" + repository: + type: "local" +- name: "workflows/multiomics/dimensionality_reduction" + alias: "dimensionality_reduction_prot" + repository: + type: "local" +- name: "workflows/multiomics/dimensionality_reduction" + alias: "dimensionality_reduction_scaling_rna" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/multiomics/process_batches/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/multiomics/process_batches" + executable: "target/nextflow/workflows/multiomics/process_batches/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/dataflow/merge" + - "target/_private/nextflow/workflows/multiomics/split_modalities" + - "target/nextflow/workflows/prot/prot_multisample" + - "target/nextflow/workflows/rna/rna_multisample" + - "target/nextflow/workflows/multiomics/dimensionality_reduction" + - "target/nextflow/workflows/multiomics/dimensionality_reduction" + - "target/nextflow/workflows/multiomics/dimensionality_reduction" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/multiomics/process_batches/main.nf b/target/nextflow/workflows/multiomics/process_batches/main.nf new file mode 100644 index 00000000..aa1ebffd --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/main.nf @@ -0,0 +1,3942 @@ +// process_batches main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "process_batches", + "namespace" : "workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the sample.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_layer", + "description" : "Input layer for the gene expression modality. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_layer", + "description" : "Input layer for the antibody capture modality. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Highly variable features detection", + "arguments" : [ + { + "type" : "string", + "name" : "--highly_variable_features_var_output", + "alternatives" : [ + "--filter_with_hvg_var_output" + ], + "description" : "In which .var slot to store a boolean array corresponding to the highly variable genes.", + "default" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--highly_variable_features_obs_batch_key", + "alternatives" : [ + "--filter_with_hvg_obs_batch_key" + ], + "description" : "If specified, highly-variable genes are selected within each batch separately and merged. This simple \nprocess avoids the selection of batch-specific genes and acts as a lightweight batch correction method.\n", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "QC metrics calculation options", + "arguments" : [ + { + "type" : "string", + "name" : "--var_qc_metrics", + "description" : "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes.\n", + "example" : [ + "ercc,highly_variable" + ], + "default" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "integer", + "name" : "--top_n_vars", + "description" : "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n", + "default" : [ + 50, + 100, + 200, + 500 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + } + ] + }, + { + "name" : "PCA options", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--pca_overwrite", + "description" : "Allow overwriting slots for PCA output.", + "direction" : "input" + } + ] + }, + { + "name" : "CLR options", + "arguments" : [ + { + "type" : "integer", + "name" : "--clr_axis", + "description" : "Axis to perform the CLR transformation on.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "RNA Scaling options", + "description" : "Options for enabling scaling of the log-normalized data to unit variance and zero mean.\nThe scaled data will be output a different layer and representation with reduced dimensions\nwill be created and stored in addition to the non-scaled data.\n", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--rna_enable_scaling", + "description" : "Enable scaling for the RNA modality.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--rna_scaling_output_layer", + "description" : "Output layer where the scaled log-normalized data will be stored.", + "default" : [ + "scaled" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_pca_obsm_output", + "description" : "Name of the .obsm key where the PCA representation of the log-normalized\nand scaled data is stored.\n", + "default" : [ + "scaled_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_pca_loadings_varm_output", + "description" : "Name of the .varm key where the PCA loadings of the log-normalized and scaled\ndata is stored.\n", + "default" : [ + "scaled_pca_loadings" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_pca_variance_uns_output", + "description" : "Name of the .uns key where the variance and variance ratio will be stored as a map.\nThe map will contain two keys: variance and variance_ratio respectively.\n", + "default" : [ + "scaled_pca_variance" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_umap_obsm_output", + "description" : "Name of the .obsm key where the UMAP representation of the log-normalized and scaled data is stored.", + "default" : [ + "scaled_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_scaling_max_value", + "description" : "Clip (truncate) data to this value after scaling. If not specified, do not clip.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_false", + "name" : "--rna_scaling_zero_center", + "description" : "If set, omit zero-centering variables, which allows to handle sparse input efficiently.\\"", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "This workflow serves as an entrypoint into the 'full_pipeline' in order to\nre-run the multisample processing and the integration setup. An input .h5mu file will \nfirst be split in order to run the multisample processing per modality. Next, the modalities\nare merged again and the integration setup pipeline is executed. Please note that this workflow\nassumes that samples from multiple pipelines are already concatenated. \n", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_anticmv" + } + ], + "info" : { + "test_dependencies" : [ + { + "name" : "workflow_test", + "namespace" : "test_workflows/multiomics/process_batches" + }, + { + "name" : "workflow_test2", + "namespace" : "test_workflows/multiomics/process_batches" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "dataflow/merge", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/split_modalities", + "alias" : "split_modalities_workflow", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/prot/prot_multisample", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/rna/rna_multisample", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/dimensionality_reduction", + "alias" : "dimensionality_reduction_rna", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/dimensionality_reduction", + "alias" : "dimensionality_reduction_prot", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/dimensionality_reduction", + "alias" : "dimensionality_reduction_scaling_rna", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/multiomics/process_batches/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/multiomics/process_batches", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { merge } from "${meta.resources_dir}/../../../../nextflow/dataflow/merge/main.nf" +include { split_modalities as split_modalities_workflow_viashalias } from "${meta.resources_dir}/../../../../_private/nextflow/workflows/multiomics/split_modalities/main.nf" +split_modalities_workflow = split_modalities_workflow_viashalias.run(key: "split_modalities_workflow") +include { prot_multisample } from "${meta.resources_dir}/../../../../nextflow/workflows/prot/prot_multisample/main.nf" +include { rna_multisample } from "${meta.resources_dir}/../../../../nextflow/workflows/rna/rna_multisample/main.nf" +include { dimensionality_reduction as dimensionality_reduction_rna_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/dimensionality_reduction/main.nf" +dimensionality_reduction_rna = dimensionality_reduction_rna_viashalias.run(key: "dimensionality_reduction_rna") +include { dimensionality_reduction as dimensionality_reduction_prot_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/dimensionality_reduction/main.nf" +dimensionality_reduction_prot = dimensionality_reduction_prot_viashalias.run(key: "dimensionality_reduction_prot") +include { dimensionality_reduction as dimensionality_reduction_scaling_rna_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/dimensionality_reduction/main.nf" +dimensionality_reduction_scaling_rna = dimensionality_reduction_scaling_rna_viashalias.run(key: "dimensionality_reduction_scaling_rna") + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + multisample_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // The input for this workflow can either be a list of unimodal files + // or a single multimodal file. To destingish between the two, the files will be split either way. + // For multiple unimodal files, the result before or after splitting is identical. + // In both cases, this workflow requires split files. + + // Split must be called on each item of the input list, so split it into multiple events with unique ids + // Unique ids are required to run a component + | flatMap {id, state -> + if (workflow.stubRun) { + return [[id, state]] + } + def newEvents = state.input.withIndex().collect{input_file, index -> + def newState = state + ["input": input_file, "original_id": id] + ["${id}_${index}", newState] + } + newEvents + } + | split_modalities_workflow.run( + fromState: {id, state -> + [ + "input": state.input, + "id": id + ] + }, + toState: [ + "output": "output", + "output_types": "output_types" + ] + ) + // gather the output from split_modalities_workflow + // by reading the output csv (the csv contains 1 line per output file) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + def new_id = state.original_id + "_${dat.name}" // Make a unique ID by appending the modality name. + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types", "original_id"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + + multisample_ch + | toSortedList() + | map{all_input -> + def ids = all_input.collect({it[0]}) + assert ids.clone().unique().size() == ids.size(): "Found duplicate modalities in the input." + } + + // + // Multisample processing + // + def multisample_arguments = [ + "rna": [ + "highly_variable_features_var_output": "highly_variable_features_var_output", + "highly_variable_features_obs_batch_key": "highly_variable_features_obs_batch_key", + "var_qc_metrics": "var_qc_metrics", + "top_n_vars": "top_n_vars", + "layer": "rna_layer", + "enable_scaling": "rna_enable_scaling", + "scaling_output_layer": "rna_scaling_output_layer", + "scaling_max_value": "rna_scaling_max_value", + "scaling_zero_center": "rna_scaling_zero_center", + ], + "prot": [ + "layer": "prot_layer", + "clr_axis": "clr_axis", + ] + ].asImmutable() + + multimodal_ch_known = multisample_ch + | runEach( + components: [rna_multisample, prot_multisample], + filter: { id, state, component -> + state.modality + "_multisample" == component.config.name + }, + fromState: { id, state, component -> + def newState = multisample_arguments.get(state.modality).collectEntries{key_, value_ -> + [key_, state[value_]] + } + newState + ["id": id, "input": state.input] + }, + toState: {id, output, state, component -> + def newState = state + ["input": output.output] + return newState + } + ) + + multimodal_ch_unknown = multisample_ch + | filter { id, state -> state.modality !in multisample_arguments.keySet() } + + multimodal_ch = multimodal_ch_unknown.mix(multimodal_ch_known) + // Remove arguments for multisample processing from state. + | map {id, state -> + def keysToRemove = multisample_arguments.inject([]){currentKeys, modality, stateMapping -> + def newKeys = currentKeys + stateMapping.values() + return newKeys + } + keysToRemove -= ["rna_enable_scaling", "rna_scaling_output_layer"] + def newState = state.findAll{it.key !in keysToRemove } + [id, newState] + } + | view {"After multisample processing: $it"} + + // + // Merging: joining the observations from all modalities together. Everything in 1 file. + // + + // Set the original IDs back into place to use them in groupTuple + | map {id, state -> + def newEvent = [state.id, state] + newEvent + } + // Group the modalities back together per input sample + | groupTuple(by: 0, sort: "hash") + | view {"After toSortedList: $it"} + | map { id, states -> + def new_input = states.collect{it.input} + def modalities = states.collect{it.modality}.unique() + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + [id, new_state + ["input": new_input, "modalities": modalities]] + } + | view {"Input merge channel: $it"} + | merge.run( + fromState: ["input": "input"], + toState: ["input": "output"], + ) + | view {"After merging processing: $it"} + + // Processing of multi-modal multisample MuData files. + // Performs calculations on samples that have *not* been integrated, + // and can be considered a "no-integration" workflow. + output_ch = [dimensionality_reduction_rna, dimensionality_reduction_scaling_rna, dimensionality_reduction_prot].inject(multimodal_ch){ channel_in, component -> + channel_out_integrated = channel_in + | component.run( + runIf: {id, state -> + def reg = state.rna_enable_scaling ? ~/^dimensionality_reduction_(scaling_)?/ : ~/^dimensionality_reduction_/ + def modality_to_check = component.name - reg + state.modalities.contains(modality_to_check) + }, + fromState: { id, state -> + def stateMappings = [ + "dimensionality_reduction_rna": + [ + "id": id, + "input": state.input, + "layer": "log_normalized", + "modality": "rna", + "var_pca_feature_selection": state.highly_variable_features_var_output, // run PCA on highly variable genes only + "pca_overwrite": state.pca_overwrite, + "output": state.workflow_output, + ], + "dimensionality_reduction_scaling_rna": + [ + "id": id, + "input": state.input, + "layer": state.rna_scaling_output_layer, + "modality": "rna", + "var_pca_feature_selection": state.highly_variable_features_var_output, // run PCA on highly variable genes only + "pca_overwrite": state.pca_overwrite, + // extra scaling args + "obsm_pca": state.rna_scaling_pca_obsm_output, + "pca_loadings_varm_output": state.rna_scaling_pca_loadings_varm_output, + "pca_variance_uns_output": state.rna_scaling_pca_variance_uns_output, + "pca_overwrite": state.pca_overwrite, + "obsm_umap": state.rna_scaling_umap_obsm_output, + "uns_neighbors": "neighbors_scaled", + "obsp_neighbor_connectivities": "connectivities_scaled", + "obsp_neighbor_distances": "distances_scaled", + "output": state.workflow_output, + ], + "dimensionality_reduction_prot": + [ + "id": id, + "input": state.input, + "layer": "clr", + "modality": "prot", + "pca_overwrite": state.pca_overwrite, + "output": state.workflow_output, + ] + ] + return stateMappings[component.name] + }, + toState: ["input": "output"] + ) + } + // At the end of the reduce statement, + // the `toState` closure put the output back into + // into the 'input' slot + | map {id, state -> + [id, ["output": state.input]] + } + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/multiomics/process_batches/nextflow.config b/target/nextflow/workflows/multiomics/process_batches/nextflow.config new file mode 100644 index 00000000..d6bc0458 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/multiomics/process_batches' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'This workflow serves as an entrypoint into the \'full_pipeline\' in order to\nre-run the multisample processing and the integration setup. An input .h5mu file will \nfirst be split in order to run the multisample processing per modality. Next, the modalities\nare merged again and the integration setup pipeline is executed. Please note that this workflow\nassumes that samples from multiple pipelines are already concatenated. \n' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/multiomics/process_batches/nextflow_labels.config b/target/nextflow/workflows/multiomics/process_batches/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/process_batches/nextflow_schema.json b/target/nextflow/workflows/multiomics/process_batches/nextflow_schema.json new file mode 100644 index 00000000..c8313751 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/nextflow_schema.json @@ -0,0 +1,223 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "process_batches", + "description": "This workflow serves as an entrypoint into the 'full_pipeline' in order to\nre-run the multisample processing and the integration setup. An input .h5mu file will \nfirst be split in order to run the multisample processing per modality. Next, the modalities\nare merged again and the integration setup pipeline is executed. Please note that this workflow\nassumes that samples from multiple pipelines are already concatenated. \n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"input.h5mu\"]`. " + }, + "rna_layer": { + "type": "string", + "description": "Input layer for the gene expression modality", + "help_text": "Type: `string`, multiple: `False`. " + }, + "prot_layer": { + "type": "string", + "description": "Input layer for the antibody capture modality", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "highly variable features detection": { + "title": "Highly variable features detection", + "type": "object", + "description": "No description", + "properties": { + "highly_variable_features_var_output": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding to the highly variable genes.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_hvg\"`. ", + "default": "filter_with_hvg" + }, + "highly_variable_features_obs_batch_key": { + "type": "string", + "description": "If specified, highly-variable genes are selected within each batch separately and merged", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + } + } + }, + "qc metrics calculation options": { + "title": "QC metrics calculation options", + "type": "object", + "description": "No description", + "properties": { + "var_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes.\n", + "help_text": "Type: `string`, multiple: `True`, default: `[\"filter_with_hvg\"]`, example: `[\"ercc,highly_variable\"]`. ", + "default": [ + "filter_with_hvg" + ] + }, + "top_n_vars": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated", + "help_text": "Type: `integer`, multiple: `True`, default: `[50,100,200,500]`. ", + "default": [ + 50, + 100, + 200, + 500 + ] + } + } + }, + "pca options": { + "title": "PCA options", + "type": "object", + "description": "No description", + "properties": { + "pca_overwrite": { + "type": "boolean", + "description": "Allow overwriting slots for PCA output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "clr options": { + "title": "CLR options", + "type": "object", + "description": "No description", + "properties": { + "clr_axis": { + "type": "integer", + "description": "Axis to perform the CLR transformation on.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "rna scaling options": { + "title": "RNA Scaling options", + "type": "object", + "description": "Options for enabling scaling of the log-normalized data to unit variance and zero mean.\nThe scaled data will be output a different layer and representation with reduced dimensions\nwill be created and stored in addition to the non-scaled data.\n", + "properties": { + "rna_enable_scaling": { + "type": "boolean", + "description": "Enable scaling for the RNA modality.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "rna_scaling_output_layer": { + "type": "string", + "description": "Output layer where the scaled log-normalized data will be stored.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled\"`. ", + "default": "scaled" + }, + "rna_scaling_pca_obsm_output": { + "type": "string", + "description": "Name of the .obsm key where the PCA representation of the log-normalized\nand scaled data is stored.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_pca\"`. ", + "default": "scaled_pca" + }, + "rna_scaling_pca_loadings_varm_output": { + "type": "string", + "description": "Name of the .varm key where the PCA loadings of the log-normalized and scaled\ndata is stored.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_pca_loadings\"`. ", + "default": "scaled_pca_loadings" + }, + "rna_scaling_pca_variance_uns_output": { + "type": "string", + "description": "Name of the .uns key where the variance and variance ratio will be stored as a map.\nThe map will contain two keys: variance and variance_ratio respectively.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_pca_variance\"`. ", + "default": "scaled_pca_variance" + }, + "rna_scaling_umap_obsm_output": { + "type": "string", + "description": "Name of the .obsm key where the UMAP representation of the log-normalized and scaled data is stored.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_umap\"`. ", + "default": "scaled_umap" + }, + "rna_scaling_max_value": { + "type": "number", + "description": "Clip (truncate) data to this value after scaling", + "help_text": "Type: `double`, multiple: `False`. " + }, + "rna_scaling_zero_center": { + "type": "boolean", + "description": "If set, omit zero-centering variables, which allows to handle sparse input efficiently.\"", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/highly variable features detection" + }, + { + "$ref": "#/$defs/qc metrics calculation options" + }, + { + "$ref": "#/$defs/pca options" + }, + { + "$ref": "#/$defs/clr options" + }, + { + "$ref": "#/$defs/rna scaling options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/multiomics/process_batches/utils/errorstrat_ignore.config b/target/nextflow/workflows/multiomics/process_batches/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/process_batches/utils/integration_tests.config b/target/nextflow/workflows/multiomics/process_batches/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/process_batches/utils/labels.config b/target/nextflow/workflows/multiomics/process_batches/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/process_batches/utils/labels_ci.config b/target/nextflow/workflows/multiomics/process_batches/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_batches/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/multiomics/process_samples/.config.vsh.yaml b/target/nextflow/workflows/multiomics/process_samples/.config.vsh.yaml new file mode 100644 index 00000000..d9508516 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/.config.vsh.yaml @@ -0,0 +1,767 @@ +name: "process_samples" +namespace: "workflows/multiomics" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the sample." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_layer" + description: "Input layer for the gene expression modality. If not specified,\ + \ .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prot_layer" + description: "Input layer for the antibody capture modality. If not specified,\ + \ .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--gdo_layer" + description: "Input layer for the guide-derived oligonucleotide (GDO) data. If\ + \ not specified, .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Sample ID options" + description: "Options for adding the id to .obs on the MuData object. Having a sample\ + \ \nid present in a requirement of several components for this pipeline.\n" + arguments: + - type: "boolean" + name: "--add_id_to_obs" + description: "Add the value passed with --id to .obs." + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--add_id_obs_output" + description: ".Obs column to add the sample IDs to. Required and only used when\ + \ \n--add_id_to_obs is set to 'true'\n" + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean" + name: "--add_id_make_observation_keys_unique" + description: "Join the id to the .obs index (.obs_names). \nOnly used when --add_id_to_obs\ + \ is set to 'true'.\n" + info: null + default: + - true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "RNA filtering options" + arguments: + - type: "integer" + name: "--rna_min_counts" + description: "Minimum number of counts captured per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--rna_max_counts" + description: "Maximum number of counts captured per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--rna_min_genes_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--rna_max_genes_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 1500000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--rna_min_cells_per_gene" + description: "Minimum of non-zero values per gene." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_min_fraction_mito" + description: "Minimum fraction of UMIs that are mitochondrial." + info: null + example: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_max_fraction_mito" + description: "Maximum fraction of UMIs that are mitochondrial." + info: null + example: + - 0.2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_min_fraction_ribo" + description: "Minimum fraction of UMIs that are mitochondrial." + info: null + example: + - 0.0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_max_fraction_ribo" + description: "Maximum fraction of UMIs that are mitochondrial." + info: null + example: + - 0.2 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--skip_scrublet_doublet_detection" + description: "Skip the scrublet doublet detection step." + info: null + direction: "input" +- name: "CITE-seq filtering options" + arguments: + - type: "integer" + name: "--prot_min_counts" + description: "Minimum number of counts per cell." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--prot_max_counts" + description: "Minimum number of counts per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--prot_min_proteins_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--prot_max_proteins_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 100000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--prot_min_cells_per_protein" + description: "Minimum of non-zero values per protein." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "GDO filtering options" + arguments: + - type: "integer" + name: "--gdo_min_counts" + description: "Minimum number of counts per cell." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gdo_max_counts" + description: "Minimum number of counts per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gdo_min_guides_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gdo_max_guides_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 100000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--gdo_min_cells_per_guide" + description: "Minimum of non-zero values per guide." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Highly variable features detection" + arguments: + - type: "string" + name: "--highly_variable_features_var_output" + alternatives: + - "--filter_with_hvg_var_output" + description: "In which .var slot to store a boolean array corresponding to the\ + \ highly variable genes." + info: null + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--highly_variable_features_obs_batch_key" + alternatives: + - "--filter_with_hvg_obs_batch_key" + description: "If specified, highly-variable genes are selected within each batch\ + \ separately and merged. This simple \nprocess avoids the selection of batch-specific\ + \ genes and acts as a lightweight batch correction method.\n" + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Mitochondrial & Ribosomal Gene Detection" + arguments: + - type: "string" + name: "--var_gene_names" + description: ".var column name to be used to detect mitochondrial/ribosomal genes\ + \ instead of .var_names (default if not set).\nGene names matching with the\ + \ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\ + \ be \nidentified as mitochondrial or ribosomal genes, respectively. \n" + info: null + example: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_mitochondrial_genes" + description: "In which .var slot to store a boolean array corresponding the mitochondrial\ + \ genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_mitochondrial_fraction" + description: "When specified, write the fraction of counts originating from mitochondrial\ + \ genes \n(based on --mitochondrial_gene_regex) to an .obs column with the specified\ + \ name.\nRequires --var_name_mitochondrial_genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--mitochondrial_gene_regex" + description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\ + By default will detect human and mouse mitochondrial genes from a gene symbol.\n" + info: null + default: + - "^[mM][tT]-" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_ribosomal_genes" + description: "In which .var slot to store a boolean array corresponding the ribosomal\ + \ genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_ribosomal_fraction" + description: "When specified, write the fraction of counts originating from ribosomal\ + \ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\ + \ name.\nRequires --var_name_ribosomal_genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--ribosomal_gene_regex" + description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\ + By default will detect human and mouse ribosomal genes from a gene symbol.\n" + info: null + default: + - "^[Mm]?[Rr][Pp][LlSs]" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "QC metrics calculation options" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes. Defaults to the combined values specified for\n--var_name_mitochondrial_genes\ + \ and --highly_variable_features_var_output.\n" + info: null + example: + - "ercc,highly_variable" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + default: + - 50 + - 100 + - 200 + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: "," +- name: "PCA options" + arguments: + - type: "boolean_true" + name: "--pca_overwrite" + description: "Allow overwriting slots for PCA output." + info: null + direction: "input" +- name: "CLR options" + arguments: + - type: "integer" + name: "--clr_axis" + description: "Axis to perform the CLR transformation on." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "RNA Scaling options" + description: "Options for enabling scaling of the log-normalized data to unit variance\ + \ and zero mean.\nThe scaled data will be output a different layer and representation\ + \ with reduced dimensions\nwill be created and stored in addition to the non-scaled\ + \ data.\n" + arguments: + - type: "boolean_true" + name: "--rna_enable_scaling" + description: "Enable scaling for the RNA modality." + info: null + direction: "input" + - type: "string" + name: "--rna_scaling_output_layer" + description: "Output layer where the scaled log-normalized data will be stored." + info: null + default: + - "scaled" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_pca_obsm_output" + description: "Name of the .obsm key where the PCA representation of the log-normalized\n\ + and scaled data is stored.\n" + info: null + default: + - "scaled_pca" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_pca_loadings_varm_output" + description: "Name of the .varm key where the PCA loadings of the log-normalized\ + \ and scaled\ndata is stored.\n" + info: null + default: + - "scaled_pca_loadings" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_pca_variance_uns_output" + description: "Name of the .uns key where the variance and variance ratio will\ + \ be stored as a map.\nThe map will contain two keys: variance and variance_ratio\ + \ respectively.\n" + info: null + default: + - "scaled_pca_variance" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rna_scaling_umap_obsm_output" + description: "Name of the .obsm key where the UMAP representation of the log-normalized\ + \ and scaled data is stored." + info: null + default: + - "scaled_umap" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--rna_scaling_max_value" + description: "Clip (truncate) data to this value after scaling. If not specified,\ + \ do not clip." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--rna_scaling_zero_center" + description: "If set, omit zero-centering variables, which allows to handle sparse\ + \ input efficiently.\"" + info: null + direction: "input" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A pipeline to analyse multiple multiomics samples." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf3" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf4" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf5" +- type: "file" + path: "concat_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +- type: "file" + path: "10x_5k_lung_crispr" +info: + test_dependencies: + - name: "move_layer" + namespace: "transform" + - name: "remove_modality" + namespace: "filter" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "metadata/add_id" + repository: + type: "local" +- name: "workflows/multiomics/split_modalities" + alias: "split_modalities_workflow" + repository: + type: "local" +- name: "dataflow/merge" + repository: + type: "local" +- name: "dataflow/concatenate_h5mu" + repository: + type: "local" +- name: "workflows/rna/rna_singlesample" + repository: + type: "local" +- name: "workflows/prot/prot_singlesample" + repository: + type: "local" +- name: "workflows/gdo/gdo_singlesample" + repository: + type: "local" +- name: "workflows/multiomics/process_batches" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/multiomics/process_samples/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/multiomics/process_samples" + executable: "target/nextflow/workflows/multiomics/process_samples/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/metadata/add_id" + - "target/_private/nextflow/workflows/multiomics/split_modalities" + - "target/nextflow/dataflow/merge" + - "target/nextflow/dataflow/concatenate_h5mu" + - "target/nextflow/workflows/rna/rna_singlesample" + - "target/nextflow/workflows/prot/prot_singlesample" + - "target/nextflow/workflows/gdo/gdo_singlesample" + - "target/nextflow/workflows/multiomics/process_batches" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/multiomics/process_samples/main.nf b/target/nextflow/workflows/multiomics/process_samples/main.nf new file mode 100644 index 00000000..e3cf17b5 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/main.nf @@ -0,0 +1,4366 @@ +// process_samples main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "process_samples", + "namespace" : "workflows/multiomics", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the sample.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_layer", + "description" : "Input layer for the gene expression modality. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prot_layer", + "description" : "Input layer for the antibody capture modality. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--gdo_layer", + "description" : "Input layer for the guide-derived oligonucleotide (GDO) data. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Sample ID options", + "description" : "Options for adding the id to .obs on the MuData object. Having a sample \nid present in a requirement of several components for this pipeline.\n", + "arguments" : [ + { + "type" : "boolean", + "name" : "--add_id_to_obs", + "description" : "Add the value passed with --id to .obs.", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--add_id_obs_output", + "description" : ".Obs column to add the sample IDs to. Required and only used when \n--add_id_to_obs is set to 'true'\n", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean", + "name" : "--add_id_make_observation_keys_unique", + "description" : "Join the id to the .obs index (.obs_names). \nOnly used when --add_id_to_obs is set to 'true'.\n", + "default" : [ + true + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "RNA filtering options", + "arguments" : [ + { + "type" : "integer", + "name" : "--rna_min_counts", + "description" : "Minimum number of counts captured per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--rna_max_counts", + "description" : "Maximum number of counts captured per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--rna_min_genes_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--rna_max_genes_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 1500000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--rna_min_cells_per_gene", + "description" : "Minimum of non-zero values per gene.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_min_fraction_mito", + "description" : "Minimum fraction of UMIs that are mitochondrial.", + "example" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_max_fraction_mito", + "description" : "Maximum fraction of UMIs that are mitochondrial.", + "example" : [ + 0.2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_min_fraction_ribo", + "description" : "Minimum fraction of UMIs that are mitochondrial.", + "example" : [ + 0.0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_max_fraction_ribo", + "description" : "Maximum fraction of UMIs that are mitochondrial.", + "example" : [ + 0.2 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--skip_scrublet_doublet_detection", + "description" : "Skip the scrublet doublet detection step.", + "direction" : "input" + } + ] + }, + { + "name" : "CITE-seq filtering options", + "arguments" : [ + { + "type" : "integer", + "name" : "--prot_min_counts", + "description" : "Minimum number of counts per cell.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--prot_max_counts", + "description" : "Minimum number of counts per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--prot_min_proteins_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--prot_max_proteins_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 100000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--prot_min_cells_per_protein", + "description" : "Minimum of non-zero values per protein.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "GDO filtering options", + "arguments" : [ + { + "type" : "integer", + "name" : "--gdo_min_counts", + "description" : "Minimum number of counts per cell.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gdo_max_counts", + "description" : "Minimum number of counts per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gdo_min_guides_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gdo_max_guides_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 100000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--gdo_min_cells_per_guide", + "description" : "Minimum of non-zero values per guide.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Highly variable features detection", + "arguments" : [ + { + "type" : "string", + "name" : "--highly_variable_features_var_output", + "alternatives" : [ + "--filter_with_hvg_var_output" + ], + "description" : "In which .var slot to store a boolean array corresponding to the highly variable genes.", + "default" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--highly_variable_features_obs_batch_key", + "alternatives" : [ + "--filter_with_hvg_obs_batch_key" + ], + "description" : "If specified, highly-variable genes are selected within each batch separately and merged. This simple \nprocess avoids the selection of batch-specific genes and acts as a lightweight batch correction method.\n", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Mitochondrial & Ribosomal Gene Detection", + "arguments" : [ + { + "type" : "string", + "name" : "--var_gene_names", + "description" : ".var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively. \n", + "example" : [ + "gene_symbol" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_mitochondrial_genes", + "description" : "In which .var slot to store a boolean array corresponding the mitochondrial genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_mitochondrial_fraction", + "description" : "When specified, write the fraction of counts originating from mitochondrial genes \n(based on --mitochondrial_gene_regex) to an .obs column with the specified name.\nRequires --var_name_mitochondrial_genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--mitochondrial_gene_regex", + "description" : "Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n", + "default" : [ + "^[mM][tT]-" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_ribosomal_genes", + "description" : "In which .var slot to store a boolean array corresponding the ribosomal genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_ribosomal_fraction", + "description" : "When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--ribosomal_gene_regex", + "description" : "Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n", + "default" : [ + "^[Mm]?[Rr][Pp][LlSs]" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "QC metrics calculation options", + "arguments" : [ + { + "type" : "string", + "name" : "--var_qc_metrics", + "description" : "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes. Defaults to the combined values specified for\n--var_name_mitochondrial_genes and --highly_variable_features_var_output.\n", + "example" : [ + "ercc,highly_variable" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "integer", + "name" : "--top_n_vars", + "description" : "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n", + "default" : [ + 50, + 100, + 200, + 500 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + } + ] + }, + { + "name" : "PCA options", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--pca_overwrite", + "description" : "Allow overwriting slots for PCA output.", + "direction" : "input" + } + ] + }, + { + "name" : "CLR options", + "arguments" : [ + { + "type" : "integer", + "name" : "--clr_axis", + "description" : "Axis to perform the CLR transformation on.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "RNA Scaling options", + "description" : "Options for enabling scaling of the log-normalized data to unit variance and zero mean.\nThe scaled data will be output a different layer and representation with reduced dimensions\nwill be created and stored in addition to the non-scaled data.\n", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--rna_enable_scaling", + "description" : "Enable scaling for the RNA modality.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--rna_scaling_output_layer", + "description" : "Output layer where the scaled log-normalized data will be stored.", + "default" : [ + "scaled" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_pca_obsm_output", + "description" : "Name of the .obsm key where the PCA representation of the log-normalized\nand scaled data is stored.\n", + "default" : [ + "scaled_pca" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_pca_loadings_varm_output", + "description" : "Name of the .varm key where the PCA loadings of the log-normalized and scaled\ndata is stored.\n", + "default" : [ + "scaled_pca_loadings" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_pca_variance_uns_output", + "description" : "Name of the .uns key where the variance and variance ratio will be stored as a map.\nThe map will contain two keys: variance and variance_ratio respectively.\n", + "default" : [ + "scaled_pca_variance" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rna_scaling_umap_obsm_output", + "description" : "Name of the .obsm key where the UMAP representation of the log-normalized and scaled data is stored.", + "default" : [ + "scaled_umap" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--rna_scaling_max_value", + "description" : "Clip (truncate) data to this value after scaling. If not specified, do not clip.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_false", + "name" : "--rna_scaling_zero_center", + "description" : "If set, omit zero-centering variables, which allows to handle sparse input efficiently.\\"", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A pipeline to analyse multiple multiomics samples.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf3" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf4" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf5" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + }, + { + "type" : "file", + "path" : "/resources_test/10x_5k_lung_crispr" + } + ], + "info" : { + "test_dependencies" : [ + { + "name" : "move_layer", + "namespace" : "transform" + }, + { + "name" : "remove_modality", + "namespace" : "filter" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "metadata/add_id", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/split_modalities", + "alias" : "split_modalities_workflow", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dataflow/merge", + "repository" : { + "type" : "local" + } + }, + { + "name" : "dataflow/concatenate_h5mu", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/rna/rna_singlesample", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/prot/prot_singlesample", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/gdo/gdo_singlesample", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/multiomics/process_batches", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/multiomics/process_samples/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/multiomics/process_samples", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { add_id } from "${meta.resources_dir}/../../../../nextflow/metadata/add_id/main.nf" +include { split_modalities as split_modalities_workflow_viashalias } from "${meta.resources_dir}/../../../../_private/nextflow/workflows/multiomics/split_modalities/main.nf" +split_modalities_workflow = split_modalities_workflow_viashalias.run(key: "split_modalities_workflow") +include { merge } from "${meta.resources_dir}/../../../../nextflow/dataflow/merge/main.nf" +include { concatenate_h5mu } from "${meta.resources_dir}/../../../../nextflow/dataflow/concatenate_h5mu/main.nf" +include { rna_singlesample } from "${meta.resources_dir}/../../../../nextflow/workflows/rna/rna_singlesample/main.nf" +include { prot_singlesample } from "${meta.resources_dir}/../../../../nextflow/workflows/prot/prot_singlesample/main.nf" +include { gdo_singlesample } from "${meta.resources_dir}/../../../../nextflow/workflows/gdo/gdo_singlesample/main.nf" +include { process_batches } from "${meta.resources_dir}/../../../../nextflow/workflows/multiomics/process_batches/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + modalities_ch = input_ch + // Make sure there is not conflict between the output from this workflow + // And the output from any of the components + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // If requested to be detected, make sure the mitochondrial and ribosomal genes + // are added to the input of the qc metrics calculation + | map {id, state -> + def var_qc_default = [state.highly_variable_features_var_output] + if (state.var_name_mitochondrial_genes) { + var_qc_default.add(state.var_name_mitochondrial_genes) + } + if (state.var_name_ribosomal_genes) { + var_qc_default.add(state.var_name_ribosomal_genes) + } + def newState = state + ["var_qc_metrics": var_qc_default.join(",")] + [id, newState] + } + // If requested, add the id of the events (samples) a column in .obs. + // Also allows to make .obs_names (the .obs index) unique, by prefixing the values with an unique id per .h5mu file. + // The latter is usefull to avoid duplicate observations during concatenation. + | add_id.run( + filter: {id, state -> state.add_id_to_obs }, + fromState: {id, state -> + def newState = [ + "input": state.input, + "input_id": id, + "make_observation_keys_unique": state.add_id_make_observation_keys_unique, + "obs_output": state.add_id_obs_output, + "add_id_to_obs": state.add_id_to_obs + ] + newState + }, + toState: {id, output, state -> + def keysToRemove = ["add_id_to_obs", "add_id_obs_output", "add_id_make_observation_keys_unique"] + def newState = state.findAll{it.key !in keysToRemove} + newState + ["input": output.output] + } + ) + | split_modalities_workflow.run( + fromState: {id, state -> + def newState = ["input": state.input, "id": id] + }, + toState: ["output": "output", "output_types": "output_types"] + ) + | flatMap {id, state -> + def outputDir = state.output + def types = readCsv(state.output_types.toUriString()) + + types.collect{ dat -> + // def new_id = id + "_" + dat.name + def new_id = id // it's okay because the channel will get split up anyways + def new_data = outputDir.resolve(dat.filename) + [ new_id, state + ["input": new_data, modality: dat.name]] + } + } + // Remove arguments from split modalities from state + | map {id, state -> + def keysToRemove = ["output_types"] + def newState = state.findAll{it.key !in keysToRemove} + [id, newState] + } + | view {"After splitting modalities: $it"} + + + // + // Singlesample processing + // + def singlesample_arguments = [ + "rna": [ + "min_counts": "rna_min_counts", + "max_counts": "rna_max_counts", + "min_genes_per_cell": "rna_min_genes_per_cell", + "max_genes_per_cell": "rna_max_genes_per_cell", + "min_cells_per_gene": "rna_min_cells_per_gene", + "min_fraction_mito": "rna_min_fraction_mito", + "max_fraction_mito": "rna_max_fraction_mito", + "var_name_mitochondrial_genes": "var_name_mitochondrial_genes", + "obs_name_mitochondrial_fraction": "obs_name_mitochondrial_fraction", + "var_gene_names": "var_gene_names", + "mitochondrial_gene_regex": "mitochondrial_gene_regex", + "min_fraction_ribo": "rna_min_fraction_ribo", + "max_fraction_ribo": "rna_max_fraction_ribo", + "var_name_ribosomal_genes": "var_name_ribosomal_genes", + "obs_name_ribosomal_fraction": "obs_name_ribosomal_fraction", + "ribosomal_gene_regex": "ribosomal_gene_regex", + "layer": "rna_layer", + "skip_scrublet_doublet_detection": "skip_scrublet_doublet_detection" + ], + "prot": [ + "min_counts": "prot_min_counts", + "max_counts": "prot_max_counts", + "min_proteins_per_cell": "prot_min_proteins_per_cell", + "max_proteins_per_cell": "prot_max_proteins_per_cell", + "min_cells_per_protein": "prot_min_cells_per_protein", + "layer": "prot_layer", + ], + "gdo": [ + "min_counts": "gdo_min_counts", + "max_counts": "gdo_max_counts", + "min_guides_per_cell": "gdo_min_guides_per_cell", + "max_guides_per_cell": "gdo_max_guides_per_cell", + "min_cells_per_guide": "gdo_min_cells_per_guide", + "layer": "gdo_layer", + ], + ].asImmutable() + + multisample_ch_known = modalities_ch + // run the singlesample processing + | runEach( + components: [rna_singlesample, prot_singlesample, gdo_singlesample], + filter: { id, state, component -> + state.modality + "_singlesample" == component.config.name + }, + fromState: { id, state, component -> + def newState = singlesample_arguments.get(state.modality).collectEntries{key_, value_ -> + [key_, state[value_]] + } + return newState + ["id": id, "input": state.input] + }, + toState: ["input": "output"], + ) + + multisample_ch_unknown = modalities_ch + | filter{id, state -> state.modality !in singlesample_arguments.keySet()} + + output_ch = multisample_ch_unknown.mix(multisample_ch_known) + // Remove arguments for singlesample processing from state. + | map {id, state -> + def keysToRemove = singlesample_arguments.inject([]){currentKeys, modality, stateMapping -> + currentKeys += stateMapping.values() + } + def allwayskeep = ["gdo_layer", "rna_layer", "prot_layer", "workflow_output"] + def newState = state.findAll{(it.key !in keysToRemove + ["id"]) || (it.key in allwayskeep)} + [id, newState] + } + | view {"After singlesample processing: $it"} + + // + // Concatenation: join observations across samples together per modality. + // + // Concatenate multiple single-sample unimodal MuData objects back into several multi-sample files. + // One multi-sample MuData file is created per modality. + // + | map { id, state -> // Put modality name in first element so that we can group on it + [state.modality, id, state] + } + | groupTuple(by: 0, sort: "hash") + | view {"After groupTuple: $it"} + | map { modality, old_ids, states -> + def new_id = "combined_$modality" + // keys in the new state that should not have a unique value across samples + def new_state_non_unique_values = [ + "input": states.collect{it.input}, + "input_id": old_ids, + "_meta": ["join_id": old_ids[0]] + ] + // Gather the keys from the different states, + // one state might contain more keys compared to another (so create a set) + def all_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input_id", "input", "_meta"]) + // Create the new state from the keys, values should be the same across samples + def new_state = all_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across samples. Argument name: $argument_name, \ + argument value: $argument_values" + // take the unique value from the set (there is only one) + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + def final_state = new_state_non_unique_values + new_state + [new_id, final_state] + } + | concatenate_h5mu.run( + fromState: [ + "input": "input", + "input_id": "input_id" + ], + toState: {id, output, state -> + def keysToRemove = ["input_id"] + def newState = state.findAll{it.key !in keysToRemove} + newState + ["input": output.output] + }, + ) + + | view {"After concatenation: $it"} + | toSortedList() + | map {modalities_states -> + def states = modalities_states.collect{it[1]} + def new_input = states.collect{it.input} + def join_id = states[0]._meta.join_id + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(["output", "input", "modality", "_meta"]) + def new_state = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + ["merged", new_state + ["input": new_input, "_meta": ["join_id": join_id]]] + } + | process_batches.run( + fromState: {id, state -> + [ + "id": id, + "input": state.input, + "output": state.workflow_output, + "highly_variable_features_var_output": state.highly_variable_features_var_output, + "highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key, + "var_qc_metrics": state.var_qc_metrics, + "top_n_vars": state.top_n_vars, + "pca_overwrite": state.pca_overwrite, + "rna_layer": state.rna_layer, + "prot_layer": state.prot_layer, + "clr_axis": state.clr_axis, + "rna_enable_scaling": state.rna_enable_scaling, + "rna_scaling_output_layer": state.rna_scaling_output_layer, + "rna_scaling_pca_obsm_output": state.rna_scaling_pca_obsm_output, + "rna_scaling_pca_loadings_varm_output": state.rna_scaling_pca_loadings_varm_output, + "rna_scaling_pca_variance_uns_output": state.rna_scaling_pca_variance_uns_output, + "rna_scaling_umap_obsm_output": state.rna_scaling_umap_obsm_output, + "rna_scaling_max_value": state.rna_scaling_max_value, + "rna_scaling_zero_center": state.rna_scaling_zero_center, + ] + }, + toState: {id, output, state -> + [ + "output": output.output, + "_meta": state._meta, + ] + } + ) + | view {"After process_batches: $it"} + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/multiomics/process_samples/nextflow.config b/target/nextflow/workflows/multiomics/process_samples/nextflow.config new file mode 100644 index 00000000..d7fb93b5 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/multiomics/process_samples' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A pipeline to analyse multiple multiomics samples.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/multiomics/process_samples/nextflow_labels.config b/target/nextflow/workflows/multiomics/process_samples/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/process_samples/nextflow_schema.json b/target/nextflow/workflows/multiomics/process_samples/nextflow_schema.json new file mode 100644 index 00000000..dab191ec --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/nextflow_schema.json @@ -0,0 +1,428 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "process_samples", + "description": "A pipeline to analyse multiple multiomics samples.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "rna_layer": { + "type": "string", + "description": "Input layer for the gene expression modality", + "help_text": "Type: `string`, multiple: `False`. " + }, + "prot_layer": { + "type": "string", + "description": "Input layer for the antibody capture modality", + "help_text": "Type: `string`, multiple: `False`. " + }, + "gdo_layer": { + "type": "string", + "description": "Input layer for the guide-derived oligonucleotide (GDO) data", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "sample id options": { + "title": "Sample ID options", + "type": "object", + "description": "Options for adding the id to .obs on the MuData object. Having a sample \nid present in a requirement of several components for this pipeline.\n", + "properties": { + "add_id_to_obs": { + "type": "boolean", + "description": "Add the value passed with --id to .obs.", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + }, + "add_id_obs_output": { + "type": "string", + "description": ".Obs column to add the sample IDs to", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "add_id_make_observation_keys_unique": { + "type": "boolean", + "description": "Join the id to the .obs index (.obs_names)", + "help_text": "Type: `boolean`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "rna filtering options": { + "title": "RNA filtering options", + "type": "object", + "description": "No description", + "properties": { + "rna_min_counts": { + "type": "integer", + "description": "Minimum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "rna_max_counts": { + "type": "integer", + "description": "Maximum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "rna_min_genes_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "rna_max_genes_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `1500000`. " + }, + "rna_min_cells_per_gene": { + "type": "integer", + "description": "Minimum of non-zero values per gene.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "rna_min_fraction_mito": { + "type": "number", + "description": "Minimum fraction of UMIs that are mitochondrial.", + "help_text": "Type: `double`, multiple: `False`, example: `0.0`. " + }, + "rna_max_fraction_mito": { + "type": "number", + "description": "Maximum fraction of UMIs that are mitochondrial.", + "help_text": "Type: `double`, multiple: `False`, example: `0.2`. " + }, + "rna_min_fraction_ribo": { + "type": "number", + "description": "Minimum fraction of UMIs that are mitochondrial.", + "help_text": "Type: `double`, multiple: `False`, example: `0.0`. " + }, + "rna_max_fraction_ribo": { + "type": "number", + "description": "Maximum fraction of UMIs that are mitochondrial.", + "help_text": "Type: `double`, multiple: `False`, example: `0.2`. " + }, + "skip_scrublet_doublet_detection": { + "type": "boolean", + "description": "Skip the scrublet doublet detection step.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "cite-seq filtering options": { + "title": "CITE-seq filtering options", + "type": "object", + "description": "No description", + "properties": { + "prot_min_counts": { + "type": "integer", + "description": "Minimum number of counts per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "prot_max_counts": { + "type": "integer", + "description": "Minimum number of counts per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "prot_min_proteins_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "prot_max_proteins_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `100000000`. " + }, + "prot_min_cells_per_protein": { + "type": "integer", + "description": "Minimum of non-zero values per protein.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + } + } + }, + "gdo filtering options": { + "title": "GDO filtering options", + "type": "object", + "description": "No description", + "properties": { + "gdo_min_counts": { + "type": "integer", + "description": "Minimum number of counts per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "gdo_max_counts": { + "type": "integer", + "description": "Minimum number of counts per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "gdo_min_guides_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "gdo_max_guides_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `100000000`. " + }, + "gdo_min_cells_per_guide": { + "type": "integer", + "description": "Minimum of non-zero values per guide.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + } + } + }, + "highly variable features detection": { + "title": "Highly variable features detection", + "type": "object", + "description": "No description", + "properties": { + "highly_variable_features_var_output": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding to the highly variable genes.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_hvg\"`. ", + "default": "filter_with_hvg" + }, + "highly_variable_features_obs_batch_key": { + "type": "string", + "description": "If specified, highly-variable genes are selected within each batch separately and merged", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + } + } + }, + "mitochondrial & ribosomal gene detection": { + "title": "Mitochondrial & Ribosomal Gene Detection", + "type": "object", + "description": "No description", + "properties": { + "var_gene_names": { + "type": "string", + "description": ".var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_symbol\"`. " + }, + "var_name_mitochondrial_genes": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding the mitochondrial genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_name_mitochondrial_fraction": { + "type": "string", + "description": "When specified, write the fraction of counts originating from mitochondrial genes \n(based on --mitochondrial_gene_regex) to an .obs column with the specified name.\nRequires --var_name_mitochondrial_genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "mitochondrial_gene_regex": { + "type": "string", + "description": "Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"^[mM][tT]-\"`. ", + "default": "^[mM][tT]-" + }, + "var_name_ribosomal_genes": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding the ribosomal genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_name_ribosomal_fraction": { + "type": "string", + "description": "When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "ribosomal_gene_regex": { + "type": "string", + "description": "Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"^[Mm]?[Rr][Pp][LlSs]\"`. ", + "default": "^[Mm]?[Rr][Pp][LlSs]" + } + } + }, + "qc metrics calculation options": { + "title": "QC metrics calculation options", + "type": "object", + "description": "No description", + "properties": { + "var_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes", + "help_text": "Type: `string`, multiple: `True`, example: `[\"ercc,highly_variable\"]`. " + }, + "top_n_vars": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated", + "help_text": "Type: `integer`, multiple: `True`, default: `[50,100,200,500]`. ", + "default": [ + 50, + 100, + 200, + 500 + ] + } + } + }, + "pca options": { + "title": "PCA options", + "type": "object", + "description": "No description", + "properties": { + "pca_overwrite": { + "type": "boolean", + "description": "Allow overwriting slots for PCA output.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "clr options": { + "title": "CLR options", + "type": "object", + "description": "No description", + "properties": { + "clr_axis": { + "type": "integer", + "description": "Axis to perform the CLR transformation on.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "rna scaling options": { + "title": "RNA Scaling options", + "type": "object", + "description": "Options for enabling scaling of the log-normalized data to unit variance and zero mean.\nThe scaled data will be output a different layer and representation with reduced dimensions\nwill be created and stored in addition to the non-scaled data.\n", + "properties": { + "rna_enable_scaling": { + "type": "boolean", + "description": "Enable scaling for the RNA modality.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "rna_scaling_output_layer": { + "type": "string", + "description": "Output layer where the scaled log-normalized data will be stored.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled\"`. ", + "default": "scaled" + }, + "rna_scaling_pca_obsm_output": { + "type": "string", + "description": "Name of the .obsm key where the PCA representation of the log-normalized\nand scaled data is stored.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_pca\"`. ", + "default": "scaled_pca" + }, + "rna_scaling_pca_loadings_varm_output": { + "type": "string", + "description": "Name of the .varm key where the PCA loadings of the log-normalized and scaled\ndata is stored.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_pca_loadings\"`. ", + "default": "scaled_pca_loadings" + }, + "rna_scaling_pca_variance_uns_output": { + "type": "string", + "description": "Name of the .uns key where the variance and variance ratio will be stored as a map.\nThe map will contain two keys: variance and variance_ratio respectively.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_pca_variance\"`. ", + "default": "scaled_pca_variance" + }, + "rna_scaling_umap_obsm_output": { + "type": "string", + "description": "Name of the .obsm key where the UMAP representation of the log-normalized and scaled data is stored.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled_umap\"`. ", + "default": "scaled_umap" + }, + "rna_scaling_max_value": { + "type": "number", + "description": "Clip (truncate) data to this value after scaling", + "help_text": "Type: `double`, multiple: `False`. " + }, + "rna_scaling_zero_center": { + "type": "boolean", + "description": "If set, omit zero-centering variables, which allows to handle sparse input efficiently.\"", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/sample id options" + }, + { + "$ref": "#/$defs/rna filtering options" + }, + { + "$ref": "#/$defs/cite-seq filtering options" + }, + { + "$ref": "#/$defs/gdo filtering options" + }, + { + "$ref": "#/$defs/highly variable features detection" + }, + { + "$ref": "#/$defs/mitochondrial & ribosomal gene detection" + }, + { + "$ref": "#/$defs/qc metrics calculation options" + }, + { + "$ref": "#/$defs/pca options" + }, + { + "$ref": "#/$defs/clr options" + }, + { + "$ref": "#/$defs/rna scaling options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/multiomics/process_samples/utils/errorstrat_ignore.config b/target/nextflow/workflows/multiomics/process_samples/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/process_samples/utils/integration_tests.config b/target/nextflow/workflows/multiomics/process_samples/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/multiomics/process_samples/utils/labels.config b/target/nextflow/workflows/multiomics/process_samples/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/multiomics/process_samples/utils/labels_ci.config b/target/nextflow/workflows/multiomics/process_samples/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/multiomics/process_samples/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/prot/prot_multisample/.config.vsh.yaml b/target/nextflow/workflows/prot/prot_multisample/.config.vsh.yaml new file mode 100644 index 00000000..304cb6c3 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/.config.vsh.yaml @@ -0,0 +1,336 @@ +name: "prot_multisample" +namespace: "workflows/prot" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the concatenated file" + info: null + example: + - "concatenated" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the samples." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to use. If not specified, .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "QC metrics calculation options" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n" + info: null + example: + - "ercc,highly_variable" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + default: + - 50 + - 100 + - 200 + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "string" + name: "--output_obs_num_nonzero_vars" + description: "Name of column in .obs describing, for each observation, the number\ + \ of stored values\n(including explicit zeroes). In other words, the name of\ + \ the column that counts\nfor each row the number of columns that contain data.\n" + info: null + default: + - "num_nonzero_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_total_counts_vars" + description: "Name of the column for .obs describing, for each observation (row),\n\ + the sum of the stored values in the columns.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_num_nonzero_obs" + description: "Name of column describing, for each feature, the number of stored\ + \ values\n(including explicit zeroes). In other words, the name of the column\ + \ that counts\nfor each column the number of rows that contain data.\n" + info: null + default: + - "num_nonzero_obs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_total_counts_obs" + description: "Name of the column in .var describing, for each feature (column),\n\ + the sum of the stored values in the rows.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_obs_mean" + description: "Name of the column in .obs providing the mean of the values in each\ + \ row.\n" + info: null + default: + - "obs_mean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_pct_dropout" + description: "Name of the column in .obs providing for each feature the percentage\ + \ of\nobservations the feature does not appear on (i.e. is missing). Same as\ + \ `--output_var_num_nonzero_obs`\nbut percentage based.\n" + info: null + default: + - "pct_dropout" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "CLR arguments" + arguments: + - type: "integer" + name: "--clr_axis" + description: "Axis across which CLR is performed." + info: null + default: + - 0 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Processing unimodal multi-sample ADT data." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + image: "/images/concepts/fig_workflow_multiomics_adt_multisample.svg" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "transform/clr" + repository: + type: "local" +- name: "workflows/qc/qc" + alias: "prot_qc" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/prot/prot_multisample/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/prot/prot_multisample" + executable: "target/nextflow/workflows/prot/prot_multisample/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/transform/clr" + - "target/nextflow/workflows/qc/qc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/prot/prot_multisample/main.nf b/target/nextflow/workflows/prot/prot_multisample/main.nf new file mode 100644 index 00000000..870ae112 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/main.nf @@ -0,0 +1,3630 @@ +// prot_multisample main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "prot_multisample", + "namespace" : "workflows/prot", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the concatenated file", + "example" : [ + "concatenated" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the samples.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to use. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "QC metrics calculation options", + "arguments" : [ + { + "type" : "string", + "name" : "--var_qc_metrics", + "description" : "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n", + "example" : [ + "ercc,highly_variable" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "integer", + "name" : "--top_n_vars", + "description" : "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n", + "default" : [ + 50, + 100, + 200, + 500 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "string", + "name" : "--output_obs_num_nonzero_vars", + "description" : "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n", + "default" : [ + "num_nonzero_vars" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_total_counts_vars", + "description" : "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_num_nonzero_obs", + "description" : "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n", + "default" : [ + "num_nonzero_obs" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_total_counts_obs", + "description" : "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_obs_mean", + "description" : "Name of the column in .obs providing the mean of the values in each row.\n", + "default" : [ + "obs_mean" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_pct_dropout", + "description" : "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`\nbut percentage based.\n", + "default" : [ + "pct_dropout" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "CLR arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--clr_axis", + "description" : "Axis across which CLR is performed.", + "default" : [ + 0 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Processing unimodal multi-sample ADT data.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "info" : { + "image" : "/images/concepts/fig_workflow_multiomics_adt_multisample.svg" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "transform/clr", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/qc/qc", + "alias" : "prot_qc", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/prot/prot_multisample/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/prot/prot_multisample", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { clr } from "${meta.resources_dir}/../../../../nextflow/transform/clr/main.nf" +include { qc as prot_qc_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/qc/qc/main.nf" +prot_qc = prot_qc_viashalias.run(key: "prot_qc") + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | clr.run( + fromState: [ + "input": "input", + "input_layer": "layer", + "clr_axis": "clr_axis", + ], + toState: ["input": "output"], + args: [ + output_layer: "clr", + modality: "prot" + ] + ) + | prot_qc.run( + // TODO: remove when viash 0.8.3 is released + key: "prot_qc", + fromState: { id, state -> + def newState = [ + "id": id, + "output": state.workflow_output, + "input": state.input, + "top_n_vars": state.top_n_vars, + "var_qc_metrics": null, + "input_layer": state.layer, // Use the non-transformed layer + "modality": "prot", + "var_name_mitochondrial_genes": null, + "output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars, + "output_obs_total_counts_vars": state.output_obs_total_counts_vars, + "num_nonzeoutput_var_num_nonzero_obsro_obs": state.output_var_num_nonzero_obs, + "output_var_total_counts_obs": state.output_var_total_counts_obs, + "output_var_obs_mean": state.output_var_obs_mean, + "output_var_pct_dropout": state.pct_dropout + ] + newState + } + ) + | setState(["output"]) + + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/prot/prot_multisample/nextflow.config b/target/nextflow/workflows/prot/prot_multisample/nextflow.config new file mode 100644 index 00000000..9596f6d5 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/prot/prot_multisample' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Processing unimodal multi-sample ADT data.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/prot/prot_multisample/nextflow_labels.config b/target/nextflow/workflows/prot/prot_multisample/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/prot/prot_multisample/nextflow_schema.json b/target/nextflow/workflows/prot/prot_multisample/nextflow_schema.json new file mode 100644 index 00000000..64027416 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/nextflow_schema.json @@ -0,0 +1,153 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "prot_multisample", + "description": "Processing unimodal multi-sample ADT data.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the concatenated file", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"concatenated\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the samples.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "qc metrics calculation options": { + "title": "QC metrics calculation options", + "type": "object", + "description": "No description", + "properties": { + "var_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes", + "help_text": "Type: `string`, multiple: `True`, example: `[\"ercc,highly_variable\"]`. " + }, + "top_n_vars": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated", + "help_text": "Type: `integer`, multiple: `True`, default: `[50,100,200,500]`. ", + "default": [ + 50, + 100, + 200, + 500 + ] + }, + "output_obs_num_nonzero_vars": { + "type": "string", + "description": "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_vars\"`. ", + "default": "num_nonzero_vars" + }, + "output_obs_total_counts_vars": { + "type": "string", + "description": "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_num_nonzero_obs": { + "type": "string", + "description": "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_obs\"`. ", + "default": "num_nonzero_obs" + }, + "output_var_total_counts_obs": { + "type": "string", + "description": "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_obs_mean": { + "type": "string", + "description": "Name of the column in .obs providing the mean of the values in each row.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"obs_mean\"`. ", + "default": "obs_mean" + }, + "output_var_pct_dropout": { + "type": "string", + "description": "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e", + "help_text": "Type: `string`, multiple: `False`, default: `\"pct_dropout\"`. ", + "default": "pct_dropout" + } + } + }, + "clr arguments": { + "title": "CLR arguments", + "type": "object", + "description": "No description", + "properties": { + "clr_axis": { + "type": "integer", + "description": "Axis across which CLR is performed.", + "help_text": "Type: `integer`, multiple: `False`, default: `0`. ", + "default": 0 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/qc metrics calculation options" + }, + { + "$ref": "#/$defs/clr arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/prot/prot_multisample/utils/errorstrat_ignore.config b/target/nextflow/workflows/prot/prot_multisample/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/prot/prot_multisample/utils/integration_tests.config b/target/nextflow/workflows/prot/prot_multisample/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/prot/prot_multisample/utils/labels.config b/target/nextflow/workflows/prot/prot_multisample/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/prot/prot_multisample/utils/labels_ci.config b/target/nextflow/workflows/prot/prot_multisample/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/prot/prot_multisample/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/prot/prot_singlesample/.config.vsh.yaml b/target/nextflow/workflows/prot/prot_singlesample/.config.vsh.yaml new file mode 100644 index 00000000..830383d5 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/.config.vsh.yaml @@ -0,0 +1,307 @@ +name: "prot_singlesample" +namespace: "workflows/prot" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to start from. By default, .X will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Filtering options" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts captured per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_counts" + description: "Maximum number of counts captured per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_proteins_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_proteins_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 1500000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_protein" + description: "Minimum of non-zero values per gene." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Processing unimodal single-sample CITE-seq data." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + image: "/images/concepts/fig_workflow_multiomics_adt_singlesample.svg" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "filter/filter_with_counts" + repository: + type: "local" +- name: "filter/do_filter" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/prot/prot_singlesample/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/prot/prot_singlesample" + executable: "target/nextflow/workflows/prot/prot_singlesample/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/filter/filter_with_counts" + - "target/nextflow/filter/do_filter" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/prot/prot_singlesample/main.nf b/target/nextflow/workflows/prot/prot_singlesample/main.nf new file mode 100644 index 00000000..27e2053c --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/main.nf @@ -0,0 +1,3623 @@ +// prot_singlesample main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (author) +// * Robrecht Cannoodt (author, maintainer) +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "prot_singlesample", + "namespace" : "workflows/prot", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to start from. By default, .X will be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filtering options", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of counts captured per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_counts", + "description" : "Maximum number of counts captured per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_proteins_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_proteins_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 1500000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells_per_protein", + "description" : "Minimum of non-zero values per gene.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Processing unimodal single-sample CITE-seq data.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "info" : { + "image" : "/images/concepts/fig_workflow_multiomics_adt_singlesample.svg" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "filter/filter_with_counts", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/do_filter", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/prot/prot_singlesample/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/prot/prot_singlesample", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { filter_with_counts } from "${meta.resources_dir}/../../../../nextflow/filter/filter_with_counts/main.nf" +include { do_filter } from "${meta.resources_dir}/../../../../nextflow/filter/do_filter/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // filtering + | filter_with_counts.run( + key: "prot_filter_with_counts", + fromState: { id, state -> + def newState = [ + "input": state.input, + "min_counts": state.min_counts, + "max_counts": state.max_counts, + "min_genes_per_cell": state.min_proteins_per_cell, + "max_genes_per_cell": state.max_proteins_per_cell, + "min_cells_per_gene": state.min_cells_per_protein, + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "modality": "prot", + "layer": state.layer, + ] + newState + }, + toState: ["input": "output"] + ) + | do_filter.run( + key: "prot_do_filter", + fromState : { id, state -> + // do_filter does not need a layer argument because it filters all layers + // from a modality. + def newState = [ + "input": state.input, + "obs_filter": "filter_with_counts", + "modality": "prot", + "var_filter": "filter_with_counts", + "output_compression": "gzip", + "output": state.workflow_output + ] + return newState + }, + toState: ["output": "output"], + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/prot/prot_singlesample/nextflow.config b/target/nextflow/workflows/prot/prot_singlesample/nextflow.config new file mode 100644 index 00000000..aeaa5b1b --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/prot/prot_singlesample' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Processing unimodal single-sample CITE-seq data.' + author = 'Dries De Maeyer, Robrecht Cannoodt, Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/prot/prot_singlesample/nextflow_labels.config b/target/nextflow/workflows/prot/prot_singlesample/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/prot/prot_singlesample/nextflow_schema.json b/target/nextflow/workflows/prot/prot_singlesample/nextflow_schema.json new file mode 100644 index 00000000..988222c3 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/nextflow_schema.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "prot_singlesample", + "description": "Processing unimodal single-sample CITE-seq data.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "Input layer to start from", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "filtering options": { + "title": "Filtering options", + "type": "object", + "description": "No description", + "properties": { + "min_counts": { + "type": "integer", + "description": "Minimum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_counts": { + "type": "integer", + "description": "Maximum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "min_proteins_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_proteins_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `1500000`. " + }, + "min_cells_per_protein": { + "type": "integer", + "description": "Minimum of non-zero values per gene.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/filtering options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/prot/prot_singlesample/utils/errorstrat_ignore.config b/target/nextflow/workflows/prot/prot_singlesample/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/prot/prot_singlesample/utils/integration_tests.config b/target/nextflow/workflows/prot/prot_singlesample/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/prot/prot_singlesample/utils/labels.config b/target/nextflow/workflows/prot/prot_singlesample/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/prot/prot_singlesample/utils/labels_ci.config b/target/nextflow/workflows/prot/prot_singlesample/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/prot/prot_singlesample/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/qc/qc/.config.vsh.yaml b/target/nextflow/workflows/qc/qc/.config.vsh.yaml new file mode 100644 index 00000000..902055a1 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/.config.vsh.yaml @@ -0,0 +1,416 @@ +name: "qc" +namespace: "workflows/qc" +version: "main" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the sample." + info: null + example: + - "input.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Which modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Layer to calculate qc metrics for." + info: null + example: + - "raw_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Mitochondrial & Ribosomal Gene Detection" + arguments: + - type: "string" + name: "--var_gene_names" + description: ".var column name to be used to detect mitochondrial/ribosomal genes\ + \ instead of .var_names (default if not set).\nGene names matching with the\ + \ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\ + \ be \nidentified as mitochondrial or ribosomal genes, respectively.\n" + info: null + example: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_mitochondrial_genes" + description: "In which .var slot to store a boolean array corresponding the mitochondrial\ + \ genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_mitochondrial_fraction" + description: ".Obs slot to store the fraction of reads found to be mitochondrial.\ + \ Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--mitochondrial_gene_regex" + description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\ + By default will detect human and mouse mitochondrial genes from a gene symbol.\n" + info: null + default: + - "^[mM][tT]-" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_ribosomal_genes" + description: "In which .var slot to store a boolean array corresponding the ribosomal\ + \ genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_ribosomal_fraction" + description: "When specified, write the fraction of counts originating from ribosomal\ + \ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\ + \ name.\nRequires --var_name_ribosomal_genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--ribosomal_gene_regex" + description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\ + By default will detect human and mouse ribosomal genes from a gene symbol.\n" + info: null + default: + - "^[Mm]?[Rr][Pp][LlSs]" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "QC metrics calculation options" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n" + info: null + example: + - "ercc,highly_variable" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + default: + - 50 + - 100 + - 200 + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "string" + name: "--output_obs_num_nonzero_vars" + description: "Name of column in .obs describing, for each observation, the number\ + \ of stored values\n(including explicit zeroes). In other words, the name of\ + \ the column that counts\nfor each row the number of columns that contain data.\n" + info: null + default: + - "num_nonzero_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_total_counts_vars" + description: "Name of the column for .obs describing, for each observation (row),\n\ + the sum of the stored values in the columns.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_num_nonzero_obs" + description: "Name of column describing, for each feature, the number of stored\ + \ values\n(including explicit zeroes). In other words, the name of the column\ + \ that counts\nfor each column the number of rows that contain data.\n" + info: null + default: + - "num_nonzero_obs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_total_counts_obs" + description: "Name of the column in .var describing, for each feature (column),\n\ + the sum of the stored values in the rows.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_obs_mean" + description: "Name of the column in .obs providing the mean of the values in each\ + \ row.\n" + info: null + default: + - "obs_mean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_pct_dropout" + description: "Name of the column in .obs providing for each feature the percentage\ + \ of\nobservations the feature does not appear on (i.e. is missing). Same as\ + \ `--output_var_num_nonzero_obs`\nbut percentage based.\n" + info: null + default: + - "pct_dropout" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "A pipeline to add basic qc statistics to a MuData " +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "concat_test_data" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + test_dependencies: + - name: "qc_test" + namespace: "test_workflows/qc" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "metadata/grep_annotation_column" + repository: + type: "local" +- name: "qc/calculate_qc_metrics" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/qc/qc/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/qc/qc" + executable: "target/nextflow/workflows/qc/qc/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/metadata/grep_annotation_column" + - "target/nextflow/qc/calculate_qc_metrics" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/qc/qc/main.nf b/target/nextflow/workflows/qc/qc/main.nf new file mode 100644 index 00000000..c282f008 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/main.nf @@ -0,0 +1,3767 @@ +// qc main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "qc", + "namespace" : "workflows/qc", + "version" : "main", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the sample.", + "example" : [ + "input.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Which modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Layer to calculate qc metrics for.", + "example" : [ + "raw_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Mitochondrial & Ribosomal Gene Detection", + "arguments" : [ + { + "type" : "string", + "name" : "--var_gene_names", + "description" : ".var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively.\n", + "example" : [ + "gene_symbol" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_mitochondrial_genes", + "description" : "In which .var slot to store a boolean array corresponding the mitochondrial genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_mitochondrial_fraction", + "description" : ".Obs slot to store the fraction of reads found to be mitochondrial. Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--mitochondrial_gene_regex", + "description" : "Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n", + "default" : [ + "^[mM][tT]-" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_ribosomal_genes", + "description" : "In which .var slot to store a boolean array corresponding the ribosomal genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_ribosomal_fraction", + "description" : "When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--ribosomal_gene_regex", + "description" : "Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n", + "default" : [ + "^[Mm]?[Rr][Pp][LlSs]" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "QC metrics calculation options", + "arguments" : [ + { + "type" : "string", + "name" : "--var_qc_metrics", + "description" : "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n", + "example" : [ + "ercc,highly_variable" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "integer", + "name" : "--top_n_vars", + "description" : "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n", + "default" : [ + 50, + 100, + 200, + 500 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "string", + "name" : "--output_obs_num_nonzero_vars", + "description" : "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n", + "default" : [ + "num_nonzero_vars" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_total_counts_vars", + "description" : "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_num_nonzero_obs", + "description" : "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n", + "default" : [ + "num_nonzero_obs" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_total_counts_obs", + "description" : "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_obs_mean", + "description" : "Name of the column in .obs providing the mean of the values in each row.\n", + "default" : [ + "obs_mean" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_pct_dropout", + "description" : "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`\nbut percentage based.\n", + "default" : [ + "pct_dropout" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "A pipeline to add basic qc statistics to a MuData ", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "info" : { + "test_dependencies" : [ + { + "name" : "qc_test", + "namespace" : "test_workflows/qc" + } + ] + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "metadata/grep_annotation_column", + "repository" : { + "type" : "local" + } + }, + { + "name" : "qc/calculate_qc_metrics", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/qc/qc/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/qc/qc", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { grep_annotation_column } from "${meta.resources_dir}/../../../../nextflow/metadata/grep_annotation_column/main.nf" +include { calculate_qc_metrics } from "${meta.resources_dir}/../../../../nextflow/qc/calculate_qc_metrics/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Avoid conflict between output from component and output for this workflow + | map {id, state -> + assert state.output, "Output must be defined" + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Add default for var_qc_metrics component + | map {id, state -> + def var_qc_default = [] + // Remove the var_qc_metric argument from the state if its value is null (not specified) + def new_state = state.findAll { it.key != "var_qc_metrics" || it.value == null } + if (state.var_name_mitochondrial_genes) { + var_qc_default.add(state.var_name_mitochondrial_genes) + } + if (state.var_name_ribosomal_genes) { + var_qc_default.add(state.var_name_ribosomal_genes) + } + // Get the new state, but make sure to overwrite var_qc_metrics if the user has set it. + new_state = ["var_qc_metrics": var_qc_default.join(";")] + new_state + [id, new_state] + } + + | grep_annotation_column.run( + key: "grep_mitochondrial_genes", + runIf: { id, state -> + state.var_name_mitochondrial_genes + }, + fromState: { id, state -> + def stateMapping = [ + "input": state.input, + "modality": state.modality, + "input_column": state.var_gene_names, + "matrix": "var", + "output_match_column": state.var_name_mitochondrial_genes, + "regex_pattern": state.mitochondrial_gene_regex, + "input_layer": state.layer, + ] + stateMapping.output_fraction_column = state.obs_name_mitochondrial_fraction ? state.obs_name_mitochondrial_fraction: "fraction_$state.var_name_mitochondrial_genes" + return stateMapping + }, + toState: ["input": "output"] + ) + + | grep_annotation_column.run( + key: "grep_ribosomal_genes", + runIf: { id, state -> + state.var_name_ribosomal_genes + }, + fromState: { id, state -> + def stateMapping = [ + "input": state.input, + "modality": state.modality, + "input_column": state.var_gene_names, + "matrix": "var", + "output_match_column": state.var_name_ribosomal_genes, + "regex_pattern": state.ribosomal_gene_regex, + "input_layer": state.layer, + ] + stateMapping.output_fraction_column = state.obs_name_ribosomal_fraction ? state.obs_name_ribosomal_fraction: "fraction_$state.var_name_ribosomal_genes" + return stateMapping + }, + toState: ["input": "output"] + ) + + | calculate_qc_metrics.run( + fromState: { id, state -> + def newState = [ + "input": state.input, + "modality": state.modality, + "layer": state.layer, + // TODO: remove this workaround when Viash issue is resolved: + // 'top_n_vars': list(map(int, r''.split(';'))), + // ValueError: invalid literal for int() with base 10: '' + // See https://github.com/viash-io/viash/issues/619 + "top_n_vars": state.top_n_vars ? state.top_n_vars : null, + "var_qc_metrics_fill_na_value": state.var_qc_metrics_fill_na_value, + "output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars, + "output_obs_total_counts_vars": state.output_obs_total_counts_vars, + "output_var_num_nonzero_obs": state.output_var_num_nonzero_obs, + "output_var_total_counts_obs": state.output_var_total_counts_obs, + "output_var_obs_mean": state.output_var_obs_mean, + "output_var_pct_dropout": state.output_var_pct_dropout, + "output": state.workflow_output, + "compression": "gzip" + ] + if (state.var_qc_metrics) { + newState += ["var_qc_metrics": state.var_qc_metrics] + } + return newState + }, + toState: ["output": "output"] + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/qc/qc/nextflow.config b/target/nextflow/workflows/qc/qc/nextflow.config new file mode 100644 index 00000000..fe157978 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/qc/qc' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'A pipeline to add basic qc statistics to a MuData ' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/qc/qc/nextflow_labels.config b/target/nextflow/workflows/qc/qc/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/qc/qc/nextflow_schema.json b/target/nextflow/workflows/qc/qc/nextflow_schema.json new file mode 100644 index 00000000..82142b48 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/nextflow_schema.json @@ -0,0 +1,190 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "qc", + "description": "A pipeline to add basic qc statistics to a MuData ", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Which modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Layer to calculate qc metrics for.", + "help_text": "Type: `string`, multiple: `False`, example: `\"raw_counts\"`. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "mitochondrial & ribosomal gene detection": { + "title": "Mitochondrial & Ribosomal Gene Detection", + "type": "object", + "description": "No description", + "properties": { + "var_gene_names": { + "type": "string", + "description": ".var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively.\n", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_symbol\"`. " + }, + "var_name_mitochondrial_genes": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding the mitochondrial genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_name_mitochondrial_fraction": { + "type": "string", + "description": ".Obs slot to store the fraction of reads found to be mitochondrial", + "help_text": "Type: `string`, multiple: `False`. " + }, + "mitochondrial_gene_regex": { + "type": "string", + "description": "Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"^[mM][tT]-\"`. ", + "default": "^[mM][tT]-" + }, + "var_name_ribosomal_genes": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding the ribosomal genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_name_ribosomal_fraction": { + "type": "string", + "description": "When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "ribosomal_gene_regex": { + "type": "string", + "description": "Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"^[Mm]?[Rr][Pp][LlSs]\"`. ", + "default": "^[Mm]?[Rr][Pp][LlSs]" + } + } + }, + "qc metrics calculation options": { + "title": "QC metrics calculation options", + "type": "object", + "description": "No description", + "properties": { + "var_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes", + "help_text": "Type: `string`, multiple: `True`, example: `[\"ercc,highly_variable\"]`. " + }, + "top_n_vars": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated", + "help_text": "Type: `integer`, multiple: `True`, default: `[50,100,200,500]`. ", + "default": [ + 50, + 100, + 200, + 500 + ] + }, + "output_obs_num_nonzero_vars": { + "type": "string", + "description": "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_vars\"`. ", + "default": "num_nonzero_vars" + }, + "output_obs_total_counts_vars": { + "type": "string", + "description": "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_num_nonzero_obs": { + "type": "string", + "description": "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_obs\"`. ", + "default": "num_nonzero_obs" + }, + "output_var_total_counts_obs": { + "type": "string", + "description": "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_obs_mean": { + "type": "string", + "description": "Name of the column in .obs providing the mean of the values in each row.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"obs_mean\"`. ", + "default": "obs_mean" + }, + "output_var_pct_dropout": { + "type": "string", + "description": "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e", + "help_text": "Type: `string`, multiple: `False`, default: `\"pct_dropout\"`. ", + "default": "pct_dropout" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/mitochondrial & ribosomal gene detection" + }, + { + "$ref": "#/$defs/qc metrics calculation options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/qc/qc/utils/errorstrat_ignore.config b/target/nextflow/workflows/qc/qc/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/qc/qc/utils/integration_tests.config b/target/nextflow/workflows/qc/qc/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/qc/qc/utils/labels.config b/target/nextflow/workflows/qc/qc/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/qc/qc/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/qc/qc/utils/labels_ci.config b/target/nextflow/workflows/qc/qc/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/qc/qc/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/rna/rna_multisample/.config.vsh.yaml b/target/nextflow/workflows/rna/rna_multisample/.config.vsh.yaml new file mode 100644 index 00000000..9578ba39 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/.config.vsh.yaml @@ -0,0 +1,485 @@ +name: "rna_multisample" +namespace: "workflows/rna" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "ID of the concatenated file" + info: null + example: + - "concatenated" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the samples." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--modality" + description: "Modality to process." + info: null + default: + - "rna" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to use. If not specified, .X is used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Filtering highly variable features" + arguments: + - type: "string" + name: "--highly_variable_features_var_output" + alternatives: + - "--filter_with_hvg_var_output" + description: "In which .var slot to store a boolean array corresponding to the\ + \ highly variable features." + info: null + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--highly_variable_features_obs_batch_key" + alternatives: + - "--filter_with_hvg_obs_batch_key" + description: "If specified, highly-variable features are selected within each\ + \ batch separately and merged. This simple \nprocess avoids the selection of\ + \ batch-specific features and acts as a lightweight batch correction method.\ + \ \nFor all flavors, featues are first sorted by how many batches they are highly\ + \ variable. For dispersion-based flavors \nties are broken by normalized dispersion.\ + \ If flavor = 'seurat_v3', ties are broken by the median (across\nbatches) rank\ + \ based on within-batch normalized variance.\n" + info: null + default: + - "sample_id" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--highly_variable_features_flavor" + alternatives: + - "--filter_with_hvg_flavor" + description: "Choose the flavor for identifying highly variable features. For\ + \ the dispersion based methods\nin their default workflows, Seurat passes the\ + \ cutoffs whereas Cell Ranger passes n_top_features.\n" + info: null + default: + - "seurat" + required: false + choices: + - "seurat" + - "cell_ranger" + - "seurat_v3" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--highly_variable_features_n_top_features" + alternatives: + - "--filter_with_hvg_n_top_genes" + description: "Number of highly-variable features to keep. Mandatory if filter_with_hvg_flavor\ + \ is set to 'seurat_v3'." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "QC metrics calculation options" + arguments: + - type: "string" + name: "--var_qc_metrics" + description: "Keys to select a boolean (containing only True or False) column\ + \ from .var.\nFor each cell, calculate the proportion of total values for genes\ + \ which are labeled 'True', \ncompared to the total sum of the values for all\ + \ genes.\n" + info: null + example: + - "ercc,highly_variable" + default: + - "filter_with_hvg" + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "integer" + name: "--top_n_vars" + description: "Number of top vars to be used to calculate cumulative proportions.\n\ + If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\ + cumulative proportion to the 20th and 50th most expressed vars.\n" + info: null + default: + - 50 + - 100 + - 200 + - 500 + required: false + direction: "input" + multiple: true + multiple_sep: "," + - type: "string" + name: "--output_obs_num_nonzero_vars" + description: "Name of column in .obs describing, for each observation, the number\ + \ of stored values\n(including explicit zeroes). In other words, the name of\ + \ the column that counts\nfor each row the number of columns that contain data.\n" + info: null + default: + - "num_nonzero_vars" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_obs_total_counts_vars" + description: "Name of the column for .obs describing, for each observation (row),\n\ + the sum of the stored values in the columns.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_num_nonzero_obs" + description: "Name of column describing, for each feature, the number of stored\ + \ values\n(including explicit zeroes). In other words, the name of the column\ + \ that counts\nfor each column the number of rows that contain data.\n" + info: null + default: + - "num_nonzero_obs" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_total_counts_obs" + description: "Name of the column in .var describing, for each feature (column),\n\ + the sum of the stored values in the rows.\n" + info: null + default: + - "total_counts" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_obs_mean" + description: "Name of the column in .obs providing the mean of the values in each\ + \ row.\n" + info: null + default: + - "obs_mean" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--output_var_pct_dropout" + description: "Name of the column in .obs providing for each feature the percentage\ + \ of\nobservations the feature does not appear on (i.e. is missing). Same as\ + \ `--num_nonzero_obs`\nbut percentage based.\n" + info: null + default: + - "pct_dropout" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "RNA Scaling options" + description: "Options for enabling scaling of the log-normalized data to unit variance\ + \ and zero mean.\nThe scaled data will be output a different layer and representation\ + \ with reduced dimensions\nwill be created and stored in addition to the non-scaled\ + \ data.\n" + arguments: + - type: "boolean_true" + name: "--enable_scaling" + description: "Enable scaling for the RNA modality." + info: null + direction: "input" + - type: "string" + name: "--scaling_output_layer" + description: "Output layer where the scaled log-normalized data will be stored." + info: null + default: + - "scaled" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--scaling_max_value" + description: "Clip (truncate) data to this value after scaling. If not specified,\ + \ do not clip." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_false" + name: "--scaling_zero_center" + description: "If set, omit zero-centering variables, which allows to handle sparse\ + \ input efficiently.\"" + info: null + direction: "input" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Processing unimodal multi-sample RNA transcriptomics data." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "file" + path: "concat_test_data" +info: + image: "/images/concepts/fig_workflow_multiomics_rna_multisample.svg" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "transform/normalize_total" + repository: + type: "local" +- name: "transform/log1p" + repository: + type: "local" +- name: "feature_annotation/highly_variable_features_scanpy" + repository: + type: "local" +- name: "workflows/qc/qc" + alias: "rna_qc" + repository: + type: "local" +- name: "transform/delete_layer" + repository: + type: "local" +- name: "metadata/add_id" + repository: + type: "local" +- name: "transform/scale" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/rna/rna_multisample/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/rna/rna_multisample" + executable: "target/nextflow/workflows/rna/rna_multisample/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/transform/normalize_total" + - "target/nextflow/transform/log1p" + - "target/nextflow/feature_annotation/highly_variable_features_scanpy" + - "target/nextflow/workflows/qc/qc" + - "target/nextflow/transform/delete_layer" + - "target/nextflow/metadata/add_id" + - "target/nextflow/transform/scale" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/rna/rna_multisample/main.nf b/target/nextflow/workflows/rna/rna_multisample/main.nf new file mode 100644 index 00000000..470c4e29 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/main.nf @@ -0,0 +1,3865 @@ +// rna_multisample main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (author) +// * Robrecht Cannoodt (author, maintainer) +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "rna_multisample", + "namespace" : "workflows/rna", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the concatenated file", + "example" : [ + "concatenated" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the samples.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--modality", + "description" : "Modality to process.", + "default" : [ + "rna" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to use. If not specified, .X is used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filtering highly variable features", + "arguments" : [ + { + "type" : "string", + "name" : "--highly_variable_features_var_output", + "alternatives" : [ + "--filter_with_hvg_var_output" + ], + "description" : "In which .var slot to store a boolean array corresponding to the highly variable features.", + "default" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--highly_variable_features_obs_batch_key", + "alternatives" : [ + "--filter_with_hvg_obs_batch_key" + ], + "description" : "If specified, highly-variable features are selected within each batch separately and merged. This simple \nprocess avoids the selection of batch-specific features and acts as a lightweight batch correction method. \nFor all flavors, featues are first sorted by how many batches they are highly variable. For dispersion-based flavors \nties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across\nbatches) rank based on within-batch normalized variance.\n", + "default" : [ + "sample_id" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--highly_variable_features_flavor", + "alternatives" : [ + "--filter_with_hvg_flavor" + ], + "description" : "Choose the flavor for identifying highly variable features. For the dispersion based methods\nin their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features.\n", + "default" : [ + "seurat" + ], + "required" : false, + "choices" : [ + "seurat", + "cell_ranger", + "seurat_v3" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--highly_variable_features_n_top_features", + "alternatives" : [ + "--filter_with_hvg_n_top_genes" + ], + "description" : "Number of highly-variable features to keep. Mandatory if filter_with_hvg_flavor is set to 'seurat_v3'.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "QC metrics calculation options", + "arguments" : [ + { + "type" : "string", + "name" : "--var_qc_metrics", + "description" : "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes.\n", + "example" : [ + "ercc,highly_variable" + ], + "default" : [ + "filter_with_hvg" + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "integer", + "name" : "--top_n_vars", + "description" : "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n", + "default" : [ + 50, + 100, + 200, + 500 + ], + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : "," + }, + { + "type" : "string", + "name" : "--output_obs_num_nonzero_vars", + "description" : "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n", + "default" : [ + "num_nonzero_vars" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_obs_total_counts_vars", + "description" : "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_num_nonzero_obs", + "description" : "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n", + "default" : [ + "num_nonzero_obs" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_total_counts_obs", + "description" : "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "default" : [ + "total_counts" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_obs_mean", + "description" : "Name of the column in .obs providing the mean of the values in each row.\n", + "default" : [ + "obs_mean" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--output_var_pct_dropout", + "description" : "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs`\nbut percentage based.\n", + "default" : [ + "pct_dropout" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "RNA Scaling options", + "description" : "Options for enabling scaling of the log-normalized data to unit variance and zero mean.\nThe scaled data will be output a different layer and representation with reduced dimensions\nwill be created and stored in addition to the non-scaled data.\n", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--enable_scaling", + "description" : "Enable scaling for the RNA modality.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--scaling_output_layer", + "description" : "Output layer where the scaled log-normalized data will be stored.", + "default" : [ + "scaled" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--scaling_max_value", + "description" : "Clip (truncate) data to this value after scaling. If not specified, do not clip.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_false", + "name" : "--scaling_zero_center", + "description" : "If set, omit zero-centering variables, which allows to handle sparse input efficiently.\\"", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Processing unimodal multi-sample RNA transcriptomics data.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "file", + "path" : "/resources_test/concat_test_data" + } + ], + "info" : { + "image" : "/images/concepts/fig_workflow_multiomics_rna_multisample.svg" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "transform/normalize_total", + "repository" : { + "type" : "local" + } + }, + { + "name" : "transform/log1p", + "repository" : { + "type" : "local" + } + }, + { + "name" : "feature_annotation/highly_variable_features_scanpy", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/qc/qc", + "alias" : "rna_qc", + "repository" : { + "type" : "local" + } + }, + { + "name" : "transform/delete_layer", + "repository" : { + "type" : "local" + } + }, + { + "name" : "metadata/add_id", + "repository" : { + "type" : "local" + } + }, + { + "name" : "transform/scale", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/rna/rna_multisample/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/rna/rna_multisample", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { normalize_total } from "${meta.resources_dir}/../../../../nextflow/transform/normalize_total/main.nf" +include { log1p } from "${meta.resources_dir}/../../../../nextflow/transform/log1p/main.nf" +include { highly_variable_features_scanpy } from "${meta.resources_dir}/../../../../nextflow/feature_annotation/highly_variable_features_scanpy/main.nf" +include { qc as rna_qc_viashalias } from "${meta.resources_dir}/../../../../nextflow/workflows/qc/qc/main.nf" +rna_qc = rna_qc_viashalias.run(key: "rna_qc") +include { delete_layer } from "${meta.resources_dir}/../../../../nextflow/transform/delete_layer/main.nf" +include { add_id } from "${meta.resources_dir}/../../../../nextflow/metadata/add_id/main.nf" +include { scale } from "${meta.resources_dir}/../../../../nextflow/transform/scale/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + | normalize_total.run( + fromState: { id, state -> + [ + "input": state.input, + "input_layer": state.layer, + "output_layer": "normalized", + "modality": state.modality + ] + }, + toState: ["input": "output"], + ) + | log1p.run( + fromState: { id, state -> + [ + "input": state.input, + "output_layer": "log_normalized", + "input_layer": "normalized", + "modality": state.modality + ] + }, + toState: ["input": "output"] + ) + | delete_layer.run( + fromState: {id, state -> + [ + "input": state.input, + "layer": "normalized", + "modality": state.modality + ] + }, + toState: ["input": "output"] + ) + | scale.run( + runIf: {id, state -> state.enable_scaling}, + fromState: {id, state -> + [ + "input": state.input, + "modality": state.modality, + "input_layer": "log_normalized", + "output_layer": state.scaling_output_layer, + "max_value": state.scaling_max_value, + "zero_center": state.scaling_zero_center, + ] + }, + toState: ["input": "output"], + ) + | highly_variable_features_scanpy.run( + fromState: {id, state -> + [ + "input": state.input, + "layer": "log_normalized", + "modality": state.modality, + "var_name_filter": state.highly_variable_features_var_output, + "n_top_features": state.highly_variable_features_n_top_features, + "flavor": state.highly_variable_features_flavor, + "obs_batch_key": state.highly_variable_features_obs_batch_key + ] + }, + toState: ["input": "output"], + ) + | rna_qc.run( + // TODO: remove when viash 0.8.3 is released + key: "rna_qc", + fromState: {id, state -> + [ + "id": id, + "input": state.input, + "output": state.workflow_output, + "layer": state.layer, // Use the non-transformed layer + "output_compression": "gzip", + "modality": state.modality, + "var_qc_metrics": state.var_qc_metrics, + "top_n_vars": state.top_n_vars, + "output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars, + "output_obs_total_counts_vars": state.output_obs_total_counts_vars, + "output_var_num_nonzero_obs": state.output_var_num_nonzero_obs, + "output_var_total_counts_obs": state.output_var_total_counts_obs, + "output_var_obs_mean": state.output_var_obs_mean, + "output_var_pct_dropout": state.output_var_pct_dropout + ] + }, + ) + | setState(["output"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/rna/rna_multisample/nextflow.config b/target/nextflow/workflows/rna/rna_multisample/nextflow.config new file mode 100644 index 00000000..f17e9883 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/rna/rna_multisample' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Processing unimodal multi-sample RNA transcriptomics data.' + author = 'Dries De Maeyer, Robrecht Cannoodt, Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/rna/rna_multisample/nextflow_labels.config b/target/nextflow/workflows/rna/rna_multisample/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/rna/rna_multisample/nextflow_schema.json b/target/nextflow/workflows/rna/rna_multisample/nextflow_schema.json new file mode 100644 index 00000000..8d790f59 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/nextflow_schema.json @@ -0,0 +1,217 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "rna_multisample", + "description": "Processing unimodal multi-sample RNA transcriptomics data.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the concatenated file", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"concatenated\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the samples.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "modality": { + "type": "string", + "description": "Modality to process.", + "help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ", + "default": "rna" + }, + "layer": { + "type": "string", + "description": "Input layer to use", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "filtering highly variable features": { + "title": "Filtering highly variable features", + "type": "object", + "description": "No description", + "properties": { + "highly_variable_features_var_output": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding to the highly variable features.", + "help_text": "Type: `string`, multiple: `False`, default: `\"filter_with_hvg\"`. ", + "default": "filter_with_hvg" + }, + "highly_variable_features_obs_batch_key": { + "type": "string", + "description": "If specified, highly-variable features are selected within each batch separately and merged", + "help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ", + "default": "sample_id" + }, + "highly_variable_features_flavor": { + "type": "string", + "description": "Choose the flavor for identifying highly variable features", + "help_text": "Type: `string`, multiple: `False`, default: `\"seurat\"`, choices: ``seurat`, `cell_ranger`, `seurat_v3``. ", + "enum": [ + "seurat", + "cell_ranger", + "seurat_v3" + ], + "default": "seurat" + }, + "highly_variable_features_n_top_features": { + "type": "integer", + "description": "Number of highly-variable features to keep", + "help_text": "Type: `integer`, multiple: `False`. " + } + } + }, + "qc metrics calculation options": { + "title": "QC metrics calculation options", + "type": "object", + "description": "No description", + "properties": { + "var_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled 'True', \ncompared to the total sum of the values for all genes.\n", + "help_text": "Type: `string`, multiple: `True`, default: `[\"filter_with_hvg\"]`, example: `[\"ercc,highly_variable\"]`. ", + "default": [ + "filter_with_hvg" + ] + }, + "top_n_vars": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated", + "help_text": "Type: `integer`, multiple: `True`, default: `[50,100,200,500]`. ", + "default": [ + 50, + 100, + 200, + 500 + ] + }, + "output_obs_num_nonzero_vars": { + "type": "string", + "description": "Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_vars\"`. ", + "default": "num_nonzero_vars" + }, + "output_obs_total_counts_vars": { + "type": "string", + "description": "Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_num_nonzero_obs": { + "type": "string", + "description": "Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)", + "help_text": "Type: `string`, multiple: `False`, default: `\"num_nonzero_obs\"`. ", + "default": "num_nonzero_obs" + }, + "output_var_total_counts_obs": { + "type": "string", + "description": "Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"total_counts\"`. ", + "default": "total_counts" + }, + "output_var_obs_mean": { + "type": "string", + "description": "Name of the column in .obs providing the mean of the values in each row.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"obs_mean\"`. ", + "default": "obs_mean" + }, + "output_var_pct_dropout": { + "type": "string", + "description": "Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e", + "help_text": "Type: `string`, multiple: `False`, default: `\"pct_dropout\"`. ", + "default": "pct_dropout" + } + } + }, + "rna scaling options": { + "title": "RNA Scaling options", + "type": "object", + "description": "Options for enabling scaling of the log-normalized data to unit variance and zero mean.\nThe scaled data will be output a different layer and representation with reduced dimensions\nwill be created and stored in addition to the non-scaled data.\n", + "properties": { + "enable_scaling": { + "type": "boolean", + "description": "Enable scaling for the RNA modality.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "scaling_output_layer": { + "type": "string", + "description": "Output layer where the scaled log-normalized data will be stored.", + "help_text": "Type: `string`, multiple: `False`, default: `\"scaled\"`. ", + "default": "scaled" + }, + "scaling_max_value": { + "type": "number", + "description": "Clip (truncate) data to this value after scaling", + "help_text": "Type: `double`, multiple: `False`. " + }, + "scaling_zero_center": { + "type": "boolean", + "description": "If set, omit zero-centering variables, which allows to handle sparse input efficiently.\"", + "help_text": "Type: `boolean_false`, multiple: `False`, default: `true`. ", + "default": true + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/filtering highly variable features" + }, + { + "$ref": "#/$defs/qc metrics calculation options" + }, + { + "$ref": "#/$defs/rna scaling options" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/rna/rna_multisample/utils/errorstrat_ignore.config b/target/nextflow/workflows/rna/rna_multisample/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/rna/rna_multisample/utils/integration_tests.config b/target/nextflow/workflows/rna/rna_multisample/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/rna/rna_multisample/utils/labels.config b/target/nextflow/workflows/rna/rna_multisample/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/rna/rna_multisample/utils/labels_ci.config b/target/nextflow/workflows/rna/rna_multisample/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/rna/rna_multisample/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1 diff --git a/target/nextflow/workflows/rna/rna_singlesample/.config.vsh.yaml b/target/nextflow/workflows/rna/rna_singlesample/.config.vsh.yaml new file mode 100644 index 00000000..067e8132 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/.config.vsh.yaml @@ -0,0 +1,451 @@ +name: "rna_singlesample" +namespace: "workflows/rna" +version: "main" +authors: +- name: "Dries De Maeyer" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "ddemaeyer@gmail.com" + github: "ddemaeyer" + linkedin: "dries-de-maeyer-b46a814" + organizations: + - name: "Janssen Pharmaceuticals" + href: "https://www.janssen.com" + role: "Principal Scientist" +- name: "Robrecht Cannoodt" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + email: "robrecht@data-intuitive.com" + github: "rcannood" + orcid: "0000-0003-3641-729X" + linkedin: "robrechtcannoodt" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Science Engineer" + - name: "Open Problems" + href: "https://openproblems.bio" + role: "Core Member" +- name: "Dries Schaumont" + roles: + - "author" + info: + role: "Core Team Member" + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input" + arguments: + - type: "string" + name: "--id" + description: "ID of the sample." + info: null + example: + - "foo" + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input" + description: "Path to the sample." + info: null + example: + - "dataset.h5mu" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--layer" + description: "Input layer to start from. By default, .X will be used." + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output" + arguments: + - type: "file" + name: "--output" + description: "Destination path to the output." + info: null + example: + - "output.h5mu" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Filtering options" + arguments: + - type: "integer" + name: "--min_counts" + description: "Minimum number of counts captured per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_counts" + description: "Maximum number of counts captured per cell." + info: null + example: + - 5000000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_genes_per_cell" + description: "Minimum of non-zero values per cell." + info: null + example: + - 200 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--max_genes_per_cell" + description: "Maximum of non-zero values per cell." + info: null + example: + - 1500000 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--min_cells_per_gene" + description: "Minimum of non-zero values per gene." + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_fraction_mito" + description: "Minimum fraction of UMIs that are mitochondrial. Requires --obs_name_mitochondrial_fraction." + info: null + example: + - 0.0 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_fraction_mito" + description: "Maximum fraction of UMIs that are mitochondrial. Requires --obs_name_mitochondrial_fraction.\n" + info: null + example: + - 0.2 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--min_fraction_ribo" + description: "Minimum fraction of UMIs that are ribosomal. Requires --obs_name_ribosomal_fraction." + info: null + example: + - 0.0 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--max_fraction_ribo" + description: "Maximum fraction of UMIs that are ribosomal. Requires --obs_name_ribosomal_fraction.\n" + info: null + example: + - 0.2 + required: false + min: 0.0 + max: 1.0 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--skip_scrublet_doublet_detection" + description: "Skip the scrublet doublet detection step." + info: null + direction: "input" +- name: "Mitochondrial & Ribosomal Gene Detection" + arguments: + - type: "string" + name: "--var_gene_names" + description: ".var column name to be used to detect mitochondrial/ribosomal genes\ + \ instead of .var_names (default if not set).\nGene names matching with the\ + \ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\ + \ be \nidentified as mitochondrial or ribosomal genes, respectively. \n" + info: null + example: + - "gene_symbol" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_mitochondrial_genes" + description: "In which .var slot to store a boolean array corresponding the mitochondrial\ + \ genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_mitochondrial_fraction" + description: "When specified, write the fraction of counts originating from mitochondrial\ + \ genes \n(based on --mitochondrial_gene_regex) to an .obs column with the specified\ + \ name.\nRequires --var_name_mitochondrial_genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--mitochondrial_gene_regex" + description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\ + By default will detect human and mouse mitochondrial genes from a gene symbol.\n" + info: null + default: + - "^[mM][tT]-" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--var_name_ribosomal_genes" + description: "In which .var slot to store a boolean array corresponding the ribosomal\ + \ genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--obs_name_ribosomal_fraction" + description: "When specified, write the fraction of counts originating from ribosomal\ + \ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\ + \ name.\nRequires --var_name_ribosomal_genes.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--ribosomal_gene_regex" + description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\ + By default will detect human and mouse ribosomal genes from a gene symbol.\n" + info: null + default: + - "^[Mm]?[Rr][Pp][LlSs]" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "utils" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +description: "Processing unimodal single-sample RNA transcriptomics data." +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +- type: "file" + path: "pbmc_1k_protein_v3" +info: + image: "/images/concepts/fig_workflow_multiomics_rna_singlesample.svg" +status: "enabled" +scope: + image: "public" + target: "public" +dependencies: +- name: "filter/filter_with_counts" + repository: + type: "local" +- name: "filter/filter_with_scrublet" + repository: + type: "local" +- name: "filter/do_filter" + repository: + type: "local" +- name: "filter/delimit_fraction" + repository: + type: "local" +- name: "workflows/qc/qc" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +build_info: + config: "src/workflows/rna/rna_singlesample/config.vsh.yaml" + runner: "nextflow" + engine: "native" + output: "target/nextflow/workflows/rna/rna_singlesample" + executable: "target/nextflow/workflows/rna/rna_singlesample/main.nf" + viash_version: "0.9.4" + git_commit: "173327cc5670aa8bd5cf473827de80b602c90092" + git_remote: "https://github.com/openpipelines-bio/openpipeline" + git_tag: "0.2.0-2055-g173327cc" + dependencies: + - "target/nextflow/filter/filter_with_counts" + - "target/nextflow/filter/filter_with_scrublet" + - "target/nextflow/filter/do_filter" + - "target/nextflow/filter/delimit_fraction" + - "target/nextflow/workflows/qc/qc" +package_config: + name: "openpipeline" + version: "main" + summary: "Best-practice workflows for single-cell multi-omics analyses.\n" + description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\ + \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\ + \nIn terms of workflows, the following has been made available, but keep in mind\ + \ that\nindividual tools and functionality can be executed as standalone components\ + \ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\ + \ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\ + \ processing: cell filtering and doublet detection.\n * Multisample processing:\ + \ Count transformation, normalization, QC metric calulations.\n * Integration:\ + \ Clustering, integration and batch correction using single and multimodal methods.\n\ + \ * Downstream analysis workflows\n" + info: + test_resources: + - type: "s3" + path: "s3://openpipelines-data" + dest: "resources_test" + nextflow_labels_ci: + - path: "src/workflows/utils/labels_ci.config" + description: "Adds the correct memory and CPU labels when running on the Viash\ + \ Hub CI." + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\ + .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\ + )'\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'main'" + keywords: + - "single-cell" + - "multimodal" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/openpipelines-bio/openpipeline" + docker_registry: "ghcr.io" + homepage: "https://openpipelines.bio" + documentation: "https://openpipelines.bio/fundamentals" + issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues" diff --git a/target/nextflow/workflows/rna/rna_singlesample/main.nf b/target/nextflow/workflows/rna/rna_singlesample/main.nf new file mode 100644 index 00000000..96a44488 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/main.nf @@ -0,0 +1,3923 @@ +// rna_singlesample main +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries De Maeyer (author) +// * Robrecht Cannoodt (author, maintainer) +// * Dries Schaumont (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "rna_singlesample", + "namespace" : "workflows/rna", + "version" : "main", + "authors" : [ + { + "name" : "Dries De Maeyer", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "ddemaeyer@gmail.com", + "github" : "ddemaeyer", + "linkedin" : "dries-de-maeyer-b46a814" + }, + "organizations" : [ + { + "name" : "Janssen Pharmaceuticals", + "href" : "https://www.janssen.com", + "role" : "Principal Scientist" + } + ] + } + }, + { + "name" : "Robrecht Cannoodt", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "robrecht@data-intuitive.com", + "github" : "rcannood", + "orcid" : "0000-0003-3641-729X", + "linkedin" : "robrechtcannoodt" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Science Engineer" + }, + { + "name" : "Open Problems", + "href" : "https://openproblems.bio", + "role" : "Core Member" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "author" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "ID of the sample.", + "example" : [ + "foo" + ], + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input", + "description" : "Path to the sample.", + "example" : [ + "dataset.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--layer", + "description" : "Input layer to start from. By default, .X will be used.", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "Destination path to the output.", + "example" : [ + "output.h5mu" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Filtering options", + "arguments" : [ + { + "type" : "integer", + "name" : "--min_counts", + "description" : "Minimum number of counts captured per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_counts", + "description" : "Maximum number of counts captured per cell.", + "example" : [ + 5000000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_genes_per_cell", + "description" : "Minimum of non-zero values per cell.", + "example" : [ + 200 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--max_genes_per_cell", + "description" : "Maximum of non-zero values per cell.", + "example" : [ + 1500000 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--min_cells_per_gene", + "description" : "Minimum of non-zero values per gene.", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_fraction_mito", + "description" : "Minimum fraction of UMIs that are mitochondrial. Requires --obs_name_mitochondrial_fraction.", + "example" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_fraction_mito", + "description" : "Maximum fraction of UMIs that are mitochondrial. Requires --obs_name_mitochondrial_fraction.\n", + "example" : [ + 0.2 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--min_fraction_ribo", + "description" : "Minimum fraction of UMIs that are ribosomal. Requires --obs_name_ribosomal_fraction.", + "example" : [ + 0.0 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--max_fraction_ribo", + "description" : "Maximum fraction of UMIs that are ribosomal. Requires --obs_name_ribosomal_fraction.\n", + "example" : [ + 0.2 + ], + "required" : false, + "min" : 0.0, + "max" : 1.0, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--skip_scrublet_doublet_detection", + "description" : "Skip the scrublet doublet detection step.", + "direction" : "input" + } + ] + }, + { + "name" : "Mitochondrial & Ribosomal Gene Detection", + "arguments" : [ + { + "type" : "string", + "name" : "--var_gene_names", + "description" : ".var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively. \n", + "example" : [ + "gene_symbol" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_mitochondrial_genes", + "description" : "In which .var slot to store a boolean array corresponding the mitochondrial genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_mitochondrial_fraction", + "description" : "When specified, write the fraction of counts originating from mitochondrial genes \n(based on --mitochondrial_gene_regex) to an .obs column with the specified name.\nRequires --var_name_mitochondrial_genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--mitochondrial_gene_regex", + "description" : "Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n", + "default" : [ + "^[mM][tT]-" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--var_name_ribosomal_genes", + "description" : "In which .var slot to store a boolean array corresponding the ribosomal genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--obs_name_ribosomal_fraction", + "description" : "When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--ribosomal_gene_regex", + "description" : "Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n", + "default" : [ + "^[Mm]?[Rr][Pp][LlSs]" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/" + }, + { + "type" : "file", + "path" : "/src/workflows/utils/labels.config", + "dest" : "nextflow_labels.config" + } + ], + "description" : "Processing unimodal single-sample RNA transcriptomics data.", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + }, + { + "type" : "file", + "path" : "/resources_test/pbmc_1k_protein_v3" + } + ], + "info" : { + "image" : "/images/concepts/fig_workflow_multiomics_rna_singlesample.svg" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "dependencies" : [ + { + "name" : "filter/filter_with_counts", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/filter_with_scrublet", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/do_filter", + "repository" : { + "type" : "local" + } + }, + { + "name" : "filter/delimit_fraction", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/qc/qc", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/rna/rna_singlesample/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native", + "output" : "/workdir/root/repo/target/nextflow/workflows/rna/rna_singlesample", + "viash_version" : "0.9.4", + "git_commit" : "173327cc5670aa8bd5cf473827de80b602c90092", + "git_remote" : "https://github.com/openpipelines-bio/openpipeline", + "git_tag" : "0.2.0-2055-g173327cc" + }, + "package_config" : { + "name" : "openpipeline", + "version" : "main", + "summary" : "Best-practice workflows for single-cell multi-omics analyses.\n", + "description" : "OpenPipelines are extensible single cell analysis pipelines for reproducible and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\nIn terms of workflows, the following has been made available, but keep in mind that\nindividual tools and functionality can be executed as standalone components as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n * Ingestion: Read mapping and generating a count matrix.\n * Single sample processing: cell filtering and doublet detection.\n * Multisample processing: Count transformation, normalization, QC metric calulations.\n * Integration: Clustering, integration and batch correction using single and multimodal methods.\n * Downstream analysis workflows\n", + "info" : { + "test_resources" : [ + { + "type" : "s3", + "path" : "s3://openpipelines-data", + "dest" : "resources_test" + } + ], + "nextflow_labels_ci" : [ + { + "path" : "src/workflows/utils/labels_ci.config", + "description" : "Adds the correct memory and CPU labels when running on the Viash Hub CI." + } + ] + }, + "viash_version" : "0.9.4", + "source" : "/workdir/root/repo/src", + "target" : "/workdir/root/repo/target", + "config_mods" : [ + ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n.runners[.type == 'nextflow'].config.script := 'includeConfig(\\"nextflow_labels.config\\")'\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'main'" + ], + "keywords" : [ + "single-cell", + "multimodal" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/openpipelines-bio/openpipeline", + "docker_registry" : "ghcr.io", + "homepage" : "https://openpipelines.bio", + "documentation" : "https://openpipelines.bio/fundamentals", + "issue_tracker" : "https://github.com/openpipelines-bio/openpipeline/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { filter_with_counts } from "${meta.resources_dir}/../../../../nextflow/filter/filter_with_counts/main.nf" +include { filter_with_scrublet } from "${meta.resources_dir}/../../../../nextflow/filter/filter_with_scrublet/main.nf" +include { do_filter } from "${meta.resources_dir}/../../../../nextflow/filter/do_filter/main.nf" +include { delimit_fraction } from "${meta.resources_dir}/../../../../nextflow/filter/delimit_fraction/main.nf" +include { qc } from "${meta.resources_dir}/../../../../nextflow/workflows/qc/qc/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Check for correctness of mitochondrial and ribosomal gene detection arguments + | map { id, state -> + def new_state = [:] + if (state.obs_name_mitochondrial_fraction && !state.var_name_mitochondrial_genes) { + throw new RuntimeException("Using --obs_name_mitochondrial_fraction requires --var_name_mitochondrial_genes.") + } + if (!state.obs_name_mitochondrial_fraction && state.var_name_mitochondrial_genes) { + new_state.obs_name_mitochondrial_fraction = "fraction_${state.var_name_mitochondrial_genes}" + } + if ((state.min_fraction_mito != null || state.max_fraction_mito != null) && !state.var_name_mitochondrial_genes) { + throw new RuntimeException("Enabling --min_fraction_mito or --max_fraction_mito requires --var_name_mitochondrial_genes.") + } + if (state.var_gene_names && !state.var_name_mitochondrial_genes && !state.var_name_ribosomal_genes) { + System.err.println("Warning: --var_gene_names is only required for mitochondrial gene detection and does nothing while \ + not also setting --var_name_mitochondrial_genes or --var_name_ribosomal_genes.") + } + if (state.mitochondrial_gene_regex && !state.var_name_mitochondrial_genes && !state.var_name_ribosomal_genes) { + System.err.println("Warning: --mitochondrial_gene_regex is only required for mitochondrial gene detection and does \ + nothing while not also setting --var_name_mitochondrial_genes or --var_name_ribosomal_genes.") + } + if (state.obs_name_ribosomal_fraction && !state.var_name_ribosomal_genes) { + throw new RuntimeException("Using --obs_name_ribosomal_fraction requires --var_name_ribosomal_genes.") + } + if (!state.obs_name_ribosomal_fraction && state.var_name_ribosomal_genes) { + new_state.obs_name_ribosomal_fraction = "fraction_${state.var_name_ribosomal_genes}" + } + if ((state.min_fraction_ribo != null || state.max_fraction_ribo != null) && !state.var_name_ribosomal_genes) { + throw new RuntimeException("Enabling --min_fraction_ribo or --max_fraction_ribo requires --var_name_ribosomal_genes.") + } + [id, state + new_state] + } + | qc.run( + fromState: { id, state -> + // The rna singlesample processing allows detecting mitochondrial genes and filtering based + // on the fraction of mitochondrial genes per cell + // This behaviour is optional based on the presence of var_name_mitochondrial_genes + // The behavior of other components must be tuned to this argument as well + def args = [ + "id": id, + "input": state.input, + // disable other qc metric calculations + // only mitochondrial gene detection is required at this point + "top_n_vars": [], + "output_obs_num_nonzero_vars": null, + "output_obs_total_counts_vars": null, + "output_var_num_nonzero_obs": null, + "output_var_total_counts_obs": null, + "output_var_obs_mean": null, + "output_var_pct_dropout": null, + "output": state.output, + "modality": "rna", + "layer": state.layer, + ] + + if (state.var_name_mitochondrial_genes) { + // Check if user has defined var columns to calculate metrics + def new_var_qc_metrics = state.var_qc_metrics != null ? state.var_qc_metrics : [] + assert new_var_qc_metrics instanceof List + // Add the mitochondrial genes var column to the columns to calculate statistics for if set. + new_var_qc_metrics = ((new_var_qc_metrics as Set) + [state.var_name_mitochondrial_genes]) as List + + args += [ + "var_qc_metrics": new_var_qc_metrics, + "obs_name_mitochondrial_fraction": state.obs_name_mitochondrial_fraction, + "var_gene_names": state.var_gene_names, + "var_name_mitochondrial_genes": state.var_name_mitochondrial_genes, + "mitochondrial_gene_regex": state.mitochondrial_gene_regex + ] + } + + if (state.var_name_ribosomal_genes) { + // Check if user has defined var columns to calculate metrics + def new_var_qc_metrics = state.var_qc_metrics != null ? state.var_qc_metrics : [] + assert new_var_qc_metrics instanceof List + // Add the ribosomal genes var column to the columns to calculate statistics for if set. + new_var_qc_metrics = ((new_var_qc_metrics as Set) + [state.var_name_ribosomal_genes]) as List + + args += [ + "var_qc_metrics": new_var_qc_metrics, + "obs_name_ribosomal_fraction": state.obs_name_ribosomal_fraction, + "var_gene_names": state.var_gene_names, + "var_name_ribosomal_genes": state.var_name_ribosomal_genes, + "ribosomal_gene_regex": state.ribosomal_gene_regex + ] + } + + return args + }, + toState: ["input": "output"] + ) + | delimit_fraction.run( + runIf: {id, state -> state.var_name_mitochondrial_genes}, + fromState: {id, state -> + [ + "input": state.input, + "obs_name_filter": "filter_mitochondrial", + "min_fraction": state.min_fraction_mito, + "max_fraction": state.max_fraction_mito, + "obs_fraction_column": state.obs_name_mitochondrial_fraction, + ] + }, + toState: ["input": "output"] + ) + | delimit_fraction.run( + runIf: {id, state -> state.var_name_ribosomal_genes}, + fromState: {id, state -> + [ + "input": state.input, + "obs_name_filter": "filter_ribosomal", + "min_fraction": state.min_fraction_ribo, + "max_fraction": state.max_fraction_ribo, + "obs_fraction_column": state.obs_name_ribosomal_fraction, + ] + }, + toState: ["input": "output"] + ) + // cell filtering + | filter_with_counts.run( + key: "rna_filter_with_counts", + fromState: { id, state -> + [ + "input": state.input, + "layer": state.layer, + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "min_counts": state.min_counts, + "max_counts": state.max_counts, + "min_genes_per_cell": state.min_genes_per_cell, + "max_genes_per_cell": state.max_genes_per_cell, + "min_cells_per_gene": state.min_cells_per_gene, + ] + }, + toState: ["input": "output"] + ) + | do_filter.run( + key: "rna_do_filter", + fromState: {id, state -> + // do_filter does not need a layer argument because it filters all layers + // from a modality. + def stateMapping = [ + input: state.input, + var_filter: ["filter_with_counts"], + // If scrublet is skipped, the output should be set to the workflow output + output: state.workflow_output + ] + def obs_filter = ["filter_with_counts"] + if (state.var_name_mitochondrial_genes) { + obs_filter += ["filter_mitochondrial"] + } + if (state.var_name_ribosomal_genes) { + obs_filter += ["filter_ribosomal"] + } + stateMapping += ["obs_filter": obs_filter] + return stateMapping + }, + toState: ["input": "output"] + ) + // doublet calling + | filter_with_scrublet.run( + runIf: { id, state -> + !state.skip_scrublet_doublet_detection + }, + fromState: [ + "input": "input", + "layer": "layer", + "output": "workflow_output" + ], + args: [output_compression: "gzip"], + toState: ["input": "output"] + ) + | setState(["output": "input"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/rna/rna_singlesample/nextflow.config b/target/nextflow/workflows/rna/rna_singlesample/nextflow.config new file mode 100644 index 00000000..77392b28 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/rna/rna_singlesample' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'main' + description = 'Processing unimodal single-sample RNA transcriptomics data.' + author = 'Dries De Maeyer, Robrecht Cannoodt, Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/rna/rna_singlesample/nextflow_labels.config b/target/nextflow/workflows/rna/rna_singlesample/nextflow_labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/nextflow_labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/rna/rna_singlesample/nextflow_schema.json b/target/nextflow/workflows/rna/rna_singlesample/nextflow_schema.json new file mode 100644 index 00000000..876b7f59 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/nextflow_schema.json @@ -0,0 +1,177 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "rna_singlesample", + "description": "Processing unimodal single-sample RNA transcriptomics data.", + "type": "object", + "$defs": { + "input": { + "title": "Input", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "ID of the sample.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"foo\"`. " + }, + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the sample.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"dataset.h5mu\"`. " + }, + "layer": { + "type": "string", + "description": "Input layer to start from", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "output": { + "title": "Output", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "Destination path to the output.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ", + "default": "$id.$key.output.h5mu" + } + } + }, + "filtering options": { + "title": "Filtering options", + "type": "object", + "description": "No description", + "properties": { + "min_counts": { + "type": "integer", + "description": "Minimum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_counts": { + "type": "integer", + "description": "Maximum number of counts captured per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `5000000`. " + }, + "min_genes_per_cell": { + "type": "integer", + "description": "Minimum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `200`. " + }, + "max_genes_per_cell": { + "type": "integer", + "description": "Maximum of non-zero values per cell.", + "help_text": "Type: `integer`, multiple: `False`, example: `1500000`. " + }, + "min_cells_per_gene": { + "type": "integer", + "description": "Minimum of non-zero values per gene.", + "help_text": "Type: `integer`, multiple: `False`, example: `3`. " + }, + "min_fraction_mito": { + "type": "number", + "description": "Minimum fraction of UMIs that are mitochondrial", + "help_text": "Type: `double`, multiple: `False`, example: `0.0`. " + }, + "max_fraction_mito": { + "type": "number", + "description": "Maximum fraction of UMIs that are mitochondrial", + "help_text": "Type: `double`, multiple: `False`, example: `0.2`. " + }, + "min_fraction_ribo": { + "type": "number", + "description": "Minimum fraction of UMIs that are ribosomal", + "help_text": "Type: `double`, multiple: `False`, example: `0.0`. " + }, + "max_fraction_ribo": { + "type": "number", + "description": "Maximum fraction of UMIs that are ribosomal", + "help_text": "Type: `double`, multiple: `False`, example: `0.2`. " + }, + "skip_scrublet_doublet_detection": { + "type": "boolean", + "description": "Skip the scrublet doublet detection step.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "mitochondrial & ribosomal gene detection": { + "title": "Mitochondrial & Ribosomal Gene Detection", + "type": "object", + "description": "No description", + "properties": { + "var_gene_names": { + "type": "string", + "description": ".var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively", + "help_text": "Type: `string`, multiple: `False`, example: `\"gene_symbol\"`. " + }, + "var_name_mitochondrial_genes": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding the mitochondrial genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_name_mitochondrial_fraction": { + "type": "string", + "description": "When specified, write the fraction of counts originating from mitochondrial genes \n(based on --mitochondrial_gene_regex) to an .obs column with the specified name.\nRequires --var_name_mitochondrial_genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "mitochondrial_gene_regex": { + "type": "string", + "description": "Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"^[mM][tT]-\"`. ", + "default": "^[mM][tT]-" + }, + "var_name_ribosomal_genes": { + "type": "string", + "description": "In which .var slot to store a boolean array corresponding the ribosomal genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "obs_name_ribosomal_fraction": { + "type": "string", + "description": "When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n", + "help_text": "Type: `string`, multiple: `False`. " + }, + "ribosomal_gene_regex": { + "type": "string", + "description": "Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n", + "help_text": "Type: `string`, multiple: `False`, default: `\"^[Mm]?[Rr][Pp][LlSs]\"`. ", + "default": "^[Mm]?[Rr][Pp][LlSs]" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input" + }, + { + "$ref": "#/$defs/output" + }, + { + "$ref": "#/$defs/filtering options" + }, + { + "$ref": "#/$defs/mitochondrial & ribosomal gene detection" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/rna/rna_singlesample/utils/errorstrat_ignore.config b/target/nextflow/workflows/rna/rna_singlesample/utils/errorstrat_ignore.config new file mode 100644 index 00000000..6b4b0293 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/utils/errorstrat_ignore.config @@ -0,0 +1 @@ +process.errorStrategy = 'ignore' \ No newline at end of file diff --git a/target/nextflow/workflows/rna/rna_singlesample/utils/integration_tests.config b/target/nextflow/workflows/rna/rna_singlesample/utils/integration_tests.config new file mode 100644 index 00000000..59d5b092 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/utils/integration_tests.config @@ -0,0 +1,36 @@ +profiles { + + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/rna/rna_singlesample/utils/labels.config b/target/nextflow/workflows/rna/rna_singlesample/utils/labels.config new file mode 100644 index 00000000..94570ec6 --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/utils/labels.config @@ -0,0 +1,48 @@ +process { + // Default resources for components that hardly do any processing + memory = { 2.GB * task.attempt } + cpus = 1 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + + // The memory a task is assinged increases with each attempt + // uncomment the line below and adjust the value to set a global upper limit on the memory. + // resourceLimits = [ memory: 240.Gb ] + + // CPU resources + withLabel: singlecpu { cpus = 1 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 10 } + withLabel: highcpu { cpus = 20 } + + // Memory resources + withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } } + withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } } + withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } } + withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } } + + // Disk space + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } + + // NOTE: The above labels intentionally do not have an effect by default. + // The user should set the disk space requirements by adding the following + // to the compute environment: + // + // withLabel: lowdisk { disk = { 20.GB * task.attempt } } + // withLabel: middisk { disk = { 100.GB * task.attempt } } + // withLabel: highdisk { disk = { 200.GB * task.attempt } } + // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } } +} diff --git a/target/nextflow/workflows/rna/rna_singlesample/utils/labels_ci.config b/target/nextflow/workflows/rna/rna_singlesample/utils/labels_ci.config new file mode 100644 index 00000000..aee866db --- /dev/null +++ b/target/nextflow/workflows/rna/rna_singlesample/utils/labels_ci.config @@ -0,0 +1,33 @@ +process { + withLabel: lowmem { memory = 13.Gb } + withLabel: lowcpu { cpus = 4 } + withLabel: midmem { memory = 13.Gb } + withLabel: midcpu { cpus = 4 } + withLabel: highmem { memory = 13.Gb } + withLabel: highcpu { cpus = 4 } + withLabel: veryhighmem { memory = 13.Gb } + withLabel: lowdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: middisk { + disk = {process.disk ? process.disk : null} + } + withLabel: highdisk { + disk = {process.disk ? process.disk : null} + } + withLabel: veryhighdisk { + disk = {process.disk ? process.disk : null} + } +} + +env.NUMBA_CACHE_DIR = '/tmp' + +trace { + enabled = true + overwrite = true +} +dag { + overwrite = true +} + +process.maxForks = 1